Blame - fs/io-wq.c - hafnium/third_party/linux

blob: 3d5fc76b92d0143f121fc91b43dc8fbefeca92df [file] [log] [blame]

Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Basic worker thread pool for io_uring
				4	*
				5	* Copyright (C) 2019 Jens Axboe
				6	*
				7	*/
				8	#include <linux/kernel.h>
				9	#include <linux/init.h>
				10	#include <linux/errno.h>
				11	#include <linux/sched/signal.h>
				12	#include <linux/mm.h>
				13	#include <linux/sched/mm.h>
				14	#include <linux/percpu.h>
				15	#include <linux/slab.h>
				16	#include <linux/kthread.h>
				17	#include <linux/rculist_nulls.h>
				18	#include <linux/fs_struct.h>
				19	#include <linux/task_work.h>
				20	#include <linux/blk-cgroup.h>
				21	#include <linux/audit.h>
				22	#include <linux/cpu.h>
				23
				24	#include "../kernel/sched/sched.h"
				25	#include "io-wq.h"
				26
				27	#define WORKER_IDLE_TIMEOUT (5 * HZ)
				28
				29	enum {
				30	IO_WORKER_F_UP = 1, /* up and active */
				31	IO_WORKER_F_RUNNING = 2, /* account as running */
				32	IO_WORKER_F_FREE = 4, /* worker on free list */
				33	IO_WORKER_F_FIXED = 8, /* static idle worker */
				34	IO_WORKER_F_BOUND = 16, /* is doing bounded work */
				35	};
				36
				37	enum {
				38	IO_WQ_BIT_EXIT = 0, /* wq exiting */
				39	IO_WQ_BIT_CANCEL = 1, /* cancel work on list */
				40	IO_WQ_BIT_ERROR = 2, /* error on setup */
				41	};
				42
				43	enum {
				44	IO_WQE_FLAG_STALLED = 1, /* stalled on hash */
				45	};
				46
				47	/*
				48	* One for each thread in a wqe pool
				49	*/
				50	struct io_worker {
				51	refcount_t ref;
				52	unsigned flags;
				53	struct hlist_nulls_node nulls_node;
				54	struct list_head all_list;
				55	struct task_struct *task;
				56	struct io_wqe *wqe;
				57
				58	struct io_wq_work *cur_work;
				59	spinlock_t lock;
				60
				61	struct rcu_head rcu;
				62	struct mm_struct *mm;
				63	#ifdef CONFIG_BLK_CGROUP
				64	struct cgroup_subsys_state *blkcg_css;
				65	#endif
				66	const struct cred *cur_creds;
				67	const struct cred *saved_creds;
				68	struct files_struct *restore_files;
				69	struct nsproxy *restore_nsproxy;
				70	struct fs_struct *restore_fs;
				71	};
				72
				73	#if BITS_PER_LONG == 64
				74	#define IO_WQ_HASH_ORDER 6
				75	#else
				76	#define IO_WQ_HASH_ORDER 5
				77	#endif
				78
				79	#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER)
				80
				81	struct io_wqe_acct {
				82	unsigned nr_workers;
				83	unsigned max_workers;
				84	atomic_t nr_running;
				85	};
				86
				87	enum {
				88	IO_WQ_ACCT_BOUND,
				89	IO_WQ_ACCT_UNBOUND,
				90	};
				91
				92	/*
				93	* Per-node worker thread pool
				94	*/
				95	struct io_wqe {
				96	struct {
				97	raw_spinlock_t lock;
				98	struct io_wq_work_list work_list;
				99	unsigned long hash_map;
				100	unsigned flags;
				101	} ____cacheline_aligned_in_smp;
				102
				103	int node;
				104	struct io_wqe_acct acct[2];
				105
				106	struct hlist_nulls_head free_list;
				107	struct list_head all_list;
				108
				109	struct io_wq *wq;
				110	struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
				111	};
				112
				113	/*
				114	* Per io_wq state
				115	*/
				116	struct io_wq {
				117	struct io_wqe **wqes;
				118	unsigned long state;
				119
				120	free_work_fn *free_work;
				121	io_wq_work_fn *do_work;
				122
				123	struct task_struct *manager;
				124	struct user_struct *user;
				125	refcount_t refs;
				126	struct completion done;
				127
				128	struct hlist_node cpuhp_node;
				129
				130	refcount_t use_refs;
				131	};
				132
				133	static enum cpuhp_state io_wq_online;
				134
				135	static bool io_worker_get(struct io_worker *worker)
				136	{
				137	return refcount_inc_not_zero(&worker->ref);
				138	}
				139
				140	static void io_worker_release(struct io_worker *worker)
				141	{
				142	if (refcount_dec_and_test(&worker->ref))
				143	wake_up_process(worker->task);
				144	}
				145
				146	/*
				147	* Note: drops the wqe->lock if returning true! The caller must re-acquire
				148	* the lock in that case. Some callers need to restart handling if this
				149	* happens, so we can't just re-acquire the lock on behalf of the caller.
				150	*/
				151	static bool __io_worker_unuse(struct io_wqe wqe, struct io_worker worker)
				152	{
				153	bool dropped_lock = false;
				154
				155	if (worker->saved_creds) {
				156	revert_creds(worker->saved_creds);
				157	worker->cur_creds = worker->saved_creds = NULL;
				158	}
				159
				160	if (current->files != worker->restore_files) {
				161	__acquire(&wqe->lock);
				162	raw_spin_unlock_irq(&wqe->lock);
				163	dropped_lock = true;
				164
				165	task_lock(current);
				166	current->files = worker->restore_files;
				167	current->nsproxy = worker->restore_nsproxy;
				168	task_unlock(current);
				169	}
				170
				171	if (current->fs != worker->restore_fs)
				172	current->fs = worker->restore_fs;
				173
				174	/*
				175	* If we have an active mm, we need to drop the wq lock before unusing
				176	* it. If we do, return true and let the caller retry the idle loop.
				177	*/
				178	if (worker->mm) {
				179	if (!dropped_lock) {
				180	__acquire(&wqe->lock);
				181	raw_spin_unlock_irq(&wqe->lock);
				182	dropped_lock = true;
				183	}
				184	__set_current_state(TASK_RUNNING);
				185	kthread_unuse_mm(worker->mm);
				186	mmput(worker->mm);
				187	worker->mm = NULL;
				188	}
				189
				190	#ifdef CONFIG_BLK_CGROUP
				191	if (worker->blkcg_css) {
				192	kthread_associate_blkcg(NULL);
				193	worker->blkcg_css = NULL;
				194	}
				195	#endif
				196	if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
				197	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
				198	return dropped_lock;
				199	}
				200
				201	static inline struct io_wqe_acct io_work_get_acct(struct io_wqe wqe,
				202	struct io_wq_work *work)
				203	{
				204	if (work->flags & IO_WQ_WORK_UNBOUND)
				205	return &wqe->acct[IO_WQ_ACCT_UNBOUND];
				206
				207	return &wqe->acct[IO_WQ_ACCT_BOUND];
				208	}
				209
				210	static inline struct io_wqe_acct io_wqe_get_acct(struct io_wqe wqe,
				211	struct io_worker *worker)
				212	{
				213	if (worker->flags & IO_WORKER_F_BOUND)
				214	return &wqe->acct[IO_WQ_ACCT_BOUND];
				215
				216	return &wqe->acct[IO_WQ_ACCT_UNBOUND];
				217	}
				218
				219	static void io_worker_exit(struct io_worker *worker)
				220	{
				221	struct io_wqe *wqe = worker->wqe;
				222	struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
				223
				224	/*
				225	* If we're not at zero, someone else is holding a brief reference
				226	* to the worker. Wait for that to go away.
				227	*/
				228	set_current_state(TASK_INTERRUPTIBLE);
				229	if (!refcount_dec_and_test(&worker->ref))
				230	schedule();
				231	__set_current_state(TASK_RUNNING);
				232
				233	preempt_disable();
				234	current->flags &= ~PF_IO_WORKER;
				235	if (worker->flags & IO_WORKER_F_RUNNING)
				236	atomic_dec(&acct->nr_running);
				237	if (!(worker->flags & IO_WORKER_F_BOUND))
				238	atomic_dec(&wqe->wq->user->processes);
				239	worker->flags = 0;
				240	preempt_enable();
				241
				242	raw_spin_lock_irq(&wqe->lock);
				243	hlist_nulls_del_rcu(&worker->nulls_node);
				244	list_del_rcu(&worker->all_list);
				245	if (__io_worker_unuse(wqe, worker)) {
				246	__release(&wqe->lock);
				247	raw_spin_lock_irq(&wqe->lock);
				248	}
				249	acct->nr_workers--;
				250	raw_spin_unlock_irq(&wqe->lock);
				251
				252	kfree_rcu(worker, rcu);
				253	if (refcount_dec_and_test(&wqe->wq->refs))
				254	complete(&wqe->wq->done);
				255	}
				256
				257	static inline bool io_wqe_run_queue(struct io_wqe *wqe)
				258	__must_hold(wqe->lock)
				259	{
				260	if (!wq_list_empty(&wqe->work_list) &&
				261	!(wqe->flags & IO_WQE_FLAG_STALLED))
				262	return true;
				263	return false;
				264	}
				265
				266	/*
				267	* Check head of free list for an available worker. If one isn't available,
				268	* caller must wake up the wq manager to create one.
				269	*/
				270	static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
				271	__must_hold(RCU)
				272	{
				273	struct hlist_nulls_node *n;
				274	struct io_worker *worker;
				275
				276	n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
				277	if (is_a_nulls(n))
				278	return false;
				279
				280	worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
				281	if (io_worker_get(worker)) {
				282	wake_up_process(worker->task);
				283	io_worker_release(worker);
				284	return true;
				285	}
				286
				287	return false;
				288	}
				289
				290	/*
				291	* We need a worker. If we find a free one, we're good. If not, and we're
				292	* below the max number of workers, wake up the manager to create one.
				293	*/
				294	static void io_wqe_wake_worker(struct io_wqe wqe, struct io_wqe_acct acct)
				295	{
				296	bool ret;
				297
				298	/*
				299	* Most likely an attempt to queue unbounded work on an io_wq that
				300	* wasn't setup with any unbounded workers.
				301	*/
				302	if (unlikely(!acct->max_workers))
				303	pr_warn_once("io-wq is not configured for unbound workers");
				304
				305	rcu_read_lock();
				306	ret = io_wqe_activate_free_worker(wqe);
				307	rcu_read_unlock();
				308
				309	if (!ret && acct->nr_workers < acct->max_workers)
				310	wake_up_process(wqe->wq->manager);
				311	}
				312
				313	static void io_wqe_inc_running(struct io_wqe wqe, struct io_worker worker)
				314	{
				315	struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
				316
				317	atomic_inc(&acct->nr_running);
				318	}
				319
				320	static void io_wqe_dec_running(struct io_wqe wqe, struct io_worker worker)
				321	__must_hold(wqe->lock)
				322	{
				323	struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
				324
				325	if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe))
				326	io_wqe_wake_worker(wqe, acct);
				327	}
				328
				329	static void io_worker_start(struct io_wqe wqe, struct io_worker worker)
				330	{
				331	allow_kernel_signal(SIGINT);
				332
				333	current->flags \|= PF_IO_WORKER;
				334
				335	worker->flags \|= (IO_WORKER_F_UP \| IO_WORKER_F_RUNNING);
				336	worker->restore_files = current->files;
				337	worker->restore_nsproxy = current->nsproxy;
				338	worker->restore_fs = current->fs;
				339	io_wqe_inc_running(wqe, worker);
				340	}
				341
				342	/*
				343	* Worker will start processing some work. Move it to the busy list, if
				344	* it's currently on the freelist
				345	*/
				346	static void __io_worker_busy(struct io_wqe wqe, struct io_worker worker,
				347	struct io_wq_work *work)
				348	__must_hold(wqe->lock)
				349	{
				350	bool worker_bound, work_bound;
				351
				352	if (worker->flags & IO_WORKER_F_FREE) {
				353	worker->flags &= ~IO_WORKER_F_FREE;
				354	hlist_nulls_del_init_rcu(&worker->nulls_node);
				355	}
				356
				357	/*
				358	* If worker is moving from bound to unbound (or vice versa), then
				359	* ensure we update the running accounting.
				360	*/
				361	worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
				362	work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
				363	if (worker_bound != work_bound) {
				364	io_wqe_dec_running(wqe, worker);
				365	if (work_bound) {
				366	worker->flags \|= IO_WORKER_F_BOUND;
				367	wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
				368	wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
				369	atomic_dec(&wqe->wq->user->processes);
				370	} else {
				371	worker->flags &= ~IO_WORKER_F_BOUND;
				372	wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
				373	wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
				374	atomic_inc(&wqe->wq->user->processes);
				375	}
				376	io_wqe_inc_running(wqe, worker);
				377	}
				378	}
				379
				380	/*
				381	* No work, worker going to sleep. Move to freelist, and unuse mm if we
				382	* have one attached. Dropping the mm may potentially sleep, so we drop
				383	* the lock in that case and return success. Since the caller has to
				384	* retry the loop in that case (we changed task state), we don't regrab
				385	* the lock if we return success.
				386	*/
				387	static bool __io_worker_idle(struct io_wqe wqe, struct io_worker worker)
				388	__must_hold(wqe->lock)
				389	{
				390	if (!(worker->flags & IO_WORKER_F_FREE)) {
				391	worker->flags \|= IO_WORKER_F_FREE;
				392	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
				393	}
				394
				395	return __io_worker_unuse(wqe, worker);
				396	}
				397
				398	static inline unsigned int io_get_work_hash(struct io_wq_work *work)
				399	{
				400	return work->flags >> IO_WQ_HASH_SHIFT;
				401	}
				402
				403	static struct io_wq_work io_get_next_work(struct io_wqe wqe)
				404	__must_hold(wqe->lock)
				405	{
				406	struct io_wq_work_node node, prev;
				407	struct io_wq_work work, tail;
				408	unsigned int hash;
				409
				410	wq_list_for_each(node, prev, &wqe->work_list) {
				411	work = container_of(node, struct io_wq_work, list);
				412
				413	/* not hashed, can run anytime */
				414	if (!io_wq_is_hashed(work)) {
				415	wq_list_del(&wqe->work_list, node, prev);
				416	return work;
				417	}
				418
				419	/* hashed, can run if not already running */
				420	hash = io_get_work_hash(work);
				421	if (!(wqe->hash_map & BIT(hash))) {
				422	wqe->hash_map \|= BIT(hash);
				423	/* all items with this hash lie in [work, tail] */
				424	tail = wqe->hash_tail[hash];
				425	wqe->hash_tail[hash] = NULL;
				426	wq_list_cut(&wqe->work_list, &tail->list, prev);
				427	return work;
				428	}
				429	}
				430
				431	return NULL;
				432	}
				433
				434	static void io_wq_switch_mm(struct io_worker worker, struct io_wq_work work)
				435	{
				436	if (worker->mm) {
				437	kthread_unuse_mm(worker->mm);
				438	mmput(worker->mm);
				439	worker->mm = NULL;
				440	}
				441
				442	if (mmget_not_zero(work->identity->mm)) {
				443	kthread_use_mm(work->identity->mm);
				444	worker->mm = work->identity->mm;
				445	return;
				446	}
				447
				448	/* failed grabbing mm, ensure work gets cancelled */
				449	work->flags \|= IO_WQ_WORK_CANCEL;
				450	}
				451
				452	static inline void io_wq_switch_blkcg(struct io_worker *worker,
				453	struct io_wq_work *work)
				454	{
				455	#ifdef CONFIG_BLK_CGROUP
				456	if (!(work->flags & IO_WQ_WORK_BLKCG))
				457	return;
				458	if (work->identity->blkcg_css != worker->blkcg_css) {
				459	kthread_associate_blkcg(work->identity->blkcg_css);
				460	worker->blkcg_css = work->identity->blkcg_css;
				461	}
				462	#endif
				463	}
				464
				465	static void io_wq_switch_creds(struct io_worker *worker,
				466	struct io_wq_work *work)
				467	{
				468	const struct cred *old_creds = override_creds(work->identity->creds);
				469
				470	worker->cur_creds = work->identity->creds;
				471	if (worker->saved_creds)
				472	put_cred(old_creds); /* creds set by previous switch */
				473	else
				474	worker->saved_creds = old_creds;
				475	}
				476
				477	static void io_impersonate_work(struct io_worker *worker,
				478	struct io_wq_work *work)
				479	{
				480	if ((work->flags & IO_WQ_WORK_FILES) &&
				481	current->files != work->identity->files) {
				482	task_lock(current);
				483	current->files = work->identity->files;
				484	current->nsproxy = work->identity->nsproxy;
				485	task_unlock(current);
				486	if (!work->identity->files) {
				487	/* failed grabbing files, ensure work gets cancelled */
				488	work->flags \|= IO_WQ_WORK_CANCEL;
				489	}
				490	}
				491	if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)
				492	current->fs = work->identity->fs;
				493	if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm)
				494	io_wq_switch_mm(worker, work);
				495	if ((work->flags & IO_WQ_WORK_CREDS) &&
				496	worker->cur_creds != work->identity->creds)
				497	io_wq_switch_creds(worker, work);
				498	if (work->flags & IO_WQ_WORK_FSIZE)
				499	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize;
				500	else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
				501	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
				502	io_wq_switch_blkcg(worker, work);
				503	#ifdef CONFIG_AUDIT
				504	current->loginuid = work->identity->loginuid;
				505	current->sessionid = work->identity->sessionid;
				506	#endif
				507	}
				508
				509	static void io_assign_current_work(struct io_worker *worker,
				510	struct io_wq_work *work)
				511	{
				512	if (work) {
				513	/* flush pending signals before assigning new work */
				514	if (signal_pending(current))
				515	flush_signals(current);
				516	cond_resched();
				517	}
				518
				519	#ifdef CONFIG_AUDIT
				520	current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET);
				521	current->sessionid = AUDIT_SID_UNSET;
				522	#endif
				523
				524	spin_lock_irq(&worker->lock);
				525	worker->cur_work = work;
				526	spin_unlock_irq(&worker->lock);
				527	}
				528
				529	static void io_wqe_enqueue(struct io_wqe wqe, struct io_wq_work work);
				530
				531	static void io_worker_handle_work(struct io_worker *worker)
				532	__releases(wqe->lock)
				533	{
				534	struct io_wqe *wqe = worker->wqe;
				535	struct io_wq *wq = wqe->wq;
				536
				537	do {
				538	struct io_wq_work *work;
				539	get_next:
				540	/*
				541	* If we got some work, mark us as busy. If we didn't, but
				542	* the list isn't empty, it means we stalled on hashed work.
				543	* Mark us stalled so we don't keep looking for work when we
				544	* can't make progress, any work completion or insertion will
				545	* clear the stalled flag.
				546	*/
				547	work = io_get_next_work(wqe);
				548	if (work)
				549	__io_worker_busy(wqe, worker, work);
				550	else if (!wq_list_empty(&wqe->work_list))
				551	wqe->flags \|= IO_WQE_FLAG_STALLED;
				552
				553	raw_spin_unlock_irq(&wqe->lock);
				554	if (!work)
				555	break;
				556	io_assign_current_work(worker, work);
				557
				558	/* handle a whole dependent link */
				559	do {
				560	struct io_wq_work old_work, next_hashed, *linked;
				561	unsigned int hash = io_get_work_hash(work);
				562
				563	next_hashed = wq_next_work(work);
				564	io_impersonate_work(worker, work);
				565	/*
				566	* OK to set IO_WQ_WORK_CANCEL even for uncancellable
				567	* work, the worker function will do the right thing.
				568	*/
				569	if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
				570	work->flags \|= IO_WQ_WORK_CANCEL;
				571
				572	old_work = work;
				573	linked = wq->do_work(work);
				574
				575	work = next_hashed;
				576	if (!work && linked && !io_wq_is_hashed(linked)) {
				577	work = linked;
				578	linked = NULL;
				579	}
				580	io_assign_current_work(worker, work);
				581	wq->free_work(old_work);
				582
				583	if (linked)
				584	io_wqe_enqueue(wqe, linked);
				585
				586	if (hash != -1U && !next_hashed) {
				587	raw_spin_lock_irq(&wqe->lock);
				588	wqe->hash_map &= ~BIT_ULL(hash);
				589	wqe->flags &= ~IO_WQE_FLAG_STALLED;
				590	/* skip unnecessary unlock-lock wqe->lock */
				591	if (!work)
				592	goto get_next;
				593	raw_spin_unlock_irq(&wqe->lock);
				594	}
				595	} while (work);
				596
				597	raw_spin_lock_irq(&wqe->lock);
				598	} while (1);
				599	}
				600
				601	static int io_wqe_worker(void *data)
				602	{
				603	struct io_worker *worker = data;
				604	struct io_wqe *wqe = worker->wqe;
				605	struct io_wq *wq = wqe->wq;
				606
				607	io_worker_start(wqe, worker);
				608
				609	while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
				610	set_current_state(TASK_INTERRUPTIBLE);
				611	loop:
				612	raw_spin_lock_irq(&wqe->lock);
				613	if (io_wqe_run_queue(wqe)) {
				614	__set_current_state(TASK_RUNNING);
				615	io_worker_handle_work(worker);
				616	goto loop;
				617	}
				618	/* drops the lock on success, retry */
				619	if (__io_worker_idle(wqe, worker)) {
				620	__release(&wqe->lock);
				621	goto loop;
				622	}
				623	raw_spin_unlock_irq(&wqe->lock);
				624	if (signal_pending(current))
				625	flush_signals(current);
				626	if (schedule_timeout(WORKER_IDLE_TIMEOUT))
				627	continue;
				628	/* timed out, exit unless we're the fixed worker */
				629	if (test_bit(IO_WQ_BIT_EXIT, &wq->state) \|\|
				630	!(worker->flags & IO_WORKER_F_FIXED))
				631	break;
				632	}
				633
				634	if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
				635	raw_spin_lock_irq(&wqe->lock);
				636	if (!wq_list_empty(&wqe->work_list))
				637	io_worker_handle_work(worker);
				638	else
				639	raw_spin_unlock_irq(&wqe->lock);
				640	}
				641
				642	io_worker_exit(worker);
				643	return 0;
				644	}
				645
				646	/*
				647	* Called when a worker is scheduled in. Mark us as currently running.
				648	*/
				649	void io_wq_worker_running(struct task_struct *tsk)
				650	{
				651	struct io_worker *worker = kthread_data(tsk);
				652	struct io_wqe *wqe = worker->wqe;
				653
				654	if (!(worker->flags & IO_WORKER_F_UP))
				655	return;
				656	if (worker->flags & IO_WORKER_F_RUNNING)
				657	return;
				658	worker->flags \|= IO_WORKER_F_RUNNING;
				659	io_wqe_inc_running(wqe, worker);
				660	}
				661
				662	/*
				663	* Called when worker is going to sleep. If there are no workers currently
				664	* running and we have work pending, wake up a free one or have the manager
				665	* set one up.
				666	*/
				667	void io_wq_worker_sleeping(struct task_struct *tsk)
				668	{
				669	struct io_worker *worker = kthread_data(tsk);
				670	struct io_wqe *wqe = worker->wqe;
				671
				672	if (!(worker->flags & IO_WORKER_F_UP))
				673	return;
				674	if (!(worker->flags & IO_WORKER_F_RUNNING))
				675	return;
				676
				677	worker->flags &= ~IO_WORKER_F_RUNNING;
				678
				679	raw_spin_lock_irq(&wqe->lock);
				680	io_wqe_dec_running(wqe, worker);
				681	raw_spin_unlock_irq(&wqe->lock);
				682	}
				683
				684	static bool create_io_worker(struct io_wq wq, struct io_wqe wqe, int index)
				685	{
				686	struct io_wqe_acct *acct = &wqe->acct[index];
				687	struct io_worker *worker;
				688
				689	worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
				690	if (!worker)
				691	return false;
				692
				693	refcount_set(&worker->ref, 1);
				694	worker->nulls_node.pprev = NULL;
				695	worker->wqe = wqe;
				696	spin_lock_init(&worker->lock);
				697
				698	worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
				699	"io_wqe_worker-%d/%d", index, wqe->node);
				700	if (IS_ERR(worker->task)) {
				701	kfree(worker);
				702	return false;
				703	}
				704	kthread_bind_mask(worker->task, cpumask_of_node(wqe->node));
				705
				706	raw_spin_lock_irq(&wqe->lock);
				707	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
				708	list_add_tail_rcu(&worker->all_list, &wqe->all_list);
				709	worker->flags \|= IO_WORKER_F_FREE;
				710	if (index == IO_WQ_ACCT_BOUND)
				711	worker->flags \|= IO_WORKER_F_BOUND;
				712	if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
				713	worker->flags \|= IO_WORKER_F_FIXED;
				714	acct->nr_workers++;
				715	raw_spin_unlock_irq(&wqe->lock);
				716
				717	if (index == IO_WQ_ACCT_UNBOUND)
				718	atomic_inc(&wq->user->processes);
				719
				720	refcount_inc(&wq->refs);
				721	wake_up_process(worker->task);
				722	return true;
				723	}
				724
				725	static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
				726	__must_hold(wqe->lock)
				727	{
				728	struct io_wqe_acct *acct = &wqe->acct[index];
				729
				730	/* if we have available workers or no work, no need */
				731	if (!hlist_nulls_empty(&wqe->free_list) \|\| !io_wqe_run_queue(wqe))
				732	return false;
				733	return acct->nr_workers < acct->max_workers;
				734	}
				735
				736	static bool io_wqe_worker_send_sig(struct io_worker worker, void data)
				737	{
				738	send_sig(SIGINT, worker->task, 1);
				739	return false;
				740	}
				741
				742	/*
				743	* Iterate the passed in list and call the specific function for each
				744	* worker that isn't exiting
				745	*/
				746	static bool io_wq_for_each_worker(struct io_wqe *wqe,
				747	bool (func)(struct io_worker , void *),
				748	void *data)
				749	{
				750	struct io_worker *worker;
				751	bool ret = false;
				752
				753	list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
				754	if (io_worker_get(worker)) {
				755	/* no task if node is/was offline */
				756	if (worker->task)
				757	ret = func(worker, data);
				758	io_worker_release(worker);
				759	if (ret)
				760	break;
				761	}
				762	}
				763
				764	return ret;
				765	}
				766
				767	static bool io_wq_worker_wake(struct io_worker worker, void data)
				768	{
				769	wake_up_process(worker->task);
				770	return false;
				771	}
				772
				773	/*
				774	* Manager thread. Tasked with creating new workers, if we need them.
				775	*/
				776	static int io_wq_manager(void *data)
				777	{
				778	struct io_wq *wq = data;
				779	int node;
				780
				781	/* create fixed workers */
				782	refcount_set(&wq->refs, 1);
				783	for_each_node(node) {
				784	if (!node_online(node))
				785	continue;
				786	if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
				787	continue;
				788	set_bit(IO_WQ_BIT_ERROR, &wq->state);
				789	set_bit(IO_WQ_BIT_EXIT, &wq->state);
				790	goto out;
				791	}
				792
				793	complete(&wq->done);
				794
				795	while (!kthread_should_stop()) {
				796	if (current->task_works)
				797	task_work_run();
				798
				799	for_each_node(node) {
				800	struct io_wqe *wqe = wq->wqes[node];
				801	bool fork_worker[2] = { false, false };
				802
				803	if (!node_online(node))
				804	continue;
				805
				806	raw_spin_lock_irq(&wqe->lock);
				807	if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
				808	fork_worker[IO_WQ_ACCT_BOUND] = true;
				809	if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
				810	fork_worker[IO_WQ_ACCT_UNBOUND] = true;
				811	raw_spin_unlock_irq(&wqe->lock);
				812	if (fork_worker[IO_WQ_ACCT_BOUND])
				813	create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
				814	if (fork_worker[IO_WQ_ACCT_UNBOUND])
				815	create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
				816	}
				817	set_current_state(TASK_INTERRUPTIBLE);
				818	schedule_timeout(HZ);
				819	}
				820
				821	if (current->task_works)
				822	task_work_run();
				823
				824	out:
				825	if (refcount_dec_and_test(&wq->refs)) {
				826	complete(&wq->done);
				827	return 0;
				828	}
				829	/* if ERROR is set and we get here, we have workers to wake */
				830	if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
				831	rcu_read_lock();
				832	for_each_node(node)
				833	io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
				834	rcu_read_unlock();
				835	}
				836	return 0;
				837	}
				838
				839	static bool io_wq_can_queue(struct io_wqe wqe, struct io_wqe_acct acct,
				840	struct io_wq_work *work)
				841	{
				842	bool free_worker;
				843
				844	if (!(work->flags & IO_WQ_WORK_UNBOUND))
				845	return true;
				846	if (atomic_read(&acct->nr_running))
				847	return true;
				848
				849	rcu_read_lock();
				850	free_worker = !hlist_nulls_empty(&wqe->free_list);
				851	rcu_read_unlock();
				852	if (free_worker)
				853	return true;
				854
				855	if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers &&
				856	!(capable(CAP_SYS_RESOURCE) \|\| capable(CAP_SYS_ADMIN)))
				857	return false;
				858
				859	return true;
				860	}
				861
				862	static void io_run_cancel(struct io_wq_work work, struct io_wqe wqe)
				863	{
				864	struct io_wq *wq = wqe->wq;
				865
				866	do {
				867	struct io_wq_work *old_work = work;
				868
				869	work->flags \|= IO_WQ_WORK_CANCEL;
				870	work = wq->do_work(work);
				871	wq->free_work(old_work);
				872	} while (work);
				873	}
				874
				875	static void io_wqe_insert_work(struct io_wqe wqe, struct io_wq_work work)
				876	{
				877	unsigned int hash;
				878	struct io_wq_work *tail;
				879
				880	if (!io_wq_is_hashed(work)) {
				881	append:
				882	wq_list_add_tail(&work->list, &wqe->work_list);
				883	return;
				884	}
				885
				886	hash = io_get_work_hash(work);
				887	tail = wqe->hash_tail[hash];
				888	wqe->hash_tail[hash] = work;
				889	if (!tail)
				890	goto append;
				891
				892	wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
				893	}
				894
				895	static void io_wqe_enqueue(struct io_wqe wqe, struct io_wq_work work)
				896	{
				897	struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
				898	bool do_wake;
				899	unsigned long flags;
				900
				901	/*
				902	* Do early check to see if we need a new unbound worker, and if we do,
				903	* if we're allowed to do so. This isn't 100% accurate as there's a
				904	* gap between this check and incrementing the value, but that's OK.
				905	* It's close enough to not be an issue, fork() has the same delay.
				906	*/
				907	if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
				908	io_run_cancel(work, wqe);
				909	return;
				910	}
				911
				912	raw_spin_lock_irqsave(&wqe->lock, flags);
				913	io_wqe_insert_work(wqe, work);
				914	wqe->flags &= ~IO_WQE_FLAG_STALLED;
				915	do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) \|\|
				916	!atomic_read(&acct->nr_running);
				917	raw_spin_unlock_irqrestore(&wqe->lock, flags);
				918
				919	if (do_wake)
				920	io_wqe_wake_worker(wqe, acct);
				921	}
				922
				923	void io_wq_enqueue(struct io_wq wq, struct io_wq_work work)
				924	{
				925	struct io_wqe *wqe = wq->wqes[numa_node_id()];
				926
				927	io_wqe_enqueue(wqe, work);
				928	}
				929
				930	/*
				931	* Work items that hash to the same value will not be done in parallel.
				932	* Used to limit concurrent writes, generally hashed by inode.
				933	*/
				934	void io_wq_hash_work(struct io_wq_work work, void val)
				935	{
				936	unsigned int bit;
				937
				938	bit = hash_ptr(val, IO_WQ_HASH_ORDER);
				939	work->flags \|= (IO_WQ_WORK_HASHED \| (bit << IO_WQ_HASH_SHIFT));
				940	}
				941
				942	void io_wq_cancel_all(struct io_wq *wq)
				943	{
				944	int node;
				945
				946	set_bit(IO_WQ_BIT_CANCEL, &wq->state);
				947
				948	rcu_read_lock();
				949	for_each_node(node) {
				950	struct io_wqe *wqe = wq->wqes[node];
				951
				952	io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
				953	}
				954	rcu_read_unlock();
				955	}
				956
				957	struct io_cb_cancel_data {
				958	work_cancel_fn *fn;
				959	void *data;
				960	int nr_running;
				961	int nr_pending;
				962	bool cancel_all;
				963	};
				964
				965	static bool io_wq_worker_cancel(struct io_worker worker, void data)
				966	{
				967	struct io_cb_cancel_data *match = data;
				968	unsigned long flags;
				969
				970	/*
				971	* Hold the lock to avoid ->cur_work going out of scope, caller
				972	* may dereference the passed in work.
				973	*/
				974	spin_lock_irqsave(&worker->lock, flags);
				975	if (worker->cur_work &&
				976	!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
				977	match->fn(worker->cur_work, match->data)) {
				978	send_sig(SIGINT, worker->task, 1);
				979	match->nr_running++;
				980	}
				981	spin_unlock_irqrestore(&worker->lock, flags);
				982
				983	return match->nr_running && !match->cancel_all;
				984	}
				985
				986	static inline void io_wqe_remove_pending(struct io_wqe *wqe,
				987	struct io_wq_work *work,
				988	struct io_wq_work_node *prev)
				989	{
				990	unsigned int hash = io_get_work_hash(work);
				991	struct io_wq_work *prev_work = NULL;
				992
				993	if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) {
				994	if (prev)
				995	prev_work = container_of(prev, struct io_wq_work, list);
				996	if (prev_work && io_get_work_hash(prev_work) == hash)
				997	wqe->hash_tail[hash] = prev_work;
				998	else
				999	wqe->hash_tail[hash] = NULL;
				1000	}
				1001	wq_list_del(&wqe->work_list, &work->list, prev);
				1002	}
				1003
				1004	static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
				1005	struct io_cb_cancel_data *match)
				1006	{
				1007	struct io_wq_work_node node, prev;
				1008	struct io_wq_work *work;
				1009	unsigned long flags;
				1010
				1011	retry:
				1012	raw_spin_lock_irqsave(&wqe->lock, flags);
				1013	wq_list_for_each(node, prev, &wqe->work_list) {
				1014	work = container_of(node, struct io_wq_work, list);
				1015	if (!match->fn(work, match->data))
				1016	continue;
				1017	io_wqe_remove_pending(wqe, work, prev);
				1018	raw_spin_unlock_irqrestore(&wqe->lock, flags);
				1019	io_run_cancel(work, wqe);
				1020	match->nr_pending++;
				1021	if (!match->cancel_all)
				1022	return;
				1023
				1024	/* not safe to continue after unlock */
				1025	goto retry;
				1026	}
				1027	raw_spin_unlock_irqrestore(&wqe->lock, flags);
				1028	}
				1029
				1030	static void io_wqe_cancel_running_work(struct io_wqe *wqe,
				1031	struct io_cb_cancel_data *match)
				1032	{
				1033	rcu_read_lock();
				1034	io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
				1035	rcu_read_unlock();
				1036	}
				1037
				1038	enum io_wq_cancel io_wq_cancel_cb(struct io_wq wq, work_cancel_fn cancel,
				1039	void *data, bool cancel_all)
				1040	{
				1041	struct io_cb_cancel_data match = {
				1042	.fn = cancel,
				1043	.data = data,
				1044	.cancel_all = cancel_all,
				1045	};
				1046	int node;
				1047
				1048	/*
				1049	* First check pending list, if we're lucky we can just remove it
				1050	* from there. CANCEL_OK means that the work is returned as-new,
				1051	* no completion will be posted for it.
				1052	*/
				1053	for_each_node(node) {
				1054	struct io_wqe *wqe = wq->wqes[node];
				1055
				1056	io_wqe_cancel_pending_work(wqe, &match);
				1057	if (match.nr_pending && !match.cancel_all)
				1058	return IO_WQ_CANCEL_OK;
				1059	}
				1060
				1061	/*
				1062	* Now check if a free (going busy) or busy worker has the work
				1063	* currently running. If we find it there, we'll return CANCEL_RUNNING
				1064	* as an indication that we attempt to signal cancellation. The
				1065	* completion will run normally in this case.
				1066	*/
				1067	for_each_node(node) {
				1068	struct io_wqe *wqe = wq->wqes[node];
				1069
				1070	io_wqe_cancel_running_work(wqe, &match);
				1071	if (match.nr_running && !match.cancel_all)
				1072	return IO_WQ_CANCEL_RUNNING;
				1073	}
				1074
				1075	if (match.nr_running)
				1076	return IO_WQ_CANCEL_RUNNING;
				1077	if (match.nr_pending)
				1078	return IO_WQ_CANCEL_OK;
				1079	return IO_WQ_CANCEL_NOTFOUND;
				1080	}
				1081
				1082	struct io_wq io_wq_create(unsigned bounded, struct io_wq_data data)
				1083	{
				1084	int ret = -ENOMEM, node;
				1085	struct io_wq *wq;
				1086
				1087	if (WARN_ON_ONCE(!data->free_work \|\| !data->do_work))
				1088	return ERR_PTR(-EINVAL);
				1089	if (WARN_ON_ONCE(!bounded))
				1090	return ERR_PTR(-EINVAL);
				1091
				1092	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
				1093	if (!wq)
				1094	return ERR_PTR(-ENOMEM);
				1095
				1096	wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
				1097	if (!wq->wqes)
				1098	goto err_wq;
				1099
				1100	ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
				1101	if (ret)
				1102	goto err_wqes;
				1103
				1104	wq->free_work = data->free_work;
				1105	wq->do_work = data->do_work;
				1106
				1107	/* caller must already hold a reference to this */
				1108	wq->user = data->user;
				1109
				1110	ret = -ENOMEM;
				1111	for_each_node(node) {
				1112	struct io_wqe *wqe;
				1113	int alloc_node = node;
				1114
				1115	if (!node_online(alloc_node))
				1116	alloc_node = NUMA_NO_NODE;
				1117	wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
				1118	if (!wqe)
				1119	goto err;
				1120	wq->wqes[node] = wqe;
				1121	wqe->node = alloc_node;
				1122	wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
				1123	atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
				1124	if (wq->user) {
				1125	wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
				1126	task_rlimit(current, RLIMIT_NPROC);
				1127	}
				1128	atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
				1129	wqe->wq = wq;
				1130	raw_spin_lock_init(&wqe->lock);
				1131	INIT_WQ_LIST(&wqe->work_list);
				1132	INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
				1133	INIT_LIST_HEAD(&wqe->all_list);
				1134	}
				1135
				1136	init_completion(&wq->done);
				1137
				1138	wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
				1139	if (!IS_ERR(wq->manager)) {
				1140	wake_up_process(wq->manager);
				1141	wait_for_completion(&wq->done);
				1142	if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
				1143	ret = -ENOMEM;
				1144	goto err;
				1145	}
				1146	refcount_set(&wq->use_refs, 1);
				1147	reinit_completion(&wq->done);
				1148	return wq;
				1149	}
				1150
				1151	ret = PTR_ERR(wq->manager);
				1152	complete(&wq->done);
				1153	err:
				1154	cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
				1155	for_each_node(node)
				1156	kfree(wq->wqes[node]);
				1157	err_wqes:
				1158	kfree(wq->wqes);
				1159	err_wq:
				1160	kfree(wq);
				1161	return ERR_PTR(ret);
				1162	}
				1163
				1164	bool io_wq_get(struct io_wq wq, struct io_wq_data data)
				1165	{
				1166	if (data->free_work != wq->free_work \|\| data->do_work != wq->do_work)
				1167	return false;
				1168
				1169	return refcount_inc_not_zero(&wq->use_refs);
				1170	}
				1171
				1172	static void __io_wq_destroy(struct io_wq *wq)
				1173	{
				1174	int node;
				1175
				1176	cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
				1177
				1178	set_bit(IO_WQ_BIT_EXIT, &wq->state);
				1179	if (wq->manager)
				1180	kthread_stop(wq->manager);
				1181
				1182	rcu_read_lock();
				1183	for_each_node(node)
				1184	io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
				1185	rcu_read_unlock();
				1186
				1187	wait_for_completion(&wq->done);
				1188
				1189	for_each_node(node)
				1190	kfree(wq->wqes[node]);
				1191	kfree(wq->wqes);
				1192	kfree(wq);
				1193	}
				1194
				1195	void io_wq_destroy(struct io_wq *wq)
				1196	{
				1197	if (refcount_dec_and_test(&wq->use_refs))
				1198	__io_wq_destroy(wq);
				1199	}
				1200
				1201	struct task_struct io_wq_get_task(struct io_wq wq)
				1202	{
				1203	return wq->manager;
				1204	}
				1205
				1206	static bool io_wq_worker_affinity(struct io_worker worker, void data)
				1207	{
				1208	struct task_struct *task = worker->task;
				1209	struct rq_flags rf;
				1210	struct rq *rq;
				1211
				1212	rq = task_rq_lock(task, &rf);
				1213	do_set_cpus_allowed(task, cpumask_of_node(worker->wqe->node));
				1214	task->flags \|= PF_NO_SETAFFINITY;
				1215	task_rq_unlock(rq, task, &rf);
				1216	return false;
				1217	}
				1218
				1219	static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node)
				1220	{
				1221	struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
				1222	int i;
				1223
				1224	rcu_read_lock();
				1225	for_each_node(i)
				1226	io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, NULL);
				1227	rcu_read_unlock();
				1228	return 0;
				1229	}
				1230
				1231	static __init int io_wq_init(void)
				1232	{
				1233	int ret;
				1234
				1235	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online",
				1236	io_wq_cpu_online, NULL);
				1237	if (ret < 0)
				1238	return ret;
				1239	io_wq_online = ret;
				1240	return 0;
				1241	}
				1242	subsys_initcall(io_wq_init);