Blame - fs/inode.c - hafnium/third_party/linux

blob: 638d5d5bf42df341b0b021ea6abfb47f45992a84 [file] [log] [blame]

David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2	/*
				3	* (C) 1997 Linus Torvalds
				4	* (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
				5	*/
				6	#include <linux/export.h>
				7	#include <linux/fs.h>
				8	#include <linux/mm.h>
				9	#include <linux/backing-dev.h>
				10	#include <linux/hash.h>
				11	#include <linux/swap.h>
				12	#include <linux/security.h>
				13	#include <linux/cdev.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	14	#include <linux/memblock.h>
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	15	#include <linux/fscrypt.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	16	#include <linux/fsnotify.h>
				17	#include <linux/mount.h>
				18	#include <linux/posix_acl.h>
				19	#include <linux/prefetch.h>
				20	#include <linux/buffer_head.h> /* for inode_has_buffers */
				21	#include <linux/ratelimit.h>
				22	#include <linux/list_lru.h>
				23	#include <linux/iversion.h>
				24	#include <trace/events/writeback.h>
				25	#include "internal.h"
				26
				27	/*
				28	* Inode locking rules:
				29	*
				30	* inode->i_lock protects:
				31	* inode->i_state, inode->i_hash, __iget()
				32	* Inode LRU list locks protect:
				33	* inode->i_sb->s_inode_lru, inode->i_lru
				34	* inode->i_sb->s_inode_list_lock protects:
				35	* inode->i_sb->s_inodes, inode->i_sb_list
				36	* bdi->wb.list_lock protects:
				37	* bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
				38	* inode_hash_lock protects:
				39	* inode_hashtable, inode->i_hash
				40	*
				41	* Lock ordering:
				42	*
				43	* inode->i_sb->s_inode_list_lock
				44	* inode->i_lock
				45	* Inode LRU list locks
				46	*
				47	* bdi->wb.list_lock
				48	* inode->i_lock
				49	*
				50	* inode_hash_lock
				51	* inode->i_sb->s_inode_list_lock
				52	* inode->i_lock
				53	*
				54	* iunique_lock
				55	* inode_hash_lock
				56	*/
				57
				58	static unsigned int i_hash_mask __read_mostly;
				59	static unsigned int i_hash_shift __read_mostly;
				60	static struct hlist_head *inode_hashtable __read_mostly;
				61	static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
				62
				63	/*
				64	* Empty aops. Can be used for the cases where the user does not
				65	* define any of the address_space operations.
				66	*/
				67	const struct address_space_operations empty_aops = {
				68	};
				69	EXPORT_SYMBOL(empty_aops);
				70
				71	/*
				72	* Statistics gathering..
				73	*/
				74	struct inodes_stat_t inodes_stat;
				75
				76	static DEFINE_PER_CPU(unsigned long, nr_inodes);
				77	static DEFINE_PER_CPU(unsigned long, nr_unused);
				78
				79	static struct kmem_cache *inode_cachep __read_mostly;
				80
				81	static long get_nr_inodes(void)
				82	{
				83	int i;
				84	long sum = 0;
				85	for_each_possible_cpu(i)
				86	sum += per_cpu(nr_inodes, i);
				87	return sum < 0 ? 0 : sum;
				88	}
				89
				90	static inline long get_nr_inodes_unused(void)
				91	{
				92	int i;
				93	long sum = 0;
				94	for_each_possible_cpu(i)
				95	sum += per_cpu(nr_unused, i);
				96	return sum < 0 ? 0 : sum;
				97	}
				98
				99	long get_nr_dirty_inodes(void)
				100	{
				101	/* not actually dirty inodes, but a wild approximation */
				102	long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
				103	return nr_dirty > 0 ? nr_dirty : 0;
				104	}
				105
				106	/*
				107	* Handle nr_inode sysctl
				108	*/
				109	#ifdef CONFIG_SYSCTL
				110	int proc_nr_inodes(struct ctl_table *table, int write,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	111	void buffer, size_t lenp, loff_t *ppos)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	112	{
				113	inodes_stat.nr_inodes = get_nr_inodes();
				114	inodes_stat.nr_unused = get_nr_inodes_unused();
				115	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
				116	}
				117	#endif
				118
				119	static int no_open(struct inode inode, struct file file)
				120	{
				121	return -ENXIO;
				122	}
				123
				124	/**
				125	* inode_init_always - perform inode structure initialisation
				126	* @sb: superblock inode belongs to
				127	* @inode: inode to initialise
				128	*
				129	* These are initializations that need to be done on every inode
				130	* allocation as the fields are not initialised by slab allocation.
				131	*/
				132	int inode_init_always(struct super_block sb, struct inode inode)
				133	{
				134	static const struct inode_operations empty_iops;
				135	static const struct file_operations no_open_fops = {.open = no_open};
				136	struct address_space *const mapping = &inode->i_data;
				137
				138	inode->i_sb = sb;
				139	inode->i_blkbits = sb->s_blocksize_bits;
				140	inode->i_flags = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	141	atomic64_set(&inode->i_sequence, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	142	atomic_set(&inode->i_count, 1);
				143	inode->i_op = &empty_iops;
				144	inode->i_fop = &no_open_fops;
				145	inode->__i_nlink = 1;
				146	inode->i_opflags = 0;
				147	if (sb->s_xattr)
				148	inode->i_opflags \|= IOP_XATTR;
				149	i_uid_write(inode, 0);
				150	i_gid_write(inode, 0);
				151	atomic_set(&inode->i_writecount, 0);
				152	inode->i_size = 0;
				153	inode->i_write_hint = WRITE_LIFE_NOT_SET;
				154	inode->i_blocks = 0;
				155	inode->i_bytes = 0;
				156	inode->i_generation = 0;
				157	inode->i_pipe = NULL;
				158	inode->i_bdev = NULL;
				159	inode->i_cdev = NULL;
				160	inode->i_link = NULL;
				161	inode->i_dir_seq = 0;
				162	inode->i_rdev = 0;
				163	inode->dirtied_when = 0;
				164
				165	#ifdef CONFIG_CGROUP_WRITEBACK
				166	inode->i_wb_frn_winner = 0;
				167	inode->i_wb_frn_avg_time = 0;
				168	inode->i_wb_frn_history = 0;
				169	#endif
				170
				171	if (security_inode_alloc(inode))
				172	goto out;
				173	spin_lock_init(&inode->i_lock);
				174	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
				175
				176	init_rwsem(&inode->i_rwsem);
				177	lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
				178
				179	atomic_set(&inode->i_dio_count, 0);
				180
				181	mapping->a_ops = &empty_aops;
				182	mapping->host = inode;
				183	mapping->flags = 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	184	if (sb->s_type->fs_flags & FS_THP_SUPPORT)
				185	__set_bit(AS_THP_SUPPORT, &mapping->flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	186	mapping->wb_err = 0;
				187	atomic_set(&mapping->i_mmap_writable, 0);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	188	#ifdef CONFIG_READ_ONLY_THP_FOR_FS
				189	atomic_set(&mapping->nr_thps, 0);
				190	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	191	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
				192	mapping->private_data = NULL;
				193	mapping->writeback_index = 0;
				194	inode->i_private = NULL;
				195	inode->i_mapping = mapping;
				196	INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
				197	#ifdef CONFIG_FS_POSIX_ACL
				198	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
				199	#endif
				200
				201	#ifdef CONFIG_FSNOTIFY
				202	inode->i_fsnotify_mask = 0;
				203	#endif
				204	inode->i_flctx = NULL;
				205	this_cpu_inc(nr_inodes);
				206
				207	return 0;
				208	out:
				209	return -ENOMEM;
				210	}
				211	EXPORT_SYMBOL(inode_init_always);
				212
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	213	void free_inode_nonrcu(struct inode *inode)
				214	{
				215	kmem_cache_free(inode_cachep, inode);
				216	}
				217	EXPORT_SYMBOL(free_inode_nonrcu);
				218
				219	static void i_callback(struct rcu_head *head)
				220	{
				221	struct inode *inode = container_of(head, struct inode, i_rcu);
				222	if (inode->free_inode)
				223	inode->free_inode(inode);
				224	else
				225	free_inode_nonrcu(inode);
				226	}
				227
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	228	static struct inode alloc_inode(struct super_block sb)
				229	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	230	const struct super_operations *ops = sb->s_op;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	231	struct inode *inode;
				232
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	233	if (ops->alloc_inode)
				234	inode = ops->alloc_inode(sb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	235	else
				236	inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
				237
				238	if (!inode)
				239	return NULL;
				240
				241	if (unlikely(inode_init_always(sb, inode))) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	242	if (ops->destroy_inode) {
				243	ops->destroy_inode(inode);
				244	if (!ops->free_inode)
				245	return NULL;
				246	}
				247	inode->free_inode = ops->free_inode;
				248	i_callback(&inode->i_rcu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	249	return NULL;
				250	}
				251
				252	return inode;
				253	}
				254
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	255	void __destroy_inode(struct inode *inode)
				256	{
				257	BUG_ON(inode_has_buffers(inode));
				258	inode_detach_wb(inode);
				259	security_inode_free(inode);
				260	fsnotify_inode_delete(inode);
				261	locks_free_lock_context(inode);
				262	if (!inode->i_nlink) {
				263	WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
				264	atomic_long_dec(&inode->i_sb->s_remove_count);
				265	}
				266
				267	#ifdef CONFIG_FS_POSIX_ACL
				268	if (inode->i_acl && !is_uncached_acl(inode->i_acl))
				269	posix_acl_release(inode->i_acl);
				270	if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
				271	posix_acl_release(inode->i_default_acl);
				272	#endif
				273	this_cpu_dec(nr_inodes);
				274	}
				275	EXPORT_SYMBOL(__destroy_inode);
				276
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	277	static void destroy_inode(struct inode *inode)
				278	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	279	const struct super_operations *ops = inode->i_sb->s_op;
				280
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	281	BUG_ON(!list_empty(&inode->i_lru));
				282	__destroy_inode(inode);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	283	if (ops->destroy_inode) {
				284	ops->destroy_inode(inode);
				285	if (!ops->free_inode)
				286	return;
				287	}
				288	inode->free_inode = ops->free_inode;
				289	call_rcu(&inode->i_rcu, i_callback);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	290	}
				291
				292	/**
				293	* drop_nlink - directly drop an inode's link count
				294	* @inode: inode
				295	*
				296	* This is a low-level filesystem helper to replace any
				297	* direct filesystem manipulation of i_nlink. In cases
				298	* where we are attempting to track writes to the
				299	* filesystem, a decrement to zero means an imminent
				300	* write when the file is truncated and actually unlinked
				301	* on the filesystem.
				302	*/
				303	void drop_nlink(struct inode *inode)
				304	{
				305	WARN_ON(inode->i_nlink == 0);
				306	inode->__i_nlink--;
				307	if (!inode->i_nlink)
				308	atomic_long_inc(&inode->i_sb->s_remove_count);
				309	}
				310	EXPORT_SYMBOL(drop_nlink);
				311
				312	/**
				313	* clear_nlink - directly zero an inode's link count
				314	* @inode: inode
				315	*
				316	* This is a low-level filesystem helper to replace any
				317	* direct filesystem manipulation of i_nlink. See
				318	* drop_nlink() for why we care about i_nlink hitting zero.
				319	*/
				320	void clear_nlink(struct inode *inode)
				321	{
				322	if (inode->i_nlink) {
				323	inode->__i_nlink = 0;
				324	atomic_long_inc(&inode->i_sb->s_remove_count);
				325	}
				326	}
				327	EXPORT_SYMBOL(clear_nlink);
				328
				329	/**
				330	* set_nlink - directly set an inode's link count
				331	* @inode: inode
				332	* @nlink: new nlink (should be non-zero)
				333	*
				334	* This is a low-level filesystem helper to replace any
				335	* direct filesystem manipulation of i_nlink.
				336	*/
				337	void set_nlink(struct inode *inode, unsigned int nlink)
				338	{
				339	if (!nlink) {
				340	clear_nlink(inode);
				341	} else {
				342	/* Yes, some filesystems do change nlink from zero to one */
				343	if (inode->i_nlink == 0)
				344	atomic_long_dec(&inode->i_sb->s_remove_count);
				345
				346	inode->__i_nlink = nlink;
				347	}
				348	}
				349	EXPORT_SYMBOL(set_nlink);
				350
				351	/**
				352	* inc_nlink - directly increment an inode's link count
				353	* @inode: inode
				354	*
				355	* This is a low-level filesystem helper to replace any
				356	* direct filesystem manipulation of i_nlink. Currently,
				357	* it is only here for parity with dec_nlink().
				358	*/
				359	void inc_nlink(struct inode *inode)
				360	{
				361	if (unlikely(inode->i_nlink == 0)) {
				362	WARN_ON(!(inode->i_state & I_LINKABLE));
				363	atomic_long_dec(&inode->i_sb->s_remove_count);
				364	}
				365
				366	inode->__i_nlink++;
				367	}
				368	EXPORT_SYMBOL(inc_nlink);
				369
				370	static void __address_space_init_once(struct address_space *mapping)
				371	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	372	xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ \| XA_FLAGS_ACCOUNT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	373	init_rwsem(&mapping->i_mmap_rwsem);
				374	INIT_LIST_HEAD(&mapping->private_list);
				375	spin_lock_init(&mapping->private_lock);
				376	mapping->i_mmap = RB_ROOT_CACHED;
				377	}
				378
				379	void address_space_init_once(struct address_space *mapping)
				380	{
				381	memset(mapping, 0, sizeof(*mapping));
				382	__address_space_init_once(mapping);
				383	}
				384	EXPORT_SYMBOL(address_space_init_once);
				385
				386	/*
				387	* These are initializations that only need to be done
				388	* once, because the fields are idempotent across use
				389	* of the inode, so let the slab aware of that.
				390	*/
				391	void inode_init_once(struct inode *inode)
				392	{
				393	memset(inode, 0, sizeof(*inode));
				394	INIT_HLIST_NODE(&inode->i_hash);
				395	INIT_LIST_HEAD(&inode->i_devices);
				396	INIT_LIST_HEAD(&inode->i_io_list);
				397	INIT_LIST_HEAD(&inode->i_wb_list);
				398	INIT_LIST_HEAD(&inode->i_lru);
				399	__address_space_init_once(&inode->i_data);
				400	i_size_ordered_init(inode);
				401	}
				402	EXPORT_SYMBOL(inode_init_once);
				403
				404	static void init_once(void *foo)
				405	{
				406	struct inode inode = (struct inode ) foo;
				407
				408	inode_init_once(inode);
				409	}
				410
				411	/*
				412	* inode->i_lock must be held
				413	*/
				414	void __iget(struct inode *inode)
				415	{
				416	atomic_inc(&inode->i_count);
				417	}
				418
				419	/*
				420	* get additional reference to inode; caller must already hold one.
				421	*/
				422	void ihold(struct inode *inode)
				423	{
				424	WARN_ON(atomic_inc_return(&inode->i_count) < 2);
				425	}
				426	EXPORT_SYMBOL(ihold);
				427
				428	static void inode_lru_list_add(struct inode *inode)
				429	{
				430	if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
				431	this_cpu_inc(nr_unused);
				432	else
				433	inode->i_state \|= I_REFERENCED;
				434	}
				435
				436	/*
				437	* Add inode to LRU if needed (inode is unused and clean).
				438	*
				439	* Needs inode->i_lock held.
				440	*/
				441	void inode_add_lru(struct inode *inode)
				442	{
				443	if (!(inode->i_state & (I_DIRTY_ALL \| I_SYNC \|
				444	I_FREEING \| I_WILL_FREE)) &&
				445	!atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
				446	inode_lru_list_add(inode);
				447	}
				448
				449
				450	static void inode_lru_list_del(struct inode *inode)
				451	{
				452
				453	if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
				454	this_cpu_dec(nr_unused);
				455	}
				456
				457	/**
				458	* inode_sb_list_add - add inode to the superblock list of inodes
				459	* @inode: inode to add
				460	*/
				461	void inode_sb_list_add(struct inode *inode)
				462	{
				463	spin_lock(&inode->i_sb->s_inode_list_lock);
				464	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
				465	spin_unlock(&inode->i_sb->s_inode_list_lock);
				466	}
				467	EXPORT_SYMBOL_GPL(inode_sb_list_add);
				468
				469	static inline void inode_sb_list_del(struct inode *inode)
				470	{
				471	if (!list_empty(&inode->i_sb_list)) {
				472	spin_lock(&inode->i_sb->s_inode_list_lock);
				473	list_del_init(&inode->i_sb_list);
				474	spin_unlock(&inode->i_sb->s_inode_list_lock);
				475	}
				476	}
				477
				478	static unsigned long hash(struct super_block *sb, unsigned long hashval)
				479	{
				480	unsigned long tmp;
				481
				482	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
				483	L1_CACHE_BYTES;
				484	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
				485	return tmp & i_hash_mask;
				486	}
				487
				488	/**
				489	* __insert_inode_hash - hash an inode
				490	* @inode: unhashed inode
				491	* @hashval: unsigned long value used to locate this object in the
				492	* inode_hashtable.
				493	*
				494	* Add an inode to the inode hash for this superblock.
				495	*/
				496	void __insert_inode_hash(struct inode *inode, unsigned long hashval)
				497	{
				498	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
				499
				500	spin_lock(&inode_hash_lock);
				501	spin_lock(&inode->i_lock);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	502	hlist_add_head_rcu(&inode->i_hash, b);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	503	spin_unlock(&inode->i_lock);
				504	spin_unlock(&inode_hash_lock);
				505	}
				506	EXPORT_SYMBOL(__insert_inode_hash);
				507
				508	/**
				509	* __remove_inode_hash - remove an inode from the hash
				510	* @inode: inode to unhash
				511	*
				512	* Remove an inode from the superblock.
				513	*/
				514	void __remove_inode_hash(struct inode *inode)
				515	{
				516	spin_lock(&inode_hash_lock);
				517	spin_lock(&inode->i_lock);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	518	hlist_del_init_rcu(&inode->i_hash);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	519	spin_unlock(&inode->i_lock);
				520	spin_unlock(&inode_hash_lock);
				521	}
				522	EXPORT_SYMBOL(__remove_inode_hash);
				523
				524	void clear_inode(struct inode *inode)
				525	{
				526	/*
				527	* We have to cycle the i_pages lock here because reclaim can be in the
				528	* process of removing the last page (in __delete_from_page_cache())
				529	* and we must not free the mapping under it.
				530	*/
				531	xa_lock_irq(&inode->i_data.i_pages);
				532	BUG_ON(inode->i_data.nrpages);
				533	BUG_ON(inode->i_data.nrexceptional);
				534	xa_unlock_irq(&inode->i_data.i_pages);
				535	BUG_ON(!list_empty(&inode->i_data.private_list));
				536	BUG_ON(!(inode->i_state & I_FREEING));
				537	BUG_ON(inode->i_state & I_CLEAR);
				538	BUG_ON(!list_empty(&inode->i_wb_list));
				539	/* don't need i_lock here, no concurrent mods to i_state */
				540	inode->i_state = I_FREEING \| I_CLEAR;
				541	}
				542	EXPORT_SYMBOL(clear_inode);
				543
				544	/*
				545	* Free the inode passed in, removing it from the lists it is still connected
				546	* to. We remove any pages still attached to the inode and wait for any IO that
				547	* is still in progress before finally destroying the inode.
				548	*
				549	* An inode must already be marked I_FREEING so that we avoid the inode being
				550	* moved back onto lists if we race with other code that manipulates the lists
				551	* (e.g. writeback_single_inode). The caller is responsible for setting this.
				552	*
				553	* An inode must already be removed from the LRU list before being evicted from
				554	* the cache. This should occur atomically with setting the I_FREEING state
				555	* flag, so no inodes here should ever be on the LRU when being evicted.
				556	*/
				557	static void evict(struct inode *inode)
				558	{
				559	const struct super_operations *op = inode->i_sb->s_op;
				560
				561	BUG_ON(!(inode->i_state & I_FREEING));
				562	BUG_ON(!list_empty(&inode->i_lru));
				563
				564	if (!list_empty(&inode->i_io_list))
				565	inode_io_list_del(inode);
				566
				567	inode_sb_list_del(inode);
				568
				569	/*
				570	* Wait for flusher thread to be done with the inode so that filesystem
				571	* does not start destroying it while writeback is still running. Since
				572	* the inode has I_FREEING set, flusher thread won't start new work on
				573	* the inode. We just have to wait for running writeback to finish.
				574	*/
				575	inode_wait_for_writeback(inode);
				576
				577	if (op->evict_inode) {
				578	op->evict_inode(inode);
				579	} else {
				580	truncate_inode_pages_final(&inode->i_data);
				581	clear_inode(inode);
				582	}
				583	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
				584	bd_forget(inode);
				585	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
				586	cd_forget(inode);
				587
				588	remove_inode_hash(inode);
				589
				590	spin_lock(&inode->i_lock);
				591	wake_up_bit(&inode->i_state, __I_NEW);
				592	BUG_ON(inode->i_state != (I_FREEING \| I_CLEAR));
				593	spin_unlock(&inode->i_lock);
				594
				595	destroy_inode(inode);
				596	}
				597
				598	/*
				599	* dispose_list - dispose of the contents of a local list
				600	* @head: the head of the list to free
				601	*
				602	* Dispose-list gets a local list with local inodes in it, so it doesn't
				603	* need to worry about list corruption and SMP locks.
				604	*/
				605	static void dispose_list(struct list_head *head)
				606	{
				607	while (!list_empty(head)) {
				608	struct inode *inode;
				609
				610	inode = list_first_entry(head, struct inode, i_lru);
				611	list_del_init(&inode->i_lru);
				612
				613	evict(inode);
				614	cond_resched();
				615	}
				616	}
				617
				618	/**
				619	* evict_inodes - evict all evictable inodes for a superblock
				620	* @sb: superblock to operate on
				621	*
				622	* Make sure that no inodes with zero refcount are retained. This is
				623	* called by superblock shutdown after having SB_ACTIVE flag removed,
				624	* so any inode reaching zero refcount during or after that call will
				625	* be immediately evicted.
				626	*/
				627	void evict_inodes(struct super_block *sb)
				628	{
				629	struct inode inode, next;
				630	LIST_HEAD(dispose);
				631
				632	again:
				633	spin_lock(&sb->s_inode_list_lock);
				634	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
				635	if (atomic_read(&inode->i_count))
				636	continue;
				637
				638	spin_lock(&inode->i_lock);
				639	if (inode->i_state & (I_NEW \| I_FREEING \| I_WILL_FREE)) {
				640	spin_unlock(&inode->i_lock);
				641	continue;
				642	}
				643
				644	inode->i_state \|= I_FREEING;
				645	inode_lru_list_del(inode);
				646	spin_unlock(&inode->i_lock);
				647	list_add(&inode->i_lru, &dispose);
				648
				649	/*
				650	* We can have a ton of inodes to evict at unmount time given
				651	* enough memory, check to see if we need to go to sleep for a
				652	* bit so we don't livelock.
				653	*/
				654	if (need_resched()) {
				655	spin_unlock(&sb->s_inode_list_lock);
				656	cond_resched();
				657	dispose_list(&dispose);
				658	goto again;
				659	}
				660	}
				661	spin_unlock(&sb->s_inode_list_lock);
				662
				663	dispose_list(&dispose);
				664	}
				665	EXPORT_SYMBOL_GPL(evict_inodes);
				666
				667	/**
				668	* invalidate_inodes - attempt to free all inodes on a superblock
				669	* @sb: superblock to operate on
				670	* @kill_dirty: flag to guide handling of dirty inodes
				671	*
				672	* Attempts to free all inodes for a given superblock. If there were any
				673	* busy inodes return a non-zero value, else zero.
				674	* If @kill_dirty is set, discard dirty inodes too, otherwise treat
				675	* them as busy.
				676	*/
				677	int invalidate_inodes(struct super_block *sb, bool kill_dirty)
				678	{
				679	int busy = 0;
				680	struct inode inode, next;
				681	LIST_HEAD(dispose);
				682
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	683	again:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	684	spin_lock(&sb->s_inode_list_lock);
				685	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
				686	spin_lock(&inode->i_lock);
				687	if (inode->i_state & (I_NEW \| I_FREEING \| I_WILL_FREE)) {
				688	spin_unlock(&inode->i_lock);
				689	continue;
				690	}
				691	if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
				692	spin_unlock(&inode->i_lock);
				693	busy = 1;
				694	continue;
				695	}
				696	if (atomic_read(&inode->i_count)) {
				697	spin_unlock(&inode->i_lock);
				698	busy = 1;
				699	continue;
				700	}
				701
				702	inode->i_state \|= I_FREEING;
				703	inode_lru_list_del(inode);
				704	spin_unlock(&inode->i_lock);
				705	list_add(&inode->i_lru, &dispose);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	706	if (need_resched()) {
				707	spin_unlock(&sb->s_inode_list_lock);
				708	cond_resched();
				709	dispose_list(&dispose);
				710	goto again;
				711	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	712	}
				713	spin_unlock(&sb->s_inode_list_lock);
				714
				715	dispose_list(&dispose);
				716
				717	return busy;
				718	}
				719
				720	/*
				721	* Isolate the inode from the LRU in preparation for freeing it.
				722	*
				723	* Any inodes which are pinned purely because of attached pagecache have their
				724	* pagecache removed. If the inode has metadata buffers attached to
				725	* mapping->private_list then try to remove them.
				726	*
				727	* If the inode has the I_REFERENCED flag set, then it means that it has been
				728	* used recently - the flag is set in iput_final(). When we encounter such an
				729	* inode, clear the flag and move it to the back of the LRU so it gets another
				730	* pass through the LRU before it gets reclaimed. This is necessary because of
				731	* the fact we are doing lazy LRU updates to minimise lock contention so the
				732	* LRU does not have strict ordering. Hence we don't want to reclaim inodes
				733	* with this flag set because they are the inodes that are out of order.
				734	*/
				735	static enum lru_status inode_lru_isolate(struct list_head *item,
				736	struct list_lru_one lru, spinlock_t lru_lock, void *arg)
				737	{
				738	struct list_head *freeable = arg;
				739	struct inode *inode = container_of(item, struct inode, i_lru);
				740
				741	/*
				742	* we are inverting the lru lock/inode->i_lock here, so use a trylock.
				743	* If we fail to get the lock, just skip it.
				744	*/
				745	if (!spin_trylock(&inode->i_lock))
				746	return LRU_SKIP;
				747
				748	/*
				749	* Referenced or dirty inodes are still in use. Give them another pass
				750	* through the LRU as we canot reclaim them now.
				751	*/
				752	if (atomic_read(&inode->i_count) \|\|
				753	(inode->i_state & ~I_REFERENCED)) {
				754	list_lru_isolate(lru, &inode->i_lru);
				755	spin_unlock(&inode->i_lock);
				756	this_cpu_dec(nr_unused);
				757	return LRU_REMOVED;
				758	}
				759
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	760	/* recently referenced inodes get one more pass */
				761	if (inode->i_state & I_REFERENCED) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	762	inode->i_state &= ~I_REFERENCED;
				763	spin_unlock(&inode->i_lock);
				764	return LRU_ROTATE;
				765	}
				766
				767	if (inode_has_buffers(inode) \|\| inode->i_data.nrpages) {
				768	__iget(inode);
				769	spin_unlock(&inode->i_lock);
				770	spin_unlock(lru_lock);
				771	if (remove_inode_buffers(inode)) {
				772	unsigned long reap;
				773	reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
				774	if (current_is_kswapd())
				775	__count_vm_events(KSWAPD_INODESTEAL, reap);
				776	else
				777	__count_vm_events(PGINODESTEAL, reap);
				778	if (current->reclaim_state)
				779	current->reclaim_state->reclaimed_slab += reap;
				780	}
				781	iput(inode);
				782	spin_lock(lru_lock);
				783	return LRU_RETRY;
				784	}
				785
				786	WARN_ON(inode->i_state & I_NEW);
				787	inode->i_state \|= I_FREEING;
				788	list_lru_isolate_move(lru, &inode->i_lru, freeable);
				789	spin_unlock(&inode->i_lock);
				790
				791	this_cpu_dec(nr_unused);
				792	return LRU_REMOVED;
				793	}
				794
				795	/*
				796	* Walk the superblock inode LRU for freeable inodes and attempt to free them.
				797	* This is called from the superblock shrinker function with a number of inodes
				798	* to trim from the LRU. Inodes to be freed are moved to a temporary list and
				799	* then are freed outside inode_lock by dispose_list().
				800	*/
				801	long prune_icache_sb(struct super_block sb, struct shrink_control sc)
				802	{
				803	LIST_HEAD(freeable);
				804	long freed;
				805
				806	freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
				807	inode_lru_isolate, &freeable);
				808	dispose_list(&freeable);
				809	return freed;
				810	}
				811
				812	static void __wait_on_freeing_inode(struct inode *inode);
				813	/*
				814	* Called with the inode lock held.
				815	*/
				816	static struct inode find_inode(struct super_block sb,
				817	struct hlist_head *head,
				818	int (test)(struct inode , void *),
				819	void *data)
				820	{
				821	struct inode *inode = NULL;
				822
				823	repeat:
				824	hlist_for_each_entry(inode, head, i_hash) {
				825	if (inode->i_sb != sb)
				826	continue;
				827	if (!test(inode, data))
				828	continue;
				829	spin_lock(&inode->i_lock);
				830	if (inode->i_state & (I_FREEING\|I_WILL_FREE)) {
				831	__wait_on_freeing_inode(inode);
				832	goto repeat;
				833	}
				834	if (unlikely(inode->i_state & I_CREATING)) {
				835	spin_unlock(&inode->i_lock);
				836	return ERR_PTR(-ESTALE);
				837	}
				838	__iget(inode);
				839	spin_unlock(&inode->i_lock);
				840	return inode;
				841	}
				842	return NULL;
				843	}
				844
				845	/*
				846	* find_inode_fast is the fast path version of find_inode, see the comment at
				847	* iget_locked for details.
				848	*/
				849	static struct inode find_inode_fast(struct super_block sb,
				850	struct hlist_head *head, unsigned long ino)
				851	{
				852	struct inode *inode = NULL;
				853
				854	repeat:
				855	hlist_for_each_entry(inode, head, i_hash) {
				856	if (inode->i_ino != ino)
				857	continue;
				858	if (inode->i_sb != sb)
				859	continue;
				860	spin_lock(&inode->i_lock);
				861	if (inode->i_state & (I_FREEING\|I_WILL_FREE)) {
				862	__wait_on_freeing_inode(inode);
				863	goto repeat;
				864	}
				865	if (unlikely(inode->i_state & I_CREATING)) {
				866	spin_unlock(&inode->i_lock);
				867	return ERR_PTR(-ESTALE);
				868	}
				869	__iget(inode);
				870	spin_unlock(&inode->i_lock);
				871	return inode;
				872	}
				873	return NULL;
				874	}
				875
				876	/*
				877	* Each cpu owns a range of LAST_INO_BATCH numbers.
				878	* 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
				879	* to renew the exhausted range.
				880	*
				881	* This does not significantly increase overflow rate because every CPU can
				882	* consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
				883	* NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
				884	* 2^32 range, and is a worst-case. Even a 50% wastage would only increase
				885	* overflow rate by 2x, which does not seem too significant.
				886	*
				887	* On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
				888	* error if st_ino won't fit in target struct field. Use 32bit counter
				889	* here to attempt to avoid that.
				890	*/
				891	#define LAST_INO_BATCH 1024
				892	static DEFINE_PER_CPU(unsigned int, last_ino);
				893
				894	unsigned int get_next_ino(void)
				895	{
				896	unsigned int *p = &get_cpu_var(last_ino);
				897	unsigned int res = *p;
				898
				899	#ifdef CONFIG_SMP
				900	if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
				901	static atomic_t shared_last_ino;
				902	int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
				903
				904	res = next - LAST_INO_BATCH;
				905	}
				906	#endif
				907
				908	res++;
				909	/* get_next_ino should not provide a 0 inode number */
				910	if (unlikely(!res))
				911	res++;
				912	*p = res;
				913	put_cpu_var(last_ino);
				914	return res;
				915	}
				916	EXPORT_SYMBOL(get_next_ino);
				917
				918	/**
				919	* new_inode_pseudo - obtain an inode
				920	* @sb: superblock
				921	*
				922	* Allocates a new inode for given superblock.
				923	* Inode wont be chained in superblock s_inodes list
				924	* This means :
				925	* - fs can't be unmount
				926	* - quotas, fsnotify, writeback can't work
				927	*/
				928	struct inode new_inode_pseudo(struct super_block sb)
				929	{
				930	struct inode *inode = alloc_inode(sb);
				931
				932	if (inode) {
				933	spin_lock(&inode->i_lock);
				934	inode->i_state = 0;
				935	spin_unlock(&inode->i_lock);
				936	INIT_LIST_HEAD(&inode->i_sb_list);
				937	}
				938	return inode;
				939	}
				940
				941	/**
				942	* new_inode - obtain an inode
				943	* @sb: superblock
				944	*
				945	* Allocates a new inode for given superblock. The default gfp_mask
				946	* for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
				947	* If HIGHMEM pages are unsuitable or it is known that pages allocated
				948	* for the page cache are not reclaimable or migratable,
				949	* mapping_set_gfp_mask() must be called with suitable flags on the
				950	* newly created inode's mapping
				951	*
				952	*/
				953	struct inode new_inode(struct super_block sb)
				954	{
				955	struct inode *inode;
				956
				957	spin_lock_prefetch(&sb->s_inode_list_lock);
				958
				959	inode = new_inode_pseudo(sb);
				960	if (inode)
				961	inode_sb_list_add(inode);
				962	return inode;
				963	}
				964	EXPORT_SYMBOL(new_inode);
				965
				966	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				967	void lockdep_annotate_inode_mutex_key(struct inode *inode)
				968	{
				969	if (S_ISDIR(inode->i_mode)) {
				970	struct file_system_type *type = inode->i_sb->s_type;
				971
				972	/* Set new key only if filesystem hasn't already changed it */
				973	if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
				974	/*
				975	* ensure nobody is actually holding i_mutex
				976	*/
				977	// mutex_destroy(&inode->i_mutex);
				978	init_rwsem(&inode->i_rwsem);
				979	lockdep_set_class(&inode->i_rwsem,
				980	&type->i_mutex_dir_key);
				981	}
				982	}
				983	}
				984	EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
				985	#endif
				986
				987	/**
				988	* unlock_new_inode - clear the I_NEW state and wake up any waiters
				989	* @inode: new inode to unlock
				990	*
				991	* Called when the inode is fully initialised to clear the new state of the
				992	* inode and wake up anyone waiting for the inode to finish initialisation.
				993	*/
				994	void unlock_new_inode(struct inode *inode)
				995	{
				996	lockdep_annotate_inode_mutex_key(inode);
				997	spin_lock(&inode->i_lock);
				998	WARN_ON(!(inode->i_state & I_NEW));
				999	inode->i_state &= ~I_NEW & ~I_CREATING;
				1000	smp_mb();
				1001	wake_up_bit(&inode->i_state, __I_NEW);
				1002	spin_unlock(&inode->i_lock);
				1003	}
				1004	EXPORT_SYMBOL(unlock_new_inode);
				1005
				1006	void discard_new_inode(struct inode *inode)
				1007	{
				1008	lockdep_annotate_inode_mutex_key(inode);
				1009	spin_lock(&inode->i_lock);
				1010	WARN_ON(!(inode->i_state & I_NEW));
				1011	inode->i_state &= ~I_NEW;
				1012	smp_mb();
				1013	wake_up_bit(&inode->i_state, __I_NEW);
				1014	spin_unlock(&inode->i_lock);
				1015	iput(inode);
				1016	}
				1017	EXPORT_SYMBOL(discard_new_inode);
				1018
				1019	/**
				1020	* lock_two_nondirectories - take two i_mutexes on non-directory objects
				1021	*
				1022	* Lock any non-NULL argument that is not a directory.
				1023	* Zero, one or two objects may be locked by this function.
				1024	*
				1025	* @inode1: first inode to lock
				1026	* @inode2: second inode to lock
				1027	*/
				1028	void lock_two_nondirectories(struct inode inode1, struct inode inode2)
				1029	{
				1030	if (inode1 > inode2)
				1031	swap(inode1, inode2);
				1032
				1033	if (inode1 && !S_ISDIR(inode1->i_mode))
				1034	inode_lock(inode1);
				1035	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
				1036	inode_lock_nested(inode2, I_MUTEX_NONDIR2);
				1037	}
				1038	EXPORT_SYMBOL(lock_two_nondirectories);
				1039
				1040	/**
				1041	* unlock_two_nondirectories - release locks from lock_two_nondirectories()
				1042	* @inode1: first inode to unlock
				1043	* @inode2: second inode to unlock
				1044	*/
				1045	void unlock_two_nondirectories(struct inode inode1, struct inode inode2)
				1046	{
				1047	if (inode1 && !S_ISDIR(inode1->i_mode))
				1048	inode_unlock(inode1);
				1049	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
				1050	inode_unlock(inode2);
				1051	}
				1052	EXPORT_SYMBOL(unlock_two_nondirectories);
				1053
				1054	/**
				1055	* inode_insert5 - obtain an inode from a mounted file system
				1056	* @inode: pre-allocated inode to use for insert to cache
				1057	* @hashval: hash value (usually inode number) to get
				1058	* @test: callback used for comparisons between inodes
				1059	* @set: callback used to initialize a new struct inode
				1060	* @data: opaque data pointer to pass to @test and @set
				1061	*
				1062	* Search for the inode specified by @hashval and @data in the inode cache,
				1063	* and if present it is return it with an increased reference count. This is
				1064	* a variant of iget5_locked() for callers that don't want to fail on memory
				1065	* allocation of inode.
				1066	*
				1067	* If the inode is not in cache, insert the pre-allocated inode to cache and
				1068	* return it locked, hashed, and with the I_NEW flag set. The file system gets
				1069	* to fill it in before unlocking it via unlock_new_inode().
				1070	*
				1071	* Note both @test and @set are called with the inode_hash_lock held, so can't
				1072	* sleep.
				1073	*/
				1074	struct inode inode_insert5(struct inode inode, unsigned long hashval,
				1075	int (test)(struct inode , void *),
				1076	int (set)(struct inode , void ), void data)
				1077	{
				1078	struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
				1079	struct inode *old;
				1080	bool creating = inode->i_state & I_CREATING;
				1081
				1082	again:
				1083	spin_lock(&inode_hash_lock);
				1084	old = find_inode(inode->i_sb, head, test, data);
				1085	if (unlikely(old)) {
				1086	/*
				1087	* Uhhuh, somebody else created the same inode under us.
				1088	* Use the old inode instead of the preallocated one.
				1089	*/
				1090	spin_unlock(&inode_hash_lock);
				1091	if (IS_ERR(old))
				1092	return NULL;
				1093	wait_on_inode(old);
				1094	if (unlikely(inode_unhashed(old))) {
				1095	iput(old);
				1096	goto again;
				1097	}
				1098	return old;
				1099	}
				1100
				1101	if (set && unlikely(set(inode, data))) {
				1102	inode = NULL;
				1103	goto unlock;
				1104	}
				1105
				1106	/*
				1107	* Return the locked inode with I_NEW set, the
				1108	* caller is responsible for filling in the contents
				1109	*/
				1110	spin_lock(&inode->i_lock);
				1111	inode->i_state \|= I_NEW;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1112	hlist_add_head_rcu(&inode->i_hash, head);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1113	spin_unlock(&inode->i_lock);
				1114	if (!creating)
				1115	inode_sb_list_add(inode);
				1116	unlock:
				1117	spin_unlock(&inode_hash_lock);
				1118
				1119	return inode;
				1120	}
				1121	EXPORT_SYMBOL(inode_insert5);
				1122
				1123	/**
				1124	* iget5_locked - obtain an inode from a mounted file system
				1125	* @sb: super block of file system
				1126	* @hashval: hash value (usually inode number) to get
				1127	* @test: callback used for comparisons between inodes
				1128	* @set: callback used to initialize a new struct inode
				1129	* @data: opaque data pointer to pass to @test and @set
				1130	*
				1131	* Search for the inode specified by @hashval and @data in the inode cache,
				1132	* and if present it is return it with an increased reference count. This is
				1133	* a generalized version of iget_locked() for file systems where the inode
				1134	* number is not sufficient for unique identification of an inode.
				1135	*
				1136	* If the inode is not in cache, allocate a new inode and return it locked,
				1137	* hashed, and with the I_NEW flag set. The file system gets to fill it in
				1138	* before unlocking it via unlock_new_inode().
				1139	*
				1140	* Note both @test and @set are called with the inode_hash_lock held, so can't
				1141	* sleep.
				1142	*/
				1143	struct inode iget5_locked(struct super_block sb, unsigned long hashval,
				1144	int (test)(struct inode , void *),
				1145	int (set)(struct inode , void ), void data)
				1146	{
				1147	struct inode *inode = ilookup5(sb, hashval, test, data);
				1148
				1149	if (!inode) {
				1150	struct inode *new = alloc_inode(sb);
				1151
				1152	if (new) {
				1153	new->i_state = 0;
				1154	inode = inode_insert5(new, hashval, test, set, data);
				1155	if (unlikely(inode != new))
				1156	destroy_inode(new);
				1157	}
				1158	}
				1159	return inode;
				1160	}
				1161	EXPORT_SYMBOL(iget5_locked);
				1162
				1163	/**
				1164	* iget_locked - obtain an inode from a mounted file system
				1165	* @sb: super block of file system
				1166	* @ino: inode number to get
				1167	*
				1168	* Search for the inode specified by @ino in the inode cache and if present
				1169	* return it with an increased reference count. This is for file systems
				1170	* where the inode number is sufficient for unique identification of an inode.
				1171	*
				1172	* If the inode is not in cache, allocate a new inode and return it locked,
				1173	* hashed, and with the I_NEW flag set. The file system gets to fill it in
				1174	* before unlocking it via unlock_new_inode().
				1175	*/
				1176	struct inode iget_locked(struct super_block sb, unsigned long ino)
				1177	{
				1178	struct hlist_head *head = inode_hashtable + hash(sb, ino);
				1179	struct inode *inode;
				1180	again:
				1181	spin_lock(&inode_hash_lock);
				1182	inode = find_inode_fast(sb, head, ino);
				1183	spin_unlock(&inode_hash_lock);
				1184	if (inode) {
				1185	if (IS_ERR(inode))
				1186	return NULL;
				1187	wait_on_inode(inode);
				1188	if (unlikely(inode_unhashed(inode))) {
				1189	iput(inode);
				1190	goto again;
				1191	}
				1192	return inode;
				1193	}
				1194
				1195	inode = alloc_inode(sb);
				1196	if (inode) {
				1197	struct inode *old;
				1198
				1199	spin_lock(&inode_hash_lock);
				1200	/* We released the lock, so.. */
				1201	old = find_inode_fast(sb, head, ino);
				1202	if (!old) {
				1203	inode->i_ino = ino;
				1204	spin_lock(&inode->i_lock);
				1205	inode->i_state = I_NEW;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1206	hlist_add_head_rcu(&inode->i_hash, head);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1207	spin_unlock(&inode->i_lock);
				1208	inode_sb_list_add(inode);
				1209	spin_unlock(&inode_hash_lock);
				1210
				1211	/* Return the locked inode with I_NEW set, the
				1212	* caller is responsible for filling in the contents
				1213	*/
				1214	return inode;
				1215	}
				1216
				1217	/*
				1218	* Uhhuh, somebody else created the same inode under
				1219	* us. Use the old inode instead of the one we just
				1220	* allocated.
				1221	*/
				1222	spin_unlock(&inode_hash_lock);
				1223	destroy_inode(inode);
				1224	if (IS_ERR(old))
				1225	return NULL;
				1226	inode = old;
				1227	wait_on_inode(inode);
				1228	if (unlikely(inode_unhashed(inode))) {
				1229	iput(inode);
				1230	goto again;
				1231	}
				1232	}
				1233	return inode;
				1234	}
				1235	EXPORT_SYMBOL(iget_locked);
				1236
				1237	/*
				1238	* search the inode cache for a matching inode number.
				1239	* If we find one, then the inode number we are trying to
				1240	* allocate is not unique and so we should not use it.
				1241	*
				1242	* Returns 1 if the inode number is unique, 0 if it is not.
				1243	*/
				1244	static int test_inode_iunique(struct super_block *sb, unsigned long ino)
				1245	{
				1246	struct hlist_head *b = inode_hashtable + hash(sb, ino);
				1247	struct inode *inode;
				1248
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1249	hlist_for_each_entry_rcu(inode, b, i_hash) {
				1250	if (inode->i_ino == ino && inode->i_sb == sb)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1251	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1252	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1253	return 1;
				1254	}
				1255
				1256	/**
				1257	* iunique - get a unique inode number
				1258	* @sb: superblock
				1259	* @max_reserved: highest reserved inode number
				1260	*
				1261	* Obtain an inode number that is unique on the system for a given
				1262	* superblock. This is used by file systems that have no natural
				1263	* permanent inode numbering system. An inode number is returned that
				1264	* is higher than the reserved limit but unique.
				1265	*
				1266	* BUGS:
				1267	* With a large number of inodes live on the file system this function
				1268	* currently becomes quite slow.
				1269	*/
				1270	ino_t iunique(struct super_block *sb, ino_t max_reserved)
				1271	{
				1272	/*
				1273	* On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
				1274	* error if st_ino won't fit in target struct field. Use 32bit counter
				1275	* here to attempt to avoid that.
				1276	*/
				1277	static DEFINE_SPINLOCK(iunique_lock);
				1278	static unsigned int counter;
				1279	ino_t res;
				1280
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1281	rcu_read_lock();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1282	spin_lock(&iunique_lock);
				1283	do {
				1284	if (counter <= max_reserved)
				1285	counter = max_reserved + 1;
				1286	res = counter++;
				1287	} while (!test_inode_iunique(sb, res));
				1288	spin_unlock(&iunique_lock);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1289	rcu_read_unlock();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1290
				1291	return res;
				1292	}
				1293	EXPORT_SYMBOL(iunique);
				1294
				1295	struct inode igrab(struct inode inode)
				1296	{
				1297	spin_lock(&inode->i_lock);
				1298	if (!(inode->i_state & (I_FREEING\|I_WILL_FREE))) {
				1299	__iget(inode);
				1300	spin_unlock(&inode->i_lock);
				1301	} else {
				1302	spin_unlock(&inode->i_lock);
				1303	/*
				1304	* Handle the case where s_op->clear_inode is not been
				1305	* called yet, and somebody is calling igrab
				1306	* while the inode is getting freed.
				1307	*/
				1308	inode = NULL;
				1309	}
				1310	return inode;
				1311	}
				1312	EXPORT_SYMBOL(igrab);
				1313
				1314	/**
				1315	* ilookup5_nowait - search for an inode in the inode cache
				1316	* @sb: super block of file system to search
				1317	* @hashval: hash value (usually inode number) to search for
				1318	* @test: callback used for comparisons between inodes
				1319	* @data: opaque data pointer to pass to @test
				1320	*
				1321	* Search for the inode specified by @hashval and @data in the inode cache.
				1322	* If the inode is in the cache, the inode is returned with an incremented
				1323	* reference count.
				1324	*
				1325	* Note: I_NEW is not waited upon so you have to be very careful what you do
				1326	* with the returned inode. You probably should be using ilookup5() instead.
				1327	*
				1328	* Note2: @test is called with the inode_hash_lock held, so can't sleep.
				1329	*/
				1330	struct inode ilookup5_nowait(struct super_block sb, unsigned long hashval,
				1331	int (test)(struct inode , void ), void data)
				1332	{
				1333	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
				1334	struct inode *inode;
				1335
				1336	spin_lock(&inode_hash_lock);
				1337	inode = find_inode(sb, head, test, data);
				1338	spin_unlock(&inode_hash_lock);
				1339
				1340	return IS_ERR(inode) ? NULL : inode;
				1341	}
				1342	EXPORT_SYMBOL(ilookup5_nowait);
				1343
				1344	/**
				1345	* ilookup5 - search for an inode in the inode cache
				1346	* @sb: super block of file system to search
				1347	* @hashval: hash value (usually inode number) to search for
				1348	* @test: callback used for comparisons between inodes
				1349	* @data: opaque data pointer to pass to @test
				1350	*
				1351	* Search for the inode specified by @hashval and @data in the inode cache,
				1352	* and if the inode is in the cache, return the inode with an incremented
				1353	* reference count. Waits on I_NEW before returning the inode.
				1354	* returned with an incremented reference count.
				1355	*
				1356	* This is a generalized version of ilookup() for file systems where the
				1357	* inode number is not sufficient for unique identification of an inode.
				1358	*
				1359	* Note: @test is called with the inode_hash_lock held, so can't sleep.
				1360	*/
				1361	struct inode ilookup5(struct super_block sb, unsigned long hashval,
				1362	int (test)(struct inode , void ), void data)
				1363	{
				1364	struct inode *inode;
				1365	again:
				1366	inode = ilookup5_nowait(sb, hashval, test, data);
				1367	if (inode) {
				1368	wait_on_inode(inode);
				1369	if (unlikely(inode_unhashed(inode))) {
				1370	iput(inode);
				1371	goto again;
				1372	}
				1373	}
				1374	return inode;
				1375	}
				1376	EXPORT_SYMBOL(ilookup5);
				1377
				1378	/**
				1379	* ilookup - search for an inode in the inode cache
				1380	* @sb: super block of file system to search
				1381	* @ino: inode number to search for
				1382	*
				1383	* Search for the inode @ino in the inode cache, and if the inode is in the
				1384	* cache, the inode is returned with an incremented reference count.
				1385	*/
				1386	struct inode ilookup(struct super_block sb, unsigned long ino)
				1387	{
				1388	struct hlist_head *head = inode_hashtable + hash(sb, ino);
				1389	struct inode *inode;
				1390	again:
				1391	spin_lock(&inode_hash_lock);
				1392	inode = find_inode_fast(sb, head, ino);
				1393	spin_unlock(&inode_hash_lock);
				1394
				1395	if (inode) {
				1396	if (IS_ERR(inode))
				1397	return NULL;
				1398	wait_on_inode(inode);
				1399	if (unlikely(inode_unhashed(inode))) {
				1400	iput(inode);
				1401	goto again;
				1402	}
				1403	}
				1404	return inode;
				1405	}
				1406	EXPORT_SYMBOL(ilookup);
				1407
				1408	/**
				1409	* find_inode_nowait - find an inode in the inode cache
				1410	* @sb: super block of file system to search
				1411	* @hashval: hash value (usually inode number) to search for
				1412	* @match: callback used for comparisons between inodes
				1413	* @data: opaque data pointer to pass to @match
				1414	*
				1415	* Search for the inode specified by @hashval and @data in the inode
				1416	* cache, where the helper function @match will return 0 if the inode
				1417	* does not match, 1 if the inode does match, and -1 if the search
				1418	* should be stopped. The @match function must be responsible for
				1419	* taking the i_lock spin_lock and checking i_state for an inode being
				1420	* freed or being initialized, and incrementing the reference count
				1421	* before returning 1. It also must not sleep, since it is called with
				1422	* the inode_hash_lock spinlock held.
				1423	*
				1424	* This is a even more generalized version of ilookup5() when the
				1425	* function must never block --- find_inode() can block in
				1426	* __wait_on_freeing_inode() --- or when the caller can not increment
				1427	* the reference count because the resulting iput() might cause an
				1428	* inode eviction. The tradeoff is that the @match funtion must be
				1429	* very carefully implemented.
				1430	*/
				1431	struct inode find_inode_nowait(struct super_block sb,
				1432	unsigned long hashval,
				1433	int (match)(struct inode , unsigned long,
				1434	void *),
				1435	void *data)
				1436	{
				1437	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
				1438	struct inode inode, ret_inode = NULL;
				1439	int mval;
				1440
				1441	spin_lock(&inode_hash_lock);
				1442	hlist_for_each_entry(inode, head, i_hash) {
				1443	if (inode->i_sb != sb)
				1444	continue;
				1445	mval = match(inode, hashval, data);
				1446	if (mval == 0)
				1447	continue;
				1448	if (mval == 1)
				1449	ret_inode = inode;
				1450	goto out;
				1451	}
				1452	out:
				1453	spin_unlock(&inode_hash_lock);
				1454	return ret_inode;
				1455	}
				1456	EXPORT_SYMBOL(find_inode_nowait);
				1457
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1458	/**
				1459	* find_inode_rcu - find an inode in the inode cache
				1460	* @sb: Super block of file system to search
				1461	* @hashval: Key to hash
				1462	* @test: Function to test match on an inode
				1463	* @data: Data for test function
				1464	*
				1465	* Search for the inode specified by @hashval and @data in the inode cache,
				1466	* where the helper function @test will return 0 if the inode does not match
				1467	* and 1 if it does. The @test function must be responsible for taking the
				1468	* i_lock spin_lock and checking i_state for an inode being freed or being
				1469	* initialized.
				1470	*
				1471	* If successful, this will return the inode for which the @test function
				1472	* returned 1 and NULL otherwise.
				1473	*
				1474	* The @test function is not permitted to take a ref on any inode presented.
				1475	* It is also not permitted to sleep.
				1476	*
				1477	* The caller must hold the RCU read lock.
				1478	*/
				1479	struct inode find_inode_rcu(struct super_block sb, unsigned long hashval,
				1480	int (test)(struct inode , void ), void data)
				1481	{
				1482	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
				1483	struct inode *inode;
				1484
				1485	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
				1486	"suspicious find_inode_rcu() usage");
				1487
				1488	hlist_for_each_entry_rcu(inode, head, i_hash) {
				1489	if (inode->i_sb == sb &&
				1490	!(READ_ONCE(inode->i_state) & (I_FREEING \| I_WILL_FREE)) &&
				1491	test(inode, data))
				1492	return inode;
				1493	}
				1494	return NULL;
				1495	}
				1496	EXPORT_SYMBOL(find_inode_rcu);
				1497
				1498	/**
				1499	* find_inode_by_rcu - Find an inode in the inode cache
				1500	* @sb: Super block of file system to search
				1501	* @ino: The inode number to match
				1502	*
				1503	* Search for the inode specified by @hashval and @data in the inode cache,
				1504	* where the helper function @test will return 0 if the inode does not match
				1505	* and 1 if it does. The @test function must be responsible for taking the
				1506	* i_lock spin_lock and checking i_state for an inode being freed or being
				1507	* initialized.
				1508	*
				1509	* If successful, this will return the inode for which the @test function
				1510	* returned 1 and NULL otherwise.
				1511	*
				1512	* The @test function is not permitted to take a ref on any inode presented.
				1513	* It is also not permitted to sleep.
				1514	*
				1515	* The caller must hold the RCU read lock.
				1516	*/
				1517	struct inode find_inode_by_ino_rcu(struct super_block sb,
				1518	unsigned long ino)
				1519	{
				1520	struct hlist_head *head = inode_hashtable + hash(sb, ino);
				1521	struct inode *inode;
				1522
				1523	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
				1524	"suspicious find_inode_by_ino_rcu() usage");
				1525
				1526	hlist_for_each_entry_rcu(inode, head, i_hash) {
				1527	if (inode->i_ino == ino &&
				1528	inode->i_sb == sb &&
				1529	!(READ_ONCE(inode->i_state) & (I_FREEING \| I_WILL_FREE)))
				1530	return inode;
				1531	}
				1532	return NULL;
				1533	}
				1534	EXPORT_SYMBOL(find_inode_by_ino_rcu);
				1535
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1536	int insert_inode_locked(struct inode *inode)
				1537	{
				1538	struct super_block *sb = inode->i_sb;
				1539	ino_t ino = inode->i_ino;
				1540	struct hlist_head *head = inode_hashtable + hash(sb, ino);
				1541
				1542	while (1) {
				1543	struct inode *old = NULL;
				1544	spin_lock(&inode_hash_lock);
				1545	hlist_for_each_entry(old, head, i_hash) {
				1546	if (old->i_ino != ino)
				1547	continue;
				1548	if (old->i_sb != sb)
				1549	continue;
				1550	spin_lock(&old->i_lock);
				1551	if (old->i_state & (I_FREEING\|I_WILL_FREE)) {
				1552	spin_unlock(&old->i_lock);
				1553	continue;
				1554	}
				1555	break;
				1556	}
				1557	if (likely(!old)) {
				1558	spin_lock(&inode->i_lock);
				1559	inode->i_state \|= I_NEW \| I_CREATING;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1560	hlist_add_head_rcu(&inode->i_hash, head);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1561	spin_unlock(&inode->i_lock);
				1562	spin_unlock(&inode_hash_lock);
				1563	return 0;
				1564	}
				1565	if (unlikely(old->i_state & I_CREATING)) {
				1566	spin_unlock(&old->i_lock);
				1567	spin_unlock(&inode_hash_lock);
				1568	return -EBUSY;
				1569	}
				1570	__iget(old);
				1571	spin_unlock(&old->i_lock);
				1572	spin_unlock(&inode_hash_lock);
				1573	wait_on_inode(old);
				1574	if (unlikely(!inode_unhashed(old))) {
				1575	iput(old);
				1576	return -EBUSY;
				1577	}
				1578	iput(old);
				1579	}
				1580	}
				1581	EXPORT_SYMBOL(insert_inode_locked);
				1582
				1583	int insert_inode_locked4(struct inode *inode, unsigned long hashval,
				1584	int (test)(struct inode , void ), void data)
				1585	{
				1586	struct inode *old;
				1587
				1588	inode->i_state \|= I_CREATING;
				1589	old = inode_insert5(inode, hashval, test, NULL, data);
				1590
				1591	if (old != inode) {
				1592	iput(old);
				1593	return -EBUSY;
				1594	}
				1595	return 0;
				1596	}
				1597	EXPORT_SYMBOL(insert_inode_locked4);
				1598
				1599
				1600	int generic_delete_inode(struct inode *inode)
				1601	{
				1602	return 1;
				1603	}
				1604	EXPORT_SYMBOL(generic_delete_inode);
				1605
				1606	/*
				1607	* Called when we're dropping the last reference
				1608	* to an inode.
				1609	*
				1610	* Call the FS "drop_inode()" function, defaulting to
				1611	* the legacy UNIX filesystem behaviour. If it tells
				1612	* us to evict inode, do so. Otherwise, retain inode
				1613	* in cache if fs is alive, sync and evict if fs is
				1614	* shutting down.
				1615	*/
				1616	static void iput_final(struct inode *inode)
				1617	{
				1618	struct super_block *sb = inode->i_sb;
				1619	const struct super_operations *op = inode->i_sb->s_op;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1620	unsigned long state;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1621	int drop;
				1622
				1623	WARN_ON(inode->i_state & I_NEW);
				1624
				1625	if (op->drop_inode)
				1626	drop = op->drop_inode(inode);
				1627	else
				1628	drop = generic_drop_inode(inode);
				1629
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1630	if (!drop &&
				1631	!(inode->i_state & I_DONTCACHE) &&
				1632	(sb->s_flags & SB_ACTIVE)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1633	inode_add_lru(inode);
				1634	spin_unlock(&inode->i_lock);
				1635	return;
				1636	}
				1637
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1638	state = inode->i_state;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1639	if (!drop) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1640	WRITE_ONCE(inode->i_state, state \| I_WILL_FREE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1641	spin_unlock(&inode->i_lock);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1642
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1643	write_inode_now(inode, 1);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1644
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1645	spin_lock(&inode->i_lock);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1646	state = inode->i_state;
				1647	WARN_ON(state & I_NEW);
				1648	state &= ~I_WILL_FREE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1649	}
				1650
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1651	WRITE_ONCE(inode->i_state, state \| I_FREEING);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1652	if (!list_empty(&inode->i_lru))
				1653	inode_lru_list_del(inode);
				1654	spin_unlock(&inode->i_lock);
				1655
				1656	evict(inode);
				1657	}
				1658
				1659	/**
				1660	* iput - put an inode
				1661	* @inode: inode to put
				1662	*
				1663	* Puts an inode, dropping its usage count. If the inode use count hits
				1664	* zero, the inode is then freed and may also be destroyed.
				1665	*
				1666	* Consequently, iput() can sleep.
				1667	*/
				1668	void iput(struct inode *inode)
				1669	{
				1670	if (!inode)
				1671	return;
				1672	BUG_ON(inode->i_state & I_CLEAR);
				1673	retry:
				1674	if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
				1675	if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
				1676	atomic_inc(&inode->i_count);
				1677	spin_unlock(&inode->i_lock);
				1678	trace_writeback_lazytime_iput(inode);
				1679	mark_inode_dirty_sync(inode);
				1680	goto retry;
				1681	}
				1682	iput_final(inode);
				1683	}
				1684	}
				1685	EXPORT_SYMBOL(iput);
				1686
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1687	#ifdef CONFIG_BLOCK
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1688	/**
				1689	* bmap - find a block number in a file
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1690	* @inode: inode owning the block number being requested
				1691	* @block: pointer containing the block to find
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1692	*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1693	* Replaces the value in ``*block`` with the block number on the device holding
				1694	* corresponding to the requested block number in the file.
				1695	* That is, asked for block 4 of inode 1 the function will replace the
				1696	* 4 in ``*block``, with disk block relative to the disk start that holds that
				1697	* block of the file.
				1698	*
				1699	* Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a
				1700	* hole, returns 0 and ``*block`` is also set to 0.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1701	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1702	int bmap(struct inode inode, sector_t block)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1703	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1704	if (!inode->i_mapping->a_ops->bmap)
				1705	return -EINVAL;
				1706
				1707	block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
				1708	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1709	}
				1710	EXPORT_SYMBOL(bmap);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1711	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1712
				1713	/*
				1714	* With relative atime, only update atime if the previous atime is
				1715	* earlier than either the ctime or mtime or if at least a day has
				1716	* passed since the last atime update.
				1717	*/
				1718	static int relatime_need_update(struct vfsmount mnt, struct inode inode,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1719	struct timespec64 now)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1720	{
				1721
				1722	if (!(mnt->mnt_flags & MNT_RELATIME))
				1723	return 1;
				1724	/*
				1725	* Is mtime younger than atime? If yes, update atime:
				1726	*/
				1727	if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
				1728	return 1;
				1729	/*
				1730	* Is ctime younger than atime? If yes, update atime:
				1731	*/
				1732	if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0)
				1733	return 1;
				1734
				1735	/*
				1736	* Is the previous atime value older than a day? If yes,
				1737	* update atime:
				1738	*/
				1739	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 246060)
				1740	return 1;
				1741	/*
				1742	* Good, we can skip the atime update:
				1743	*/
				1744	return 0;
				1745	}
				1746
				1747	int generic_update_time(struct inode inode, struct timespec64 time, int flags)
				1748	{
				1749	int iflags = I_DIRTY_TIME;
				1750	bool dirty = false;
				1751
				1752	if (flags & S_ATIME)
				1753	inode->i_atime = *time;
				1754	if (flags & S_VERSION)
				1755	dirty = inode_maybe_inc_iversion(inode, false);
				1756	if (flags & S_CTIME)
				1757	inode->i_ctime = *time;
				1758	if (flags & S_MTIME)
				1759	inode->i_mtime = *time;
				1760	if ((flags & (S_ATIME \| S_CTIME \| S_MTIME)) &&
				1761	!(inode->i_sb->s_flags & SB_LAZYTIME))
				1762	dirty = true;
				1763
				1764	if (dirty)
				1765	iflags \|= I_DIRTY_SYNC;
				1766	__mark_inode_dirty(inode, iflags);
				1767	return 0;
				1768	}
				1769	EXPORT_SYMBOL(generic_update_time);
				1770
				1771	/*
				1772	* This does the actual work of updating an inodes time or version. Must have
				1773	* had called mnt_want_write() before calling this.
				1774	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1775	int inode_update_time(struct inode inode, struct timespec64 time, int flags)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1776	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1777	if (inode->i_op->update_time)
				1778	return inode->i_op->update_time(inode, time, flags);
				1779	return generic_update_time(inode, time, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1780	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1781	EXPORT_SYMBOL(inode_update_time);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1782
				1783	/**
				1784	* touch_atime - update the access time
				1785	* @path: the &struct path to update
				1786	* @inode: inode to update
				1787	*
				1788	* Update the accessed time on an inode and mark it for writeback.
				1789	* This function automatically handles read only file systems and media,
				1790	* as well as the "noatime" flag and inode specific "noatime" markers.
				1791	*/
				1792	bool atime_needs_update(const struct path path, struct inode inode)
				1793	{
				1794	struct vfsmount *mnt = path->mnt;
				1795	struct timespec64 now;
				1796
				1797	if (inode->i_flags & S_NOATIME)
				1798	return false;
				1799
				1800	/* Atime updates will likely cause i_uid and i_gid to be written
				1801	* back improprely if their true value is unknown to the vfs.
				1802	*/
				1803	if (HAS_UNMAPPED_ID(inode))
				1804	return false;
				1805
				1806	if (IS_NOATIME(inode))
				1807	return false;
				1808	if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
				1809	return false;
				1810
				1811	if (mnt->mnt_flags & MNT_NOATIME)
				1812	return false;
				1813	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
				1814	return false;
				1815
				1816	now = current_time(inode);
				1817
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1818	if (!relatime_need_update(mnt, inode, now))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1819	return false;
				1820
				1821	if (timespec64_equal(&inode->i_atime, &now))
				1822	return false;
				1823
				1824	return true;
				1825	}
				1826
				1827	void touch_atime(const struct path *path)
				1828	{
				1829	struct vfsmount *mnt = path->mnt;
				1830	struct inode *inode = d_inode(path->dentry);
				1831	struct timespec64 now;
				1832
				1833	if (!atime_needs_update(path, inode))
				1834	return;
				1835
				1836	if (!sb_start_write_trylock(inode->i_sb))
				1837	return;
				1838
				1839	if (__mnt_want_write(mnt) != 0)
				1840	goto skip_update;
				1841	/*
				1842	* File systems can error out when updating inodes if they need to
				1843	* allocate new space to modify an inode (such is the case for
				1844	* Btrfs), but since we touch atime while walking down the path we
				1845	* really don't care if we failed to update the atime of the file,
				1846	* so just ignore the return value.
				1847	* We may also fail on filesystems that have the ability to make parts
				1848	* of the fs read only, e.g. subvolumes in Btrfs.
				1849	*/
				1850	now = current_time(inode);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1851	inode_update_time(inode, &now, S_ATIME);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1852	__mnt_drop_write(mnt);
				1853	skip_update:
				1854	sb_end_write(inode->i_sb);
				1855	}
				1856	EXPORT_SYMBOL(touch_atime);
				1857
				1858	/*
				1859	* The logic we want is
				1860	*
				1861	* if suid or (sgid and xgrp)
				1862	* remove privs
				1863	*/
				1864	int should_remove_suid(struct dentry *dentry)
				1865	{
				1866	umode_t mode = d_inode(dentry)->i_mode;
				1867	int kill = 0;
				1868
				1869	/* suid always must be killed */
				1870	if (unlikely(mode & S_ISUID))
				1871	kill = ATTR_KILL_SUID;
				1872
				1873	/*
				1874	* sgid without any exec bits is just a mandatory locking mark; leave
				1875	* it alone. If some exec bits are set, it's a real sgid; kill it.
				1876	*/
				1877	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
				1878	kill \|= ATTR_KILL_SGID;
				1879
				1880	if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
				1881	return kill;
				1882
				1883	return 0;
				1884	}
				1885	EXPORT_SYMBOL(should_remove_suid);
				1886
				1887	/*
				1888	* Return mask of changes for notify_change() that need to be done as a
				1889	* response to write or truncate. Return 0 if nothing has to be changed.
				1890	* Negative value on error (change should be denied).
				1891	*/
				1892	int dentry_needs_remove_privs(struct dentry *dentry)
				1893	{
				1894	struct inode *inode = d_inode(dentry);
				1895	int mask = 0;
				1896	int ret;
				1897
				1898	if (IS_NOSEC(inode))
				1899	return 0;
				1900
				1901	mask = should_remove_suid(dentry);
				1902	ret = security_inode_need_killpriv(dentry);
				1903	if (ret < 0)
				1904	return ret;
				1905	if (ret)
				1906	mask \|= ATTR_KILL_PRIV;
				1907	return mask;
				1908	}
				1909
				1910	static int __remove_privs(struct dentry *dentry, int kill)
				1911	{
				1912	struct iattr newattrs;
				1913
				1914	newattrs.ia_valid = ATTR_FORCE \| kill;
				1915	/*
				1916	* Note we call this on write, so notify_change will not
				1917	* encounter any conflicting delegations:
				1918	*/
				1919	return notify_change(dentry, &newattrs, NULL);
				1920	}
				1921
				1922	/*
				1923	* Remove special file priviledges (suid, capabilities) when file is written
				1924	* to or truncated.
				1925	*/
				1926	int file_remove_privs(struct file *file)
				1927	{
				1928	struct dentry *dentry = file_dentry(file);
				1929	struct inode *inode = file_inode(file);
				1930	int kill;
				1931	int error = 0;
				1932
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1933	/*
				1934	* Fast path for nothing security related.
				1935	* As well for non-regular files, e.g. blkdev inodes.
				1936	* For example, blkdev_write_iter() might get here
				1937	* trying to remove privs which it is not allowed to.
				1938	*/
				1939	if (IS_NOSEC(inode) \|\| !S_ISREG(inode->i_mode))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1940	return 0;
				1941
				1942	kill = dentry_needs_remove_privs(dentry);
				1943	if (kill < 0)
				1944	return kill;
				1945	if (kill)
				1946	error = __remove_privs(dentry, kill);
				1947	if (!error)
				1948	inode_has_no_xattr(inode);
				1949
				1950	return error;
				1951	}
				1952	EXPORT_SYMBOL(file_remove_privs);
				1953
				1954	/**
				1955	* file_update_time - update mtime and ctime time
				1956	* @file: file accessed
				1957	*
				1958	* Update the mtime and ctime members of an inode and mark the inode
				1959	* for writeback. Note that this function is meant exclusively for
				1960	* usage in the file write path of filesystems, and filesystems may
				1961	* choose to explicitly ignore update via this function with the
				1962	* S_NOCMTIME inode flag, e.g. for network filesystem where these
				1963	* timestamps are handled by the server. This can return an error for
				1964	* file systems who need to allocate space in order to update an inode.
				1965	*/
				1966
				1967	int file_update_time(struct file *file)
				1968	{
				1969	struct inode *inode = file_inode(file);
				1970	struct timespec64 now;
				1971	int sync_it = 0;
				1972	int ret;
				1973
				1974	/* First try to exhaust all avenues to not sync */
				1975	if (IS_NOCMTIME(inode))
				1976	return 0;
				1977
				1978	now = current_time(inode);
				1979	if (!timespec64_equal(&inode->i_mtime, &now))
				1980	sync_it = S_MTIME;
				1981
				1982	if (!timespec64_equal(&inode->i_ctime, &now))
				1983	sync_it \|= S_CTIME;
				1984
				1985	if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
				1986	sync_it \|= S_VERSION;
				1987
				1988	if (!sync_it)
				1989	return 0;
				1990
				1991	/* Finally allowed to write? Takes lock. */
				1992	if (__mnt_want_write_file(file))
				1993	return 0;
				1994
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1995	ret = inode_update_time(inode, &now, sync_it);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1996	__mnt_drop_write_file(file);
				1997
				1998	return ret;
				1999	}
				2000	EXPORT_SYMBOL(file_update_time);
				2001
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2002	/* Caller must hold the file's inode lock */
				2003	int file_modified(struct file *file)
				2004	{
				2005	int err;
				2006
				2007	/*
				2008	* Clear the security bits if the process is not being run by root.
				2009	* This keeps people from modifying setuid and setgid binaries.
				2010	*/
				2011	err = file_remove_privs(file);
				2012	if (err)
				2013	return err;
				2014
				2015	if (unlikely(file->f_mode & FMODE_NOCMTIME))
				2016	return 0;
				2017
				2018	return file_update_time(file);
				2019	}
				2020	EXPORT_SYMBOL(file_modified);
				2021
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2022	int inode_needs_sync(struct inode *inode)
				2023	{
				2024	if (IS_SYNC(inode))
				2025	return 1;
				2026	if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
				2027	return 1;
				2028	return 0;
				2029	}
				2030	EXPORT_SYMBOL(inode_needs_sync);
				2031
				2032	/*
				2033	* If we try to find an inode in the inode hash while it is being
				2034	* deleted, we have to wait until the filesystem completes its
				2035	* deletion before reporting that it isn't found. This function waits
				2036	* until the deletion _might_ have completed. Callers are responsible
				2037	* to recheck inode state.
				2038	*
				2039	* It doesn't matter if I_NEW is not set initially, a call to
				2040	* wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
				2041	* will DTRT.
				2042	*/
				2043	static void __wait_on_freeing_inode(struct inode *inode)
				2044	{
				2045	wait_queue_head_t *wq;
				2046	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
				2047	wq = bit_waitqueue(&inode->i_state, __I_NEW);
				2048	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
				2049	spin_unlock(&inode->i_lock);
				2050	spin_unlock(&inode_hash_lock);
				2051	schedule();
				2052	finish_wait(wq, &wait.wq_entry);
				2053	spin_lock(&inode_hash_lock);
				2054	}
				2055
				2056	static __initdata unsigned long ihash_entries;
				2057	static int __init set_ihash_entries(char *str)
				2058	{
				2059	if (!str)
				2060	return 0;
				2061	ihash_entries = simple_strtoul(str, &str, 0);
				2062	return 1;
				2063	}
				2064	__setup("ihash_entries=", set_ihash_entries);
				2065
				2066	/*
				2067	* Initialize the waitqueues and inode hash table.
				2068	*/
				2069	void __init inode_init_early(void)
				2070	{
				2071	/* If hashes are distributed across NUMA nodes, defer
				2072	* hash allocation until vmalloc space is available.
				2073	*/
				2074	if (hashdist)
				2075	return;
				2076
				2077	inode_hashtable =
				2078	alloc_large_system_hash("Inode-cache",
				2079	sizeof(struct hlist_head),
				2080	ihash_entries,
				2081	14,
				2082	HASH_EARLY \| HASH_ZERO,
				2083	&i_hash_shift,
				2084	&i_hash_mask,
				2085	0,
				2086	0);
				2087	}
				2088
				2089	void __init inode_init(void)
				2090	{
				2091	/* inode slab cache */
				2092	inode_cachep = kmem_cache_create("inode_cache",
				2093	sizeof(struct inode),
				2094	0,
				2095	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				2096	SLAB_MEM_SPREAD\|SLAB_ACCOUNT),
				2097	init_once);
				2098
				2099	/* Hash may have been set up in inode_init_early */
				2100	if (!hashdist)
				2101	return;
				2102
				2103	inode_hashtable =
				2104	alloc_large_system_hash("Inode-cache",
				2105	sizeof(struct hlist_head),
				2106	ihash_entries,
				2107	14,
				2108	HASH_ZERO,
				2109	&i_hash_shift,
				2110	&i_hash_mask,
				2111	0,
				2112	0);
				2113	}
				2114
				2115	void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
				2116	{
				2117	inode->i_mode = mode;
				2118	if (S_ISCHR(mode)) {
				2119	inode->i_fop = &def_chr_fops;
				2120	inode->i_rdev = rdev;
				2121	} else if (S_ISBLK(mode)) {
				2122	inode->i_fop = &def_blk_fops;
				2123	inode->i_rdev = rdev;
				2124	} else if (S_ISFIFO(mode))
				2125	inode->i_fop = &pipefifo_fops;
				2126	else if (S_ISSOCK(mode))
				2127	; /* leave it no_open_fops */
				2128	else
				2129	printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
				2130	" inode %s:%lu\n", mode, inode->i_sb->s_id,
				2131	inode->i_ino);
				2132	}
				2133	EXPORT_SYMBOL(init_special_inode);
				2134
				2135	/**
				2136	* inode_init_owner - Init uid,gid,mode for new inode according to posix standards
				2137	* @inode: New inode
				2138	* @dir: Directory inode
				2139	* @mode: mode of the new inode
				2140	*/
				2141	void inode_init_owner(struct inode inode, const struct inode dir,
				2142	umode_t mode)
				2143	{
				2144	inode->i_uid = current_fsuid();
				2145	if (dir && dir->i_mode & S_ISGID) {
				2146	inode->i_gid = dir->i_gid;
				2147
				2148	/* Directories are special, and always inherit S_ISGID */
				2149	if (S_ISDIR(mode))
				2150	mode \|= S_ISGID;
				2151	else if ((mode & (S_ISGID \| S_IXGRP)) == (S_ISGID \| S_IXGRP) &&
				2152	!in_group_p(inode->i_gid) &&
				2153	!capable_wrt_inode_uidgid(dir, CAP_FSETID))
				2154	mode &= ~S_ISGID;
				2155	} else
				2156	inode->i_gid = current_fsgid();
				2157	inode->i_mode = mode;
				2158	}
				2159	EXPORT_SYMBOL(inode_init_owner);
				2160
				2161	/**
				2162	* inode_owner_or_capable - check current task permissions to inode
				2163	* @inode: inode being checked
				2164	*
				2165	* Return true if current either has CAP_FOWNER in a namespace with the
				2166	* inode owner uid mapped, or owns the file.
				2167	*/
				2168	bool inode_owner_or_capable(const struct inode *inode)
				2169	{
				2170	struct user_namespace *ns;
				2171
				2172	if (uid_eq(current_fsuid(), inode->i_uid))
				2173	return true;
				2174
				2175	ns = current_user_ns();
				2176	if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER))
				2177	return true;
				2178	return false;
				2179	}
				2180	EXPORT_SYMBOL(inode_owner_or_capable);
				2181
				2182	/*
				2183	* Direct i/o helper functions
				2184	*/
				2185	static void __inode_dio_wait(struct inode *inode)
				2186	{
				2187	wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
				2188	DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
				2189
				2190	do {
				2191	prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
				2192	if (atomic_read(&inode->i_dio_count))
				2193	schedule();
				2194	} while (atomic_read(&inode->i_dio_count));
				2195	finish_wait(wq, &q.wq_entry);
				2196	}
				2197
				2198	/**
				2199	* inode_dio_wait - wait for outstanding DIO requests to finish
				2200	* @inode: inode to wait for
				2201	*
				2202	* Waits for all pending direct I/O requests to finish so that we can
				2203	* proceed with a truncate or equivalent operation.
				2204	*
				2205	* Must be called under a lock that serializes taking new references
				2206	* to i_dio_count, usually by inode->i_mutex.
				2207	*/
				2208	void inode_dio_wait(struct inode *inode)
				2209	{
				2210	if (atomic_read(&inode->i_dio_count))
				2211	__inode_dio_wait(inode);
				2212	}
				2213	EXPORT_SYMBOL(inode_dio_wait);
				2214
				2215	/*
				2216	* inode_set_flags - atomically set some inode flags
				2217	*
				2218	* Note: the caller should be holding i_mutex, or else be sure that
				2219	* they have exclusive access to the inode structure (i.e., while the
				2220	* inode is being instantiated). The reason for the cmpxchg() loop
				2221	* --- which wouldn't be necessary if all code paths which modify
				2222	* i_flags actually followed this rule, is that there is at least one
				2223	* code path which doesn't today so we use cmpxchg() out of an abundance
				2224	* of caution.
				2225	*
				2226	* In the long run, i_mutex is overkill, and we should probably look
				2227	* at using the i_lock spinlock to protect i_flags, and then make sure
				2228	* it is so documented in include/linux/fs.h and that all code follows
				2229	* the locking convention!!
				2230	*/
				2231	void inode_set_flags(struct inode *inode, unsigned int flags,
				2232	unsigned int mask)
				2233	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2234	WARN_ON_ONCE(flags & ~mask);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2235	set_mask_bits(&inode->i_flags, mask, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2236	}
				2237	EXPORT_SYMBOL(inode_set_flags);
				2238
				2239	void inode_nohighmem(struct inode *inode)
				2240	{
				2241	mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
				2242	}
				2243	EXPORT_SYMBOL(inode_nohighmem);
				2244
				2245	/**
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2246	* timestamp_truncate - Truncate timespec to a granularity
				2247	* @t: Timespec
				2248	* @inode: inode being updated
				2249	*
				2250	* Truncate a timespec to the granularity supported by the fs
				2251	* containing the inode. Always rounds down. gran must
				2252	* not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
				2253	*/
				2254	struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
				2255	{
				2256	struct super_block *sb = inode->i_sb;
				2257	unsigned int gran = sb->s_time_gran;
				2258
				2259	t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max);
				2260	if (unlikely(t.tv_sec == sb->s_time_max \|\| t.tv_sec == sb->s_time_min))
				2261	t.tv_nsec = 0;
				2262
				2263	/* Avoid division in the common cases 1 ns and 1 s. */
				2264	if (gran == 1)
				2265	; /* nothing */
				2266	else if (gran == NSEC_PER_SEC)
				2267	t.tv_nsec = 0;
				2268	else if (gran > 1 && gran < NSEC_PER_SEC)
				2269	t.tv_nsec -= t.tv_nsec % gran;
				2270	else
				2271	WARN(1, "invalid file time granularity: %u", gran);
				2272	return t;
				2273	}
				2274	EXPORT_SYMBOL(timestamp_truncate);
				2275
				2276	/**
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2277	* current_time - Return FS time
				2278	* @inode: inode.
				2279	*
				2280	* Return the current time truncated to the time granularity supported by
				2281	* the fs.
				2282	*
				2283	* Note that inode and inode->sb cannot be NULL.
				2284	* Otherwise, the function warns and returns time without truncation.
				2285	*/
				2286	struct timespec64 current_time(struct inode *inode)
				2287	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2288	struct timespec64 now;
				2289
				2290	ktime_get_coarse_real_ts64(&now);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2291
				2292	if (unlikely(!inode->i_sb)) {
				2293	WARN(1, "current_time() called with uninitialized super_block in the inode");
				2294	return now;
				2295	}
				2296
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2297	return timestamp_truncate(now, inode);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2298	}
				2299	EXPORT_SYMBOL(current_time);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2300
				2301	/*
				2302	* Generic function to check FS_IOC_SETFLAGS values and reject any invalid
				2303	* configurations.
				2304	*
				2305	* Note: the caller should be holding i_mutex, or else be sure that they have
				2306	* exclusive access to the inode structure.
				2307	*/
				2308	int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
				2309	unsigned int flags)
				2310	{
				2311	/*
				2312	* The IMMUTABLE and APPEND_ONLY flags can only be changed by
				2313	* the relevant capability.
				2314	*
				2315	* This test looks nicer. Thanks to Pauline Middelink
				2316	*/
				2317	if ((flags ^ oldflags) & (FS_APPEND_FL \| FS_IMMUTABLE_FL) &&
				2318	!capable(CAP_LINUX_IMMUTABLE))
				2319	return -EPERM;
				2320
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2321	return fscrypt_prepare_setflags(inode, oldflags, flags);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2322	}
				2323	EXPORT_SYMBOL(vfs_ioc_setflags_prepare);
				2324
				2325	/*
				2326	* Generic function to check FS_IOC_FSSETXATTR values and reject any invalid
				2327	* configurations.
				2328	*
				2329	* Note: the caller should be holding i_mutex, or else be sure that they have
				2330	* exclusive access to the inode structure.
				2331	*/
				2332	int vfs_ioc_fssetxattr_check(struct inode inode, const struct fsxattr old_fa,
				2333	struct fsxattr *fa)
				2334	{
				2335	/*
				2336	* Can't modify an immutable/append-only file unless we have
				2337	* appropriate permission.
				2338	*/
				2339	if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
				2340	(FS_XFLAG_IMMUTABLE \| FS_XFLAG_APPEND) &&
				2341	!capable(CAP_LINUX_IMMUTABLE))
				2342	return -EPERM;
				2343
				2344	/*
				2345	* Project Quota ID state is only allowed to change from within the init
				2346	* namespace. Enforce that restriction only if we are trying to change
				2347	* the quota ID state. Everything else is allowed in user namespaces.
				2348	*/
				2349	if (current_user_ns() != &init_user_ns) {
				2350	if (old_fa->fsx_projid != fa->fsx_projid)
				2351	return -EINVAL;
				2352	if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
				2353	FS_XFLAG_PROJINHERIT)
				2354	return -EINVAL;
				2355	}
				2356
				2357	/* Check extent size hints. */
				2358	if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
				2359	return -EINVAL;
				2360
				2361	if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
				2362	!S_ISDIR(inode->i_mode))
				2363	return -EINVAL;
				2364
				2365	if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
				2366	!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
				2367	return -EINVAL;
				2368
				2369	/*
				2370	* It is only valid to set the DAX flag on regular files and
				2371	* directories on filesystems.
				2372	*/
				2373	if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
				2374	!(S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode)))
				2375	return -EINVAL;
				2376
				2377	/* Extent size hints of zero turn off the flags. */
				2378	if (fa->fsx_extsize == 0)
				2379	fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE \| FS_XFLAG_EXTSZINHERIT);
				2380	if (fa->fsx_cowextsize == 0)
				2381	fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
				2382
				2383	return 0;
				2384	}
				2385	EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);