Blame - fs/xfs/xfs_file.c - hafnium/third_party/linux

blob: 5b0f93f738372dfde612437cb9fe38f78aaadbf0 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
				4	* All Rights Reserved.
				5	*/
				6	#include "xfs.h"
				7	#include "xfs_fs.h"
				8	#include "xfs_shared.h"
				9	#include "xfs_format.h"
				10	#include "xfs_log_format.h"
				11	#include "xfs_trans_resv.h"
				12	#include "xfs_mount.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	13	#include "xfs_inode.h"
				14	#include "xfs_trans.h"
				15	#include "xfs_inode_item.h"
				16	#include "xfs_bmap.h"
				17	#include "xfs_bmap_util.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	18	#include "xfs_dir2.h"
				19	#include "xfs_dir2_priv.h"
				20	#include "xfs_ioctl.h"
				21	#include "xfs_trace.h"
				22	#include "xfs_log.h"
				23	#include "xfs_icache.h"
				24	#include "xfs_pnfs.h"
				25	#include "xfs_iomap.h"
				26	#include "xfs_reflink.h"
				27
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	28	#include <linux/falloc.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	29	#include <linux/backing-dev.h>
				30	#include <linux/mman.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	31	#include <linux/fadvise.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	32
				33	static const struct vm_operations_struct xfs_file_vm_ops;
				34
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	35	/*
				36	* Decide if the given file range is aligned to the size of the fundamental
				37	* allocation unit for the file.
				38	*/
				39	static bool
				40	xfs_is_falloc_aligned(
				41	struct xfs_inode *ip,
				42	loff_t pos,
				43	long long int len)
				44	{
				45	struct xfs_mount *mp = ip->i_mount;
				46	uint64_t mask;
				47
				48	if (XFS_IS_REALTIME_INODE(ip)) {
				49	if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
				50	u64 rextbytes;
				51	u32 mod;
				52
				53	rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
				54	div_u64_rem(pos, rextbytes, &mod);
				55	if (mod)
				56	return false;
				57	div_u64_rem(len, rextbytes, &mod);
				58	return mod == 0;
				59	}
				60	mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
				61	} else {
				62	mask = mp->m_sb.sb_blocksize - 1;
				63	}
				64
				65	return !((pos \| len) & mask);
				66	}
				67
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	68	int
				69	xfs_update_prealloc_flags(
				70	struct xfs_inode *ip,
				71	enum xfs_prealloc_flags flags)
				72	{
				73	struct xfs_trans *tp;
				74	int error;
				75
				76	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
				77	0, 0, 0, &tp);
				78	if (error)
				79	return error;
				80
				81	xfs_ilock(ip, XFS_ILOCK_EXCL);
				82	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
				83
				84	if (!(flags & XFS_PREALLOC_INVISIBLE)) {
				85	VFS_I(ip)->i_mode &= ~S_ISUID;
				86	if (VFS_I(ip)->i_mode & S_IXGRP)
				87	VFS_I(ip)->i_mode &= ~S_ISGID;
				88	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				89	}
				90
				91	if (flags & XFS_PREALLOC_SET)
				92	ip->i_d.di_flags \|= XFS_DIFLAG_PREALLOC;
				93	if (flags & XFS_PREALLOC_CLEAR)
				94	ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
				95
				96	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				97	if (flags & XFS_PREALLOC_SYNC)
				98	xfs_trans_set_sync(tp);
				99	return xfs_trans_commit(tp);
				100	}
				101
				102	/*
				103	* Fsync operations on directories are much simpler than on regular files,
				104	* as there is no file data to flush, and thus also no need for explicit
				105	* cache flush operations, and there are no non-transaction metadata updates
				106	* on directories either.
				107	*/
				108	STATIC int
				109	xfs_dir_fsync(
				110	struct file *file,
				111	loff_t start,
				112	loff_t end,
				113	int datasync)
				114	{
				115	struct xfs_inode *ip = XFS_I(file->f_mapping->host);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	116
				117	trace_xfs_dir_fsync(ip);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	118	return xfs_log_force_inode(ip);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	119	}
				120
				121	STATIC int
				122	xfs_file_fsync(
				123	struct file *file,
				124	loff_t start,
				125	loff_t end,
				126	int datasync)
				127	{
				128	struct inode *inode = file->f_mapping->host;
				129	struct xfs_inode *ip = XFS_I(inode);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	130	struct xfs_inode_log_item *iip = ip->i_itemp;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	131	struct xfs_mount *mp = ip->i_mount;
				132	int error = 0;
				133	int log_flushed = 0;
				134	xfs_lsn_t lsn = 0;
				135
				136	trace_xfs_file_fsync(ip);
				137
				138	error = file_write_and_wait_range(file, start, end);
				139	if (error)
				140	return error;
				141
				142	if (XFS_FORCED_SHUTDOWN(mp))
				143	return -EIO;
				144
				145	xfs_iflags_clear(ip, XFS_ITRUNCATED);
				146
				147	/*
				148	* If we have an RT and/or log subvolume we need to make sure to flush
				149	* the write cache the device used for file data first. This is to
				150	* ensure newly written file data make it to disk before logging the new
				151	* inode size in case of an extending write.
				152	*/
				153	if (XFS_IS_REALTIME_INODE(ip))
				154	xfs_blkdev_issue_flush(mp->m_rtdev_targp);
				155	else if (mp->m_logdev_targp != mp->m_ddev_targp)
				156	xfs_blkdev_issue_flush(mp->m_ddev_targp);
				157
				158	/*
				159	* All metadata updates are logged, which means that we just have to
				160	* flush the log up to the latest LSN that touched the inode. If we have
				161	* concurrent fsync/fdatasync() calls, we need them to all block on the
				162	* log force before we clear the ili_fsync_fields field. This ensures
				163	* that we don't get a racing sync operation that does not wait for the
				164	* metadata to hit the journal before returning. If we race with
				165	* clearing the ili_fsync_fields, then all that will happen is the log
				166	* force will do nothing as the lsn will already be on disk. We can't
				167	* race with setting ili_fsync_fields because that is done under
				168	* XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
				169	* until after the ili_fsync_fields is cleared.
				170	*/
				171	xfs_ilock(ip, XFS_ILOCK_SHARED);
				172	if (xfs_ipincount(ip)) {
				173	if (!datasync \|\|
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	174	(iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
				175	lsn = iip->ili_last_lsn;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	176	}
				177
				178	if (lsn) {
				179	error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	180	spin_lock(&iip->ili_lock);
				181	iip->ili_fsync_fields = 0;
				182	spin_unlock(&iip->ili_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	183	}
				184	xfs_iunlock(ip, XFS_ILOCK_SHARED);
				185
				186	/*
				187	* If we only have a single device, and the log force about was
				188	* a no-op we might have to flush the data device cache here.
				189	* This can only happen for fdatasync/O_DSYNC if we were overwriting
				190	* an already allocated file and thus do not have any metadata to
				191	* commit.
				192	*/
				193	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
				194	mp->m_logdev_targp == mp->m_ddev_targp)
				195	xfs_blkdev_issue_flush(mp->m_ddev_targp);
				196
				197	return error;
				198	}
				199
				200	STATIC ssize_t
				201	xfs_file_dio_aio_read(
				202	struct kiocb *iocb,
				203	struct iov_iter *to)
				204	{
				205	struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
				206	size_t count = iov_iter_count(to);
				207	ssize_t ret;
				208
				209	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
				210
				211	if (!count)
				212	return 0; /* skip atime */
				213
				214	file_accessed(iocb->ki_filp);
				215
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	216	if (iocb->ki_flags & IOCB_NOWAIT) {
				217	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
				218	return -EAGAIN;
				219	} else {
				220	xfs_ilock(ip, XFS_IOLOCK_SHARED);
				221	}
				222	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
				223	is_sync_kiocb(iocb));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	224	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
				225
				226	return ret;
				227	}
				228
				229	static noinline ssize_t
				230	xfs_file_dax_read(
				231	struct kiocb *iocb,
				232	struct iov_iter *to)
				233	{
				234	struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
				235	size_t count = iov_iter_count(to);
				236	ssize_t ret = 0;
				237
				238	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
				239
				240	if (!count)
				241	return 0; /* skip atime */
				242
				243	if (iocb->ki_flags & IOCB_NOWAIT) {
				244	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
				245	return -EAGAIN;
				246	} else {
				247	xfs_ilock(ip, XFS_IOLOCK_SHARED);
				248	}
				249
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	250	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	251	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
				252
				253	file_accessed(iocb->ki_filp);
				254	return ret;
				255	}
				256
				257	STATIC ssize_t
				258	xfs_file_buffered_aio_read(
				259	struct kiocb *iocb,
				260	struct iov_iter *to)
				261	{
				262	struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
				263	ssize_t ret;
				264
				265	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
				266
				267	if (iocb->ki_flags & IOCB_NOWAIT) {
				268	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
				269	return -EAGAIN;
				270	} else {
				271	xfs_ilock(ip, XFS_IOLOCK_SHARED);
				272	}
				273	ret = generic_file_read_iter(iocb, to);
				274	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
				275
				276	return ret;
				277	}
				278
				279	STATIC ssize_t
				280	xfs_file_read_iter(
				281	struct kiocb *iocb,
				282	struct iov_iter *to)
				283	{
				284	struct inode *inode = file_inode(iocb->ki_filp);
				285	struct xfs_mount *mp = XFS_I(inode)->i_mount;
				286	ssize_t ret = 0;
				287
				288	XFS_STATS_INC(mp, xs_read_calls);
				289
				290	if (XFS_FORCED_SHUTDOWN(mp))
				291	return -EIO;
				292
				293	if (IS_DAX(inode))
				294	ret = xfs_file_dax_read(iocb, to);
				295	else if (iocb->ki_flags & IOCB_DIRECT)
				296	ret = xfs_file_dio_aio_read(iocb, to);
				297	else
				298	ret = xfs_file_buffered_aio_read(iocb, to);
				299
				300	if (ret > 0)
				301	XFS_STATS_ADD(mp, xs_read_bytes, ret);
				302	return ret;
				303	}
				304
				305	/*
				306	* Common pre-write limit and setup checks.
				307	*
				308	* Called with the iolocked held either shared and exclusive according to
				309	* @iolock, and returns with it held. Might upgrade the iolock to exclusive
				310	* if called for a direct write beyond i_size.
				311	*/
				312	STATIC ssize_t
				313	xfs_file_aio_write_checks(
				314	struct kiocb *iocb,
				315	struct iov_iter *from,
				316	int *iolock)
				317	{
				318	struct file *file = iocb->ki_filp;
				319	struct inode *inode = file->f_mapping->host;
				320	struct xfs_inode *ip = XFS_I(inode);
				321	ssize_t error = 0;
				322	size_t count = iov_iter_count(from);
				323	bool drained_dio = false;
				324	loff_t isize;
				325
				326	restart:
				327	error = generic_write_checks(iocb, from);
				328	if (error <= 0)
				329	return error;
				330
				331	error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
				332	if (error)
				333	return error;
				334
				335	/*
				336	* For changing security info in file_remove_privs() we need i_rwsem
				337	* exclusively.
				338	*/
				339	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
				340	xfs_iunlock(ip, *iolock);
				341	*iolock = XFS_IOLOCK_EXCL;
				342	xfs_ilock(ip, *iolock);
				343	goto restart;
				344	}
				345	/*
				346	* If the offset is beyond the size of the file, we need to zero any
				347	* blocks that fall between the existing EOF and the start of this
				348	* write. If zeroing is needed and we are currently holding the
				349	* iolock shared, we need to update it to exclusive which implies
				350	* having to redo all checks before.
				351	*
				352	* We need to serialise against EOF updates that occur in IO
				353	* completions here. We want to make sure that nobody is changing the
				354	* size while we do this check until we have placed an IO barrier (i.e.
				355	* hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
				356	* The spinlock effectively forms a memory barrier once we have the
				357	* XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
				358	* and hence be able to correctly determine if we need to run zeroing.
				359	*/
				360	spin_lock(&ip->i_flags_lock);
				361	isize = i_size_read(inode);
				362	if (iocb->ki_pos > isize) {
				363	spin_unlock(&ip->i_flags_lock);
				364	if (!drained_dio) {
				365	if (*iolock == XFS_IOLOCK_SHARED) {
				366	xfs_iunlock(ip, *iolock);
				367	*iolock = XFS_IOLOCK_EXCL;
				368	xfs_ilock(ip, *iolock);
				369	iov_iter_reexpand(from, count);
				370	}
				371	/*
				372	* We now have an IO submission barrier in place, but
				373	* AIO can do EOF updates during IO completion and hence
				374	* we now need to wait for all of them to drain. Non-AIO
				375	* DIO will have drained before we are given the
				376	* XFS_IOLOCK_EXCL, and so for most cases this wait is a
				377	* no-op.
				378	*/
				379	inode_dio_wait(inode);
				380	drained_dio = true;
				381	goto restart;
				382	}
				383
				384	trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
				385	error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	386	NULL, &xfs_buffered_write_iomap_ops);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	387	if (error)
				388	return error;
				389	} else
				390	spin_unlock(&ip->i_flags_lock);
				391
				392	/*
				393	* Updating the timestamps will grab the ilock again from
				394	* xfs_fs_dirty_inode, so we have to call it after dropping the
				395	* lock above. Eventually we should look into a way to avoid
				396	* the pointless lock roundtrip.
				397	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	398	return file_modified(file);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	399	}
				400
				401	static int
				402	xfs_dio_write_end_io(
				403	struct kiocb *iocb,
				404	ssize_t size,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	405	int error,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	406	unsigned flags)
				407	{
				408	struct inode *inode = file_inode(iocb->ki_filp);
				409	struct xfs_inode *ip = XFS_I(inode);
				410	loff_t offset = iocb->ki_pos;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	411	unsigned int nofs_flag;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	412
				413	trace_xfs_end_io_direct_write(ip, offset, size);
				414
				415	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
				416	return -EIO;
				417
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	418	if (error)
				419	return error;
				420	if (!size)
				421	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	422
				423	/*
				424	* Capture amount written on completion as we can't reliably account
				425	* for it on submission.
				426	*/
				427	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
				428
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	429	/*
				430	* We can allocate memory here while doing writeback on behalf of
				431	* memory reclaim. To avoid memory allocation deadlocks set the
				432	* task-wide nofs context for the following operations.
				433	*/
				434	nofs_flag = memalloc_nofs_save();
				435
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	436	if (flags & IOMAP_DIO_COW) {
				437	error = xfs_reflink_end_cow(ip, offset, size);
				438	if (error)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	439	goto out;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	440	}
				441
				442	/*
				443	* Unwritten conversion updates the in-core isize after extent
				444	* conversion but before updating the on-disk size. Updating isize any
				445	* earlier allows a racing dio read to find unwritten extents before
				446	* they are converted.
				447	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	448	if (flags & IOMAP_DIO_UNWRITTEN) {
				449	error = xfs_iomap_write_unwritten(ip, offset, size, true);
				450	goto out;
				451	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	452
				453	/*
				454	* We need to update the in-core inode size here so that we don't end up
				455	* with the on-disk inode size being outside the in-core inode size. We
				456	* have no other method of updating EOF for AIO, so always do it here
				457	* if necessary.
				458	*
				459	* We need to lock the test/set EOF update as we can be racing with
				460	* other IO completions here to update the EOF. Failing to serialise
				461	* here can result in EOF moving backwards and Bad Things Happen when
				462	* that occurs.
				463	*/
				464	spin_lock(&ip->i_flags_lock);
				465	if (offset + size > i_size_read(inode)) {
				466	i_size_write(inode, offset + size);
				467	spin_unlock(&ip->i_flags_lock);
				468	error = xfs_setfilesize(ip, offset, size);
				469	} else {
				470	spin_unlock(&ip->i_flags_lock);
				471	}
				472
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	473	out:
				474	memalloc_nofs_restore(nofs_flag);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	475	return error;
				476	}
				477
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	478	static const struct iomap_dio_ops xfs_dio_write_ops = {
				479	.end_io = xfs_dio_write_end_io,
				480	};
				481
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	482	/*
				483	* xfs_file_dio_aio_write - handle direct IO writes
				484	*
				485	* Lock the inode appropriately to prepare for and issue a direct IO write.
				486	* By separating it from the buffered write path we remove all the tricky to
				487	* follow locking changes and looping.
				488	*
				489	* If there are cached pages or we're extending the file, we need IOLOCK_EXCL
				490	* until we're sure the bytes at the new EOF have been zeroed and/or the cached
				491	* pages are flushed out.
				492	*
				493	* In most cases the direct IO writes will be done holding IOLOCK_SHARED
				494	* allowing them to be done in parallel with reads and other direct IO writes.
				495	* However, if the IO is not aligned to filesystem blocks, the direct IO layer
				496	* needs to do sub-block zeroing and that requires serialisation against other
				497	* direct IOs to the same block. In this case we need to serialise the
				498	* submission of the unaligned IOs so that we don't get racing block zeroing in
				499	* the dio layer. To avoid the problem with aio, we also need to wait for
				500	* outstanding IOs to complete so that unwritten extent conversion is completed
				501	* before we try to map the overlapping block. This is currently implemented by
				502	* hitting it with a big hammer (i.e. inode_dio_wait()).
				503	*
				504	* Returns with locks held indicated by @iolock and errors indicated by
				505	* negative return values.
				506	*/
				507	STATIC ssize_t
				508	xfs_file_dio_aio_write(
				509	struct kiocb *iocb,
				510	struct iov_iter *from)
				511	{
				512	struct file *file = iocb->ki_filp;
				513	struct address_space *mapping = file->f_mapping;
				514	struct inode *inode = mapping->host;
				515	struct xfs_inode *ip = XFS_I(inode);
				516	struct xfs_mount *mp = ip->i_mount;
				517	ssize_t ret = 0;
				518	int unaligned_io = 0;
				519	int iolock;
				520	size_t count = iov_iter_count(from);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	521	struct xfs_buftarg *target = xfs_inode_buftarg(ip);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	522
				523	/* DIO must be aligned to device logical sector size */
				524	if ((iocb->ki_pos \| count) & target->bt_logical_sectormask)
				525	return -EINVAL;
				526
				527	/*
				528	* Don't take the exclusive iolock here unless the I/O is unaligned to
				529	* the file system block size. We don't need to consider the EOF
				530	* extension case here because xfs_file_aio_write_checks() will relock
				531	* the inode as necessary for EOF zeroing cases and fill out the new
				532	* inode size as appropriate.
				533	*/
				534	if ((iocb->ki_pos & mp->m_blockmask) \|\|
				535	((iocb->ki_pos + count) & mp->m_blockmask)) {
				536	unaligned_io = 1;
				537
				538	/*
				539	* We can't properly handle unaligned direct I/O to reflink
				540	* files yet, as we can't unshare a partial block.
				541	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	542	if (xfs_is_cow_inode(ip)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	543	trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	544	return -ENOTBLK;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	545	}
				546	iolock = XFS_IOLOCK_EXCL;
				547	} else {
				548	iolock = XFS_IOLOCK_SHARED;
				549	}
				550
				551	if (iocb->ki_flags & IOCB_NOWAIT) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	552	/* unaligned dio always waits, bail */
				553	if (unaligned_io)
				554	return -EAGAIN;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	555	if (!xfs_ilock_nowait(ip, iolock))
				556	return -EAGAIN;
				557	} else {
				558	xfs_ilock(ip, iolock);
				559	}
				560
				561	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
				562	if (ret)
				563	goto out;
				564	count = iov_iter_count(from);
				565
				566	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	567	* If we are doing unaligned IO, we can't allow any other overlapping IO
				568	* in-flight at the same time or we risk data corruption. Wait for all
				569	* other IO to drain before we submit. If the IO is aligned, demote the
				570	* iolock if we had to take the exclusive lock in
				571	* xfs_file_aio_write_checks() for other reasons.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	572	*/
				573	if (unaligned_io) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	574	inode_dio_wait(inode);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	575	} else if (iolock == XFS_IOLOCK_EXCL) {
				576	xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
				577	iolock = XFS_IOLOCK_SHARED;
				578	}
				579
				580	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	581	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	582	* If unaligned, this is the only IO in-flight. Wait on it before we
				583	* release the iolock to prevent subsequent overlapping IO.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	584	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	585	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
				586	&xfs_dio_write_ops,
				587	is_sync_kiocb(iocb) \|\| unaligned_io);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	588	out:
				589	xfs_iunlock(ip, iolock);
				590
				591	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	592	* No fallback to buffered IO after short writes for XFS, direct I/O
				593	* will either complete fully or return an error.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	594	*/
				595	ASSERT(ret < 0 \|\| ret == count);
				596	return ret;
				597	}
				598
				599	static noinline ssize_t
				600	xfs_file_dax_write(
				601	struct kiocb *iocb,
				602	struct iov_iter *from)
				603	{
				604	struct inode *inode = iocb->ki_filp->f_mapping->host;
				605	struct xfs_inode *ip = XFS_I(inode);
				606	int iolock = XFS_IOLOCK_EXCL;
				607	ssize_t ret, error = 0;
				608	size_t count;
				609	loff_t pos;
				610
				611	if (iocb->ki_flags & IOCB_NOWAIT) {
				612	if (!xfs_ilock_nowait(ip, iolock))
				613	return -EAGAIN;
				614	} else {
				615	xfs_ilock(ip, iolock);
				616	}
				617
				618	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
				619	if (ret)
				620	goto out;
				621
				622	pos = iocb->ki_pos;
				623	count = iov_iter_count(from);
				624
				625	trace_xfs_file_dax_write(ip, count, pos);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	626	ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	627	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
				628	i_size_write(inode, iocb->ki_pos);
				629	error = xfs_setfilesize(ip, pos, ret);
				630	}
				631	out:
				632	xfs_iunlock(ip, iolock);
				633	if (error)
				634	return error;
				635
				636	if (ret > 0) {
				637	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
				638
				639	/* Handle various SYNC-type writes */
				640	ret = generic_write_sync(iocb, ret);
				641	}
				642	return ret;
				643	}
				644
				645	STATIC ssize_t
				646	xfs_file_buffered_aio_write(
				647	struct kiocb *iocb,
				648	struct iov_iter *from)
				649	{
				650	struct file *file = iocb->ki_filp;
				651	struct address_space *mapping = file->f_mapping;
				652	struct inode *inode = mapping->host;
				653	struct xfs_inode *ip = XFS_I(inode);
				654	ssize_t ret;
				655	int enospc = 0;
				656	int iolock;
				657
				658	if (iocb->ki_flags & IOCB_NOWAIT)
				659	return -EOPNOTSUPP;
				660
				661	write_retry:
				662	iolock = XFS_IOLOCK_EXCL;
				663	xfs_ilock(ip, iolock);
				664
				665	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
				666	if (ret)
				667	goto out;
				668
				669	/* We can write back this queue in page reclaim */
				670	current->backing_dev_info = inode_to_bdi(inode);
				671
				672	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	673	ret = iomap_file_buffered_write(iocb, from,
				674	&xfs_buffered_write_iomap_ops);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	675	if (likely(ret >= 0))
				676	iocb->ki_pos += ret;
				677
				678	/*
				679	* If we hit a space limit, try to free up some lingering preallocated
				680	* space before returning an error. In the case of ENOSPC, first try to
				681	* write back all dirty inodes to free up some of the excess reserved
				682	* metadata space. This reduces the chances that the eofblocks scan
				683	* waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
				684	* also behaves as a filter to prevent too many eofblocks scans from
				685	* running at the same time.
				686	*/
				687	if (ret == -EDQUOT && !enospc) {
				688	xfs_iunlock(ip, iolock);
				689	enospc = xfs_inode_free_quota_eofblocks(ip);
				690	if (enospc)
				691	goto write_retry;
				692	enospc = xfs_inode_free_quota_cowblocks(ip);
				693	if (enospc)
				694	goto write_retry;
				695	iolock = 0;
				696	} else if (ret == -ENOSPC && !enospc) {
				697	struct xfs_eofblocks eofb = {0};
				698
				699	enospc = 1;
				700	xfs_flush_inodes(ip->i_mount);
				701
				702	xfs_iunlock(ip, iolock);
				703	eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
				704	xfs_icache_free_eofblocks(ip->i_mount, &eofb);
				705	xfs_icache_free_cowblocks(ip->i_mount, &eofb);
				706	goto write_retry;
				707	}
				708
				709	current->backing_dev_info = NULL;
				710	out:
				711	if (iolock)
				712	xfs_iunlock(ip, iolock);
				713
				714	if (ret > 0) {
				715	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
				716	/* Handle various SYNC-type writes */
				717	ret = generic_write_sync(iocb, ret);
				718	}
				719	return ret;
				720	}
				721
				722	STATIC ssize_t
				723	xfs_file_write_iter(
				724	struct kiocb *iocb,
				725	struct iov_iter *from)
				726	{
				727	struct file *file = iocb->ki_filp;
				728	struct address_space *mapping = file->f_mapping;
				729	struct inode *inode = mapping->host;
				730	struct xfs_inode *ip = XFS_I(inode);
				731	ssize_t ret;
				732	size_t ocount = iov_iter_count(from);
				733
				734	XFS_STATS_INC(ip->i_mount, xs_write_calls);
				735
				736	if (ocount == 0)
				737	return 0;
				738
				739	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
				740	return -EIO;
				741
				742	if (IS_DAX(inode))
				743	return xfs_file_dax_write(iocb, from);
				744
				745	if (iocb->ki_flags & IOCB_DIRECT) {
				746	/*
				747	* Allow a directio write to fall back to a buffered
				748	* write only in the case that we're doing a reflink
				749	* CoW. In all other directio scenarios we do not
				750	* allow an operation to fall back to buffered mode.
				751	*/
				752	ret = xfs_file_dio_aio_write(iocb, from);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	753	if (ret != -ENOTBLK)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	754	return ret;
				755	}
				756
				757	return xfs_file_buffered_aio_write(iocb, from);
				758	}
				759
				760	static void
				761	xfs_wait_dax_page(
				762	struct inode *inode)
				763	{
				764	struct xfs_inode *ip = XFS_I(inode);
				765
				766	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
				767	schedule();
				768	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
				769	}
				770
				771	static int
				772	xfs_break_dax_layouts(
				773	struct inode *inode,
				774	bool *retry)
				775	{
				776	struct page *page;
				777
				778	ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
				779
				780	page = dax_layout_busy_page(inode->i_mapping);
				781	if (!page)
				782	return 0;
				783
				784	*retry = true;
				785	return ___wait_var_event(&page->_refcount,
				786	atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
				787	0, 0, xfs_wait_dax_page(inode));
				788	}
				789
				790	int
				791	xfs_break_layouts(
				792	struct inode *inode,
				793	uint *iolock,
				794	enum layout_break_reason reason)
				795	{
				796	bool retry;
				797	int error;
				798
				799	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED\|XFS_IOLOCK_EXCL));
				800
				801	do {
				802	retry = false;
				803	switch (reason) {
				804	case BREAK_UNMAP:
				805	error = xfs_break_dax_layouts(inode, &retry);
				806	if (error \|\| retry)
				807	break;
				808	/* fall through */
				809	case BREAK_WRITE:
				810	error = xfs_break_leased_layouts(inode, iolock, &retry);
				811	break;
				812	default:
				813	WARN_ON_ONCE(1);
				814	error = -EINVAL;
				815	}
				816	} while (error == 0 && retry);
				817
				818	return error;
				819	}
				820
				821	#define XFS_FALLOC_FL_SUPPORTED \
				822	(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE \| \
				823	FALLOC_FL_COLLAPSE_RANGE \| FALLOC_FL_ZERO_RANGE \| \
				824	FALLOC_FL_INSERT_RANGE \| FALLOC_FL_UNSHARE_RANGE)
				825
				826	STATIC long
				827	xfs_file_fallocate(
				828	struct file *file,
				829	int mode,
				830	loff_t offset,
				831	loff_t len)
				832	{
				833	struct inode *inode = file_inode(file);
				834	struct xfs_inode *ip = XFS_I(inode);
				835	long error;
				836	enum xfs_prealloc_flags flags = 0;
				837	uint iolock = XFS_IOLOCK_EXCL \| XFS_MMAPLOCK_EXCL;
				838	loff_t new_size = 0;
				839	bool do_file_insert = false;
				840
				841	if (!S_ISREG(inode->i_mode))
				842	return -EINVAL;
				843	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
				844	return -EOPNOTSUPP;
				845
				846	xfs_ilock(ip, iolock);
				847	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
				848	if (error)
				849	goto out_unlock;
				850
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	851	/*
				852	* Must wait for all AIO to complete before we continue as AIO can
				853	* change the file size on completion without holding any locks we
				854	* currently hold. We must do this first because AIO can update both
				855	* the on disk and in memory inode sizes, and the operations that follow
				856	* require the in-memory size to be fully up-to-date.
				857	*/
				858	inode_dio_wait(inode);
				859
				860	/*
				861	* Now AIO and DIO has drained we flush and (if necessary) invalidate
				862	* the cached range over the first operation we are about to run.
				863	*
				864	* We care about zero and collapse here because they both run a hole
				865	* punch over the range first. Because that can zero data, and the range
				866	* of invalidation for the shift operations is much larger, we still do
				867	* the required flush for collapse in xfs_prepare_shift().
				868	*
				869	* Insert has the same range requirements as collapse, and we extend the
				870	* file first which can zero data. Hence insert has the same
				871	* flush/invalidate requirements as collapse and so they are both
				872	* handled at the right time by xfs_prepare_shift().
				873	*/
				874	if (mode & (FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_ZERO_RANGE \|
				875	FALLOC_FL_COLLAPSE_RANGE)) {
				876	error = xfs_flush_unmap_range(ip, offset, len);
				877	if (error)
				878	goto out_unlock;
				879	}
				880
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	881	if (mode & FALLOC_FL_PUNCH_HOLE) {
				882	error = xfs_free_file_space(ip, offset, len);
				883	if (error)
				884	goto out_unlock;
				885	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	886	if (!xfs_is_falloc_aligned(ip, offset, len)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	887	error = -EINVAL;
				888	goto out_unlock;
				889	}
				890
				891	/*
				892	* There is no need to overlap collapse range with EOF,
				893	* in which case it is effectively a truncate operation
				894	*/
				895	if (offset + len >= i_size_read(inode)) {
				896	error = -EINVAL;
				897	goto out_unlock;
				898	}
				899
				900	new_size = i_size_read(inode) - len;
				901
				902	error = xfs_collapse_file_space(ip, offset, len);
				903	if (error)
				904	goto out_unlock;
				905	} else if (mode & FALLOC_FL_INSERT_RANGE) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	906	loff_t isize = i_size_read(inode);
				907
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	908	if (!xfs_is_falloc_aligned(ip, offset, len)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	909	error = -EINVAL;
				910	goto out_unlock;
				911	}
				912
				913	/*
				914	* New inode size must not exceed ->s_maxbytes, accounting for
				915	* possible signed overflow.
				916	*/
				917	if (inode->i_sb->s_maxbytes - isize < len) {
				918	error = -EFBIG;
				919	goto out_unlock;
				920	}
				921	new_size = isize + len;
				922
				923	/* Offset should be less than i_size */
				924	if (offset >= isize) {
				925	error = -EINVAL;
				926	goto out_unlock;
				927	}
				928	do_file_insert = true;
				929	} else {
				930	flags \|= XFS_PREALLOC_SET;
				931
				932	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
				933	offset + len > i_size_read(inode)) {
				934	new_size = offset + len;
				935	error = inode_newsize_ok(inode, new_size);
				936	if (error)
				937	goto out_unlock;
				938	}
				939
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	940	if (mode & FALLOC_FL_ZERO_RANGE) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	941	/*
				942	* Punch a hole and prealloc the range. We use a hole
				943	* punch rather than unwritten extent conversion for two
				944	* reasons:
				945	*
				946	* 1.) Hole punch handles partial block zeroing for us.
				947	* 2.) If prealloc returns ENOSPC, the file range is
				948	* still zero-valued by virtue of the hole punch.
				949	*/
				950	unsigned int blksize = i_blocksize(inode);
				951
				952	trace_xfs_zero_file_space(ip);
				953
				954	error = xfs_free_file_space(ip, offset, len);
				955	if (error)
				956	goto out_unlock;
				957
				958	len = round_up(offset + len, blksize) -
				959	round_down(offset, blksize);
				960	offset = round_down(offset, blksize);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	961	} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
				962	error = xfs_reflink_unshare(ip, offset, len);
				963	if (error)
				964	goto out_unlock;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	965	} else {
				966	/*
				967	* If always_cow mode we can't use preallocations and
				968	* thus should not create them.
				969	*/
				970	if (xfs_is_always_cow_inode(ip)) {
				971	error = -EOPNOTSUPP;
				972	goto out_unlock;
				973	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	974	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	975
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	976	if (!xfs_is_always_cow_inode(ip)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	977	error = xfs_alloc_file_space(ip, offset, len,
				978	XFS_BMAPI_PREALLOC);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	979	if (error)
				980	goto out_unlock;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	981	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	982	}
				983
				984	if (file->f_flags & O_DSYNC)
				985	flags \|= XFS_PREALLOC_SYNC;
				986
				987	error = xfs_update_prealloc_flags(ip, flags);
				988	if (error)
				989	goto out_unlock;
				990
				991	/* Change file size if needed */
				992	if (new_size) {
				993	struct iattr iattr;
				994
				995	iattr.ia_valid = ATTR_SIZE;
				996	iattr.ia_size = new_size;
				997	error = xfs_vn_setattr_size(file_dentry(file), &iattr);
				998	if (error)
				999	goto out_unlock;
				1000	}
				1001
				1002	/*
				1003	* Perform hole insertion now that the file size has been
				1004	* updated so that if we crash during the operation we don't
				1005	* leave shifted extents past EOF and hence losing access to
				1006	* the data that is contained within them.
				1007	*/
				1008	if (do_file_insert)
				1009	error = xfs_insert_file_space(ip, offset, len);
				1010
				1011	out_unlock:
				1012	xfs_iunlock(ip, iolock);
				1013	return error;
				1014	}
				1015
				1016	STATIC int
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1017	xfs_file_fadvise(
				1018	struct file *file,
				1019	loff_t start,
				1020	loff_t end,
				1021	int advice)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1022	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1023	struct xfs_inode *ip = XFS_I(file_inode(file));
				1024	int ret;
				1025	int lockflags = 0;
				1026
				1027	/*
				1028	* Operations creating pages in page cache need protection from hole
				1029	* punching and similar ops
				1030	*/
				1031	if (advice == POSIX_FADV_WILLNEED) {
				1032	lockflags = XFS_IOLOCK_SHARED;
				1033	xfs_ilock(ip, lockflags);
				1034	}
				1035	ret = generic_fadvise(file, start, end, advice);
				1036	if (lockflags)
				1037	xfs_iunlock(ip, lockflags);
				1038	return ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1039	}
				1040
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1041	/* Does this file, inode, or mount want synchronous writes? */
				1042	static inline bool xfs_file_sync_writes(struct file *filp)
				1043	{
				1044	struct xfs_inode *ip = XFS_I(file_inode(filp));
				1045
				1046	if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC)
				1047	return true;
				1048	if (filp->f_flags & (__O_SYNC \| O_DSYNC))
				1049	return true;
				1050	if (IS_SYNC(file_inode(filp)))
				1051	return true;
				1052
				1053	return false;
				1054	}
				1055
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1056	STATIC loff_t
				1057	xfs_file_remap_range(
				1058	struct file *file_in,
				1059	loff_t pos_in,
				1060	struct file *file_out,
				1061	loff_t pos_out,
				1062	loff_t len,
				1063	unsigned int remap_flags)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1064	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1065	struct inode *inode_in = file_inode(file_in);
				1066	struct xfs_inode *src = XFS_I(inode_in);
				1067	struct inode *inode_out = file_inode(file_out);
				1068	struct xfs_inode *dest = XFS_I(inode_out);
				1069	struct xfs_mount *mp = src->i_mount;
				1070	loff_t remapped = 0;
				1071	xfs_extlen_t cowextsize;
				1072	int ret;
				1073
				1074	if (remap_flags & ~(REMAP_FILE_DEDUP \| REMAP_FILE_ADVISORY))
				1075	return -EINVAL;
				1076
				1077	if (!xfs_sb_version_hasreflink(&mp->m_sb))
				1078	return -EOPNOTSUPP;
				1079
				1080	if (XFS_FORCED_SHUTDOWN(mp))
				1081	return -EIO;
				1082
				1083	/* Prepare and then clone file data. */
				1084	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
				1085	&len, remap_flags);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1086	if (ret \|\| len == 0)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1087	return ret;
				1088
				1089	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
				1090
				1091	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
				1092	&remapped);
				1093	if (ret)
				1094	goto out_unlock;
				1095
				1096	/*
				1097	* Carry the cowextsize hint from src to dest if we're sharing the
				1098	* entire source file to the entire destination file, the source file
				1099	* has a cowextsize hint, and the destination file does not.
				1100	*/
				1101	cowextsize = 0;
				1102	if (pos_in == 0 && len == i_size_read(inode_in) &&
				1103	(src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
				1104	pos_out == 0 && len >= i_size_read(inode_out) &&
				1105	!(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
				1106	cowextsize = src->i_d.di_cowextsize;
				1107
				1108	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
				1109	remap_flags);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1110	if (ret)
				1111	goto out_unlock;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1112
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1113	if (xfs_file_sync_writes(file_in) \|\| xfs_file_sync_writes(file_out))
				1114	xfs_log_force_inode(dest);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1115	out_unlock:
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1116	xfs_iunlock2_io_mmap(src, dest);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1117	if (ret)
				1118	trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
				1119	return remapped > 0 ? remapped : ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1120	}
				1121
				1122	STATIC int
				1123	xfs_file_open(
				1124	struct inode *inode,
				1125	struct file *file)
				1126	{
				1127	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
				1128	return -EFBIG;
				1129	if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
				1130	return -EIO;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1131	file->f_mode \|= FMODE_NOWAIT \| FMODE_BUF_RASYNC;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1132	return 0;
				1133	}
				1134
				1135	STATIC int
				1136	xfs_dir_open(
				1137	struct inode *inode,
				1138	struct file *file)
				1139	{
				1140	struct xfs_inode *ip = XFS_I(inode);
				1141	int mode;
				1142	int error;
				1143
				1144	error = xfs_file_open(inode, file);
				1145	if (error)
				1146	return error;
				1147
				1148	/*
				1149	* If there are any blocks, read-ahead block 0 as we're almost
				1150	* certain to have the next operation be a read there.
				1151	*/
				1152	mode = xfs_ilock_data_map_shared(ip);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1153	if (ip->i_df.if_nextents > 0)
				1154	error = xfs_dir3_data_readahead(ip, 0, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1155	xfs_iunlock(ip, mode);
				1156	return error;
				1157	}
				1158
				1159	STATIC int
				1160	xfs_file_release(
				1161	struct inode *inode,
				1162	struct file *filp)
				1163	{
				1164	return xfs_release(XFS_I(inode));
				1165	}
				1166
				1167	STATIC int
				1168	xfs_file_readdir(
				1169	struct file *file,
				1170	struct dir_context *ctx)
				1171	{
				1172	struct inode *inode = file_inode(file);
				1173	xfs_inode_t *ip = XFS_I(inode);
				1174	size_t bufsize;
				1175
				1176	/*
				1177	* The Linux API doesn't pass down the total size of the buffer
				1178	* we read into down to the filesystem. With the filldir concept
				1179	* it's not needed for correct information, but the XFS dir2 leaf
				1180	* code wants an estimate of the buffer size to calculate it's
				1181	* readahead window and size the buffers used for mapping to
				1182	* physical blocks.
				1183	*
				1184	* Try to give it an estimate that's good enough, maybe at some
				1185	* point we can change the ->readdir prototype to include the
				1186	* buffer size. For now we use the current glibc buffer size.
				1187	*/
				1188	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size);
				1189
				1190	return xfs_readdir(NULL, ip, ctx, bufsize);
				1191	}
				1192
				1193	STATIC loff_t
				1194	xfs_file_llseek(
				1195	struct file *file,
				1196	loff_t offset,
				1197	int whence)
				1198	{
				1199	struct inode *inode = file->f_mapping->host;
				1200
				1201	if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
				1202	return -EIO;
				1203
				1204	switch (whence) {
				1205	default:
				1206	return generic_file_llseek(file, offset, whence);
				1207	case SEEK_HOLE:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1208	offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1209	break;
				1210	case SEEK_DATA:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1211	offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1212	break;
				1213	}
				1214
				1215	if (offset < 0)
				1216	return offset;
				1217	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
				1218	}
				1219
				1220	/*
				1221	* Locking for serialisation of IO during page faults. This results in a lock
				1222	* ordering of:
				1223	*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1224	* mmap_lock (MM)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1225	* sb_start_pagefault(vfs, freeze)
				1226	* i_mmaplock (XFS - truncate serialisation)
				1227	* page_lock (MM)
				1228	* i_lock (XFS - extent map serialisation)
				1229	*/
				1230	static vm_fault_t
				1231	__xfs_filemap_fault(
				1232	struct vm_fault *vmf,
				1233	enum page_entry_size pe_size,
				1234	bool write_fault)
				1235	{
				1236	struct inode *inode = file_inode(vmf->vma->vm_file);
				1237	struct xfs_inode *ip = XFS_I(inode);
				1238	vm_fault_t ret;
				1239
				1240	trace_xfs_filemap_fault(ip, pe_size, write_fault);
				1241
				1242	if (write_fault) {
				1243	sb_start_pagefault(inode->i_sb);
				1244	file_update_time(vmf->vma->vm_file);
				1245	}
				1246
				1247	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
				1248	if (IS_DAX(inode)) {
				1249	pfn_t pfn;
				1250
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1251	ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
				1252	(write_fault && !vmf->cow_page) ?
				1253	&xfs_direct_write_iomap_ops :
				1254	&xfs_read_iomap_ops);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1255	if (ret & VM_FAULT_NEEDDSYNC)
				1256	ret = dax_finish_sync_fault(vmf, pe_size, pfn);
				1257	} else {
				1258	if (write_fault)
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1259	ret = iomap_page_mkwrite(vmf,
				1260	&xfs_buffered_write_iomap_ops);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1261	else
				1262	ret = filemap_fault(vmf);
				1263	}
				1264	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
				1265
				1266	if (write_fault)
				1267	sb_end_pagefault(inode->i_sb);
				1268	return ret;
				1269	}
				1270
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	1271	static inline bool
				1272	xfs_is_write_fault(
				1273	struct vm_fault *vmf)
				1274	{
				1275	return (vmf->flags & FAULT_FLAG_WRITE) &&
				1276	(vmf->vma->vm_flags & VM_SHARED);
				1277	}
				1278
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1279	static vm_fault_t
				1280	xfs_filemap_fault(
				1281	struct vm_fault *vmf)
				1282	{
				1283	/* DAX can shortcut the normal fault path on write faults! */
				1284	return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
				1285	IS_DAX(file_inode(vmf->vma->vm_file)) &&
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	1286	xfs_is_write_fault(vmf));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1287	}
				1288
				1289	static vm_fault_t
				1290	xfs_filemap_huge_fault(
				1291	struct vm_fault *vmf,
				1292	enum page_entry_size pe_size)
				1293	{
				1294	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
				1295	return VM_FAULT_FALLBACK;
				1296
				1297	/* DAX can shortcut the normal fault path on write faults! */
				1298	return __xfs_filemap_fault(vmf, pe_size,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	1299	xfs_is_write_fault(vmf));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1300	}
				1301
				1302	static vm_fault_t
				1303	xfs_filemap_page_mkwrite(
				1304	struct vm_fault *vmf)
				1305	{
				1306	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
				1307	}
				1308
				1309	/*
				1310	* pfn_mkwrite was originally intended to ensure we capture time stamp updates
				1311	* on write faults. In reality, it needs to serialise against truncate and
				1312	* prepare memory for writing so handle is as standard write fault.
				1313	*/
				1314	static vm_fault_t
				1315	xfs_filemap_pfn_mkwrite(
				1316	struct vm_fault *vmf)
				1317	{
				1318
				1319	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
				1320	}
				1321
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1322	static void
				1323	xfs_filemap_map_pages(
				1324	struct vm_fault *vmf,
				1325	pgoff_t start_pgoff,
				1326	pgoff_t end_pgoff)
				1327	{
				1328	struct inode *inode = file_inode(vmf->vma->vm_file);
				1329
				1330	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
				1331	filemap_map_pages(vmf, start_pgoff, end_pgoff);
				1332	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
				1333	}
				1334
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1335	static const struct vm_operations_struct xfs_file_vm_ops = {
				1336	.fault = xfs_filemap_fault,
				1337	.huge_fault = xfs_filemap_huge_fault,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1338	.map_pages = xfs_filemap_map_pages,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1339	.page_mkwrite = xfs_filemap_page_mkwrite,
				1340	.pfn_mkwrite = xfs_filemap_pfn_mkwrite,
				1341	};
				1342
				1343	STATIC int
				1344	xfs_file_mmap(
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1345	struct file *file,
				1346	struct vm_area_struct *vma)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1347	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1348	struct inode *inode = file_inode(file);
				1349	struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1350
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1351	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1352	* We don't support synchronous mappings for non-DAX files and
				1353	* for DAX files if underneath dax_device is not synchronous.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1354	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1355	if (!daxdev_mapping_supported(vma, target->bt_daxdev))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1356	return -EOPNOTSUPP;
				1357
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1358	file_accessed(file);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1359	vma->vm_ops = &xfs_file_vm_ops;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1360	if (IS_DAX(inode))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1361	vma->vm_flags \|= VM_HUGEPAGE;
				1362	return 0;
				1363	}
				1364
				1365	const struct file_operations xfs_file_operations = {
				1366	.llseek = xfs_file_llseek,
				1367	.read_iter = xfs_file_read_iter,
				1368	.write_iter = xfs_file_write_iter,
				1369	.splice_read = generic_file_splice_read,
				1370	.splice_write = iter_file_splice_write,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1371	.iopoll = iomap_dio_iopoll,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1372	.unlocked_ioctl = xfs_file_ioctl,
				1373	#ifdef CONFIG_COMPAT
				1374	.compat_ioctl = xfs_file_compat_ioctl,
				1375	#endif
				1376	.mmap = xfs_file_mmap,
				1377	.mmap_supported_flags = MAP_SYNC,
				1378	.open = xfs_file_open,
				1379	.release = xfs_file_release,
				1380	.fsync = xfs_file_fsync,
				1381	.get_unmapped_area = thp_get_unmapped_area,
				1382	.fallocate = xfs_file_fallocate,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1383	.fadvise = xfs_file_fadvise,
				1384	.remap_file_range = xfs_file_remap_range,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1385	};
				1386
				1387	const struct file_operations xfs_dir_file_operations = {
				1388	.open = xfs_dir_open,
				1389	.read = generic_read_dir,
				1390	.iterate_shared = xfs_file_readdir,
				1391	.llseek = generic_file_llseek,
				1392	.unlocked_ioctl = xfs_file_ioctl,
				1393	#ifdef CONFIG_COMPAT
				1394	.compat_ioctl = xfs_file_compat_ioctl,
				1395	#endif
				1396	.fsync = xfs_dir_fsync,
				1397	};