Blame - fs/xfs/xfs_reflink.c - hafnium/third_party/linux

blob: 42ea7bab9144cc026f50d802acc31c42ab7f0858 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0+
				2	/*
				3	* Copyright (C) 2016 Oracle. All Rights Reserved.
				4	* Author: Darrick J. Wong <darrick.wong@oracle.com>
				5	*/
				6	#include "xfs.h"
				7	#include "xfs_fs.h"
				8	#include "xfs_shared.h"
				9	#include "xfs_format.h"
				10	#include "xfs_log_format.h"
				11	#include "xfs_trans_resv.h"
				12	#include "xfs_mount.h"
				13	#include "xfs_defer.h"
				14	#include "xfs_da_format.h"
				15	#include "xfs_da_btree.h"
				16	#include "xfs_inode.h"
				17	#include "xfs_trans.h"
				18	#include "xfs_inode_item.h"
				19	#include "xfs_bmap.h"
				20	#include "xfs_bmap_util.h"
				21	#include "xfs_error.h"
				22	#include "xfs_dir2.h"
				23	#include "xfs_dir2_priv.h"
				24	#include "xfs_ioctl.h"
				25	#include "xfs_trace.h"
				26	#include "xfs_log.h"
				27	#include "xfs_icache.h"
				28	#include "xfs_pnfs.h"
				29	#include "xfs_btree.h"
				30	#include "xfs_refcount_btree.h"
				31	#include "xfs_refcount.h"
				32	#include "xfs_bmap_btree.h"
				33	#include "xfs_trans_space.h"
				34	#include "xfs_bit.h"
				35	#include "xfs_alloc.h"
				36	#include "xfs_quota_defs.h"
				37	#include "xfs_quota.h"
				38	#include "xfs_reflink.h"
				39	#include "xfs_iomap.h"
				40	#include "xfs_rmap_btree.h"
				41	#include "xfs_sb.h"
				42	#include "xfs_ag_resv.h"
				43
				44	/*
				45	* Copy on Write of Shared Blocks
				46	*
				47	* XFS must preserve "the usual" file semantics even when two files share
				48	* the same physical blocks. This means that a write to one file must not
				49	* alter the blocks in a different file; the way that we'll do that is
				50	* through the use of a copy-on-write mechanism. At a high level, that
				51	* means that when we want to write to a shared block, we allocate a new
				52	* block, write the data to the new block, and if that succeeds we map the
				53	* new block into the file.
				54	*
				55	* XFS provides a "delayed allocation" mechanism that defers the allocation
				56	* of disk blocks to dirty-but-not-yet-mapped file blocks as long as
				57	* possible. This reduces fragmentation by enabling the filesystem to ask
				58	* for bigger chunks less often, which is exactly what we want for CoW.
				59	*
				60	* The delalloc mechanism begins when the kernel wants to make a block
				61	* writable (write_begin or page_mkwrite). If the offset is not mapped, we
				62	* create a delalloc mapping, which is a regular in-core extent, but without
				63	* a real startblock. (For delalloc mappings, the startblock encodes both
				64	* a flag that this is a delalloc mapping, and a worst-case estimate of how
				65	* many blocks might be required to put the mapping into the BMBT.) delalloc
				66	* mappings are a reservation against the free space in the filesystem;
				67	* adjacent mappings can also be combined into fewer larger mappings.
				68	*
				69	* As an optimization, the CoW extent size hint (cowextsz) creates
				70	* outsized aligned delalloc reservations in the hope of landing out of
				71	* order nearby CoW writes in a single extent on disk, thereby reducing
				72	* fragmentation and improving future performance.
				73	*
				74	* D: --RRRRRRSSSRRRRRRRR--- (data fork)
				75	* C: ------DDDDDDD--------- (CoW fork)
				76	*
				77	* When dirty pages are being written out (typically in writepage), the
				78	* delalloc reservations are converted into unwritten mappings by
				79	* allocating blocks and replacing the delalloc mapping with real ones.
				80	* A delalloc mapping can be replaced by several unwritten ones if the
				81	* free space is fragmented.
				82	*
				83	* D: --RRRRRRSSSRRRRRRRR---
				84	* C: ------UUUUUUU---------
				85	*
				86	* We want to adapt the delalloc mechanism for copy-on-write, since the
				87	* write paths are similar. The first two steps (creating the reservation
				88	* and allocating the blocks) are exactly the same as delalloc except that
				89	* the mappings must be stored in a separate CoW fork because we do not want
				90	* to disturb the mapping in the data fork until we're sure that the write
				91	* succeeded. IO completion in this case is the process of removing the old
				92	* mapping from the data fork and moving the new mapping from the CoW fork to
				93	* the data fork. This will be discussed shortly.
				94	*
				95	* For now, unaligned directio writes will be bounced back to the page cache.
				96	* Block-aligned directio writes will use the same mechanism as buffered
				97	* writes.
				98	*
				99	* Just prior to submitting the actual disk write requests, we convert
				100	* the extents representing the range of the file actually being written
				101	* (as opposed to extra pieces created for the cowextsize hint) to real
				102	* extents. This will become important in the next step:
				103	*
				104	* D: --RRRRRRSSSRRRRRRRR---
				105	* C: ------UUrrUUU---------
				106	*
				107	* CoW remapping must be done after the data block write completes,
				108	* because we don't want to destroy the old data fork map until we're sure
				109	* the new block has been written. Since the new mappings are kept in a
				110	* separate fork, we can simply iterate these mappings to find the ones
				111	* that cover the file blocks that we just CoW'd. For each extent, simply
				112	* unmap the corresponding range in the data fork, map the new range into
				113	* the data fork, and remove the extent from the CoW fork. Because of
				114	* the presence of the cowextsize hint, however, we must be careful
				115	* only to remap the blocks that we've actually written out -- we must
				116	* never remap delalloc reservations nor CoW staging blocks that have
				117	* yet to be written. This corresponds exactly to the real extents in
				118	* the CoW fork:
				119	*
				120	* D: --RRRRRRrrSRRRRRRRR---
				121	* C: ------UU--UUU---------
				122	*
				123	* Since the remapping operation can be applied to an arbitrary file
				124	* range, we record the need for the remap step as a flag in the ioend
				125	* instead of declaring a new IO type. This is required for direct io
				126	* because we only have ioend for the whole dio, and we have to be able to
				127	* remember the presence of unwritten blocks and CoW blocks with a single
				128	* ioend structure. Better yet, the more ground we can cover with one
				129	* ioend, the better.
				130	*/
				131
				132	/*
				133	* Given an AG extent, find the lowest-numbered run of shared blocks
				134	* within that range and return the range in fbno/flen. If
				135	* find_end_of_shared is true, return the longest contiguous extent of
				136	* shared blocks. If there are no shared extents, fbno and flen will
				137	* be set to NULLAGBLOCK and 0, respectively.
				138	*/
				139	int
				140	xfs_reflink_find_shared(
				141	struct xfs_mount *mp,
				142	struct xfs_trans *tp,
				143	xfs_agnumber_t agno,
				144	xfs_agblock_t agbno,
				145	xfs_extlen_t aglen,
				146	xfs_agblock_t *fbno,
				147	xfs_extlen_t *flen,
				148	bool find_end_of_shared)
				149	{
				150	struct xfs_buf *agbp;
				151	struct xfs_btree_cur *cur;
				152	int error;
				153
				154	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
				155	if (error)
				156	return error;
				157	if (!agbp)
				158	return -ENOMEM;
				159
				160	cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
				161
				162	error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
				163	find_end_of_shared);
				164
				165	xfs_btree_del_cursor(cur, error);
				166
				167	xfs_trans_brelse(tp, agbp);
				168	return error;
				169	}
				170
				171	/*
				172	* Trim the mapping to the next block where there's a change in the
				173	* shared/unshared status. More specifically, this means that we
				174	* find the lowest-numbered extent of shared blocks that coincides with
				175	* the given block mapping. If the shared extent overlaps the start of
				176	* the mapping, trim the mapping to the end of the shared extent. If
				177	* the shared region intersects the mapping, trim the mapping to the
				178	* start of the shared extent. If there are no shared regions that
				179	* overlap, just return the original extent.
				180	*/
				181	int
				182	xfs_reflink_trim_around_shared(
				183	struct xfs_inode *ip,
				184	struct xfs_bmbt_irec *irec,
				185	bool *shared,
				186	bool *trimmed)
				187	{
				188	xfs_agnumber_t agno;
				189	xfs_agblock_t agbno;
				190	xfs_extlen_t aglen;
				191	xfs_agblock_t fbno;
				192	xfs_extlen_t flen;
				193	int error = 0;
				194
				195	/* Holes, unwritten, and delalloc extents cannot be shared */
				196	if (!xfs_is_reflink_inode(ip) \|\| !xfs_bmap_is_real_extent(irec)) {
				197	*shared = false;
				198	return 0;
				199	}
				200
				201	trace_xfs_reflink_trim_around_shared(ip, irec);
				202
				203	agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
				204	agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
				205	aglen = irec->br_blockcount;
				206
				207	error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno,
				208	aglen, &fbno, &flen, true);
				209	if (error)
				210	return error;
				211
				212	shared = trimmed = false;
				213	if (fbno == NULLAGBLOCK) {
				214	/* No shared blocks at all. */
				215	return 0;
				216	} else if (fbno == agbno) {
				217	/*
				218	* The start of this extent is shared. Truncate the
				219	* mapping at the end of the shared region so that a
				220	* subsequent iteration starts at the start of the
				221	* unshared region.
				222	*/
				223	irec->br_blockcount = flen;
				224	*shared = true;
				225	if (flen != aglen)
				226	*trimmed = true;
				227	return 0;
				228	} else {
				229	/*
				230	* There's a shared extent midway through this extent.
				231	* Truncate the mapping at the start of the shared
				232	* extent so that a subsequent iteration starts at the
				233	* start of the shared region.
				234	*/
				235	irec->br_blockcount = fbno - agbno;
				236	*trimmed = true;
				237	return 0;
				238	}
				239	}
				240
				241	/*
				242	* Trim the passed in imap to the next shared/unshared extent boundary, and
				243	* if imap->br_startoff points to a shared extent reserve space for it in the
				244	* COW fork. In this case *shared is set to true, else to false.
				245	*
				246	* Note that imap will always contain the block numbers for the existing blocks
				247	* in the data fork, as the upper layers need them for read-modify-write
				248	* operations.
				249	*/
				250	int
				251	xfs_reflink_reserve_cow(
				252	struct xfs_inode *ip,
				253	struct xfs_bmbt_irec *imap,
				254	bool *shared)
				255	{
				256	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
				257	struct xfs_bmbt_irec got;
				258	int error = 0;
				259	bool eof = false, trimmed;
				260	struct xfs_iext_cursor icur;
				261
				262	/*
				263	* Search the COW fork extent list first. This serves two purposes:
				264	* first this implement the speculative preallocation using cowextisze,
				265	* so that we also unshared block adjacent to shared blocks instead
				266	* of just the shared blocks themselves. Second the lookup in the
				267	* extent list is generally faster than going out to the shared extent
				268	* tree.
				269	*/
				270
				271	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
				272	eof = true;
				273	if (!eof && got.br_startoff <= imap->br_startoff) {
				274	trace_xfs_reflink_cow_found(ip, imap);
				275	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
				276
				277	*shared = true;
				278	return 0;
				279	}
				280
				281	/* Trim the mapping to the nearest shared extent boundary. */
				282	error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
				283	if (error)
				284	return error;
				285
				286	/* Not shared? Just report the (potentially capped) extent. */
				287	if (!*shared)
				288	return 0;
				289
				290	/*
				291	* Fork all the shared blocks from our write offset until the end of
				292	* the extent.
				293	*/
				294	error = xfs_qm_dqattach_locked(ip, false);
				295	if (error)
				296	return error;
				297
				298	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
				299	imap->br_blockcount, 0, &got, &icur, eof);
				300	if (error == -ENOSPC \|\| error == -EDQUOT)
				301	trace_xfs_reflink_cow_enospc(ip, imap);
				302	if (error)
				303	return error;
				304
				305	trace_xfs_reflink_cow_alloc(ip, &got);
				306	return 0;
				307	}
				308
				309	/* Convert part of an unwritten CoW extent to a real one. */
				310	STATIC int
				311	xfs_reflink_convert_cow_extent(
				312	struct xfs_inode *ip,
				313	struct xfs_bmbt_irec *imap,
				314	xfs_fileoff_t offset_fsb,
				315	xfs_filblks_t count_fsb)
				316	{
				317	int nimaps = 1;
				318
				319	if (imap->br_state == XFS_EXT_NORM)
				320	return 0;
				321
				322	xfs_trim_extent(imap, offset_fsb, count_fsb);
				323	trace_xfs_reflink_convert_cow(ip, imap);
				324	if (imap->br_blockcount == 0)
				325	return 0;
				326	return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
				327	XFS_BMAPI_COWFORK \| XFS_BMAPI_CONVERT, 0, imap,
				328	&nimaps);
				329	}
				330
				331	/* Convert all of the unwritten CoW extents in a file's range to real ones. */
				332	int
				333	xfs_reflink_convert_cow(
				334	struct xfs_inode *ip,
				335	xfs_off_t offset,
				336	xfs_off_t count)
				337	{
				338	struct xfs_mount *mp = ip->i_mount;
				339	xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
				340	xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
				341	xfs_filblks_t count_fsb = end_fsb - offset_fsb;
				342	struct xfs_bmbt_irec imap;
				343	int nimaps = 1, error = 0;
				344
				345	ASSERT(count != 0);
				346
				347	xfs_ilock(ip, XFS_ILOCK_EXCL);
				348	error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
				349	XFS_BMAPI_COWFORK \| XFS_BMAPI_CONVERT \|
				350	XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
				351	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				352	return error;
				353	}
				354
				355	/*
				356	* Find the extent that maps the given range in the COW fork. Even if the extent
				357	* is not shared we might have a preallocation for it in the COW fork. If so we
				358	* use it that rather than trigger a new allocation.
				359	*/
				360	static int
				361	xfs_find_trim_cow_extent(
				362	struct xfs_inode *ip,
				363	struct xfs_bmbt_irec *imap,
				364	bool *shared,
				365	bool *found)
				366	{
				367	xfs_fileoff_t offset_fsb = imap->br_startoff;
				368	xfs_filblks_t count_fsb = imap->br_blockcount;
				369	struct xfs_iext_cursor icur;
				370	struct xfs_bmbt_irec got;
				371	bool trimmed;
				372
				373	*found = false;
				374
				375	/*
				376	* If we don't find an overlapping extent, trim the range we need to
				377	* allocate to fit the hole we found.
				378	*/
				379	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) \|\|
				380	got.br_startoff > offset_fsb)
				381	return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
				382
				383	*shared = true;
				384	if (isnullstartblock(got.br_startblock)) {
				385	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
				386	return 0;
				387	}
				388
				389	/* real extent found - no need to allocate */
				390	xfs_trim_extent(&got, offset_fsb, count_fsb);
				391	*imap = got;
				392	*found = true;
				393	return 0;
				394	}
				395
				396	/* Allocate all CoW reservations covering a range of blocks in a file. */
				397	int
				398	xfs_reflink_allocate_cow(
				399	struct xfs_inode *ip,
				400	struct xfs_bmbt_irec *imap,
				401	bool *shared,
				402	uint *lockmode)
				403	{
				404	struct xfs_mount *mp = ip->i_mount;
				405	xfs_fileoff_t offset_fsb = imap->br_startoff;
				406	xfs_filblks_t count_fsb = imap->br_blockcount;
				407	struct xfs_trans *tp;
				408	int nimaps, error = 0;
				409	bool found;
				410	xfs_filblks_t resaligned;
				411	xfs_extlen_t resblks = 0;
				412
				413	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
				414	ASSERT(xfs_is_reflink_inode(ip));
				415
				416	error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
				417	if (error \|\| !*shared)
				418	return error;
				419	if (found)
				420	goto convert;
				421
				422	resaligned = xfs_aligned_fsb_count(imap->br_startoff,
				423	imap->br_blockcount, xfs_get_cowextsz_hint(ip));
				424	resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
				425
				426	xfs_iunlock(ip, *lockmode);
				427	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
				428	*lockmode = XFS_ILOCK_EXCL;
				429	xfs_ilock(ip, *lockmode);
				430
				431	if (error)
				432	return error;
				433
				434	error = xfs_qm_dqattach_locked(ip, false);
				435	if (error)
				436	goto out_trans_cancel;
				437
				438	/*
				439	* Check for an overlapping extent again now that we dropped the ilock.
				440	*/
				441	error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
				442	if (error \|\| !*shared)
				443	goto out_trans_cancel;
				444	if (found) {
				445	xfs_trans_cancel(tp);
				446	goto convert;
				447	}
				448
				449	error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
				450	XFS_QMOPT_RES_REGBLKS);
				451	if (error)
				452	goto out_trans_cancel;
				453
				454	xfs_trans_ijoin(tp, ip, 0);
				455
				456	/* Allocate the entire reservation as unwritten blocks. */
				457	nimaps = 1;
				458	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
				459	XFS_BMAPI_COWFORK \| XFS_BMAPI_PREALLOC,
				460	resblks, imap, &nimaps);
				461	if (error)
				462	goto out_unreserve;
				463
				464	xfs_inode_set_cowblocks_tag(ip);
				465	error = xfs_trans_commit(tp);
				466	if (error)
				467	return error;
				468
				469	/*
				470	* Allocation succeeded but the requested range was not even partially
				471	* satisfied? Bail out!
				472	*/
				473	if (nimaps == 0)
				474	return -ENOSPC;
				475	convert:
				476	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
				477
				478	out_unreserve:
				479	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
				480	XFS_QMOPT_RES_REGBLKS);
				481	out_trans_cancel:
				482	xfs_trans_cancel(tp);
				483	return error;
				484	}
				485
				486	/*
				487	* Cancel CoW reservations for some block range of an inode.
				488	*
				489	* If cancel_real is true this function cancels all COW fork extents for the
				490	* inode; if cancel_real is false, real extents are not cleared.
				491	*
				492	* Caller must have already joined the inode to the current transaction. The
				493	* inode will be joined to the transaction returned to the caller.
				494	*/
				495	int
				496	xfs_reflink_cancel_cow_blocks(
				497	struct xfs_inode *ip,
				498	struct xfs_trans **tpp,
				499	xfs_fileoff_t offset_fsb,
				500	xfs_fileoff_t end_fsb,
				501	bool cancel_real)
				502	{
				503	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
				504	struct xfs_bmbt_irec got, del;
				505	struct xfs_iext_cursor icur;
				506	int error = 0;
				507
				508	if (!xfs_inode_has_cow_data(ip))
				509	return 0;
				510	if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
				511	return 0;
				512
				513	/* Walk backwards until we're out of the I/O range... */
				514	while (got.br_startoff + got.br_blockcount > offset_fsb) {
				515	del = got;
				516	xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
				517
				518	/* Extent delete may have bumped ext forward */
				519	if (!del.br_blockcount) {
				520	xfs_iext_prev(ifp, &icur);
				521	goto next_extent;
				522	}
				523
				524	trace_xfs_reflink_cancel_cow(ip, &del);
				525
				526	if (isnullstartblock(del.br_startblock)) {
				527	error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
				528	&icur, &got, &del);
				529	if (error)
				530	break;
				531	} else if (del.br_state == XFS_EXT_UNWRITTEN \|\| cancel_real) {
				532	ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
				533
				534	/* Free the CoW orphan record. */
				535	error = xfs_refcount_free_cow_extent(*tpp,
				536	del.br_startblock, del.br_blockcount);
				537	if (error)
				538	break;
				539
				540	xfs_bmap_add_free(*tpp, del.br_startblock,
				541	del.br_blockcount, NULL);
				542
				543	/* Roll the transaction */
				544	error = xfs_defer_finish(tpp);
				545	if (error)
				546	break;
				547
				548	/* Remove the mapping from the CoW fork. */
				549	xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
				550
				551	/* Remove the quota reservation */
				552	error = xfs_trans_reserve_quota_nblks(NULL, ip,
				553	-(long)del.br_blockcount, 0,
				554	XFS_QMOPT_RES_REGBLKS);
				555	if (error)
				556	break;
				557	} else {
				558	/* Didn't do anything, push cursor back. */
				559	xfs_iext_prev(ifp, &icur);
				560	}
				561	next_extent:
				562	if (!xfs_iext_get_extent(ifp, &icur, &got))
				563	break;
				564	}
				565
				566	/* clear tag if cow fork is emptied */
				567	if (!ifp->if_bytes)
				568	xfs_inode_clear_cowblocks_tag(ip);
				569	return error;
				570	}
				571
				572	/*
				573	* Cancel CoW reservations for some byte range of an inode.
				574	*
				575	* If cancel_real is true this function cancels all COW fork extents for the
				576	* inode; if cancel_real is false, real extents are not cleared.
				577	*/
				578	int
				579	xfs_reflink_cancel_cow_range(
				580	struct xfs_inode *ip,
				581	xfs_off_t offset,
				582	xfs_off_t count,
				583	bool cancel_real)
				584	{
				585	struct xfs_trans *tp;
				586	xfs_fileoff_t offset_fsb;
				587	xfs_fileoff_t end_fsb;
				588	int error;
				589
				590	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
				591	ASSERT(xfs_is_reflink_inode(ip));
				592
				593	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
				594	if (count == NULLFILEOFF)
				595	end_fsb = NULLFILEOFF;
				596	else
				597	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
				598
				599	/* Start a rolling transaction to remove the mappings */
				600	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
				601	0, 0, XFS_TRANS_NOFS, &tp);
				602	if (error)
				603	goto out;
				604
				605	xfs_ilock(ip, XFS_ILOCK_EXCL);
				606	xfs_trans_ijoin(tp, ip, 0);
				607
				608	/* Scrape out the old CoW reservations */
				609	error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
				610	cancel_real);
				611	if (error)
				612	goto out_cancel;
				613
				614	error = xfs_trans_commit(tp);
				615
				616	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				617	return error;
				618
				619	out_cancel:
				620	xfs_trans_cancel(tp);
				621	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				622	out:
				623	trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
				624	return error;
				625	}
				626
				627	/*
				628	* Remap parts of a file's data fork after a successful CoW.
				629	*/
				630	int
				631	xfs_reflink_end_cow(
				632	struct xfs_inode *ip,
				633	xfs_off_t offset,
				634	xfs_off_t count)
				635	{
				636	struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
				637	struct xfs_bmbt_irec got, del;
				638	struct xfs_trans *tp;
				639	xfs_fileoff_t offset_fsb;
				640	xfs_fileoff_t end_fsb;
				641	int error;
				642	unsigned int resblks;
				643	xfs_filblks_t rlen;
				644	struct xfs_iext_cursor icur;
				645
				646	trace_xfs_reflink_end_cow(ip, offset, count);
				647
				648	/* No COW extents? That's easy! */
				649	if (ifp->if_bytes == 0)
				650	return 0;
				651
				652	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
				653	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
				654
				655	/*
				656	* Start a rolling transaction to switch the mappings. We're
				657	* unlikely ever to have to remap 16T worth of single-block
				658	* extents, so just cap the worst case extent count to 2^32-1.
				659	* Stick a warning in just in case, and avoid 64-bit division.
				660	*/
				661	BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX);
				662	if (end_fsb - offset_fsb > UINT_MAX) {
				663	error = -EFSCORRUPTED;
				664	xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE);
				665	ASSERT(0);
				666	goto out;
				667	}
				668	resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
				669	(unsigned int)(end_fsb - offset_fsb),
				670	XFS_DATA_FORK);
				671	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
				672	resblks, 0, XFS_TRANS_RESERVE \| XFS_TRANS_NOFS, &tp);
				673	if (error)
				674	goto out;
				675
				676	xfs_ilock(ip, XFS_ILOCK_EXCL);
				677	xfs_trans_ijoin(tp, ip, 0);
				678
				679	/*
				680	* In case of racing, overlapping AIO writes no COW extents might be
				681	* left by the time I/O completes for the loser of the race. In that
				682	* case we are done.
				683	*/
				684	if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
				685	goto out_cancel;
				686
				687	/* Walk backwards until we're out of the I/O range... */
				688	while (got.br_startoff + got.br_blockcount > offset_fsb) {
				689	del = got;
				690	xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
				691
				692	/* Extent delete may have bumped ext forward */
				693	if (!del.br_blockcount)
				694	goto prev_extent;
				695
				696	/*
				697	* Only remap real extent that contain data. With AIO
				698	* speculatively preallocations can leak into the range we
				699	* are called upon, and we need to skip them.
				700	*/
				701	if (!xfs_bmap_is_real_extent(&got))
				702	goto prev_extent;
				703
				704	/* Unmap the old blocks in the data fork. */
				705	ASSERT(tp->t_firstblock == NULLFSBLOCK);
				706	rlen = del.br_blockcount;
				707	error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
				708	if (error)
				709	goto out_cancel;
				710
				711	/* Trim the extent to whatever got unmapped. */
				712	if (rlen) {
				713	xfs_trim_extent(&del, del.br_startoff + rlen,
				714	del.br_blockcount - rlen);
				715	}
				716	trace_xfs_reflink_cow_remap(ip, &del);
				717
				718	/* Free the CoW orphan record. */
				719	error = xfs_refcount_free_cow_extent(tp, del.br_startblock,
				720	del.br_blockcount);
				721	if (error)
				722	goto out_cancel;
				723
				724	/* Map the new blocks into the data fork. */
				725	error = xfs_bmap_map_extent(tp, ip, &del);
				726	if (error)
				727	goto out_cancel;
				728
				729	/* Charge this new data fork mapping to the on-disk quota. */
				730	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
				731	(long)del.br_blockcount);
				732
				733	/* Remove the mapping from the CoW fork. */
				734	xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
				735
				736	error = xfs_defer_finish(&tp);
				737	if (error)
				738	goto out_cancel;
				739	if (!xfs_iext_get_extent(ifp, &icur, &got))
				740	break;
				741	continue;
				742	prev_extent:
				743	if (!xfs_iext_prev_extent(ifp, &icur, &got))
				744	break;
				745	}
				746
				747	error = xfs_trans_commit(tp);
				748	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				749	if (error)
				750	goto out;
				751	return 0;
				752
				753	out_cancel:
				754	xfs_trans_cancel(tp);
				755	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				756	out:
				757	trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
				758	return error;
				759	}
				760
				761	/*
				762	* Free leftover CoW reservations that didn't get cleaned out.
				763	*/
				764	int
				765	xfs_reflink_recover_cow(
				766	struct xfs_mount *mp)
				767	{
				768	xfs_agnumber_t agno;
				769	int error = 0;
				770
				771	if (!xfs_sb_version_hasreflink(&mp->m_sb))
				772	return 0;
				773
				774	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
				775	error = xfs_refcount_recover_cow_leftovers(mp, agno);
				776	if (error)
				777	break;
				778	}
				779
				780	return error;
				781	}
				782
				783	/*
				784	* Reflinking (Block) Ranges of Two Files Together
				785	*
				786	* First, ensure that the reflink flag is set on both inodes. The flag is an
				787	* optimization to avoid unnecessary refcount btree lookups in the write path.
				788	*
				789	* Now we can iteratively remap the range of extents (and holes) in src to the
				790	* corresponding ranges in dest. Let drange and srange denote the ranges of
				791	* logical blocks in dest and src touched by the reflink operation.
				792	*
				793	* While the length of drange is greater than zero,
				794	* - Read src's bmbt at the start of srange ("imap")
				795	* - If imap doesn't exist, make imap appear to start at the end of srange
				796	* with zero length.
				797	* - If imap starts before srange, advance imap to start at srange.
				798	* - If imap goes beyond srange, truncate imap to end at the end of srange.
				799	* - Punch (imap start - srange start + imap len) blocks from dest at
				800	* offset (drange start).
				801	* - If imap points to a real range of pblks,
				802	* > Increase the refcount of the imap's pblks
				803	* > Map imap's pblks into dest at the offset
				804	* (drange start + imap start - srange start)
				805	* - Advance drange and srange by (imap start - srange start + imap len)
				806	*
				807	* Finally, if the reflink made dest longer, update both the in-core and
				808	* on-disk file sizes.
				809	*
				810	* ASCII Art Demonstration:
				811	*
				812	* Let's say we want to reflink this source file:
				813	*
				814	* ----SSSSSSS-SSSSS----SSSSSS (src file)
				815	* <-------------------->
				816	*
				817	* into this destination file:
				818	*
				819	* --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
				820	* <-------------------->
				821	* '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
				822	* Observe that the range has different logical offsets in either file.
				823	*
				824	* Consider that the first extent in the source file doesn't line up with our
				825	* reflink range. Unmapping and remapping are separate operations, so we can
				826	* unmap more blocks from the destination file than we remap.
				827	*
				828	* ----SSSSSSS-SSSSS----SSSSSS
				829	* <------->
				830	* --DDDDD---------DDDDD--DDD
				831	* <------->
				832	*
				833	* Now remap the source extent into the destination file:
				834	*
				835	* ----SSSSSSS-SSSSS----SSSSSS
				836	* <------->
				837	* --DDDDD--SSSSSSSDDDDD--DDD
				838	* <------->
				839	*
				840	* Do likewise with the second hole and extent in our range. Holes in the
				841	* unmap range don't affect our operation.
				842	*
				843	* ----SSSSSSS-SSSSS----SSSSSS
				844	* <---->
				845	* --DDDDD--SSSSSSS-SSSSS-DDD
				846	* <---->
				847	*
				848	* Finally, unmap and remap part of the third extent. This will increase the
				849	* size of the destination file.
				850	*
				851	* ----SSSSSSS-SSSSS----SSSSSS
				852	* <----->
				853	* --DDDDD--SSSSSSS-SSSSS----SSS
				854	* <----->
				855	*
				856	* Once we update the destination file's i_size, we're done.
				857	*/
				858
				859	/*
				860	* Ensure the reflink bit is set in both inodes.
				861	*/
				862	STATIC int
				863	xfs_reflink_set_inode_flag(
				864	struct xfs_inode *src,
				865	struct xfs_inode *dest)
				866	{
				867	struct xfs_mount *mp = src->i_mount;
				868	int error;
				869	struct xfs_trans *tp;
				870
				871	if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
				872	return 0;
				873
				874	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
				875	if (error)
				876	goto out_error;
				877
				878	/* Lock both files against IO */
				879	if (src->i_ino == dest->i_ino)
				880	xfs_ilock(src, XFS_ILOCK_EXCL);
				881	else
				882	xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL);
				883
				884	if (!xfs_is_reflink_inode(src)) {
				885	trace_xfs_reflink_set_inode_flag(src);
				886	xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
				887	src->i_d.di_flags2 \|= XFS_DIFLAG2_REFLINK;
				888	xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
				889	xfs_ifork_init_cow(src);
				890	} else
				891	xfs_iunlock(src, XFS_ILOCK_EXCL);
				892
				893	if (src->i_ino == dest->i_ino)
				894	goto commit_flags;
				895
				896	if (!xfs_is_reflink_inode(dest)) {
				897	trace_xfs_reflink_set_inode_flag(dest);
				898	xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
				899	dest->i_d.di_flags2 \|= XFS_DIFLAG2_REFLINK;
				900	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
				901	xfs_ifork_init_cow(dest);
				902	} else
				903	xfs_iunlock(dest, XFS_ILOCK_EXCL);
				904
				905	commit_flags:
				906	error = xfs_trans_commit(tp);
				907	if (error)
				908	goto out_error;
				909	return error;
				910
				911	out_error:
				912	trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
				913	return error;
				914	}
				915
				916	/*
				917	* Update destination inode size & cowextsize hint, if necessary.
				918	*/
				919	STATIC int
				920	xfs_reflink_update_dest(
				921	struct xfs_inode *dest,
				922	xfs_off_t newlen,
				923	xfs_extlen_t cowextsize,
				924	bool is_dedupe)
				925	{
				926	struct xfs_mount *mp = dest->i_mount;
				927	struct xfs_trans *tp;
				928	int error;
				929
				930	if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
				931	return 0;
				932
				933	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
				934	if (error)
				935	goto out_error;
				936
				937	xfs_ilock(dest, XFS_ILOCK_EXCL);
				938	xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
				939
				940	if (newlen > i_size_read(VFS_I(dest))) {
				941	trace_xfs_reflink_update_inode_size(dest, newlen);
				942	i_size_write(VFS_I(dest), newlen);
				943	dest->i_d.di_size = newlen;
				944	}
				945
				946	if (cowextsize) {
				947	dest->i_d.di_cowextsize = cowextsize;
				948	dest->i_d.di_flags2 \|= XFS_DIFLAG2_COWEXTSIZE;
				949	}
				950
				951	if (!is_dedupe) {
				952	xfs_trans_ichgtime(tp, dest,
				953	XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				954	}
				955	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
				956
				957	error = xfs_trans_commit(tp);
				958	if (error)
				959	goto out_error;
				960	return error;
				961
				962	out_error:
				963	trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
				964	return error;
				965	}
				966
				967	/*
				968	* Do we have enough reserve in this AG to handle a reflink? The refcount
				969	* btree already reserved all the space it needs, but the rmap btree can grow
				970	* infinitely, so we won't allow more reflinks when the AG is down to the
				971	* btree reserves.
				972	*/
				973	static int
				974	xfs_reflink_ag_has_free_space(
				975	struct xfs_mount *mp,
				976	xfs_agnumber_t agno)
				977	{
				978	struct xfs_perag *pag;
				979	int error = 0;
				980
				981	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
				982	return 0;
				983
				984	pag = xfs_perag_get(mp, agno);
				985	if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) \|\|
				986	xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
				987	error = -ENOSPC;
				988	xfs_perag_put(pag);
				989	return error;
				990	}
				991
				992	/*
				993	* Unmap a range of blocks from a file, then map other blocks into the hole.
				994	* The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
				995	* The extent irec is mapped into dest at irec->br_startoff.
				996	*/
				997	STATIC int
				998	xfs_reflink_remap_extent(
				999	struct xfs_inode *ip,
				1000	struct xfs_bmbt_irec *irec,
				1001	xfs_fileoff_t destoff,
				1002	xfs_off_t new_isize)
				1003	{
				1004	struct xfs_mount *mp = ip->i_mount;
				1005	bool real_extent = xfs_bmap_is_real_extent(irec);
				1006	struct xfs_trans *tp;
				1007	unsigned int resblks;
				1008	struct xfs_bmbt_irec uirec;
				1009	xfs_filblks_t rlen;
				1010	xfs_filblks_t unmap_len;
				1011	xfs_off_t newlen;
				1012	int error;
				1013
				1014	unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
				1015	trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
				1016
				1017	/* No reflinking if we're low on space */
				1018	if (real_extent) {
				1019	error = xfs_reflink_ag_has_free_space(mp,
				1020	XFS_FSB_TO_AGNO(mp, irec->br_startblock));
				1021	if (error)
				1022	goto out;
				1023	}
				1024
				1025	/* Start a rolling transaction to switch the mappings */
				1026	resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
				1027	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
				1028	if (error)
				1029	goto out;
				1030
				1031	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1032	xfs_trans_ijoin(tp, ip, 0);
				1033
				1034	/* If we're not just clearing space, then do we have enough quota? */
				1035	if (real_extent) {
				1036	error = xfs_trans_reserve_quota_nblks(tp, ip,
				1037	irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS);
				1038	if (error)
				1039	goto out_cancel;
				1040	}
				1041
				1042	trace_xfs_reflink_remap(ip, irec->br_startoff,
				1043	irec->br_blockcount, irec->br_startblock);
				1044
				1045	/* Unmap the old blocks in the data fork. */
				1046	rlen = unmap_len;
				1047	while (rlen) {
				1048	ASSERT(tp->t_firstblock == NULLFSBLOCK);
				1049	error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1);
				1050	if (error)
				1051	goto out_cancel;
				1052
				1053	/*
				1054	* Trim the extent to whatever got unmapped.
				1055	* Remember, bunmapi works backwards.
				1056	*/
				1057	uirec.br_startblock = irec->br_startblock + rlen;
				1058	uirec.br_startoff = irec->br_startoff + rlen;
				1059	uirec.br_blockcount = unmap_len - rlen;
				1060	unmap_len = rlen;
				1061
				1062	/* If this isn't a real mapping, we're done. */
				1063	if (!real_extent \|\| uirec.br_blockcount == 0)
				1064	goto next_extent;
				1065
				1066	trace_xfs_reflink_remap(ip, uirec.br_startoff,
				1067	uirec.br_blockcount, uirec.br_startblock);
				1068
				1069	/* Update the refcount tree */
				1070	error = xfs_refcount_increase_extent(tp, &uirec);
				1071	if (error)
				1072	goto out_cancel;
				1073
				1074	/* Map the new blocks into the data fork. */
				1075	error = xfs_bmap_map_extent(tp, ip, &uirec);
				1076	if (error)
				1077	goto out_cancel;
				1078
				1079	/* Update quota accounting. */
				1080	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
				1081	uirec.br_blockcount);
				1082
				1083	/* Update dest isize if needed. */
				1084	newlen = XFS_FSB_TO_B(mp,
				1085	uirec.br_startoff + uirec.br_blockcount);
				1086	newlen = min_t(xfs_off_t, newlen, new_isize);
				1087	if (newlen > i_size_read(VFS_I(ip))) {
				1088	trace_xfs_reflink_update_inode_size(ip, newlen);
				1089	i_size_write(VFS_I(ip), newlen);
				1090	ip->i_d.di_size = newlen;
				1091	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
				1092	}
				1093
				1094	next_extent:
				1095	/* Process all the deferred stuff. */
				1096	error = xfs_defer_finish(&tp);
				1097	if (error)
				1098	goto out_cancel;
				1099	}
				1100
				1101	error = xfs_trans_commit(tp);
				1102	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1103	if (error)
				1104	goto out;
				1105	return 0;
				1106
				1107	out_cancel:
				1108	xfs_trans_cancel(tp);
				1109	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1110	out:
				1111	trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
				1112	return error;
				1113	}
				1114
				1115	/*
				1116	* Iteratively remap one file's extents (and holes) to another's.
				1117	*/
				1118	STATIC int
				1119	xfs_reflink_remap_blocks(
				1120	struct xfs_inode *src,
				1121	xfs_fileoff_t srcoff,
				1122	struct xfs_inode *dest,
				1123	xfs_fileoff_t destoff,
				1124	xfs_filblks_t len,
				1125	xfs_off_t new_isize)
				1126	{
				1127	struct xfs_bmbt_irec imap;
				1128	int nimaps;
				1129	int error = 0;
				1130	xfs_filblks_t range_len;
				1131
				1132	/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
				1133	while (len) {
				1134	uint lock_mode;
				1135
				1136	trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
				1137	dest, destoff);
				1138
				1139	/* Read extent from the source file */
				1140	nimaps = 1;
				1141	lock_mode = xfs_ilock_data_map_shared(src);
				1142	error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
				1143	xfs_iunlock(src, lock_mode);
				1144	if (error)
				1145	goto err;
				1146	ASSERT(nimaps == 1);
				1147
				1148	trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
				1149	&imap);
				1150
				1151	/* Translate imap into the destination file. */
				1152	range_len = imap.br_startoff + imap.br_blockcount - srcoff;
				1153	imap.br_startoff += destoff - srcoff;
				1154
				1155	/* Clear dest from destoff to the end of imap and map it in. */
				1156	error = xfs_reflink_remap_extent(dest, &imap, destoff,
				1157	new_isize);
				1158	if (error)
				1159	goto err;
				1160
				1161	if (fatal_signal_pending(current)) {
				1162	error = -EINTR;
				1163	goto err;
				1164	}
				1165
				1166	/* Advance drange/srange */
				1167	srcoff += range_len;
				1168	destoff += range_len;
				1169	len -= range_len;
				1170	}
				1171
				1172	return 0;
				1173
				1174	err:
				1175	trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
				1176	return error;
				1177	}
				1178
				1179	/*
				1180	* Grab the exclusive iolock for a data copy from src to dest, making
				1181	* sure to abide vfs locking order (lowest pointer value goes first) and
				1182	* breaking the pnfs layout leases on dest before proceeding. The loop
				1183	* is needed because we cannot call the blocking break_layout() with the
				1184	* src iolock held, and therefore have to back out both locks.
				1185	*/
				1186	static int
				1187	xfs_iolock_two_inodes_and_break_layout(
				1188	struct inode *src,
				1189	struct inode *dest)
				1190	{
				1191	int error;
				1192
				1193	retry:
				1194	if (src < dest) {
				1195	inode_lock_shared(src);
				1196	inode_lock_nested(dest, I_MUTEX_NONDIR2);
				1197	} else {
				1198	/* src >= dest */
				1199	inode_lock(dest);
				1200	}
				1201
				1202	error = break_layout(dest, false);
				1203	if (error == -EWOULDBLOCK) {
				1204	inode_unlock(dest);
				1205	if (src < dest)
				1206	inode_unlock_shared(src);
				1207	error = break_layout(dest, true);
				1208	if (error)
				1209	return error;
				1210	goto retry;
				1211	}
				1212	if (error) {
				1213	inode_unlock(dest);
				1214	if (src < dest)
				1215	inode_unlock_shared(src);
				1216	return error;
				1217	}
				1218	if (src > dest)
				1219	inode_lock_shared_nested(src, I_MUTEX_NONDIR2);
				1220	return 0;
				1221	}
				1222
				1223	/* Unlock both inodes after they've been prepped for a range clone. */
				1224	STATIC void
				1225	xfs_reflink_remap_unlock(
				1226	struct file *file_in,
				1227	struct file *file_out)
				1228	{
				1229	struct inode *inode_in = file_inode(file_in);
				1230	struct xfs_inode *src = XFS_I(inode_in);
				1231	struct inode *inode_out = file_inode(file_out);
				1232	struct xfs_inode *dest = XFS_I(inode_out);
				1233	bool same_inode = (inode_in == inode_out);
				1234
				1235	xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
				1236	if (!same_inode)
				1237	xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
				1238	inode_unlock(inode_out);
				1239	if (!same_inode)
				1240	inode_unlock_shared(inode_in);
				1241	}
				1242
				1243	/*
				1244	* If we're reflinking to a point past the destination file's EOF, we must
				1245	* zero any speculative post-EOF preallocations that sit between the old EOF
				1246	* and the destination file offset.
				1247	*/
				1248	static int
				1249	xfs_reflink_zero_posteof(
				1250	struct xfs_inode *ip,
				1251	loff_t pos)
				1252	{
				1253	loff_t isize = i_size_read(VFS_I(ip));
				1254
				1255	if (pos <= isize)
				1256	return 0;
				1257
				1258	trace_xfs_zero_eof(ip, isize, pos - isize);
				1259	return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
				1260	&xfs_iomap_ops);
				1261	}
				1262
				1263	/*
				1264	* Prepare two files for range cloning. Upon a successful return both inodes
				1265	* will have the iolock and mmaplock held, the page cache of the out file will
				1266	* be truncated, and any leases on the out file will have been broken. This
				1267	* function borrows heavily from xfs_file_aio_write_checks.
				1268	*
				1269	* The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
				1270	* checked that the bytes beyond EOF physically match. Hence we cannot use the
				1271	* EOF block in the source dedupe range because it's not a complete block match,
				1272	* hence can introduce a corruption into the file that has it's block replaced.
				1273	*
				1274	* In similar fashion, the VFS file cloning also allows partial EOF blocks to be
				1275	* "block aligned" for the purposes of cloning entire files. However, if the
				1276	* source file range includes the EOF block and it lands within the existing EOF
				1277	* of the destination file, then we can expose stale data from beyond the source
				1278	* file EOF in the destination file.
				1279	*
				1280	* XFS doesn't support partial block sharing, so in both cases we have check
				1281	* these cases ourselves. For dedupe, we can simply round the length to dedupe
				1282	* down to the previous whole block and ignore the partial EOF block. While this
				1283	* means we can't dedupe the last block of a file, this is an acceptible
				1284	* tradeoff for simplicity on implementation.
				1285	*
				1286	* For cloning, we want to share the partial EOF block if it is also the new EOF
				1287	* block of the destination file. If the partial EOF block lies inside the
				1288	* existing destination EOF, then we have to abort the clone to avoid exposing
				1289	* stale data in the destination file. Hence we reject these clone attempts with
				1290	* -EINVAL in this case.
				1291	*/
				1292	STATIC int
				1293	xfs_reflink_remap_prep(
				1294	struct file *file_in,
				1295	loff_t pos_in,
				1296	struct file *file_out,
				1297	loff_t pos_out,
				1298	u64 *len,
				1299	bool is_dedupe)
				1300	{
				1301	struct inode *inode_in = file_inode(file_in);
				1302	struct xfs_inode *src = XFS_I(inode_in);
				1303	struct inode *inode_out = file_inode(file_out);
				1304	struct xfs_inode *dest = XFS_I(inode_out);
				1305	bool same_inode = (inode_in == inode_out);
				1306	u64 blkmask = i_blocksize(inode_in) - 1;
				1307	ssize_t ret;
				1308
				1309	/* Lock both files against IO */
				1310	ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
				1311	if (ret)
				1312	return ret;
				1313	if (same_inode)
				1314	xfs_ilock(src, XFS_MMAPLOCK_EXCL);
				1315	else
				1316	xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest,
				1317	XFS_MMAPLOCK_EXCL);
				1318
				1319	/* Check file eligibility and prepare for block sharing. */
				1320	ret = -EINVAL;
				1321	/* Don't reflink realtime inodes */
				1322	if (XFS_IS_REALTIME_INODE(src) \|\| XFS_IS_REALTIME_INODE(dest))
				1323	goto out_unlock;
				1324
				1325	/* Don't share DAX file data for now. */
				1326	if (IS_DAX(inode_in) \|\| IS_DAX(inode_out))
				1327	goto out_unlock;
				1328
				1329	ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
				1330	len, is_dedupe);
				1331	if (ret <= 0)
				1332	goto out_unlock;
				1333
				1334	/*
				1335	* If the dedupe data matches, chop off the partial EOF block
				1336	* from the source file so we don't try to dedupe the partial
				1337	* EOF block.
				1338	*/
				1339	if (is_dedupe) {
				1340	*len &= ~blkmask;
				1341	} else if (*len & blkmask) {
				1342	/*
				1343	* The user is attempting to share a partial EOF block,
				1344	* if it's inside the destination EOF then reject it.
				1345	*/
				1346	if (pos_out + *len < i_size_read(inode_out)) {
				1347	ret = -EINVAL;
				1348	goto out_unlock;
				1349	}
				1350	}
				1351
				1352	/* Attach dquots to dest inode before changing block map */
				1353	ret = xfs_qm_dqattach(dest);
				1354	if (ret)
				1355	goto out_unlock;
				1356
				1357	/*
				1358	* Zero existing post-eof speculative preallocations in the destination
				1359	* file.
				1360	*/
				1361	ret = xfs_reflink_zero_posteof(dest, pos_out);
				1362	if (ret)
				1363	goto out_unlock;
				1364
				1365	/* Set flags and remap blocks. */
				1366	ret = xfs_reflink_set_inode_flag(src, dest);
				1367	if (ret)
				1368	goto out_unlock;
				1369
				1370	/* Zap any page cache for the destination file's range. */
				1371	truncate_inode_pages_range(&inode_out->i_data, pos_out,
				1372	PAGE_ALIGN(pos_out + *len) - 1);
				1373
				1374	/* If we're altering the file contents... */
				1375	if (!is_dedupe) {
				1376	/*
				1377	* ...update the timestamps (which will grab the ilock again
				1378	* from xfs_fs_dirty_inode, so we have to call it before we
				1379	* take the ilock).
				1380	*/
				1381	if (!(file_out->f_mode & FMODE_NOCMTIME)) {
				1382	ret = file_update_time(file_out);
				1383	if (ret)
				1384	goto out_unlock;
				1385	}
				1386
				1387	/*
				1388	* ...clear the security bits if the process is not being run
				1389	* by root. This keeps people from modifying setuid and setgid
				1390	* binaries.
				1391	*/
				1392	ret = file_remove_privs(file_out);
				1393	if (ret)
				1394	goto out_unlock;
				1395	}
				1396
				1397	return 1;
				1398	out_unlock:
				1399	xfs_reflink_remap_unlock(file_in, file_out);
				1400	return ret;
				1401	}
				1402
				1403	/*
				1404	* Link a range of blocks from one file to another.
				1405	*/
				1406	int
				1407	xfs_reflink_remap_range(
				1408	struct file *file_in,
				1409	loff_t pos_in,
				1410	struct file *file_out,
				1411	loff_t pos_out,
				1412	u64 len,
				1413	bool is_dedupe)
				1414	{
				1415	struct inode *inode_in = file_inode(file_in);
				1416	struct xfs_inode *src = XFS_I(inode_in);
				1417	struct inode *inode_out = file_inode(file_out);
				1418	struct xfs_inode *dest = XFS_I(inode_out);
				1419	struct xfs_mount *mp = src->i_mount;
				1420	xfs_fileoff_t sfsbno, dfsbno;
				1421	xfs_filblks_t fsblen;
				1422	xfs_extlen_t cowextsize;
				1423	ssize_t ret;
				1424
				1425	if (!xfs_sb_version_hasreflink(&mp->m_sb))
				1426	return -EOPNOTSUPP;
				1427
				1428	if (XFS_FORCED_SHUTDOWN(mp))
				1429	return -EIO;
				1430
				1431	/* Prepare and then clone file data. */
				1432	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
				1433	&len, is_dedupe);
				1434	if (ret <= 0)
				1435	return ret;
				1436
				1437	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
				1438
				1439	dfsbno = XFS_B_TO_FSBT(mp, pos_out);
				1440	sfsbno = XFS_B_TO_FSBT(mp, pos_in);
				1441	fsblen = XFS_B_TO_FSB(mp, len);
				1442	ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
				1443	pos_out + len);
				1444	if (ret)
				1445	goto out_unlock;
				1446
				1447	/*
				1448	* Carry the cowextsize hint from src to dest if we're sharing the
				1449	* entire source file to the entire destination file, the source file
				1450	* has a cowextsize hint, and the destination file does not.
				1451	*/
				1452	cowextsize = 0;
				1453	if (pos_in == 0 && len == i_size_read(inode_in) &&
				1454	(src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
				1455	pos_out == 0 && len >= i_size_read(inode_out) &&
				1456	!(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
				1457	cowextsize = src->i_d.di_cowextsize;
				1458
				1459	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
				1460	is_dedupe);
				1461
				1462	out_unlock:
				1463	xfs_reflink_remap_unlock(file_in, file_out);
				1464	if (ret)
				1465	trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
				1466	return ret;
				1467	}
				1468
				1469	/*
				1470	* The user wants to preemptively CoW all shared blocks in this file,
				1471	* which enables us to turn off the reflink flag. Iterate all
				1472	* extents which are not prealloc/delalloc to see which ranges are
				1473	* mentioned in the refcount tree, then read those blocks into the
				1474	* pagecache, dirty them, fsync them back out, and then we can update
				1475	* the inode flag. What happens if we run out of memory? :)
				1476	*/
				1477	STATIC int
				1478	xfs_reflink_dirty_extents(
				1479	struct xfs_inode *ip,
				1480	xfs_fileoff_t fbno,
				1481	xfs_filblks_t end,
				1482	xfs_off_t isize)
				1483	{
				1484	struct xfs_mount *mp = ip->i_mount;
				1485	xfs_agnumber_t agno;
				1486	xfs_agblock_t agbno;
				1487	xfs_extlen_t aglen;
				1488	xfs_agblock_t rbno;
				1489	xfs_extlen_t rlen;
				1490	xfs_off_t fpos;
				1491	xfs_off_t flen;
				1492	struct xfs_bmbt_irec map[2];
				1493	int nmaps;
				1494	int error = 0;
				1495
				1496	while (end - fbno > 0) {
				1497	nmaps = 1;
				1498	/*
				1499	* Look for extents in the file. Skip holes, delalloc, or
				1500	* unwritten extents; they can't be reflinked.
				1501	*/
				1502	error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0);
				1503	if (error)
				1504	goto out;
				1505	if (nmaps == 0)
				1506	break;
				1507	if (!xfs_bmap_is_real_extent(&map[0]))
				1508	goto next;
				1509
				1510	map[1] = map[0];
				1511	while (map[1].br_blockcount) {
				1512	agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock);
				1513	agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock);
				1514	aglen = map[1].br_blockcount;
				1515
				1516	error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
				1517	aglen, &rbno, &rlen, true);
				1518	if (error)
				1519	goto out;
				1520	if (rbno == NULLAGBLOCK)
				1521	break;
				1522
				1523	/* Dirty the pages */
				1524	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1525	fpos = XFS_FSB_TO_B(mp, map[1].br_startoff +
				1526	(rbno - agbno));
				1527	flen = XFS_FSB_TO_B(mp, rlen);
				1528	if (fpos + flen > isize)
				1529	flen = isize - fpos;
				1530	error = iomap_file_dirty(VFS_I(ip), fpos, flen,
				1531	&xfs_iomap_ops);
				1532	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1533	if (error)
				1534	goto out;
				1535
				1536	map[1].br_blockcount -= (rbno - agbno + rlen);
				1537	map[1].br_startoff += (rbno - agbno + rlen);
				1538	map[1].br_startblock += (rbno - agbno + rlen);
				1539	}
				1540
				1541	next:
				1542	fbno = map[0].br_startoff + map[0].br_blockcount;
				1543	}
				1544	out:
				1545	return error;
				1546	}
				1547
				1548	/* Does this inode need the reflink flag? */
				1549	int
				1550	xfs_reflink_inode_has_shared_extents(
				1551	struct xfs_trans *tp,
				1552	struct xfs_inode *ip,
				1553	bool *has_shared)
				1554	{
				1555	struct xfs_bmbt_irec got;
				1556	struct xfs_mount *mp = ip->i_mount;
				1557	struct xfs_ifork *ifp;
				1558	xfs_agnumber_t agno;
				1559	xfs_agblock_t agbno;
				1560	xfs_extlen_t aglen;
				1561	xfs_agblock_t rbno;
				1562	xfs_extlen_t rlen;
				1563	struct xfs_iext_cursor icur;
				1564	bool found;
				1565	int error;
				1566
				1567	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
				1568	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
				1569	error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
				1570	if (error)
				1571	return error;
				1572	}
				1573
				1574	*has_shared = false;
				1575	found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
				1576	while (found) {
				1577	if (isnullstartblock(got.br_startblock) \|\|
				1578	got.br_state != XFS_EXT_NORM)
				1579	goto next;
				1580	agno = XFS_FSB_TO_AGNO(mp, got.br_startblock);
				1581	agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
				1582	aglen = got.br_blockcount;
				1583
				1584	error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen,
				1585	&rbno, &rlen, false);
				1586	if (error)
				1587	return error;
				1588	/* Is there still a shared block here? */
				1589	if (rbno != NULLAGBLOCK) {
				1590	*has_shared = true;
				1591	return 0;
				1592	}
				1593	next:
				1594	found = xfs_iext_next_extent(ifp, &icur, &got);
				1595	}
				1596
				1597	return 0;
				1598	}
				1599
				1600	/*
				1601	* Clear the inode reflink flag if there are no shared extents.
				1602	*
				1603	* The caller is responsible for joining the inode to the transaction passed in.
				1604	* The inode will be joined to the transaction that is returned to the caller.
				1605	*/
				1606	int
				1607	xfs_reflink_clear_inode_flag(
				1608	struct xfs_inode *ip,
				1609	struct xfs_trans **tpp)
				1610	{
				1611	bool needs_flag;
				1612	int error = 0;
				1613
				1614	ASSERT(xfs_is_reflink_inode(ip));
				1615
				1616	error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag);
				1617	if (error \|\| needs_flag)
				1618	return error;
				1619
				1620	/*
				1621	* We didn't find any shared blocks so turn off the reflink flag.
				1622	* First, get rid of any leftover CoW mappings.
				1623	*/
				1624	error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
				1625	if (error)
				1626	return error;
				1627
				1628	/* Clear the inode flag. */
				1629	trace_xfs_reflink_unset_inode_flag(ip);
				1630	ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
				1631	xfs_inode_clear_cowblocks_tag(ip);
				1632	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
				1633
				1634	return error;
				1635	}
				1636
				1637	/*
				1638	* Clear the inode reflink flag if there are no shared extents and the size
				1639	* hasn't changed.
				1640	*/
				1641	STATIC int
				1642	xfs_reflink_try_clear_inode_flag(
				1643	struct xfs_inode *ip)
				1644	{
				1645	struct xfs_mount *mp = ip->i_mount;
				1646	struct xfs_trans *tp;
				1647	int error = 0;
				1648
				1649	/* Start a rolling transaction to remove the mappings */
				1650	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
				1651	if (error)
				1652	return error;
				1653
				1654	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1655	xfs_trans_ijoin(tp, ip, 0);
				1656
				1657	error = xfs_reflink_clear_inode_flag(ip, &tp);
				1658	if (error)
				1659	goto cancel;
				1660
				1661	error = xfs_trans_commit(tp);
				1662	if (error)
				1663	goto out;
				1664
				1665	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1666	return 0;
				1667	cancel:
				1668	xfs_trans_cancel(tp);
				1669	out:
				1670	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1671	return error;
				1672	}
				1673
				1674	/*
				1675	* Pre-COW all shared blocks within a given byte range of a file and turn off
				1676	* the reflink flag if we unshare all of the file's blocks.
				1677	*/
				1678	int
				1679	xfs_reflink_unshare(
				1680	struct xfs_inode *ip,
				1681	xfs_off_t offset,
				1682	xfs_off_t len)
				1683	{
				1684	struct xfs_mount *mp = ip->i_mount;
				1685	xfs_fileoff_t fbno;
				1686	xfs_filblks_t end;
				1687	xfs_off_t isize;
				1688	int error;
				1689
				1690	if (!xfs_is_reflink_inode(ip))
				1691	return 0;
				1692
				1693	trace_xfs_reflink_unshare(ip, offset, len);
				1694
				1695	inode_dio_wait(VFS_I(ip));
				1696
				1697	/* Try to CoW the selected ranges */
				1698	xfs_ilock(ip, XFS_ILOCK_EXCL);
				1699	fbno = XFS_B_TO_FSBT(mp, offset);
				1700	isize = i_size_read(VFS_I(ip));
				1701	end = XFS_B_TO_FSB(mp, offset + len);
				1702	error = xfs_reflink_dirty_extents(ip, fbno, end, isize);
				1703	if (error)
				1704	goto out_unlock;
				1705	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1706
				1707	/* Wait for the IO to finish */
				1708	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
				1709	if (error)
				1710	goto out;
				1711
				1712	/* Turn off the reflink flag if possible. */
				1713	error = xfs_reflink_try_clear_inode_flag(ip);
				1714	if (error)
				1715	goto out;
				1716
				1717	return 0;
				1718
				1719	out_unlock:
				1720	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				1721	out:
				1722	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
				1723	return error;
				1724	}