Blame - fs/ocfs2/file.c - hafnium/third_party/linux.git

blob: 9fa35cb6f6e0b5b38023f45512fc75200be4c694 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* file.c
				5	*
				6	* File open, close, extend, truncate
				7	*
				8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
				9	*
				10	* This program is free software; you can redistribute it and/or
				11	* modify it under the terms of the GNU General Public
				12	* License as published by the Free Software Foundation; either
				13	* version 2 of the License, or (at your option) any later version.
				14	*
				15	* This program is distributed in the hope that it will be useful,
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				18	* General Public License for more details.
				19	*
				20	* You should have received a copy of the GNU General Public
				21	* License along with this program; if not, write to the
				22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				23	* Boston, MA 021110-1307, USA.
				24	*/
				25
				26	#include <linux/capability.h>
				27	#include <linux/fs.h>
				28	#include <linux/types.h>
				29	#include <linux/slab.h>
				30	#include <linux/highmem.h>
				31	#include <linux/pagemap.h>
				32	#include <linux/uio.h>
				33	#include <linux/sched.h>
				34	#include <linux/splice.h>
				35	#include <linux/mount.h>
				36	#include <linux/writeback.h>
				37	#include <linux/falloc.h>
				38	#include <linux/quotaops.h>
				39	#include <linux/blkdev.h>
				40	#include <linux/backing-dev.h>
				41
				42	#include <cluster/masklog.h>
				43
				44	#include "ocfs2.h"
				45
				46	#include "alloc.h"
				47	#include "aops.h"
				48	#include "dir.h"
				49	#include "dlmglue.h"
				50	#include "extent_map.h"
				51	#include "file.h"
				52	#include "sysfile.h"
				53	#include "inode.h"
				54	#include "ioctl.h"
				55	#include "journal.h"
				56	#include "locks.h"
				57	#include "mmap.h"
				58	#include "suballoc.h"
				59	#include "super.h"
				60	#include "xattr.h"
				61	#include "acl.h"
				62	#include "quota.h"
				63	#include "refcounttree.h"
				64	#include "ocfs2_trace.h"
				65
				66	#include "buffer_head_io.h"
				67
				68	static int ocfs2_init_file_private(struct inode inode, struct file file)
				69	{
				70	struct ocfs2_file_private *fp;
				71
				72	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
				73	if (!fp)
				74	return -ENOMEM;
				75
				76	fp->fp_file = file;
				77	mutex_init(&fp->fp_mutex);
				78	ocfs2_file_lock_res_init(&fp->fp_flock, fp);
				79	file->private_data = fp;
				80
				81	return 0;
				82	}
				83
				84	static void ocfs2_free_file_private(struct inode inode, struct file file)
				85	{
				86	struct ocfs2_file_private *fp = file->private_data;
				87	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				88
				89	if (fp) {
				90	ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
				91	ocfs2_lock_res_free(&fp->fp_flock);
				92	kfree(fp);
				93	file->private_data = NULL;
				94	}
				95	}
				96
				97	static int ocfs2_file_open(struct inode inode, struct file file)
				98	{
				99	int status;
				100	int mode = file->f_flags;
				101	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				102
				103	trace_ocfs2_file_open(inode, file, file->f_path.dentry,
				104	(unsigned long long)oi->ip_blkno,
				105	file->f_path.dentry->d_name.len,
				106	file->f_path.dentry->d_name.name, mode);
				107
				108	if (file->f_mode & FMODE_WRITE) {
				109	status = dquot_initialize(inode);
				110	if (status)
				111	goto leave;
				112	}
				113
				114	spin_lock(&oi->ip_lock);
				115
				116	/* Check that the inode hasn't been wiped from disk by another
				117	* node. If it hasn't then we're safe as long as we hold the
				118	* spin lock until our increment of open count. */
				119	if (oi->ip_flags & OCFS2_INODE_DELETED) {
				120	spin_unlock(&oi->ip_lock);
				121
				122	status = -ENOENT;
				123	goto leave;
				124	}
				125
				126	if (mode & O_DIRECT)
				127	oi->ip_flags \|= OCFS2_INODE_OPEN_DIRECT;
				128
				129	oi->ip_open_count++;
				130	spin_unlock(&oi->ip_lock);
				131
				132	status = ocfs2_init_file_private(inode, file);
				133	if (status) {
				134	/*
				135	* We want to set open count back if we're failing the
				136	* open.
				137	*/
				138	spin_lock(&oi->ip_lock);
				139	oi->ip_open_count--;
				140	spin_unlock(&oi->ip_lock);
				141	}
				142
				143	file->f_mode \|= FMODE_NOWAIT;
				144
				145	leave:
				146	return status;
				147	}
				148
				149	static int ocfs2_file_release(struct inode inode, struct file file)
				150	{
				151	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				152
				153	spin_lock(&oi->ip_lock);
				154	if (!--oi->ip_open_count)
				155	oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
				156
				157	trace_ocfs2_file_release(inode, file, file->f_path.dentry,
				158	oi->ip_blkno,
				159	file->f_path.dentry->d_name.len,
				160	file->f_path.dentry->d_name.name,
				161	oi->ip_open_count);
				162	spin_unlock(&oi->ip_lock);
				163
				164	ocfs2_free_file_private(inode, file);
				165
				166	return 0;
				167	}
				168
				169	static int ocfs2_dir_open(struct inode inode, struct file file)
				170	{
				171	return ocfs2_init_file_private(inode, file);
				172	}
				173
				174	static int ocfs2_dir_release(struct inode inode, struct file file)
				175	{
				176	ocfs2_free_file_private(inode, file);
				177	return 0;
				178	}
				179
				180	static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
				181	int datasync)
				182	{
				183	int err = 0;
				184	struct inode *inode = file->f_mapping->host;
				185	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				186	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				187	journal_t *journal = osb->journal->j_journal;
				188	int ret;
				189	tid_t commit_tid;
				190	bool needs_barrier = false;
				191
				192	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
				193	oi->ip_blkno,
				194	file->f_path.dentry->d_name.len,
				195	file->f_path.dentry->d_name.name,
				196	(unsigned long long)datasync);
				197
				198	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
				199	return -EROFS;
				200
				201	err = file_write_and_wait_range(file, start, end);
				202	if (err)
				203	return err;
				204
				205	commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
				206	if (journal->j_flags & JBD2_BARRIER &&
				207	!jbd2_trans_will_send_data_barrier(journal, commit_tid))
				208	needs_barrier = true;
				209	err = jbd2_complete_transaction(journal, commit_tid);
				210	if (needs_barrier) {
				211	ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
				212	if (!err)
				213	err = ret;
				214	}
				215
				216	if (err)
				217	mlog_errno(err);
				218
				219	return (err < 0) ? -EIO : 0;
				220	}
				221
				222	int ocfs2_should_update_atime(struct inode *inode,
				223	struct vfsmount *vfsmnt)
				224	{
				225	struct timespec64 now;
				226	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				227
				228	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
				229	return 0;
				230
				231	if ((inode->i_flags & S_NOATIME) \|\|
				232	((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)))
				233	return 0;
				234
				235	/*
				236	* We can be called with no vfsmnt structure - NFSD will
				237	* sometimes do this.
				238	*
				239	* Note that our action here is different than touch_atime() -
				240	* if we can't tell whether this is a noatime mount, then we
				241	* don't know whether to trust the value of s_atime_quantum.
				242	*/
				243	if (vfsmnt == NULL)
				244	return 0;
				245
				246	if ((vfsmnt->mnt_flags & MNT_NOATIME) \|\|
				247	((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
				248	return 0;
				249
				250	if (vfsmnt->mnt_flags & MNT_RELATIME) {
				251	if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) \|\|
				252	(timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0))
				253	return 1;
				254
				255	return 0;
				256	}
				257
				258	now = current_time(inode);
				259	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
				260	return 0;
				261	else
				262	return 1;
				263	}
				264
				265	int ocfs2_update_inode_atime(struct inode *inode,
				266	struct buffer_head *bh)
				267	{
				268	int ret;
				269	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				270	handle_t *handle;
				271	struct ocfs2_dinode di = (struct ocfs2_dinode ) bh->b_data;
				272
				273	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				274	if (IS_ERR(handle)) {
				275	ret = PTR_ERR(handle);
				276	mlog_errno(ret);
				277	goto out;
				278	}
				279
				280	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
				281	OCFS2_JOURNAL_ACCESS_WRITE);
				282	if (ret) {
				283	mlog_errno(ret);
				284	goto out_commit;
				285	}
				286
				287	/*
				288	* Don't use ocfs2_mark_inode_dirty() here as we don't always
				289	* have i_mutex to guard against concurrent changes to other
				290	* inode fields.
				291	*/
				292	inode->i_atime = current_time(inode);
				293	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
				294	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
				295	ocfs2_update_inode_fsync_trans(handle, inode, 0);
				296	ocfs2_journal_dirty(handle, bh);
				297
				298	out_commit:
				299	ocfs2_commit_trans(osb, handle);
				300	out:
				301	return ret;
				302	}
				303
				304	int ocfs2_set_inode_size(handle_t *handle,
				305	struct inode *inode,
				306	struct buffer_head *fe_bh,
				307	u64 new_i_size)
				308	{
				309	int status;
				310
				311	i_size_write(inode, new_i_size);
				312	inode->i_blocks = ocfs2_inode_sector_count(inode);
				313	inode->i_ctime = inode->i_mtime = current_time(inode);
				314
				315	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
				316	if (status < 0) {
				317	mlog_errno(status);
				318	goto bail;
				319	}
				320
				321	bail:
				322	return status;
				323	}
				324
				325	int ocfs2_simple_size_update(struct inode *inode,
				326	struct buffer_head *di_bh,
				327	u64 new_i_size)
				328	{
				329	int ret;
				330	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				331	handle_t *handle = NULL;
				332
				333	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				334	if (IS_ERR(handle)) {
				335	ret = PTR_ERR(handle);
				336	mlog_errno(ret);
				337	goto out;
				338	}
				339
				340	ret = ocfs2_set_inode_size(handle, inode, di_bh,
				341	new_i_size);
				342	if (ret < 0)
				343	mlog_errno(ret);
				344
				345	ocfs2_update_inode_fsync_trans(handle, inode, 0);
				346	ocfs2_commit_trans(osb, handle);
				347	out:
				348	return ret;
				349	}
				350
				351	static int ocfs2_cow_file_pos(struct inode *inode,
				352	struct buffer_head *fe_bh,
				353	u64 offset)
				354	{
				355	int status;
				356	u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
				357	unsigned int num_clusters = 0;
				358	unsigned int ext_flags = 0;
				359
				360	/*
				361	* If the new offset is aligned to the range of the cluster, there is
				362	* no space for ocfs2_zero_range_for_truncate to fill, so no need to
				363	* CoW either.
				364	*/
				365	if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
				366	return 0;
				367
				368	status = ocfs2_get_clusters(inode, cpos, &phys,
				369	&num_clusters, &ext_flags);
				370	if (status) {
				371	mlog_errno(status);
				372	goto out;
				373	}
				374
				375	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
				376	goto out;
				377
				378	return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
				379
				380	out:
				381	return status;
				382	}
				383
				384	static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
				385	struct inode *inode,
				386	struct buffer_head *fe_bh,
				387	u64 new_i_size)
				388	{
				389	int status;
				390	handle_t *handle;
				391	struct ocfs2_dinode *di;
				392	u64 cluster_bytes;
				393
				394	/*
				395	* We need to CoW the cluster contains the offset if it is reflinked
				396	* since we will call ocfs2_zero_range_for_truncate later which will
				397	* write "0" from offset to the end of the cluster.
				398	*/
				399	status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
				400	if (status) {
				401	mlog_errno(status);
				402	return status;
				403	}
				404
				405	/* TODO: This needs to actually orphan the inode in this
				406	* transaction. */
				407
				408	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				409	if (IS_ERR(handle)) {
				410	status = PTR_ERR(handle);
				411	mlog_errno(status);
				412	goto out;
				413	}
				414
				415	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
				416	OCFS2_JOURNAL_ACCESS_WRITE);
				417	if (status < 0) {
				418	mlog_errno(status);
				419	goto out_commit;
				420	}
				421
				422	/*
				423	* Do this before setting i_size.
				424	*/
				425	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
				426	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
				427	cluster_bytes);
				428	if (status) {
				429	mlog_errno(status);
				430	goto out_commit;
				431	}
				432
				433	i_size_write(inode, new_i_size);
				434	inode->i_ctime = inode->i_mtime = current_time(inode);
				435
				436	di = (struct ocfs2_dinode *) fe_bh->b_data;
				437	di->i_size = cpu_to_le64(new_i_size);
				438	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
				439	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
				440	ocfs2_update_inode_fsync_trans(handle, inode, 0);
				441
				442	ocfs2_journal_dirty(handle, fe_bh);
				443
				444	out_commit:
				445	ocfs2_commit_trans(osb, handle);
				446	out:
				447	return status;
				448	}
				449
				450	int ocfs2_truncate_file(struct inode *inode,
				451	struct buffer_head *di_bh,
				452	u64 new_i_size)
				453	{
				454	int status = 0;
				455	struct ocfs2_dinode *fe = NULL;
				456	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				457
				458	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
				459	* already validated it */
				460	fe = (struct ocfs2_dinode *) di_bh->b_data;
				461
				462	trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
				463	(unsigned long long)le64_to_cpu(fe->i_size),
				464	(unsigned long long)new_i_size);
				465
				466	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
				467	"Inode %llu, inode i_size = %lld != di "
				468	"i_size = %llu, i_flags = 0x%x\n",
				469	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				470	i_size_read(inode),
				471	(unsigned long long)le64_to_cpu(fe->i_size),
				472	le32_to_cpu(fe->i_flags));
				473
				474	if (new_i_size > le64_to_cpu(fe->i_size)) {
				475	trace_ocfs2_truncate_file_error(
				476	(unsigned long long)le64_to_cpu(fe->i_size),
				477	(unsigned long long)new_i_size);
				478	status = -EINVAL;
				479	mlog_errno(status);
				480	goto bail;
				481	}
				482
				483	down_write(&OCFS2_I(inode)->ip_alloc_sem);
				484
				485	ocfs2_resv_discard(&osb->osb_la_resmap,
				486	&OCFS2_I(inode)->ip_la_data_resv);
				487
				488	/*
				489	* The inode lock forced other nodes to sync and drop their
				490	* pages, which (correctly) happens even if we have a truncate
				491	* without allocation change - ocfs2 cluster sizes can be much
				492	* greater than page size, so we have to truncate them
				493	* anyway.
				494	*/
				495	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
				496	truncate_inode_pages(inode->i_mapping, new_i_size);
				497
				498	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
				499	status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
				500	i_size_read(inode), 1);
				501	if (status)
				502	mlog_errno(status);
				503
				504	goto bail_unlock_sem;
				505	}
				506
				507	/* alright, we're going to need to do a full blown alloc size
				508	* change. Orphan the inode so that recovery can complete the
				509	* truncate if necessary. This does the task of marking
				510	* i_size. */
				511	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
				512	if (status < 0) {
				513	mlog_errno(status);
				514	goto bail_unlock_sem;
				515	}
				516
				517	status = ocfs2_commit_truncate(osb, inode, di_bh);
				518	if (status < 0) {
				519	mlog_errno(status);
				520	goto bail_unlock_sem;
				521	}
				522
				523	/* TODO: orphan dir cleanup here. */
				524	bail_unlock_sem:
				525	up_write(&OCFS2_I(inode)->ip_alloc_sem);
				526
				527	bail:
				528	if (!status && OCFS2_I(inode)->ip_clusters == 0)
				529	status = ocfs2_try_remove_refcount_tree(inode, di_bh);
				530
				531	return status;
				532	}
				533
				534	/*
				535	* extend file allocation only here.
				536	* we'll update all the disk stuff, and oip->alloc_size
				537	*
				538	* expect stuff to be locked, a transaction started and enough data /
				539	* metadata reservations in the contexts.
				540	*
				541	* Will return -EAGAIN, and a reason if a restart is needed.
				542	* If passed in, *reason will always be set, even in error.
				543	*/
				544	int ocfs2_add_inode_data(struct ocfs2_super *osb,
				545	struct inode *inode,
				546	u32 *logical_offset,
				547	u32 clusters_to_add,
				548	int mark_unwritten,
				549	struct buffer_head *fe_bh,
				550	handle_t *handle,
				551	struct ocfs2_alloc_context *data_ac,
				552	struct ocfs2_alloc_context *meta_ac,
				553	enum ocfs2_alloc_restarted *reason_ret)
				554	{
				555	int ret;
				556	struct ocfs2_extent_tree et;
				557
				558	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
				559	ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
				560	clusters_to_add, mark_unwritten,
				561	data_ac, meta_ac, reason_ret);
				562
				563	return ret;
				564	}
				565
				566	static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
				567	u32 clusters_to_add, int mark_unwritten)
				568	{
				569	int status = 0;
				570	int restart_func = 0;
				571	int credits;
				572	u32 prev_clusters;
				573	struct buffer_head *bh = NULL;
				574	struct ocfs2_dinode *fe = NULL;
				575	handle_t *handle = NULL;
				576	struct ocfs2_alloc_context *data_ac = NULL;
				577	struct ocfs2_alloc_context *meta_ac = NULL;
				578	enum ocfs2_alloc_restarted why = RESTART_NONE;
				579	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				580	struct ocfs2_extent_tree et;
				581	int did_quota = 0;
				582
				583	/*
				584	* Unwritten extent only exists for file systems which
				585	* support holes.
				586	*/
				587	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
				588
				589	status = ocfs2_read_inode_block(inode, &bh);
				590	if (status < 0) {
				591	mlog_errno(status);
				592	goto leave;
				593	}
				594	fe = (struct ocfs2_dinode *) bh->b_data;
				595
				596	restart_all:
				597	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
				598
				599	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
				600	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
				601	&data_ac, &meta_ac);
				602	if (status) {
				603	mlog_errno(status);
				604	goto leave;
				605	}
				606
				607	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
				608	handle = ocfs2_start_trans(osb, credits);
				609	if (IS_ERR(handle)) {
				610	status = PTR_ERR(handle);
				611	handle = NULL;
				612	mlog_errno(status);
				613	goto leave;
				614	}
				615
				616	restarted_transaction:
				617	trace_ocfs2_extend_allocation(
				618	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				619	(unsigned long long)i_size_read(inode),
				620	le32_to_cpu(fe->i_clusters), clusters_to_add,
				621	why, restart_func);
				622
				623	status = dquot_alloc_space_nodirty(inode,
				624	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
				625	if (status)
				626	goto leave;
				627	did_quota = 1;
				628
				629	/* reserve a write to the file entry early on - that we if we
				630	* run out of credits in the allocation path, we can still
				631	* update i_size. */
				632	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
				633	OCFS2_JOURNAL_ACCESS_WRITE);
				634	if (status < 0) {
				635	mlog_errno(status);
				636	goto leave;
				637	}
				638
				639	prev_clusters = OCFS2_I(inode)->ip_clusters;
				640
				641	status = ocfs2_add_inode_data(osb,
				642	inode,
				643	&logical_start,
				644	clusters_to_add,
				645	mark_unwritten,
				646	bh,
				647	handle,
				648	data_ac,
				649	meta_ac,
				650	&why);
				651	if ((status < 0) && (status != -EAGAIN)) {
				652	if (status != -ENOSPC)
				653	mlog_errno(status);
				654	goto leave;
				655	}
				656	ocfs2_update_inode_fsync_trans(handle, inode, 1);
				657	ocfs2_journal_dirty(handle, bh);
				658
				659	spin_lock(&OCFS2_I(inode)->ip_lock);
				660	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
				661	spin_unlock(&OCFS2_I(inode)->ip_lock);
				662	/* Release unused quota reservation */
				663	dquot_free_space(inode,
				664	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
				665	did_quota = 0;
				666
				667	if (why != RESTART_NONE && clusters_to_add) {
				668	if (why == RESTART_META) {
				669	restart_func = 1;
				670	status = 0;
				671	} else {
				672	BUG_ON(why != RESTART_TRANS);
				673
				674	status = ocfs2_allocate_extend_trans(handle, 1);
				675	if (status < 0) {
				676	/* handle still has to be committed at
				677	* this point. */
				678	status = -ENOMEM;
				679	mlog_errno(status);
				680	goto leave;
				681	}
				682	goto restarted_transaction;
				683	}
				684	}
				685
				686	trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
				687	le32_to_cpu(fe->i_clusters),
				688	(unsigned long long)le64_to_cpu(fe->i_size),
				689	OCFS2_I(inode)->ip_clusters,
				690	(unsigned long long)i_size_read(inode));
				691
				692	leave:
				693	if (status < 0 && did_quota)
				694	dquot_free_space(inode,
				695	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
				696	if (handle) {
				697	ocfs2_commit_trans(osb, handle);
				698	handle = NULL;
				699	}
				700	if (data_ac) {
				701	ocfs2_free_alloc_context(data_ac);
				702	data_ac = NULL;
				703	}
				704	if (meta_ac) {
				705	ocfs2_free_alloc_context(meta_ac);
				706	meta_ac = NULL;
				707	}
				708	if ((!status) && restart_func) {
				709	restart_func = 0;
				710	goto restart_all;
				711	}
				712	brelse(bh);
				713	bh = NULL;
				714
				715	return status;
				716	}
				717
				718	/*
				719	* While a write will already be ordering the data, a truncate will not.
				720	* Thus, we need to explicitly order the zeroed pages.
				721	*/
				722	static handle_t ocfs2_zero_start_ordered_transaction(struct inode inode,
				723	struct buffer_head *di_bh)
				724	{
				725	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				726	handle_t *handle = NULL;
				727	int ret = 0;
				728
				729	if (!ocfs2_should_order_data(inode))
				730	goto out;
				731
				732	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				733	if (IS_ERR(handle)) {
				734	ret = -ENOMEM;
				735	mlog_errno(ret);
				736	goto out;
				737	}
				738
				739	ret = ocfs2_jbd2_file_inode(handle, inode);
				740	if (ret < 0) {
				741	mlog_errno(ret);
				742	goto out;
				743	}
				744
				745	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
				746	OCFS2_JOURNAL_ACCESS_WRITE);
				747	if (ret)
				748	mlog_errno(ret);
				749	ocfs2_update_inode_fsync_trans(handle, inode, 1);
				750
				751	out:
				752	if (ret) {
				753	if (!IS_ERR(handle))
				754	ocfs2_commit_trans(osb, handle);
				755	handle = ERR_PTR(ret);
				756	}
				757	return handle;
				758	}
				759
				760	/* Some parts of this taken from generic_cont_expand, which turned out
				761	* to be too fragile to do exactly what we need without us having to
				762	* worry about recursive locking in ->write_begin() and ->write_end(). */
				763	static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
				764	u64 abs_to, struct buffer_head *di_bh)
				765	{
				766	struct address_space *mapping = inode->i_mapping;
				767	struct page *page;
				768	unsigned long index = abs_from >> PAGE_SHIFT;
				769	handle_t *handle;
				770	int ret = 0;
				771	unsigned zero_from, zero_to, block_start, block_end;
				772	struct ocfs2_dinode di = (struct ocfs2_dinode )di_bh->b_data;
				773
				774	BUG_ON(abs_from >= abs_to);
				775	BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
				776	BUG_ON(abs_from & (inode->i_blkbits - 1));
				777
				778	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
				779	if (IS_ERR(handle)) {
				780	ret = PTR_ERR(handle);
				781	goto out;
				782	}
				783
				784	page = find_or_create_page(mapping, index, GFP_NOFS);
				785	if (!page) {
				786	ret = -ENOMEM;
				787	mlog_errno(ret);
				788	goto out_commit_trans;
				789	}
				790
				791	/* Get the offsets within the page that we want to zero */
				792	zero_from = abs_from & (PAGE_SIZE - 1);
				793	zero_to = abs_to & (PAGE_SIZE - 1);
				794	if (!zero_to)
				795	zero_to = PAGE_SIZE;
				796
				797	trace_ocfs2_write_zero_page(
				798	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				799	(unsigned long long)abs_from,
				800	(unsigned long long)abs_to,
				801	index, zero_from, zero_to);
				802
				803	/* We know that zero_from is block aligned */
				804	for (block_start = zero_from; block_start < zero_to;
				805	block_start = block_end) {
				806	block_end = block_start + i_blocksize(inode);
				807
				808	/*
				809	* block_start is block-aligned. Bump it by one to force
				810	* __block_write_begin and block_commit_write to zero the
				811	* whole block.
				812	*/
				813	ret = __block_write_begin(page, block_start + 1, 0,
				814	ocfs2_get_block);
				815	if (ret < 0) {
				816	mlog_errno(ret);
				817	goto out_unlock;
				818	}
				819
				820
				821	/* must not update i_size! */
				822	ret = block_commit_write(page, block_start + 1,
				823	block_start + 1);
				824	if (ret < 0)
				825	mlog_errno(ret);
				826	else
				827	ret = 0;
				828	}
				829
				830	/*
				831	* fs-writeback will release the dirty pages without page lock
				832	* whose offset are over inode size, the release happens at
				833	* block_write_full_page().
				834	*/
				835	i_size_write(inode, abs_to);
				836	inode->i_blocks = ocfs2_inode_sector_count(inode);
				837	di->i_size = cpu_to_le64((u64)i_size_read(inode));
				838	inode->i_mtime = inode->i_ctime = current_time(inode);
				839	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
				840	di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
				841	di->i_mtime_nsec = di->i_ctime_nsec;
				842	if (handle) {
				843	ocfs2_journal_dirty(handle, di_bh);
				844	ocfs2_update_inode_fsync_trans(handle, inode, 1);
				845	}
				846
				847	out_unlock:
				848	unlock_page(page);
				849	put_page(page);
				850	out_commit_trans:
				851	if (handle)
				852	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
				853	out:
				854	return ret;
				855	}
				856
				857	/*
				858	* Find the next range to zero. We do this in terms of bytes because
				859	* that's what ocfs2_zero_extend() wants, and it is dealing with the
				860	* pagecache. We may return multiple extents.
				861	*
				862	* zero_start and zero_end are ocfs2_zero_extend()s current idea of what
				863	* needs to be zeroed. range_start and range_end return the next zeroing
				864	* range. A subsequent call should pass the previous range_end as its
				865	* zero_start. If range_end is 0, there's nothing to do.
				866	*
				867	* Unwritten extents are skipped over. Refcounted extents are CoWd.
				868	*/
				869	static int ocfs2_zero_extend_get_range(struct inode *inode,
				870	struct buffer_head *di_bh,
				871	u64 zero_start, u64 zero_end,
				872	u64 range_start, u64 range_end)
				873	{
				874	int rc = 0, needs_cow = 0;
				875	u32 p_cpos, zero_clusters = 0;
				876	u32 zero_cpos =
				877	zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
				878	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
				879	unsigned int num_clusters = 0;
				880	unsigned int ext_flags = 0;
				881
				882	while (zero_cpos < last_cpos) {
				883	rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
				884	&num_clusters, &ext_flags);
				885	if (rc) {
				886	mlog_errno(rc);
				887	goto out;
				888	}
				889
				890	if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
				891	zero_clusters = num_clusters;
				892	if (ext_flags & OCFS2_EXT_REFCOUNTED)
				893	needs_cow = 1;
				894	break;
				895	}
				896
				897	zero_cpos += num_clusters;
				898	}
				899	if (!zero_clusters) {
				900	*range_end = 0;
				901	goto out;
				902	}
				903
				904	while ((zero_cpos + zero_clusters) < last_cpos) {
				905	rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
				906	&p_cpos, &num_clusters,
				907	&ext_flags);
				908	if (rc) {
				909	mlog_errno(rc);
				910	goto out;
				911	}
				912
				913	if (!p_cpos \|\| (ext_flags & OCFS2_EXT_UNWRITTEN))
				914	break;
				915	if (ext_flags & OCFS2_EXT_REFCOUNTED)
				916	needs_cow = 1;
				917	zero_clusters += num_clusters;
				918	}
				919	if ((zero_cpos + zero_clusters) > last_cpos)
				920	zero_clusters = last_cpos - zero_cpos;
				921
				922	if (needs_cow) {
				923	rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
				924	zero_clusters, UINT_MAX);
				925	if (rc) {
				926	mlog_errno(rc);
				927	goto out;
				928	}
				929	}
				930
				931	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
				932	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
				933	zero_cpos + zero_clusters);
				934
				935	out:
				936	return rc;
				937	}
				938
				939	/*
				940	* Zero one range returned from ocfs2_zero_extend_get_range(). The caller
				941	* has made sure that the entire range needs zeroing.
				942	*/
				943	static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
				944	u64 range_end, struct buffer_head *di_bh)
				945	{
				946	int rc = 0;
				947	u64 next_pos;
				948	u64 zero_pos = range_start;
				949
				950	trace_ocfs2_zero_extend_range(
				951	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				952	(unsigned long long)range_start,
				953	(unsigned long long)range_end);
				954	BUG_ON(range_start >= range_end);
				955
				956	while (zero_pos < range_end) {
				957	next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE;
				958	if (next_pos > range_end)
				959	next_pos = range_end;
				960	rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
				961	if (rc < 0) {
				962	mlog_errno(rc);
				963	break;
				964	}
				965	zero_pos = next_pos;
				966
				967	/*
				968	* Very large extends have the potential to lock up
				969	* the cpu for extended periods of time.
				970	*/
				971	cond_resched();
				972	}
				973
				974	return rc;
				975	}
				976
				977	int ocfs2_zero_extend(struct inode inode, struct buffer_head di_bh,
				978	loff_t zero_to_size)
				979	{
				980	int ret = 0;
				981	u64 zero_start, range_start = 0, range_end = 0;
				982	struct super_block *sb = inode->i_sb;
				983
				984	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
				985	trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
				986	(unsigned long long)zero_start,
				987	(unsigned long long)i_size_read(inode));
				988	while (zero_start < zero_to_size) {
				989	ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
				990	zero_to_size,
				991	&range_start,
				992	&range_end);
				993	if (ret) {
				994	mlog_errno(ret);
				995	break;
				996	}
				997	if (!range_end)
				998	break;
				999	/* Trim the ends */
				1000	if (range_start < zero_start)
				1001	range_start = zero_start;
				1002	if (range_end > zero_to_size)
				1003	range_end = zero_to_size;
				1004
				1005	ret = ocfs2_zero_extend_range(inode, range_start,
				1006	range_end, di_bh);
				1007	if (ret) {
				1008	mlog_errno(ret);
				1009	break;
				1010	}
				1011	zero_start = range_end;
				1012	}
				1013
				1014	return ret;
				1015	}
				1016
				1017	int ocfs2_extend_no_holes(struct inode inode, struct buffer_head di_bh,
				1018	u64 new_i_size, u64 zero_to)
				1019	{
				1020	int ret;
				1021	u32 clusters_to_add;
				1022	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				1023
				1024	/*
				1025	* Only quota files call this without a bh, and they can't be
				1026	* refcounted.
				1027	*/
				1028	BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode));
				1029	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
				1030
				1031	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
				1032	if (clusters_to_add < oi->ip_clusters)
				1033	clusters_to_add = 0;
				1034	else
				1035	clusters_to_add -= oi->ip_clusters;
				1036
				1037	if (clusters_to_add) {
				1038	ret = ocfs2_extend_allocation(inode, oi->ip_clusters,
				1039	clusters_to_add, 0);
				1040	if (ret) {
				1041	mlog_errno(ret);
				1042	goto out;
				1043	}
				1044	}
				1045
				1046	/*
				1047	* Call this even if we don't add any clusters to the tree. We
				1048	* still need to zero the area between the old i_size and the
				1049	* new i_size.
				1050	*/
				1051	ret = ocfs2_zero_extend(inode, di_bh, zero_to);
				1052	if (ret < 0)
				1053	mlog_errno(ret);
				1054
				1055	out:
				1056	return ret;
				1057	}
				1058
				1059	static int ocfs2_extend_file(struct inode *inode,
				1060	struct buffer_head *di_bh,
				1061	u64 new_i_size)
				1062	{
				1063	int ret = 0;
				1064	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				1065
				1066	BUG_ON(!di_bh);
				1067
				1068	/* setattr sometimes calls us like this. */
				1069	if (new_i_size == 0)
				1070	goto out;
				1071
				1072	if (i_size_read(inode) == new_i_size)
				1073	goto out;
				1074	BUG_ON(new_i_size < i_size_read(inode));
				1075
				1076	/*
				1077	* The alloc sem blocks people in read/write from reading our
				1078	* allocation until we're done changing it. We depend on
				1079	* i_mutex to block other extend/truncate calls while we're
				1080	* here. We even have to hold it for sparse files because there
				1081	* might be some tail zeroing.
				1082	*/
				1083	down_write(&oi->ip_alloc_sem);
				1084
				1085	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
				1086	/*
				1087	* We can optimize small extends by keeping the inodes
				1088	* inline data.
				1089	*/
				1090	if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
				1091	up_write(&oi->ip_alloc_sem);
				1092	goto out_update_size;
				1093	}
				1094
				1095	ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
				1096	if (ret) {
				1097	up_write(&oi->ip_alloc_sem);
				1098	mlog_errno(ret);
				1099	goto out;
				1100	}
				1101	}
				1102
				1103	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
				1104	ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
				1105	else
				1106	ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
				1107	new_i_size);
				1108
				1109	up_write(&oi->ip_alloc_sem);
				1110
				1111	if (ret < 0) {
				1112	mlog_errno(ret);
				1113	goto out;
				1114	}
				1115
				1116	out_update_size:
				1117	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
				1118	if (ret < 0)
				1119	mlog_errno(ret);
				1120
				1121	out:
				1122	return ret;
				1123	}
				1124
				1125	int ocfs2_setattr(struct dentry dentry, struct iattr attr)
				1126	{
				1127	int status = 0, size_change;
				1128	int inode_locked = 0;
				1129	struct inode *inode = d_inode(dentry);
				1130	struct super_block *sb = inode->i_sb;
				1131	struct ocfs2_super *osb = OCFS2_SB(sb);
				1132	struct buffer_head *bh = NULL;
				1133	handle_t *handle = NULL;
				1134	struct dquot *transfer_to[MAXQUOTAS] = { };
				1135	int qtype;
				1136	int had_lock;
				1137	struct ocfs2_lock_holder oh;
				1138
				1139	trace_ocfs2_setattr(inode, dentry,
				1140	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				1141	dentry->d_name.len, dentry->d_name.name,
				1142	attr->ia_valid, attr->ia_mode,
				1143	from_kuid(&init_user_ns, attr->ia_uid),
				1144	from_kgid(&init_user_ns, attr->ia_gid));
				1145
				1146	/* ensuring we don't even attempt to truncate a symlink */
				1147	if (S_ISLNK(inode->i_mode))
				1148	attr->ia_valid &= ~ATTR_SIZE;
				1149
				1150	#define OCFS2_VALID_ATTRS (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME \| ATTR_SIZE \
				1151	\| ATTR_GID \| ATTR_UID \| ATTR_MODE)
				1152	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
				1153	return 0;
				1154
				1155	status = setattr_prepare(dentry, attr);
				1156	if (status)
				1157	return status;
				1158
				1159	if (is_quota_modification(inode, attr)) {
				1160	status = dquot_initialize(inode);
				1161	if (status)
				1162	return status;
				1163	}
				1164	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
				1165	if (size_change) {
				1166	/*
				1167	* Here we should wait dio to finish before inode lock
				1168	* to avoid a deadlock between ocfs2_setattr() and
				1169	* ocfs2_dio_end_io_write()
				1170	*/
				1171	inode_dio_wait(inode);
				1172
				1173	status = ocfs2_rw_lock(inode, 1);
				1174	if (status < 0) {
				1175	mlog_errno(status);
				1176	goto bail;
				1177	}
				1178	}
				1179
				1180	had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
				1181	if (had_lock < 0) {
				1182	status = had_lock;
				1183	goto bail_unlock_rw;
				1184	} else if (had_lock) {
				1185	/*
				1186	* As far as we know, ocfs2_setattr() could only be the first
				1187	* VFS entry point in the call chain of recursive cluster
				1188	* locking issue.
				1189	*
				1190	* For instance:
				1191	* chmod_common()
				1192	* notify_change()
				1193	* ocfs2_setattr()
				1194	* posix_acl_chmod()
				1195	* ocfs2_iop_get_acl()
				1196	*
				1197	* But, we're not 100% sure if it's always true, because the
				1198	* ordering of the VFS entry points in the call chain is out
				1199	* of our control. So, we'd better dump the stack here to
				1200	* catch the other cases of recursive locking.
				1201	*/
				1202	mlog(ML_ERROR, "Another case of recursive locking:\n");
				1203	dump_stack();
				1204	}
				1205	inode_locked = 1;
				1206
				1207	if (size_change) {
				1208	status = inode_newsize_ok(inode, attr->ia_size);
				1209	if (status)
				1210	goto bail_unlock;
				1211
				1212	if (i_size_read(inode) >= attr->ia_size) {
				1213	if (ocfs2_should_order_data(inode)) {
				1214	status = ocfs2_begin_ordered_truncate(inode,
				1215	attr->ia_size);
				1216	if (status)
				1217	goto bail_unlock;
				1218	}
				1219	status = ocfs2_truncate_file(inode, bh, attr->ia_size);
				1220	} else
				1221	status = ocfs2_extend_file(inode, bh, attr->ia_size);
				1222	if (status < 0) {
				1223	if (status != -ENOSPC)
				1224	mlog_errno(status);
				1225	status = -ENOSPC;
				1226	goto bail_unlock;
				1227	}
				1228	}
				1229
				1230	if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) \|\|
				1231	(attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
				1232	/*
				1233	* Gather pointers to quota structures so that allocation /
				1234	* freeing of quota structures happens here and not inside
				1235	* dquot_transfer() where we have problems with lock ordering
				1236	*/
				1237	if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
				1238	&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
				1239	OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
				1240	transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
				1241	if (IS_ERR(transfer_to[USRQUOTA])) {
				1242	status = PTR_ERR(transfer_to[USRQUOTA]);
				1243	goto bail_unlock;
				1244	}
				1245	}
				1246	if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
				1247	&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
				1248	OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
				1249	transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
				1250	if (IS_ERR(transfer_to[GRPQUOTA])) {
				1251	status = PTR_ERR(transfer_to[GRPQUOTA]);
				1252	goto bail_unlock;
				1253	}
				1254	}
				1255	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
				1256	2 * ocfs2_quota_trans_credits(sb));
				1257	if (IS_ERR(handle)) {
				1258	status = PTR_ERR(handle);
				1259	mlog_errno(status);
				1260	goto bail_unlock;
				1261	}
				1262	status = __dquot_transfer(inode, transfer_to);
				1263	if (status < 0)
				1264	goto bail_commit;
				1265	} else {
				1266	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				1267	if (IS_ERR(handle)) {
				1268	status = PTR_ERR(handle);
				1269	mlog_errno(status);
				1270	goto bail_unlock;
				1271	}
				1272	}
				1273
				1274	setattr_copy(inode, attr);
				1275	mark_inode_dirty(inode);
				1276
				1277	status = ocfs2_mark_inode_dirty(handle, inode, bh);
				1278	if (status < 0)
				1279	mlog_errno(status);
				1280
				1281	bail_commit:
				1282	ocfs2_commit_trans(osb, handle);
				1283	bail_unlock:
				1284	if (status && inode_locked) {
				1285	ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
				1286	inode_locked = 0;
				1287	}
				1288	bail_unlock_rw:
				1289	if (size_change)
				1290	ocfs2_rw_unlock(inode, 1);
				1291	bail:
				1292
				1293	/* Release quota pointers in case we acquired them */
				1294	for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
				1295	dqput(transfer_to[qtype]);
				1296
				1297	if (!status && attr->ia_valid & ATTR_MODE) {
				1298	status = ocfs2_acl_chmod(inode, bh);
				1299	if (status < 0)
				1300	mlog_errno(status);
				1301	}
				1302	if (inode_locked)
				1303	ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
				1304
				1305	brelse(bh);
				1306	return status;
				1307	}
				1308
				1309	int ocfs2_getattr(const struct path path, struct kstat stat,
				1310	u32 request_mask, unsigned int flags)
				1311	{
				1312	struct inode *inode = d_inode(path->dentry);
				1313	struct super_block *sb = path->dentry->d_sb;
				1314	struct ocfs2_super *osb = sb->s_fs_info;
				1315	int err;
				1316
				1317	err = ocfs2_inode_revalidate(path->dentry);
				1318	if (err) {
				1319	if (err != -ENOENT)
				1320	mlog_errno(err);
				1321	goto bail;
				1322	}
				1323
				1324	generic_fillattr(inode, stat);
				1325	/*
				1326	* If there is inline data in the inode, the inode will normally not
				1327	* have data blocks allocated (it may have an external xattr block).
				1328	* Report at least one sector for such files, so tools like tar, rsync,
				1329	* others don't incorrectly think the file is completely sparse.
				1330	*/
				1331	if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
				1332	stat->blocks += (stat->size + 511)>>9;
				1333
				1334	/* We set the blksize from the cluster size for performance */
				1335	stat->blksize = osb->s_clustersize;
				1336
				1337	bail:
				1338	return err;
				1339	}
				1340
				1341	int ocfs2_permission(struct inode *inode, int mask)
				1342	{
				1343	int ret, had_lock;
				1344	struct ocfs2_lock_holder oh;
				1345
				1346	if (mask & MAY_NOT_BLOCK)
				1347	return -ECHILD;
				1348
				1349	had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh);
				1350	if (had_lock < 0) {
				1351	ret = had_lock;
				1352	goto out;
				1353	} else if (had_lock) {
				1354	/* See comments in ocfs2_setattr() for details.
				1355	* The call chain of this case could be:
				1356	* do_sys_open()
				1357	* may_open()
				1358	* inode_permission()
				1359	* ocfs2_permission()
				1360	* ocfs2_iop_get_acl()
				1361	*/
				1362	mlog(ML_ERROR, "Another case of recursive locking:\n");
				1363	dump_stack();
				1364	}
				1365
				1366	ret = generic_permission(inode, mask);
				1367
				1368	ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
				1369	out:
				1370	return ret;
				1371	}
				1372
				1373	static int __ocfs2_write_remove_suid(struct inode *inode,
				1374	struct buffer_head *bh)
				1375	{
				1376	int ret;
				1377	handle_t *handle;
				1378	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1379	struct ocfs2_dinode *di;
				1380
				1381	trace_ocfs2_write_remove_suid(
				1382	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				1383	inode->i_mode);
				1384
				1385	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				1386	if (IS_ERR(handle)) {
				1387	ret = PTR_ERR(handle);
				1388	mlog_errno(ret);
				1389	goto out;
				1390	}
				1391
				1392	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
				1393	OCFS2_JOURNAL_ACCESS_WRITE);
				1394	if (ret < 0) {
				1395	mlog_errno(ret);
				1396	goto out_trans;
				1397	}
				1398
				1399	inode->i_mode &= ~S_ISUID;
				1400	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
				1401	inode->i_mode &= ~S_ISGID;
				1402
				1403	di = (struct ocfs2_dinode *) bh->b_data;
				1404	di->i_mode = cpu_to_le16(inode->i_mode);
				1405	ocfs2_update_inode_fsync_trans(handle, inode, 0);
				1406
				1407	ocfs2_journal_dirty(handle, bh);
				1408
				1409	out_trans:
				1410	ocfs2_commit_trans(osb, handle);
				1411	out:
				1412	return ret;
				1413	}
				1414
				1415	static int ocfs2_write_remove_suid(struct inode *inode)
				1416	{
				1417	int ret;
				1418	struct buffer_head *bh = NULL;
				1419
				1420	ret = ocfs2_read_inode_block(inode, &bh);
				1421	if (ret < 0) {
				1422	mlog_errno(ret);
				1423	goto out;
				1424	}
				1425
				1426	ret = __ocfs2_write_remove_suid(inode, bh);
				1427	out:
				1428	brelse(bh);
				1429	return ret;
				1430	}
				1431
				1432	/*
				1433	* Allocate enough extents to cover the region starting at byte offset
				1434	* start for len bytes. Existing extents are skipped, any extents
				1435	* added are marked as "unwritten".
				1436	*/
				1437	static int ocfs2_allocate_unwritten_extents(struct inode *inode,
				1438	u64 start, u64 len)
				1439	{
				1440	int ret;
				1441	u32 cpos, phys_cpos, clusters, alloc_size;
				1442	u64 end = start + len;
				1443	struct buffer_head *di_bh = NULL;
				1444
				1445	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
				1446	ret = ocfs2_read_inode_block(inode, &di_bh);
				1447	if (ret) {
				1448	mlog_errno(ret);
				1449	goto out;
				1450	}
				1451
				1452	/*
				1453	* Nothing to do if the requested reservation range
				1454	* fits within the inode.
				1455	*/
				1456	if (ocfs2_size_fits_inline_data(di_bh, end))
				1457	goto out;
				1458
				1459	ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
				1460	if (ret) {
				1461	mlog_errno(ret);
				1462	goto out;
				1463	}
				1464	}
				1465
				1466	/*
				1467	* We consider both start and len to be inclusive.
				1468	*/
				1469	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
				1470	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
				1471	clusters -= cpos;
				1472
				1473	while (clusters) {
				1474	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
				1475	&alloc_size, NULL);
				1476	if (ret) {
				1477	mlog_errno(ret);
				1478	goto out;
				1479	}
				1480
				1481	/*
				1482	* Hole or existing extent len can be arbitrary, so
				1483	* cap it to our own allocation request.
				1484	*/
				1485	if (alloc_size > clusters)
				1486	alloc_size = clusters;
				1487
				1488	if (phys_cpos) {
				1489	/*
				1490	* We already have an allocation at this
				1491	* region so we can safely skip it.
				1492	*/
				1493	goto next;
				1494	}
				1495
				1496	ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
				1497	if (ret) {
				1498	if (ret != -ENOSPC)
				1499	mlog_errno(ret);
				1500	goto out;
				1501	}
				1502
				1503	next:
				1504	cpos += alloc_size;
				1505	clusters -= alloc_size;
				1506	}
				1507
				1508	ret = 0;
				1509	out:
				1510
				1511	brelse(di_bh);
				1512	return ret;
				1513	}
				1514
				1515	/*
				1516	* Truncate a byte range, avoiding pages within partial clusters. This
				1517	* preserves those pages for the zeroing code to write to.
				1518	*/
				1519	static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
				1520	u64 byte_len)
				1521	{
				1522	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1523	loff_t start, end;
				1524	struct address_space *mapping = inode->i_mapping;
				1525
				1526	start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
				1527	end = byte_start + byte_len;
				1528	end = end & ~(osb->s_clustersize - 1);
				1529
				1530	if (start < end) {
				1531	unmap_mapping_range(mapping, start, end - start, 0);
				1532	truncate_inode_pages_range(mapping, start, end - 1);
				1533	}
				1534	}
				1535
				1536	static int ocfs2_zero_partial_clusters(struct inode *inode,
				1537	u64 start, u64 len)
				1538	{
				1539	int ret = 0;
				1540	u64 tmpend = 0;
				1541	u64 end = start + len;
				1542	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1543	unsigned int csize = osb->s_clustersize;
				1544	handle_t *handle;
				1545
				1546	/*
				1547	* The "start" and "end" values are NOT necessarily part of
				1548	* the range whose allocation is being deleted. Rather, this
				1549	* is what the user passed in with the request. We must zero
				1550	* partial clusters here. There's no need to worry about
				1551	* physical allocation - the zeroing code knows to skip holes.
				1552	*/
				1553	trace_ocfs2_zero_partial_clusters(
				1554	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				1555	(unsigned long long)start, (unsigned long long)end);
				1556
				1557	/*
				1558	* If both edges are on a cluster boundary then there's no
				1559	* zeroing required as the region is part of the allocation to
				1560	* be truncated.
				1561	*/
				1562	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
				1563	goto out;
				1564
				1565	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				1566	if (IS_ERR(handle)) {
				1567	ret = PTR_ERR(handle);
				1568	mlog_errno(ret);
				1569	goto out;
				1570	}
				1571
				1572	/*
				1573	* If start is on a cluster boundary and end is somewhere in another
				1574	* cluster, we have not COWed the cluster starting at start, unless
				1575	* end is also within the same cluster. So, in this case, we skip this
				1576	* first call to ocfs2_zero_range_for_truncate() truncate and move on
				1577	* to the next one.
				1578	*/
				1579	if ((start & (csize - 1)) != 0) {
				1580	/*
				1581	* We want to get the byte offset of the end of the 1st
				1582	* cluster.
				1583	*/
				1584	tmpend = (u64)osb->s_clustersize +
				1585	(start & ~(osb->s_clustersize - 1));
				1586	if (tmpend > end)
				1587	tmpend = end;
				1588
				1589	trace_ocfs2_zero_partial_clusters_range1(
				1590	(unsigned long long)start,
				1591	(unsigned long long)tmpend);
				1592
				1593	ret = ocfs2_zero_range_for_truncate(inode, handle, start,
				1594	tmpend);
				1595	if (ret)
				1596	mlog_errno(ret);
				1597	}
				1598
				1599	if (tmpend < end) {
				1600	/*
				1601	* This may make start and end equal, but the zeroing
				1602	* code will skip any work in that case so there's no
				1603	* need to catch it up here.
				1604	*/
				1605	start = end & ~(osb->s_clustersize - 1);
				1606
				1607	trace_ocfs2_zero_partial_clusters_range2(
				1608	(unsigned long long)start, (unsigned long long)end);
				1609
				1610	ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
				1611	if (ret)
				1612	mlog_errno(ret);
				1613	}
				1614	ocfs2_update_inode_fsync_trans(handle, inode, 1);
				1615
				1616	ocfs2_commit_trans(osb, handle);
				1617	out:
				1618	return ret;
				1619	}
				1620
				1621	static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
				1622	{
				1623	int i;
				1624	struct ocfs2_extent_rec *rec = NULL;
				1625
				1626	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
				1627
				1628	rec = &el->l_recs[i];
				1629
				1630	if (le32_to_cpu(rec->e_cpos) < pos)
				1631	break;
				1632	}
				1633
				1634	return i;
				1635	}
				1636
				1637	/*
				1638	* Helper to calculate the punching pos and length in one run, we handle the
				1639	* following three cases in order:
				1640	*
				1641	* - remove the entire record
				1642	* - remove a partial record
				1643	* - no record needs to be removed (hole-punching completed)
				1644	*/
				1645	static void ocfs2_calc_trunc_pos(struct inode *inode,
				1646	struct ocfs2_extent_list *el,
				1647	struct ocfs2_extent_rec *rec,
				1648	u32 trunc_start, u32 *trunc_cpos,
				1649	u32 trunc_len, u32 trunc_end,
				1650	u64 blkno, int done)
				1651	{
				1652	int ret = 0;
				1653	u32 coff, range;
				1654
				1655	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
				1656
				1657	if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
				1658	/*
				1659	* remove an entire extent record.
				1660	*/
				1661	*trunc_cpos = le32_to_cpu(rec->e_cpos);
				1662	/*
				1663	* Skip holes if any.
				1664	*/
				1665	if (range < *trunc_end)
				1666	*trunc_end = range;
				1667	trunc_len = trunc_end - le32_to_cpu(rec->e_cpos);
				1668	*blkno = le64_to_cpu(rec->e_blkno);
				1669	*trunc_end = le32_to_cpu(rec->e_cpos);
				1670	} else if (range > trunc_start) {
				1671	/*
				1672	* remove a partial extent record, which means we're
				1673	* removing the last extent record.
				1674	*/
				1675	*trunc_cpos = trunc_start;
				1676	/*
				1677	* skip hole if any.
				1678	*/
				1679	if (range < *trunc_end)
				1680	*trunc_end = range;
				1681	trunc_len = trunc_end - trunc_start;
				1682	coff = trunc_start - le32_to_cpu(rec->e_cpos);
				1683	*blkno = le64_to_cpu(rec->e_blkno) +
				1684	ocfs2_clusters_to_blocks(inode->i_sb, coff);
				1685	*trunc_end = trunc_start;
				1686	} else {
				1687	/*
				1688	* It may have two following possibilities:
				1689	*
				1690	* - last record has been removed
				1691	* - trunc_start was within a hole
				1692	*
				1693	* both two cases mean the completion of hole punching.
				1694	*/
				1695	ret = 1;
				1696	}
				1697
				1698	*done = ret;
				1699	}
				1700
				1701	int ocfs2_remove_inode_range(struct inode *inode,
				1702	struct buffer_head *di_bh, u64 byte_start,
				1703	u64 byte_len)
				1704	{
				1705	int ret = 0, flags = 0, done = 0, i;
				1706	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
				1707	u32 cluster_in_el;
				1708	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1709	struct ocfs2_cached_dealloc_ctxt dealloc;
				1710	struct address_space *mapping = inode->i_mapping;
				1711	struct ocfs2_extent_tree et;
				1712	struct ocfs2_path *path = NULL;
				1713	struct ocfs2_extent_list *el = NULL;
				1714	struct ocfs2_extent_rec *rec = NULL;
				1715	struct ocfs2_dinode di = (struct ocfs2_dinode )di_bh->b_data;
				1716	u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
				1717
				1718	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
				1719	ocfs2_init_dealloc_ctxt(&dealloc);
				1720
				1721	trace_ocfs2_remove_inode_range(
				1722	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				1723	(unsigned long long)byte_start,
				1724	(unsigned long long)byte_len);
				1725
				1726	if (byte_len == 0)
				1727	return 0;
				1728
				1729	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
				1730	ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
				1731	byte_start + byte_len, 0);
				1732	if (ret) {
				1733	mlog_errno(ret);
				1734	goto out;
				1735	}
				1736	/*
				1737	* There's no need to get fancy with the page cache
				1738	* truncate of an inline-data inode. We're talking
				1739	* about less than a page here, which will be cached
				1740	* in the dinode buffer anyway.
				1741	*/
				1742	unmap_mapping_range(mapping, 0, 0, 0);
				1743	truncate_inode_pages(mapping, 0);
				1744	goto out;
				1745	}
				1746
				1747	/*
				1748	* For reflinks, we may need to CoW 2 clusters which might be
				1749	* partially zero'd later, if hole's start and end offset were
				1750	* within one cluster(means is not exactly aligned to clustersize).
				1751	*/
				1752
				1753	if (ocfs2_is_refcount_inode(inode)) {
				1754	ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
				1755	if (ret) {
				1756	mlog_errno(ret);
				1757	goto out;
				1758	}
				1759
				1760	ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
				1761	if (ret) {
				1762	mlog_errno(ret);
				1763	goto out;
				1764	}
				1765	}
				1766
				1767	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
				1768	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
				1769	cluster_in_el = trunc_end;
				1770
				1771	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
				1772	if (ret) {
				1773	mlog_errno(ret);
				1774	goto out;
				1775	}
				1776
				1777	path = ocfs2_new_path_from_et(&et);
				1778	if (!path) {
				1779	ret = -ENOMEM;
				1780	mlog_errno(ret);
				1781	goto out;
				1782	}
				1783
				1784	while (trunc_end > trunc_start) {
				1785
				1786	ret = ocfs2_find_path(INODE_CACHE(inode), path,
				1787	cluster_in_el);
				1788	if (ret) {
				1789	mlog_errno(ret);
				1790	goto out;
				1791	}
				1792
				1793	el = path_leaf_el(path);
				1794
				1795	i = ocfs2_find_rec(el, trunc_end);
				1796	/*
				1797	* Need to go to previous extent block.
				1798	*/
				1799	if (i < 0) {
				1800	if (path->p_tree_depth == 0)
				1801	break;
				1802
				1803	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
				1804	path,
				1805	&cluster_in_el);
				1806	if (ret) {
				1807	mlog_errno(ret);
				1808	goto out;
				1809	}
				1810
				1811	/*
				1812	* We've reached the leftmost extent block,
				1813	* it's safe to leave.
				1814	*/
				1815	if (cluster_in_el == 0)
				1816	break;
				1817
				1818	/*
				1819	* The 'pos' searched for previous extent block is
				1820	* always one cluster less than actual trunc_end.
				1821	*/
				1822	trunc_end = cluster_in_el + 1;
				1823
				1824	ocfs2_reinit_path(path, 1);
				1825
				1826	continue;
				1827
				1828	} else
				1829	rec = &el->l_recs[i];
				1830
				1831	ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
				1832	&trunc_len, &trunc_end, &blkno, &done);
				1833	if (done)
				1834	break;
				1835
				1836	flags = rec->e_flags;
				1837	phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
				1838
				1839	ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
				1840	phys_cpos, trunc_len, flags,
				1841	&dealloc, refcount_loc, false);
				1842	if (ret < 0) {
				1843	mlog_errno(ret);
				1844	goto out;
				1845	}
				1846
				1847	cluster_in_el = trunc_end;
				1848
				1849	ocfs2_reinit_path(path, 1);
				1850	}
				1851
				1852	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
				1853
				1854	out:
				1855	ocfs2_free_path(path);
				1856	ocfs2_schedule_truncate_log_flush(osb, 1);
				1857	ocfs2_run_deallocs(osb, &dealloc);
				1858
				1859	return ret;
				1860	}
				1861
				1862	/*
				1863	* Parts of this function taken from xfs_change_file_space()
				1864	*/
				1865	static int __ocfs2_change_file_space(struct file file, struct inode inode,
				1866	loff_t f_pos, unsigned int cmd,
				1867	struct ocfs2_space_resv *sr,
				1868	int change_size)
				1869	{
				1870	int ret;
				1871	s64 llen;
				1872	loff_t size;
				1873	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1874	struct buffer_head *di_bh = NULL;
				1875	handle_t *handle;
				1876	unsigned long long max_off = inode->i_sb->s_maxbytes;
				1877
				1878	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
				1879	return -EROFS;
				1880
				1881	inode_lock(inode);
				1882
				1883	/*
				1884	* This prevents concurrent writes on other nodes
				1885	*/
				1886	ret = ocfs2_rw_lock(inode, 1);
				1887	if (ret) {
				1888	mlog_errno(ret);
				1889	goto out;
				1890	}
				1891
				1892	ret = ocfs2_inode_lock(inode, &di_bh, 1);
				1893	if (ret) {
				1894	mlog_errno(ret);
				1895	goto out_rw_unlock;
				1896	}
				1897
				1898	if (inode->i_flags & (S_IMMUTABLE\|S_APPEND)) {
				1899	ret = -EPERM;
				1900	goto out_inode_unlock;
				1901	}
				1902
				1903	switch (sr->l_whence) {
				1904	case 0: /SEEK_SET/
				1905	break;
				1906	case 1: /SEEK_CUR/
				1907	sr->l_start += f_pos;
				1908	break;
				1909	case 2: /SEEK_END/
				1910	sr->l_start += i_size_read(inode);
				1911	break;
				1912	default:
				1913	ret = -EINVAL;
				1914	goto out_inode_unlock;
				1915	}
				1916	sr->l_whence = 0;
				1917
				1918	llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
				1919
				1920	if (sr->l_start < 0
				1921	\|\| sr->l_start > max_off
				1922	\|\| (sr->l_start + llen) < 0
				1923	\|\| (sr->l_start + llen) > max_off) {
				1924	ret = -EINVAL;
				1925	goto out_inode_unlock;
				1926	}
				1927	size = sr->l_start + sr->l_len;
				1928
				1929	if (cmd == OCFS2_IOC_RESVSP \|\| cmd == OCFS2_IOC_RESVSP64 \|\|
				1930	cmd == OCFS2_IOC_UNRESVSP \|\| cmd == OCFS2_IOC_UNRESVSP64) {
				1931	if (sr->l_len <= 0) {
				1932	ret = -EINVAL;
				1933	goto out_inode_unlock;
				1934	}
				1935	}
				1936
				1937	if (file && should_remove_suid(file->f_path.dentry)) {
				1938	ret = __ocfs2_write_remove_suid(inode, di_bh);
				1939	if (ret) {
				1940	mlog_errno(ret);
				1941	goto out_inode_unlock;
				1942	}
				1943	}
				1944
				1945	down_write(&OCFS2_I(inode)->ip_alloc_sem);
				1946	switch (cmd) {
				1947	case OCFS2_IOC_RESVSP:
				1948	case OCFS2_IOC_RESVSP64:
				1949	/*
				1950	* This takes unsigned offsets, but the signed ones we
				1951	* pass have been checked against overflow above.
				1952	*/
				1953	ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
				1954	sr->l_len);
				1955	break;
				1956	case OCFS2_IOC_UNRESVSP:
				1957	case OCFS2_IOC_UNRESVSP64:
				1958	ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
				1959	sr->l_len);
				1960	break;
				1961	default:
				1962	ret = -EINVAL;
				1963	}
				1964	up_write(&OCFS2_I(inode)->ip_alloc_sem);
				1965	if (ret) {
				1966	mlog_errno(ret);
				1967	goto out_inode_unlock;
				1968	}
				1969
				1970	/*
				1971	* We update c/mtime for these changes
				1972	*/
				1973	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				1974	if (IS_ERR(handle)) {
				1975	ret = PTR_ERR(handle);
				1976	mlog_errno(ret);
				1977	goto out_inode_unlock;
				1978	}
				1979
				1980	if (change_size && i_size_read(inode) < size)
				1981	i_size_write(inode, size);
				1982
				1983	inode->i_ctime = inode->i_mtime = current_time(inode);
				1984	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
				1985	if (ret < 0)
				1986	mlog_errno(ret);
				1987
				1988	if (file && (file->f_flags & O_SYNC))
				1989	handle->h_sync = 1;
				1990
				1991	ocfs2_commit_trans(osb, handle);
				1992
				1993	out_inode_unlock:
				1994	brelse(di_bh);
				1995	ocfs2_inode_unlock(inode, 1);
				1996	out_rw_unlock:
				1997	ocfs2_rw_unlock(inode, 1);
				1998
				1999	out:
				2000	inode_unlock(inode);
				2001	return ret;
				2002	}
				2003
				2004	int ocfs2_change_file_space(struct file *file, unsigned int cmd,
				2005	struct ocfs2_space_resv *sr)
				2006	{
				2007	struct inode *inode = file_inode(file);
				2008	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				2009	int ret;
				2010
				2011	if ((cmd == OCFS2_IOC_RESVSP \|\| cmd == OCFS2_IOC_RESVSP64) &&
				2012	!ocfs2_writes_unwritten_extents(osb))
				2013	return -ENOTTY;
				2014	else if ((cmd == OCFS2_IOC_UNRESVSP \|\| cmd == OCFS2_IOC_UNRESVSP64) &&
				2015	!ocfs2_sparse_alloc(osb))
				2016	return -ENOTTY;
				2017
				2018	if (!S_ISREG(inode->i_mode))
				2019	return -EINVAL;
				2020
				2021	if (!(file->f_mode & FMODE_WRITE))
				2022	return -EBADF;
				2023
				2024	ret = mnt_want_write_file(file);
				2025	if (ret)
				2026	return ret;
				2027	ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
				2028	mnt_drop_write_file(file);
				2029	return ret;
				2030	}
				2031
				2032	static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
				2033	loff_t len)
				2034	{
				2035	struct inode *inode = file_inode(file);
				2036	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				2037	struct ocfs2_space_resv sr;
				2038	int change_size = 1;
				2039	int cmd = OCFS2_IOC_RESVSP64;
				2040
				2041	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
				2042	return -EOPNOTSUPP;
				2043	if (!ocfs2_writes_unwritten_extents(osb))
				2044	return -EOPNOTSUPP;
				2045
				2046	if (mode & FALLOC_FL_KEEP_SIZE)
				2047	change_size = 0;
				2048
				2049	if (mode & FALLOC_FL_PUNCH_HOLE)
				2050	cmd = OCFS2_IOC_UNRESVSP64;
				2051
				2052	sr.l_whence = 0;
				2053	sr.l_start = (s64)offset;
				2054	sr.l_len = (s64)len;
				2055
				2056	return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
				2057	change_size);
				2058	}
				2059
				2060	int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
				2061	size_t count)
				2062	{
				2063	int ret = 0;
				2064	unsigned int extent_flags;
				2065	u32 cpos, clusters, extent_len, phys_cpos;
				2066	struct super_block *sb = inode->i_sb;
				2067
				2068	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) \|\|
				2069	!ocfs2_is_refcount_inode(inode) \|\|
				2070	OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
				2071	return 0;
				2072
				2073	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
				2074	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
				2075
				2076	while (clusters) {
				2077	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
				2078	&extent_flags);
				2079	if (ret < 0) {
				2080	mlog_errno(ret);
				2081	goto out;
				2082	}
				2083
				2084	if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
				2085	ret = 1;
				2086	break;
				2087	}
				2088
				2089	if (extent_len > clusters)
				2090	extent_len = clusters;
				2091
				2092	clusters -= extent_len;
				2093	cpos += extent_len;
				2094	}
				2095	out:
				2096	return ret;
				2097	}
				2098
				2099	static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
				2100	{
				2101	int blockmask = inode->i_sb->s_blocksize - 1;
				2102	loff_t final_size = pos + count;
				2103
				2104	if ((pos & blockmask) \|\| (final_size & blockmask))
				2105	return 1;
				2106	return 0;
				2107	}
				2108
				2109	static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
				2110	struct file *file,
				2111	loff_t pos, size_t count,
				2112	int *meta_level)
				2113	{
				2114	int ret;
				2115	struct buffer_head *di_bh = NULL;
				2116	u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
				2117	u32 clusters =
				2118	ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
				2119
				2120	ret = ocfs2_inode_lock(inode, &di_bh, 1);
				2121	if (ret) {
				2122	mlog_errno(ret);
				2123	goto out;
				2124	}
				2125
				2126	*meta_level = 1;
				2127
				2128	ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
				2129	if (ret)
				2130	mlog_errno(ret);
				2131	out:
				2132	brelse(di_bh);
				2133	return ret;
				2134	}
				2135
				2136	static int ocfs2_prepare_inode_for_write(struct file *file,
				2137	loff_t pos, size_t count, int wait)
				2138	{
				2139	int ret = 0, meta_level = 0, overwrite_io = 0;
				2140	struct dentry *dentry = file->f_path.dentry;
				2141	struct inode *inode = d_inode(dentry);
				2142	struct buffer_head *di_bh = NULL;
				2143	loff_t end;
				2144
				2145	/*
				2146	* We start with a read level meta lock and only jump to an ex
				2147	* if we need to make modifications here.
				2148	*/
				2149	for(;;) {
				2150	if (wait)
				2151	ret = ocfs2_inode_lock(inode, NULL, meta_level);
				2152	else
				2153	ret = ocfs2_try_inode_lock(inode,
				2154	overwrite_io ? NULL : &di_bh, meta_level);
				2155	if (ret < 0) {
				2156	meta_level = -1;
				2157	if (ret != -EAGAIN)
				2158	mlog_errno(ret);
				2159	goto out;
				2160	}
				2161
				2162	/*
				2163	* Check if IO will overwrite allocated blocks in case
				2164	* IOCB_NOWAIT flag is set.
				2165	*/
				2166	if (!wait && !overwrite_io) {
				2167	overwrite_io = 1;
				2168	if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
				2169	ret = -EAGAIN;
				2170	goto out_unlock;
				2171	}
				2172
				2173	ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
				2174	brelse(di_bh);
				2175	di_bh = NULL;
				2176	up_read(&OCFS2_I(inode)->ip_alloc_sem);
				2177	if (ret < 0) {
				2178	if (ret != -EAGAIN)
				2179	mlog_errno(ret);
				2180	goto out_unlock;
				2181	}
				2182	}
				2183
				2184	/* Clear suid / sgid if necessary. We do this here
				2185	* instead of later in the write path because
				2186	* remove_suid() calls ->setattr without any hint that
				2187	* we may have already done our cluster locking. Since
				2188	* ocfs2_setattr() must take cluster locks to
				2189	* proceed, this will lead us to recursively lock the
				2190	* inode. There's also the dinode i_size state which
				2191	* can be lost via setattr during extending writes (we
				2192	* set inode->i_size at the end of a write. */
				2193	if (should_remove_suid(dentry)) {
				2194	if (meta_level == 0) {
				2195	ocfs2_inode_unlock(inode, meta_level);
				2196	meta_level = 1;
				2197	continue;
				2198	}
				2199
				2200	ret = ocfs2_write_remove_suid(inode);
				2201	if (ret < 0) {
				2202	mlog_errno(ret);
				2203	goto out_unlock;
				2204	}
				2205	}
				2206
				2207	end = pos + count;
				2208
				2209	ret = ocfs2_check_range_for_refcount(inode, pos, count);
				2210	if (ret == 1) {
				2211	ocfs2_inode_unlock(inode, meta_level);
				2212	meta_level = -1;
				2213
				2214	ret = ocfs2_prepare_inode_for_refcount(inode,
				2215	file,
				2216	pos,
				2217	count,
				2218	&meta_level);
				2219	}
				2220
				2221	if (ret < 0) {
				2222	mlog_errno(ret);
				2223	goto out_unlock;
				2224	}
				2225
				2226	break;
				2227	}
				2228
				2229	out_unlock:
				2230	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
				2231	pos, count, wait);
				2232
				2233	brelse(di_bh);
				2234
				2235	if (meta_level >= 0)
				2236	ocfs2_inode_unlock(inode, meta_level);
				2237
				2238	out:
				2239	return ret;
				2240	}
				2241
				2242	static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
				2243	struct iov_iter *from)
				2244	{
				2245	int rw_level;
				2246	ssize_t written = 0;
				2247	ssize_t ret;
				2248	size_t count = iov_iter_count(from);
				2249	struct file *file = iocb->ki_filp;
				2250	struct inode *inode = file_inode(file);
				2251	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				2252	int full_coherency = !(osb->s_mount_opt &
				2253	OCFS2_MOUNT_COHERENCY_BUFFERED);
				2254	void *saved_ki_complete = NULL;
				2255	int append_write = ((iocb->ki_pos + count) >=
				2256	i_size_read(inode) ? 1 : 0);
				2257	int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
				2258	int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
				2259
				2260	trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry,
				2261	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				2262	file->f_path.dentry->d_name.len,
				2263	file->f_path.dentry->d_name.name,
				2264	(unsigned int)from->nr_segs); /* GRRRRR */
				2265
				2266	if (!direct_io && nowait)
				2267	return -EOPNOTSUPP;
				2268
				2269	if (count == 0)
				2270	return 0;
				2271
				2272	if (nowait) {
				2273	if (!inode_trylock(inode))
				2274	return -EAGAIN;
				2275	} else
				2276	inode_lock(inode);
				2277
				2278	/*
				2279	* Concurrent O_DIRECT writes are allowed with
				2280	* mount_option "coherency=buffered".
				2281	* For append write, we must take rw EX.
				2282	*/
				2283	rw_level = (!direct_io \|\| full_coherency \|\| append_write);
				2284
				2285	if (nowait)
				2286	ret = ocfs2_try_rw_lock(inode, rw_level);
				2287	else
				2288	ret = ocfs2_rw_lock(inode, rw_level);
				2289	if (ret < 0) {
				2290	if (ret != -EAGAIN)
				2291	mlog_errno(ret);
				2292	goto out_mutex;
				2293	}
				2294
				2295	/*
				2296	* O_DIRECT writes with "coherency=full" need to take EX cluster
				2297	* inode_lock to guarantee coherency.
				2298	*/
				2299	if (direct_io && full_coherency) {
				2300	/*
				2301	* We need to take and drop the inode lock to force
				2302	* other nodes to drop their caches. Buffered I/O
				2303	* already does this in write_begin().
				2304	*/
				2305	if (nowait)
				2306	ret = ocfs2_try_inode_lock(inode, NULL, 1);
				2307	else
				2308	ret = ocfs2_inode_lock(inode, NULL, 1);
				2309	if (ret < 0) {
				2310	if (ret != -EAGAIN)
				2311	mlog_errno(ret);
				2312	goto out;
				2313	}
				2314
				2315	ocfs2_inode_unlock(inode, 1);
				2316	}
				2317
				2318	ret = generic_write_checks(iocb, from);
				2319	if (ret <= 0) {
				2320	if (ret)
				2321	mlog_errno(ret);
				2322	goto out;
				2323	}
				2324	count = ret;
				2325
				2326	ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait);
				2327	if (ret < 0) {
				2328	if (ret != -EAGAIN)
				2329	mlog_errno(ret);
				2330	goto out;
				2331	}
				2332
				2333	if (direct_io && !is_sync_kiocb(iocb) &&
				2334	ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
				2335	/*
				2336	* Make it a sync io if it's an unaligned aio.
				2337	*/
				2338	saved_ki_complete = xchg(&iocb->ki_complete, NULL);
				2339	}
				2340
				2341	/* communicate with ocfs2_dio_end_io */
				2342	ocfs2_iocb_set_rw_locked(iocb, rw_level);
				2343
				2344	written = __generic_file_write_iter(iocb, from);
				2345	/* buffered aio wouldn't have proper lock coverage today */
				2346	BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
				2347
				2348	/*
				2349	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
				2350	* function pointer which is called when o_direct io completes so that
				2351	* it can unlock our rw lock.
				2352	* Unfortunately there are error cases which call end_io and others
				2353	* that don't. so we don't have to unlock the rw_lock if either an
				2354	* async dio is going to do it in the future or an end_io after an
				2355	* error has already done it.
				2356	*/
				2357	if ((written == -EIOCBQUEUED) \|\| (!ocfs2_iocb_is_rw_locked(iocb))) {
				2358	rw_level = -1;
				2359	}
				2360
				2361	if (unlikely(written <= 0))
				2362	goto out;
				2363
				2364	if (((file->f_flags & O_DSYNC) && !direct_io) \|\|
				2365	IS_SYNC(inode)) {
				2366	ret = filemap_fdatawrite_range(file->f_mapping,
				2367	iocb->ki_pos - written,
				2368	iocb->ki_pos - 1);
				2369	if (ret < 0)
				2370	written = ret;
				2371
				2372	if (!ret) {
				2373	ret = jbd2_journal_force_commit(osb->journal->j_journal);
				2374	if (ret < 0)
				2375	written = ret;
				2376	}
				2377
				2378	if (!ret)
				2379	ret = filemap_fdatawait_range(file->f_mapping,
				2380	iocb->ki_pos - written,
				2381	iocb->ki_pos - 1);
				2382	}
				2383
				2384	out:
				2385	if (saved_ki_complete)
				2386	xchg(&iocb->ki_complete, saved_ki_complete);
				2387
				2388	if (rw_level != -1)
				2389	ocfs2_rw_unlock(inode, rw_level);
				2390
				2391	out_mutex:
				2392	inode_unlock(inode);
				2393
				2394	if (written)
				2395	ret = written;
				2396	return ret;
				2397	}
				2398
				2399	static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
				2400	struct iov_iter *to)
				2401	{
				2402	int ret = 0, rw_level = -1, lock_level = 0;
				2403	struct file *filp = iocb->ki_filp;
				2404	struct inode *inode = file_inode(filp);
				2405	int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
				2406	int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
				2407
				2408	trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry,
				2409	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				2410	filp->f_path.dentry->d_name.len,
				2411	filp->f_path.dentry->d_name.name,
				2412	to->nr_segs); /* GRRRRR */
				2413
				2414
				2415	if (!inode) {
				2416	ret = -EINVAL;
				2417	mlog_errno(ret);
				2418	goto bail;
				2419	}
				2420
				2421	if (!direct_io && nowait)
				2422	return -EOPNOTSUPP;
				2423
				2424	/*
				2425	* buffered reads protect themselves in ->readpage(). O_DIRECT reads
				2426	* need locks to protect pending reads from racing with truncate.
				2427	*/
				2428	if (direct_io) {
				2429	if (nowait)
				2430	ret = ocfs2_try_rw_lock(inode, 0);
				2431	else
				2432	ret = ocfs2_rw_lock(inode, 0);
				2433
				2434	if (ret < 0) {
				2435	if (ret != -EAGAIN)
				2436	mlog_errno(ret);
				2437	goto bail;
				2438	}
				2439	rw_level = 0;
				2440	/* communicate with ocfs2_dio_end_io */
				2441	ocfs2_iocb_set_rw_locked(iocb, rw_level);
				2442	}
				2443
				2444	/*
				2445	* We're fine letting folks race truncates and extending
				2446	* writes with read across the cluster, just like they can
				2447	* locally. Hence no rw_lock during read.
				2448	*
				2449	* Take and drop the meta data lock to update inode fields
				2450	* like i_size. This allows the checks down below
				2451	* generic_file_read_iter() a chance of actually working.
				2452	*/
				2453	ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
				2454	!nowait);
				2455	if (ret < 0) {
				2456	if (ret != -EAGAIN)
				2457	mlog_errno(ret);
				2458	goto bail;
				2459	}
				2460	ocfs2_inode_unlock(inode, lock_level);
				2461
				2462	ret = generic_file_read_iter(iocb, to);
				2463	trace_generic_file_read_iter_ret(ret);
				2464
				2465	/* buffered aio wouldn't have proper lock coverage today */
				2466	BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
				2467
				2468	/* see ocfs2_file_write_iter */
				2469	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
				2470	rw_level = -1;
				2471	}
				2472
				2473	bail:
				2474	if (rw_level != -1)
				2475	ocfs2_rw_unlock(inode, rw_level);
				2476
				2477	return ret;
				2478	}
				2479
				2480	/* Refer generic_file_llseek_unlocked() */
				2481	static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
				2482	{
				2483	struct inode *inode = file->f_mapping->host;
				2484	int ret = 0;
				2485
				2486	inode_lock(inode);
				2487
				2488	switch (whence) {
				2489	case SEEK_SET:
				2490	break;
				2491	case SEEK_END:
				2492	/* SEEK_END requires the OCFS2 inode lock for the file
				2493	* because it references the file's size.
				2494	*/
				2495	ret = ocfs2_inode_lock(inode, NULL, 0);
				2496	if (ret < 0) {
				2497	mlog_errno(ret);
				2498	goto out;
				2499	}
				2500	offset += i_size_read(inode);
				2501	ocfs2_inode_unlock(inode, 0);
				2502	break;
				2503	case SEEK_CUR:
				2504	if (offset == 0) {
				2505	offset = file->f_pos;
				2506	goto out;
				2507	}
				2508	offset += file->f_pos;
				2509	break;
				2510	case SEEK_DATA:
				2511	case SEEK_HOLE:
				2512	ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
				2513	if (ret)
				2514	goto out;
				2515	break;
				2516	default:
				2517	ret = -EINVAL;
				2518	goto out;
				2519	}
				2520
				2521	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
				2522
				2523	out:
				2524	inode_unlock(inode);
				2525	if (ret)
				2526	return ret;
				2527	return offset;
				2528	}
				2529
				2530	static int ocfs2_file_clone_range(struct file *file_in,
				2531	loff_t pos_in,
				2532	struct file *file_out,
				2533	loff_t pos_out,
				2534	u64 len)
				2535	{
				2536	return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
				2537	len, false);
				2538	}
				2539
				2540	static int ocfs2_file_dedupe_range(struct file *file_in,
				2541	loff_t pos_in,
				2542	struct file *file_out,
				2543	loff_t pos_out,
				2544	u64 len)
				2545	{
				2546	return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
				2547	len, true);
				2548	}
				2549
				2550	const struct inode_operations ocfs2_file_iops = {
				2551	.setattr = ocfs2_setattr,
				2552	.getattr = ocfs2_getattr,
				2553	.permission = ocfs2_permission,
				2554	.listxattr = ocfs2_listxattr,
				2555	.fiemap = ocfs2_fiemap,
				2556	.get_acl = ocfs2_iop_get_acl,
				2557	.set_acl = ocfs2_iop_set_acl,
				2558	};
				2559
				2560	const struct inode_operations ocfs2_special_file_iops = {
				2561	.setattr = ocfs2_setattr,
				2562	.getattr = ocfs2_getattr,
				2563	.permission = ocfs2_permission,
				2564	.get_acl = ocfs2_iop_get_acl,
				2565	.set_acl = ocfs2_iop_set_acl,
				2566	};
				2567
				2568	/*
				2569	* Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
				2570	* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
				2571	*/
				2572	const struct file_operations ocfs2_fops = {
				2573	.llseek = ocfs2_file_llseek,
				2574	.mmap = ocfs2_mmap,
				2575	.fsync = ocfs2_sync_file,
				2576	.release = ocfs2_file_release,
				2577	.open = ocfs2_file_open,
				2578	.read_iter = ocfs2_file_read_iter,
				2579	.write_iter = ocfs2_file_write_iter,
				2580	.unlocked_ioctl = ocfs2_ioctl,
				2581	#ifdef CONFIG_COMPAT
				2582	.compat_ioctl = ocfs2_compat_ioctl,
				2583	#endif
				2584	.lock = ocfs2_lock,
				2585	.flock = ocfs2_flock,
				2586	.splice_read = generic_file_splice_read,
				2587	.splice_write = iter_file_splice_write,
				2588	.fallocate = ocfs2_fallocate,
				2589	.clone_file_range = ocfs2_file_clone_range,
				2590	.dedupe_file_range = ocfs2_file_dedupe_range,
				2591	};
				2592
				2593	const struct file_operations ocfs2_dops = {
				2594	.llseek = generic_file_llseek,
				2595	.read = generic_read_dir,
				2596	.iterate = ocfs2_readdir,
				2597	.fsync = ocfs2_sync_file,
				2598	.release = ocfs2_dir_release,
				2599	.open = ocfs2_dir_open,
				2600	.unlocked_ioctl = ocfs2_ioctl,
				2601	#ifdef CONFIG_COMPAT
				2602	.compat_ioctl = ocfs2_compat_ioctl,
				2603	#endif
				2604	.lock = ocfs2_lock,
				2605	.flock = ocfs2_flock,
				2606	};
				2607
				2608	/*
				2609	* POSIX-lockless variants of our file_operations.
				2610	*
				2611	* These will be used if the underlying cluster stack does not support
				2612	* posix file locking, if the user passes the "localflocks" mount
				2613	* option, or if we have a local-only fs.
				2614	*
				2615	* ocfs2_flock is in here because all stacks handle UNIX file locks,
				2616	* so we still want it in the case of no stack support for
				2617	* plocks. Internally, it will do the right thing when asked to ignore
				2618	* the cluster.
				2619	*/
				2620	const struct file_operations ocfs2_fops_no_plocks = {
				2621	.llseek = ocfs2_file_llseek,
				2622	.mmap = ocfs2_mmap,
				2623	.fsync = ocfs2_sync_file,
				2624	.release = ocfs2_file_release,
				2625	.open = ocfs2_file_open,
				2626	.read_iter = ocfs2_file_read_iter,
				2627	.write_iter = ocfs2_file_write_iter,
				2628	.unlocked_ioctl = ocfs2_ioctl,
				2629	#ifdef CONFIG_COMPAT
				2630	.compat_ioctl = ocfs2_compat_ioctl,
				2631	#endif
				2632	.flock = ocfs2_flock,
				2633	.splice_read = generic_file_splice_read,
				2634	.splice_write = iter_file_splice_write,
				2635	.fallocate = ocfs2_fallocate,
				2636	.clone_file_range = ocfs2_file_clone_range,
				2637	.dedupe_file_range = ocfs2_file_dedupe_range,
				2638	};
				2639
				2640	const struct file_operations ocfs2_dops_no_plocks = {
				2641	.llseek = generic_file_llseek,
				2642	.read = generic_read_dir,
				2643	.iterate = ocfs2_readdir,
				2644	.fsync = ocfs2_sync_file,
				2645	.release = ocfs2_dir_release,
				2646	.open = ocfs2_dir_open,
				2647	.unlocked_ioctl = ocfs2_ioctl,
				2648	#ifdef CONFIG_COMPAT
				2649	.compat_ioctl = ocfs2_compat_ioctl,
				2650	#endif
				2651	.flock = ocfs2_flock,
				2652	};