Blame - fs/ext4/fast_commit.c - hafnium/third_party/linux

blob: 501e60713010e57a0f84ad3b4335b9a43d23d497 [file] [log] [blame]

Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2
				3	/*
				4	* fs/ext4/fast_commit.c
				5	*
				6	* Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
				7	*
				8	* Ext4 fast commits routines.
				9	*/
				10	#include "ext4.h"
				11	#include "ext4_jbd2.h"
				12	#include "ext4_extents.h"
				13	#include "mballoc.h"
				14
				15	/*
				16	* Ext4 Fast Commits
				17	* -----------------
				18	*
				19	* Ext4 fast commits implement fine grained journalling for Ext4.
				20	*
				21	* Fast commits are organized as a log of tag-length-value (TLV) structs. (See
				22	* struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
				23	* TLV during the recovery phase. For the scenarios for which we currently
				24	* don't have replay code, fast commit falls back to full commits.
				25	* Fast commits record delta in one of the following three categories.
				26	*
				27	* (A) Directory entry updates:
				28	*
				29	* - EXT4_FC_TAG_UNLINK - records directory entry unlink
				30	* - EXT4_FC_TAG_LINK - records directory entry link
				31	* - EXT4_FC_TAG_CREAT - records inode and directory entry creation
				32	*
				33	* (B) File specific data range updates:
				34	*
				35	* - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
				36	* - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
				37	*
				38	* (C) Inode metadata (mtime / ctime etc):
				39	*
				40	* - EXT4_FC_TAG_INODE - record the inode that should be replayed
				41	* during recovery. Note that iblocks field is
				42	* not replayed and instead derived during
				43	* replay.
				44	* Commit Operation
				45	* ----------------
				46	* With fast commits, we maintain all the directory entry operations in the
				47	* order in which they are issued in an in-memory queue. This queue is flushed
				48	* to disk during the commit operation. We also maintain a list of inodes
				49	* that need to be committed during a fast commit in another in memory queue of
				50	* inodes. During the commit operation, we commit in the following order:
				51	*
				52	* [1] Lock inodes for any further data updates by setting COMMITTING state
				53	* [2] Submit data buffers of all the inodes
				54	* [3] Wait for [2] to complete
				55	* [4] Commit all the directory entry updates in the fast commit space
				56	* [5] Commit all the changed inode structures
				57	* [6] Write tail tag (this tag ensures the atomicity, please read the following
				58	* section for more details).
				59	* [7] Wait for [4], [5] and [6] to complete.
				60	*
				61	* All the inode updates must call ext4_fc_start_update() before starting an
				62	* update. If such an ongoing update is present, fast commit waits for it to
				63	* complete. The completion of such an update is marked by
				64	* ext4_fc_stop_update().
				65	*
				66	* Fast Commit Ineligibility
				67	* -------------------------
				68	* Not all operations are supported by fast commits today (e.g extended
				69	* attributes). Fast commit ineligiblity is marked by calling one of the
				70	* two following functions:
				71	*
				72	* - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
				73	* back to full commit. This is useful in case of transient errors.
				74	*
				75	* - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
				76	* the fast commits happening between ext4_fc_start_ineligible() and
				77	* ext4_fc_stop_ineligible() and one fast commit after the call to
				78	* ext4_fc_stop_ineligible() to fall back to full commits. It is important to
				79	* make one more fast commit to fall back to full commit after stop call so
				80	* that it guaranteed that the fast commit ineligible operation contained
				81	* within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
				82	* followed by at least 1 full commit.
				83	*
				84	* Atomicity of commits
				85	* --------------------
				86	* In order to guarantee atomicity during the commit operation, fast commit
				87	* uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
				88	* tag contains CRC of the contents and TID of the transaction after which
				89	* this fast commit should be applied. Recovery code replays fast commit
				90	* logs only if there's at least 1 valid tail present. For every fast commit
				91	* operation, there is 1 tail. This means, we may end up with multiple tails
				92	* in the fast commit space. Here's an example:
				93	*
				94	* - Create a new file A and remove existing file B
				95	* - fsync()
				96	* - Append contents to file A
				97	* - Truncate file A
				98	* - fsync()
				99	*
				100	* The fast commit space at the end of above operations would look like this:
				101	* [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
				102	* \|<--- Fast Commit 1 --->\|<--- Fast Commit 2 ---->\|
				103	*
				104	* Replay code should thus check for all the valid tails in the FC area.
				105	*
				106	* TODOs
				107	* -----
				108	* 1) Make fast commit atomic updates more fine grained. Today, a fast commit
				109	* eligible update must be protected within ext4_fc_start_update() and
				110	* ext4_fc_stop_update(). These routines are called at much higher
				111	* routines. This can be made more fine grained by combining with
				112	* ext4_journal_start().
				113	*
				114	* 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
				115	*
				116	* 3) Handle more ineligible cases.
				117	*/
				118
				119	#include <trace/events/ext4.h>
				120	static struct kmem_cache *ext4_fc_dentry_cachep;
				121
				122	static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
				123	{
				124	BUFFER_TRACE(bh, "");
				125	if (uptodate) {
				126	ext4_debug("%s: Block %lld up-to-date",
				127	__func__, bh->b_blocknr);
				128	set_buffer_uptodate(bh);
				129	} else {
				130	ext4_debug("%s: Block %lld not up-to-date",
				131	__func__, bh->b_blocknr);
				132	clear_buffer_uptodate(bh);
				133	}
				134
				135	unlock_buffer(bh);
				136	}
				137
				138	static inline void ext4_fc_reset_inode(struct inode *inode)
				139	{
				140	struct ext4_inode_info *ei = EXT4_I(inode);
				141
				142	ei->i_fc_lblk_start = 0;
				143	ei->i_fc_lblk_len = 0;
				144	}
				145
				146	void ext4_fc_init_inode(struct inode *inode)
				147	{
				148	struct ext4_inode_info *ei = EXT4_I(inode);
				149
				150	ext4_fc_reset_inode(inode);
				151	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
				152	INIT_LIST_HEAD(&ei->i_fc_list);
				153	init_waitqueue_head(&ei->i_fc_wait);
				154	atomic_set(&ei->i_fc_updates, 0);
				155	}
				156
				157	/* This function must be called with sbi->s_fc_lock held. */
				158	static void ext4_fc_wait_committing_inode(struct inode *inode)
				159	__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
				160	{
				161	wait_queue_head_t *wq;
				162	struct ext4_inode_info *ei = EXT4_I(inode);
				163
				164	#if (BITS_PER_LONG < 64)
				165	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
				166	EXT4_STATE_FC_COMMITTING);
				167	wq = bit_waitqueue(&ei->i_state_flags,
				168	EXT4_STATE_FC_COMMITTING);
				169	#else
				170	DEFINE_WAIT_BIT(wait, &ei->i_flags,
				171	EXT4_STATE_FC_COMMITTING);
				172	wq = bit_waitqueue(&ei->i_flags,
				173	EXT4_STATE_FC_COMMITTING);
				174	#endif
				175	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
				176	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
				177	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				178	schedule();
				179	finish_wait(wq, &wait.wq_entry);
				180	}
				181
				182	/*
				183	* Inform Ext4's fast about start of an inode update
				184	*
				185	* This function is called by the high level call VFS callbacks before
				186	* performing any inode update. This function blocks if there's an ongoing
				187	* fast commit on the inode in question.
				188	*/
				189	void ext4_fc_start_update(struct inode *inode)
				190	{
				191	struct ext4_inode_info *ei = EXT4_I(inode);
				192
				193	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) \|\|
				194	(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
				195	return;
				196
				197	restart:
				198	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				199	if (list_empty(&ei->i_fc_list))
				200	goto out;
				201
				202	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
				203	ext4_fc_wait_committing_inode(inode);
				204	goto restart;
				205	}
				206	out:
				207	atomic_inc(&ei->i_fc_updates);
				208	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				209	}
				210
				211	/*
				212	* Stop inode update and wake up waiting fast commits if any.
				213	*/
				214	void ext4_fc_stop_update(struct inode *inode)
				215	{
				216	struct ext4_inode_info *ei = EXT4_I(inode);
				217
				218	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) \|\|
				219	(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
				220	return;
				221
				222	if (atomic_dec_and_test(&ei->i_fc_updates))
				223	wake_up_all(&ei->i_fc_wait);
				224	}
				225
				226	/*
				227	* Remove inode from fast commit list. If the inode is being committed
				228	* we wait until inode commit is done.
				229	*/
				230	void ext4_fc_del(struct inode *inode)
				231	{
				232	struct ext4_inode_info *ei = EXT4_I(inode);
				233
				234	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) \|\|
				235	(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
				236	return;
				237
				238	restart:
				239	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				240	if (list_empty(&ei->i_fc_list)) {
				241	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				242	return;
				243	}
				244
				245	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
				246	ext4_fc_wait_committing_inode(inode);
				247	goto restart;
				248	}
				249	list_del_init(&ei->i_fc_list);
				250	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				251	}
				252
				253	/*
				254	* Mark file system as fast commit ineligible. This means that next commit
				255	* operation would result in a full jbd2 commit.
				256	*/
				257	void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
				258	{
				259	struct ext4_sb_info *sbi = EXT4_SB(sb);
				260
				261	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) \|\|
				262	(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
				263	return;
				264
				265	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
				266	WARN_ON(reason >= EXT4_FC_REASON_MAX);
				267	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
				268	}
				269
				270	/*
				271	* Start a fast commit ineligible update. Any commits that happen while
				272	* such an operation is in progress fall back to full commits.
				273	*/
				274	void ext4_fc_start_ineligible(struct super_block *sb, int reason)
				275	{
				276	struct ext4_sb_info *sbi = EXT4_SB(sb);
				277
				278	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) \|\|
				279	(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
				280	return;
				281
				282	WARN_ON(reason >= EXT4_FC_REASON_MAX);
				283	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
				284	atomic_inc(&sbi->s_fc_ineligible_updates);
				285	}
				286
				287	/*
				288	* Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
				289	* to ensure that after stopping the ineligible update, at least one full
				290	* commit takes place.
				291	*/
				292	void ext4_fc_stop_ineligible(struct super_block *sb)
				293	{
				294	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) \|\|
				295	(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
				296	return;
				297
				298	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
				299	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
				300	}
				301
				302	static inline int ext4_fc_is_ineligible(struct super_block *sb)
				303	{
				304	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) \|\|
				305	atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
				306	}
				307
				308	/*
				309	* Generic fast commit tracking function. If this is the first time this we are
				310	* called after a full commit, we initialize fast commit fields and then call
				311	* __fc_track_fn() with update = 0. If we have already been called after a full
				312	* commit, we pass update = 1. Based on that, the track function can determine
				313	* if it needs to track a field for the first time or if it needs to just
				314	* update the previously tracked value.
				315	*
				316	* If enqueue is set, this function enqueues the inode in fast commit list.
				317	*/
				318	static int ext4_fc_track_template(
				319	handle_t handle, struct inode inode,
				320	int (__fc_track_fn)(struct inode , void *, bool),
				321	void *args, int enqueue)
				322	{
				323	bool update = false;
				324	struct ext4_inode_info *ei = EXT4_I(inode);
				325	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
				326	tid_t tid = 0;
				327	int ret;
				328
				329	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) \|\|
				330	(sbi->s_mount_state & EXT4_FC_REPLAY))
				331	return -EOPNOTSUPP;
				332
				333	if (ext4_fc_is_ineligible(inode->i_sb))
				334	return -EINVAL;
				335
				336	tid = handle->h_transaction->t_tid;
				337	mutex_lock(&ei->i_fc_lock);
				338	if (tid == ei->i_sync_tid) {
				339	update = true;
				340	} else {
				341	ext4_fc_reset_inode(inode);
				342	ei->i_sync_tid = tid;
				343	}
				344	ret = __fc_track_fn(inode, args, update);
				345	mutex_unlock(&ei->i_fc_lock);
				346
				347	if (!enqueue)
				348	return ret;
				349
				350	spin_lock(&sbi->s_fc_lock);
				351	if (list_empty(&EXT4_I(inode)->i_fc_list))
				352	list_add_tail(&EXT4_I(inode)->i_fc_list,
				353	(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
				354	&sbi->s_fc_q[FC_Q_STAGING] :
				355	&sbi->s_fc_q[FC_Q_MAIN]);
				356	spin_unlock(&sbi->s_fc_lock);
				357
				358	return ret;
				359	}
				360
				361	struct __track_dentry_update_args {
				362	struct dentry *dentry;
				363	int op;
				364	};
				365
				366	/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
				367	static int __track_dentry_update(struct inode inode, void arg, bool update)
				368	{
				369	struct ext4_fc_dentry_update *node;
				370	struct ext4_inode_info *ei = EXT4_I(inode);
				371	struct __track_dentry_update_args *dentry_update =
				372	(struct __track_dentry_update_args *)arg;
				373	struct dentry *dentry = dentry_update->dentry;
				374	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
				375
				376	mutex_unlock(&ei->i_fc_lock);
				377	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
				378	if (!node) {
				379	ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
				380	mutex_lock(&ei->i_fc_lock);
				381	return -ENOMEM;
				382	}
				383
				384	node->fcd_op = dentry_update->op;
				385	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
				386	node->fcd_ino = inode->i_ino;
				387	if (dentry->d_name.len > DNAME_INLINE_LEN) {
				388	node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
				389	if (!node->fcd_name.name) {
				390	kmem_cache_free(ext4_fc_dentry_cachep, node);
				391	ext4_fc_mark_ineligible(inode->i_sb,
				392	EXT4_FC_REASON_NOMEM);
				393	mutex_lock(&ei->i_fc_lock);
				394	return -ENOMEM;
				395	}
				396	memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
				397	dentry->d_name.len);
				398	} else {
				399	memcpy(node->fcd_iname, dentry->d_name.name,
				400	dentry->d_name.len);
				401	node->fcd_name.name = node->fcd_iname;
				402	}
				403	node->fcd_name.len = dentry->d_name.len;
				404
				405	spin_lock(&sbi->s_fc_lock);
				406	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
				407	list_add_tail(&node->fcd_list,
				408	&sbi->s_fc_dentry_q[FC_Q_STAGING]);
				409	else
				410	list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
				411	spin_unlock(&sbi->s_fc_lock);
				412	mutex_lock(&ei->i_fc_lock);
				413
				414	return 0;
				415	}
				416
				417	void __ext4_fc_track_unlink(handle_t *handle,
				418	struct inode inode, struct dentry dentry)
				419	{
				420	struct __track_dentry_update_args args;
				421	int ret;
				422
				423	args.dentry = dentry;
				424	args.op = EXT4_FC_TAG_UNLINK;
				425
				426	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
				427	(void *)&args, 0);
				428	trace_ext4_fc_track_unlink(inode, dentry, ret);
				429	}
				430
				431	void ext4_fc_track_unlink(handle_t handle, struct dentry dentry)
				432	{
				433	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
				434	}
				435
				436	void __ext4_fc_track_link(handle_t *handle,
				437	struct inode inode, struct dentry dentry)
				438	{
				439	struct __track_dentry_update_args args;
				440	int ret;
				441
				442	args.dentry = dentry;
				443	args.op = EXT4_FC_TAG_LINK;
				444
				445	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
				446	(void *)&args, 0);
				447	trace_ext4_fc_track_link(inode, dentry, ret);
				448	}
				449
				450	void ext4_fc_track_link(handle_t handle, struct dentry dentry)
				451	{
				452	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
				453	}
				454
				455	void __ext4_fc_track_create(handle_t handle, struct inode inode,
				456	struct dentry *dentry)
				457	{
				458	struct __track_dentry_update_args args;
				459	int ret;
				460
				461	args.dentry = dentry;
				462	args.op = EXT4_FC_TAG_CREAT;
				463
				464	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
				465	(void *)&args, 0);
				466	trace_ext4_fc_track_create(inode, dentry, ret);
				467	}
				468
				469	void ext4_fc_track_create(handle_t handle, struct dentry dentry)
				470	{
				471	__ext4_fc_track_create(handle, d_inode(dentry), dentry);
				472	}
				473
				474	/* __track_fn for inode tracking */
				475	static int __track_inode(struct inode inode, void arg, bool update)
				476	{
				477	if (update)
				478	return -EEXIST;
				479
				480	EXT4_I(inode)->i_fc_lblk_len = 0;
				481
				482	return 0;
				483	}
				484
				485	void ext4_fc_track_inode(handle_t handle, struct inode inode)
				486	{
				487	int ret;
				488
				489	if (S_ISDIR(inode->i_mode))
				490	return;
				491
				492	if (ext4_should_journal_data(inode)) {
				493	ext4_fc_mark_ineligible(inode->i_sb,
				494	EXT4_FC_REASON_INODE_JOURNAL_DATA);
				495	return;
				496	}
				497
				498	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
				499	trace_ext4_fc_track_inode(inode, ret);
				500	}
				501
				502	struct __track_range_args {
				503	ext4_lblk_t start, end;
				504	};
				505
				506	/* __track_fn for tracking data updates */
				507	static int __track_range(struct inode inode, void arg, bool update)
				508	{
				509	struct ext4_inode_info *ei = EXT4_I(inode);
				510	ext4_lblk_t oldstart;
				511	struct __track_range_args *__arg =
				512	(struct __track_range_args *)arg;
				513
				514	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
				515	ext4_debug("Special inode %ld being modified\n", inode->i_ino);
				516	return -ECANCELED;
				517	}
				518
				519	oldstart = ei->i_fc_lblk_start;
				520
				521	if (update && ei->i_fc_lblk_len > 0) {
				522	ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
				523	ei->i_fc_lblk_len =
				524	max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
				525	ei->i_fc_lblk_start + 1;
				526	} else {
				527	ei->i_fc_lblk_start = __arg->start;
				528	ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
				529	}
				530
				531	return 0;
				532	}
				533
				534	void ext4_fc_track_range(handle_t handle, struct inode inode, ext4_lblk_t start,
				535	ext4_lblk_t end)
				536	{
				537	struct __track_range_args args;
				538	int ret;
				539
				540	if (S_ISDIR(inode->i_mode))
				541	return;
				542
				543	args.start = start;
				544	args.end = end;
				545
				546	ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
				547
				548	trace_ext4_fc_track_range(inode, start, end, ret);
				549	}
				550
				551	static void ext4_fc_submit_bh(struct super_block *sb)
				552	{
				553	int write_flags = REQ_SYNC;
				554	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
				555
				556	/* TODO: REQ_FUA \| REQ_PREFLUSH is unnecessarily expensive. */
				557	if (test_opt(sb, BARRIER))
				558	write_flags \|= REQ_FUA \| REQ_PREFLUSH;
				559	lock_buffer(bh);
				560	set_buffer_dirty(bh);
				561	set_buffer_uptodate(bh);
				562	bh->b_end_io = ext4_end_buffer_io_sync;
				563	submit_bh(REQ_OP_WRITE, write_flags, bh);
				564	EXT4_SB(sb)->s_fc_bh = NULL;
				565	}
				566
				567	/* Ext4 commit path routines */
				568
				569	/* memzero and update CRC */
				570	static void ext4_fc_memzero(struct super_block sb, void *dst, int len,
				571	u32 *crc)
				572	{
				573	void *ret;
				574
				575	ret = memset(dst, 0, len);
				576	if (crc)
				577	crc = ext4_chksum(EXT4_SB(sb), crc, dst, len);
				578	return ret;
				579	}
				580
				581	/*
				582	* Allocate len bytes on a fast commit buffer.
				583	*
				584	* During the commit time this function is used to manage fast commit
				585	* block space. We don't split a fast commit log onto different
				586	* blocks. So this function makes sure that if there's not enough space
				587	* on the current block, the remaining space in the current block is
				588	* marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
				589	* new block is from jbd2 and CRC is updated to reflect the padding
				590	* we added.
				591	*/
				592	static u8 ext4_fc_reserve_space(struct super_block sb, int len, u32 *crc)
				593	{
				594	struct ext4_fc_tl *tl;
				595	struct ext4_sb_info *sbi = EXT4_SB(sb);
				596	struct buffer_head *bh;
				597	int bsize = sbi->s_journal->j_blocksize;
				598	int ret, off = sbi->s_fc_bytes % bsize;
				599	int pad_len;
				600
				601	/*
				602	* After allocating len, we should have space at least for a 0 byte
				603	* padding.
				604	*/
				605	if (len + sizeof(struct ext4_fc_tl) > bsize)
				606	return NULL;
				607
				608	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
				609	/*
				610	* Only allocate from current buffer if we have enough space for
				611	* this request AND we have space to add a zero byte padding.
				612	*/
				613	if (!sbi->s_fc_bh) {
				614	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
				615	if (ret)
				616	return NULL;
				617	sbi->s_fc_bh = bh;
				618	}
				619	sbi->s_fc_bytes += len;
				620	return sbi->s_fc_bh->b_data + off;
				621	}
				622	/* Need to add PAD tag */
				623	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
				624	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
				625	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
				626	tl->fc_len = cpu_to_le16(pad_len);
				627	if (crc)
				628	crc = ext4_chksum(sbi, crc, tl, sizeof(*tl));
				629	if (pad_len > 0)
				630	ext4_fc_memzero(sb, tl + 1, pad_len, crc);
				631	ext4_fc_submit_bh(sb);
				632
				633	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
				634	if (ret)
				635	return NULL;
				636	sbi->s_fc_bh = bh;
				637	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
				638	return sbi->s_fc_bh->b_data;
				639	}
				640
				641	/* memcpy to fc reserved space and update CRC */
				642	static void ext4_fc_memcpy(struct super_block sb, void dst, const void src,
				643	int len, u32 *crc)
				644	{
				645	if (crc)
				646	crc = ext4_chksum(EXT4_SB(sb), crc, src, len);
				647	return memcpy(dst, src, len);
				648	}
				649
				650	/*
				651	* Complete a fast commit by writing tail tag.
				652	*
				653	* Writing tail tag marks the end of a fast commit. In order to guarantee
				654	* atomicity, after writing tail tag, even if there's space remaining
				655	* in the block, next commit shouldn't use it. That's why tail tag
				656	* has the length as that of the remaining space on the block.
				657	*/
				658	static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
				659	{
				660	struct ext4_sb_info *sbi = EXT4_SB(sb);
				661	struct ext4_fc_tl tl;
				662	struct ext4_fc_tail tail;
				663	int off, bsize = sbi->s_journal->j_blocksize;
				664	u8 *dst;
				665
				666	/*
				667	* ext4_fc_reserve_space takes care of allocating an extra block if
				668	* there's no enough space on this block for accommodating this tail.
				669	*/
				670	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
				671	if (!dst)
				672	return -ENOSPC;
				673
				674	off = sbi->s_fc_bytes % bsize;
				675
				676	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
				677	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
				678	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
				679
				680	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
				681	dst += sizeof(tl);
				682	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
				683	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
				684	dst += sizeof(tail.fc_tid);
				685	tail.fc_crc = cpu_to_le32(crc);
				686	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
				687
				688	ext4_fc_submit_bh(sb);
				689
				690	return 0;
				691	}
				692
				693	/*
				694	* Adds tag, length, value and updates CRC. Returns true if tlv was added.
				695	* Returns false if there's not enough space.
				696	*/
				697	static bool ext4_fc_add_tlv(struct super_block sb, u16 tag, u16 len, u8 val,
				698	u32 *crc)
				699	{
				700	struct ext4_fc_tl tl;
				701	u8 *dst;
				702
				703	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
				704	if (!dst)
				705	return false;
				706
				707	tl.fc_tag = cpu_to_le16(tag);
				708	tl.fc_len = cpu_to_le16(len);
				709
				710	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
				711	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
				712
				713	return true;
				714	}
				715
				716	/* Same as above, but adds dentry tlv. */
				717	static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
				718	int parent_ino, int ino, int dlen,
				719	const unsigned char *dname,
				720	u32 *crc)
				721	{
				722	struct ext4_fc_dentry_info fcd;
				723	struct ext4_fc_tl tl;
				724	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
				725	crc);
				726
				727	if (!dst)
				728	return false;
				729
				730	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
				731	fcd.fc_ino = cpu_to_le32(ino);
				732	tl.fc_tag = cpu_to_le16(tag);
				733	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
				734	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
				735	dst += sizeof(tl);
				736	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
				737	dst += sizeof(fcd);
				738	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
				739	dst += dlen;
				740
				741	return true;
				742	}
				743
				744	/*
				745	* Writes inode in the fast commit space under TLV with tag @tag.
				746	* Returns 0 on success, error on failure.
				747	*/
				748	static int ext4_fc_write_inode(struct inode inode, u32 crc)
				749	{
				750	struct ext4_inode_info *ei = EXT4_I(inode);
				751	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
				752	int ret;
				753	struct ext4_iloc iloc;
				754	struct ext4_fc_inode fc_inode;
				755	struct ext4_fc_tl tl;
				756	u8 *dst;
				757
				758	ret = ext4_get_inode_loc(inode, &iloc);
				759	if (ret)
				760	return ret;
				761
				762	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
				763	inode_len += ei->i_extra_isize;
				764
				765	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
				766	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
				767	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
				768
				769	dst = ext4_fc_reserve_space(inode->i_sb,
				770	sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
				771	if (!dst)
				772	return -ECANCELED;
				773
				774	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
				775	return -ECANCELED;
				776	dst += sizeof(tl);
				777	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
				778	return -ECANCELED;
				779	dst += sizeof(fc_inode);
				780	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
				781	inode_len, crc))
				782	return -ECANCELED;
				783
				784	return 0;
				785	}
				786
				787	/*
				788	* Writes updated data ranges for the inode in question. Updates CRC.
				789	* Returns 0 on success, error otherwise.
				790	*/
				791	static int ext4_fc_write_inode_data(struct inode inode, u32 crc)
				792	{
				793	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
				794	struct ext4_inode_info *ei = EXT4_I(inode);
				795	struct ext4_map_blocks map;
				796	struct ext4_fc_add_range fc_ext;
				797	struct ext4_fc_del_range lrange;
				798	struct ext4_extent *ex;
				799	int ret;
				800
				801	mutex_lock(&ei->i_fc_lock);
				802	if (ei->i_fc_lblk_len == 0) {
				803	mutex_unlock(&ei->i_fc_lock);
				804	return 0;
				805	}
				806	old_blk_size = ei->i_fc_lblk_start;
				807	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
				808	ei->i_fc_lblk_len = 0;
				809	mutex_unlock(&ei->i_fc_lock);
				810
				811	cur_lblk_off = old_blk_size;
				812	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
				813	__func__, cur_lblk_off, new_blk_size, inode->i_ino);
				814
				815	while (cur_lblk_off <= new_blk_size) {
				816	map.m_lblk = cur_lblk_off;
				817	map.m_len = new_blk_size - cur_lblk_off + 1;
				818	ret = ext4_map_blocks(NULL, inode, &map, 0);
				819	if (ret < 0)
				820	return -ECANCELED;
				821
				822	if (map.m_len == 0) {
				823	cur_lblk_off++;
				824	continue;
				825	}
				826
				827	if (ret == 0) {
				828	lrange.fc_ino = cpu_to_le32(inode->i_ino);
				829	lrange.fc_lblk = cpu_to_le32(map.m_lblk);
				830	lrange.fc_len = cpu_to_le32(map.m_len);
				831	if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
				832	sizeof(lrange), (u8 *)&lrange, crc))
				833	return -ENOSPC;
				834	} else {
				835	unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
				836	EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
				837
				838	/* Limit the number of blocks in one extent */
				839	map.m_len = min(max, map.m_len);
				840
				841	fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
				842	ex = (struct ext4_extent *)&fc_ext.fc_ex;
				843	ex->ee_block = cpu_to_le32(map.m_lblk);
				844	ex->ee_len = cpu_to_le16(map.m_len);
				845	ext4_ext_store_pblock(ex, map.m_pblk);
				846	if (map.m_flags & EXT4_MAP_UNWRITTEN)
				847	ext4_ext_mark_unwritten(ex);
				848	else
				849	ext4_ext_mark_initialized(ex);
				850	if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
				851	sizeof(fc_ext), (u8 *)&fc_ext, crc))
				852	return -ENOSPC;
				853	}
				854
				855	cur_lblk_off += map.m_len;
				856	}
				857
				858	return 0;
				859	}
				860
				861
				862	/* Submit data for all the fast commit inodes */
				863	static int ext4_fc_submit_inode_data_all(journal_t *journal)
				864	{
				865	struct super_block sb = (struct super_block )(journal->j_private);
				866	struct ext4_sb_info *sbi = EXT4_SB(sb);
				867	struct ext4_inode_info *ei;
				868	struct list_head *pos;
				869	int ret = 0;
				870
				871	spin_lock(&sbi->s_fc_lock);
				872	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
				873	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
				874	ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
				875	ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
				876	while (atomic_read(&ei->i_fc_updates)) {
				877	DEFINE_WAIT(wait);
				878
				879	prepare_to_wait(&ei->i_fc_wait, &wait,
				880	TASK_UNINTERRUPTIBLE);
				881	if (atomic_read(&ei->i_fc_updates)) {
				882	spin_unlock(&sbi->s_fc_lock);
				883	schedule();
				884	spin_lock(&sbi->s_fc_lock);
				885	}
				886	finish_wait(&ei->i_fc_wait, &wait);
				887	}
				888	spin_unlock(&sbi->s_fc_lock);
				889	ret = jbd2_submit_inode_data(ei->jinode);
				890	if (ret)
				891	return ret;
				892	spin_lock(&sbi->s_fc_lock);
				893	}
				894	spin_unlock(&sbi->s_fc_lock);
				895
				896	return ret;
				897	}
				898
				899	/* Wait for completion of data for all the fast commit inodes */
				900	static int ext4_fc_wait_inode_data_all(journal_t *journal)
				901	{
				902	struct super_block sb = (struct super_block )(journal->j_private);
				903	struct ext4_sb_info *sbi = EXT4_SB(sb);
				904	struct ext4_inode_info pos, n;
				905	int ret = 0;
				906
				907	spin_lock(&sbi->s_fc_lock);
				908	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
				909	if (!ext4_test_inode_state(&pos->vfs_inode,
				910	EXT4_STATE_FC_COMMITTING))
				911	continue;
				912	spin_unlock(&sbi->s_fc_lock);
				913
				914	ret = jbd2_wait_inode_data(journal, pos->jinode);
				915	if (ret)
				916	return ret;
				917	spin_lock(&sbi->s_fc_lock);
				918	}
				919	spin_unlock(&sbi->s_fc_lock);
				920
				921	return 0;
				922	}
				923
				924	/* Commit all the directory entry updates */
				925	static int ext4_fc_commit_dentry_updates(journal_t journal, u32 crc)
				926	__acquires(&sbi->s_fc_lock)
				927	__releases(&sbi->s_fc_lock)
				928	{
				929	struct super_block sb = (struct super_block )(journal->j_private);
				930	struct ext4_sb_info *sbi = EXT4_SB(sb);
				931	struct ext4_fc_dentry_update *fc_dentry;
				932	struct inode *inode;
				933	struct list_head pos, n, fcd_pos, fcd_n;
				934	struct ext4_inode_info *ei;
				935	int ret;
				936
				937	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
				938	return 0;
				939	list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
				940	fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
				941	fcd_list);
				942	if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
				943	spin_unlock(&sbi->s_fc_lock);
				944	if (!ext4_fc_add_dentry_tlv(
				945	sb, fc_dentry->fcd_op,
				946	fc_dentry->fcd_parent, fc_dentry->fcd_ino,
				947	fc_dentry->fcd_name.len,
				948	fc_dentry->fcd_name.name, crc)) {
				949	ret = -ENOSPC;
				950	goto lock_and_exit;
				951	}
				952	spin_lock(&sbi->s_fc_lock);
				953	continue;
				954	}
				955
				956	inode = NULL;
				957	list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
				958	ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
				959	if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
				960	inode = &ei->vfs_inode;
				961	break;
				962	}
				963	}
				964	/*
				965	* If we don't find inode in our list, then it was deleted,
				966	* in which case, we don't need to record it's create tag.
				967	*/
				968	if (!inode)
				969	continue;
				970	spin_unlock(&sbi->s_fc_lock);
				971
				972	/*
				973	* We first write the inode and then the create dirent. This
				974	* allows the recovery code to create an unnamed inode first
				975	* and then link it to a directory entry. This allows us
				976	* to use namei.c routines almost as is and simplifies
				977	* the recovery code.
				978	*/
				979	ret = ext4_fc_write_inode(inode, crc);
				980	if (ret)
				981	goto lock_and_exit;
				982
				983	ret = ext4_fc_write_inode_data(inode, crc);
				984	if (ret)
				985	goto lock_and_exit;
				986
				987	if (!ext4_fc_add_dentry_tlv(
				988	sb, fc_dentry->fcd_op,
				989	fc_dentry->fcd_parent, fc_dentry->fcd_ino,
				990	fc_dentry->fcd_name.len,
				991	fc_dentry->fcd_name.name, crc)) {
				992	ret = -ENOSPC;
				993	goto lock_and_exit;
				994	}
				995
				996	spin_lock(&sbi->s_fc_lock);
				997	}
				998	return 0;
				999	lock_and_exit:
				1000	spin_lock(&sbi->s_fc_lock);
				1001	return ret;
				1002	}
				1003
				1004	static int ext4_fc_perform_commit(journal_t *journal)
				1005	{
				1006	struct super_block sb = (struct super_block )(journal->j_private);
				1007	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1008	struct ext4_inode_info *iter;
				1009	struct ext4_fc_head head;
				1010	struct list_head *pos;
				1011	struct inode *inode;
				1012	struct blk_plug plug;
				1013	int ret = 0;
				1014	u32 crc = 0;
				1015
				1016	ret = ext4_fc_submit_inode_data_all(journal);
				1017	if (ret)
				1018	return ret;
				1019
				1020	ret = ext4_fc_wait_inode_data_all(journal);
				1021	if (ret)
				1022	return ret;
				1023
				1024	/*
				1025	* If file system device is different from journal device, issue a cache
				1026	* flush before we start writing fast commit blocks.
				1027	*/
				1028	if (journal->j_fs_dev != journal->j_dev)
				1029	blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
				1030
				1031	blk_start_plug(&plug);
				1032	if (sbi->s_fc_bytes == 0) {
				1033	/*
				1034	* Add a head tag only if this is the first fast commit
				1035	* in this TID.
				1036	*/
				1037	head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
				1038	head.fc_tid = cpu_to_le32(
				1039	sbi->s_journal->j_running_transaction->t_tid);
				1040	if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
				1041	(u8 *)&head, &crc)) {
				1042	ret = -ENOSPC;
				1043	goto out;
				1044	}
				1045	}
				1046
				1047	spin_lock(&sbi->s_fc_lock);
				1048	ret = ext4_fc_commit_dentry_updates(journal, &crc);
				1049	if (ret) {
				1050	spin_unlock(&sbi->s_fc_lock);
				1051	goto out;
				1052	}
				1053
				1054	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
				1055	iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
				1056	inode = &iter->vfs_inode;
				1057	if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
				1058	continue;
				1059
				1060	spin_unlock(&sbi->s_fc_lock);
				1061	ret = ext4_fc_write_inode_data(inode, &crc);
				1062	if (ret)
				1063	goto out;
				1064	ret = ext4_fc_write_inode(inode, &crc);
				1065	if (ret)
				1066	goto out;
				1067	spin_lock(&sbi->s_fc_lock);
				1068	}
				1069	spin_unlock(&sbi->s_fc_lock);
				1070
				1071	ret = ext4_fc_write_tail(sb, crc);
				1072
				1073	out:
				1074	blk_finish_plug(&plug);
				1075	return ret;
				1076	}
				1077
				1078	/*
				1079	* The main commit entry point. Performs a fast commit for transaction
				1080	* commit_tid if needed. If it's not possible to perform a fast commit
				1081	* due to various reasons, we fall back to full commit. Returns 0
				1082	* on success, error otherwise.
				1083	*/
				1084	int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
				1085	{
				1086	struct super_block sb = (struct super_block )(journal->j_private);
				1087	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1088	int nblks = 0, ret, bsize = journal->j_blocksize;
				1089	int subtid = atomic_read(&sbi->s_fc_subtid);
				1090	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
				1091	ktime_t start_time, commit_time;
				1092
				1093	trace_ext4_fc_commit_start(sb);
				1094
				1095	start_time = ktime_get();
				1096
				1097	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) \|\|
				1098	(ext4_fc_is_ineligible(sb))) {
				1099	reason = EXT4_FC_REASON_INELIGIBLE;
				1100	goto out;
				1101	}
				1102
				1103	restart_fc:
				1104	ret = jbd2_fc_begin_commit(journal, commit_tid);
				1105	if (ret == -EALREADY) {
				1106	/* There was an ongoing commit, check if we need to restart */
				1107	if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
				1108	commit_tid > journal->j_commit_sequence)
				1109	goto restart_fc;
				1110	reason = EXT4_FC_REASON_ALREADY_COMMITTED;
				1111	goto out;
				1112	} else if (ret) {
				1113	sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
				1114	reason = EXT4_FC_REASON_FC_START_FAILED;
				1115	goto out;
				1116	}
				1117
				1118	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
				1119	ret = ext4_fc_perform_commit(journal);
				1120	if (ret < 0) {
				1121	sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
				1122	reason = EXT4_FC_REASON_FC_FAILED;
				1123	goto out;
				1124	}
				1125	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
				1126	ret = jbd2_fc_wait_bufs(journal, nblks);
				1127	if (ret < 0) {
				1128	sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
				1129	reason = EXT4_FC_REASON_FC_FAILED;
				1130	goto out;
				1131	}
				1132	atomic_inc(&sbi->s_fc_subtid);
				1133	jbd2_fc_end_commit(journal);
				1134	out:
				1135	/* Has any ineligible update happened since we started? */
				1136	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
				1137	sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
				1138	reason = EXT4_FC_REASON_INELIGIBLE;
				1139	}
				1140
				1141	spin_lock(&sbi->s_fc_lock);
				1142	if (reason != EXT4_FC_REASON_OK &&
				1143	reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
				1144	sbi->s_fc_stats.fc_ineligible_commits++;
				1145	} else {
				1146	sbi->s_fc_stats.fc_num_commits++;
				1147	sbi->s_fc_stats.fc_numblks += nblks;
				1148	}
				1149	spin_unlock(&sbi->s_fc_lock);
				1150	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
				1151	trace_ext4_fc_commit_stop(sb, nblks, reason);
				1152	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
				1153	/*
				1154	* weight the commit time higher than the average time so we don't
				1155	* react too strongly to vast changes in the commit time
				1156	*/
				1157	if (likely(sbi->s_fc_avg_commit_time))
				1158	sbi->s_fc_avg_commit_time = (commit_time +
				1159	sbi->s_fc_avg_commit_time * 3) / 4;
				1160	else
				1161	sbi->s_fc_avg_commit_time = commit_time;
				1162	jbd_debug(1,
				1163	"Fast commit ended with blks = %d, reason = %d, subtid - %d",
				1164	nblks, reason, subtid);
				1165	if (reason == EXT4_FC_REASON_FC_FAILED)
				1166	return jbd2_fc_end_commit_fallback(journal);
				1167	if (reason == EXT4_FC_REASON_FC_START_FAILED \|\|
				1168	reason == EXT4_FC_REASON_INELIGIBLE)
				1169	return jbd2_complete_transaction(journal, commit_tid);
				1170	return 0;
				1171	}
				1172
				1173	/*
				1174	* Fast commit cleanup routine. This is called after every fast commit and
				1175	* full commit. full is true if we are called after a full commit.
				1176	*/
				1177	static void ext4_fc_cleanup(journal_t *journal, int full)
				1178	{
				1179	struct super_block *sb = journal->j_private;
				1180	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1181	struct ext4_inode_info *iter;
				1182	struct ext4_fc_dentry_update *fc_dentry;
				1183	struct list_head pos, n;
				1184
				1185	if (full && sbi->s_fc_bh)
				1186	sbi->s_fc_bh = NULL;
				1187
				1188	jbd2_fc_release_bufs(journal);
				1189
				1190	spin_lock(&sbi->s_fc_lock);
				1191	list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
				1192	iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
				1193	list_del_init(&iter->i_fc_list);
				1194	ext4_clear_inode_state(&iter->vfs_inode,
				1195	EXT4_STATE_FC_COMMITTING);
				1196	ext4_fc_reset_inode(&iter->vfs_inode);
				1197	/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
				1198	smp_mb();
				1199	#if (BITS_PER_LONG < 64)
				1200	wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
				1201	#else
				1202	wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
				1203	#endif
				1204	}
				1205
				1206	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
				1207	fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
				1208	struct ext4_fc_dentry_update,
				1209	fcd_list);
				1210	list_del_init(&fc_dentry->fcd_list);
				1211	spin_unlock(&sbi->s_fc_lock);
				1212
				1213	if (fc_dentry->fcd_name.name &&
				1214	fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
				1215	kfree(fc_dentry->fcd_name.name);
				1216	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
				1217	spin_lock(&sbi->s_fc_lock);
				1218	}
				1219
				1220	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
				1221	&sbi->s_fc_dentry_q[FC_Q_MAIN]);
				1222	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
				1223	&sbi->s_fc_q[FC_Q_MAIN]);
				1224
				1225	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
				1226	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
				1227
				1228	if (full)
				1229	sbi->s_fc_bytes = 0;
				1230	spin_unlock(&sbi->s_fc_lock);
				1231	trace_ext4_fc_stats(sb);
				1232	}
				1233
				1234	/* Ext4 Replay Path Routines */
				1235
				1236	/* Helper struct for dentry replay routines */
				1237	struct dentry_info_args {
				1238	int parent_ino, dname_len, ino, inode_len;
				1239	char *dname;
				1240	};
				1241
				1242	static inline void tl_to_darg(struct dentry_info_args *darg,
				1243	struct ext4_fc_tl tl, u8 val)
				1244	{
				1245	struct ext4_fc_dentry_info fcd;
				1246
				1247	memcpy(&fcd, val, sizeof(fcd));
				1248
				1249	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
				1250	darg->ino = le32_to_cpu(fcd.fc_ino);
				1251	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
				1252	darg->dname_len = le16_to_cpu(tl->fc_len) -
				1253	sizeof(struct ext4_fc_dentry_info);
				1254	}
				1255
				1256	/* Unlink replay function */
				1257	static int ext4_fc_replay_unlink(struct super_block sb, struct ext4_fc_tl tl,
				1258	u8 *val)
				1259	{
				1260	struct inode inode, old_parent;
				1261	struct qstr entry;
				1262	struct dentry_info_args darg;
				1263	int ret = 0;
				1264
				1265	tl_to_darg(&darg, tl, val);
				1266
				1267	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
				1268	darg.parent_ino, darg.dname_len);
				1269
				1270	entry.name = darg.dname;
				1271	entry.len = darg.dname_len;
				1272	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
				1273
				1274	if (IS_ERR(inode)) {
				1275	jbd_debug(1, "Inode %d not found", darg.ino);
				1276	return 0;
				1277	}
				1278
				1279	old_parent = ext4_iget(sb, darg.parent_ino,
				1280	EXT4_IGET_NORMAL);
				1281	if (IS_ERR(old_parent)) {
				1282	jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
				1283	iput(inode);
				1284	return 0;
				1285	}
				1286
				1287	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
				1288	/* -ENOENT ok coz it might not exist anymore. */
				1289	if (ret == -ENOENT)
				1290	ret = 0;
				1291	iput(old_parent);
				1292	iput(inode);
				1293	return ret;
				1294	}
				1295
				1296	static int ext4_fc_replay_link_internal(struct super_block *sb,
				1297	struct dentry_info_args *darg,
				1298	struct inode *inode)
				1299	{
				1300	struct inode *dir = NULL;
				1301	struct dentry dentry_dir = NULL, dentry_inode = NULL;
				1302	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
				1303	int ret = 0;
				1304
				1305	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
				1306	if (IS_ERR(dir)) {
				1307	jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
				1308	dir = NULL;
				1309	goto out;
				1310	}
				1311
				1312	dentry_dir = d_obtain_alias(dir);
				1313	if (IS_ERR(dentry_dir)) {
				1314	jbd_debug(1, "Failed to obtain dentry");
				1315	dentry_dir = NULL;
				1316	goto out;
				1317	}
				1318
				1319	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
				1320	if (!dentry_inode) {
				1321	jbd_debug(1, "Inode dentry not created.");
				1322	ret = -ENOMEM;
				1323	goto out;
				1324	}
				1325
				1326	ret = __ext4_link(dir, inode, dentry_inode);
				1327	/*
				1328	* It's possible that link already existed since data blocks
				1329	* for the dir in question got persisted before we crashed OR
				1330	* we replayed this tag and crashed before the entire replay
				1331	* could complete.
				1332	*/
				1333	if (ret && ret != -EEXIST) {
				1334	jbd_debug(1, "Failed to link\n");
				1335	goto out;
				1336	}
				1337
				1338	ret = 0;
				1339	out:
				1340	if (dentry_dir) {
				1341	d_drop(dentry_dir);
				1342	dput(dentry_dir);
				1343	} else if (dir) {
				1344	iput(dir);
				1345	}
				1346	if (dentry_inode) {
				1347	d_drop(dentry_inode);
				1348	dput(dentry_inode);
				1349	}
				1350
				1351	return ret;
				1352	}
				1353
				1354	/* Link replay function */
				1355	static int ext4_fc_replay_link(struct super_block sb, struct ext4_fc_tl tl,
				1356	u8 *val)
				1357	{
				1358	struct inode *inode;
				1359	struct dentry_info_args darg;
				1360	int ret = 0;
				1361
				1362	tl_to_darg(&darg, tl, val);
				1363	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
				1364	darg.parent_ino, darg.dname_len);
				1365
				1366	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
				1367	if (IS_ERR(inode)) {
				1368	jbd_debug(1, "Inode not found.");
				1369	return 0;
				1370	}
				1371
				1372	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
				1373	iput(inode);
				1374	return ret;
				1375	}
				1376
				1377	/*
				1378	* Record all the modified inodes during replay. We use this later to setup
				1379	* block bitmaps correctly.
				1380	*/
				1381	static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
				1382	{
				1383	struct ext4_fc_replay_state *state;
				1384	int i;
				1385
				1386	state = &EXT4_SB(sb)->s_fc_replay_state;
				1387	for (i = 0; i < state->fc_modified_inodes_used; i++)
				1388	if (state->fc_modified_inodes[i] == ino)
				1389	return 0;
				1390	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
				1391	state->fc_modified_inodes = krealloc(
				1392	state->fc_modified_inodes,
				1393	sizeof(int) * (state->fc_modified_inodes_size +
				1394	EXT4_FC_REPLAY_REALLOC_INCREMENT),
				1395	GFP_KERNEL);
				1396	if (!state->fc_modified_inodes)
				1397	return -ENOMEM;
				1398	state->fc_modified_inodes_size +=
				1399	EXT4_FC_REPLAY_REALLOC_INCREMENT;
				1400	}
				1401	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
				1402	return 0;
				1403	}
				1404
				1405	/*
				1406	* Inode replay function
				1407	*/
				1408	static int ext4_fc_replay_inode(struct super_block sb, struct ext4_fc_tl tl,
				1409	u8 *val)
				1410	{
				1411	struct ext4_fc_inode fc_inode;
				1412	struct ext4_inode *raw_inode;
				1413	struct ext4_inode *raw_fc_inode;
				1414	struct inode *inode = NULL;
				1415	struct ext4_iloc iloc;
				1416	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
				1417	struct ext4_extent_header *eh;
				1418
				1419	memcpy(&fc_inode, val, sizeof(fc_inode));
				1420
				1421	ino = le32_to_cpu(fc_inode.fc_ino);
				1422	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
				1423
				1424	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
				1425	if (!IS_ERR(inode)) {
				1426	ext4_ext_clear_bb(inode);
				1427	iput(inode);
				1428	}
				1429	inode = NULL;
				1430
				1431	ret = ext4_fc_record_modified_inode(sb, ino);
				1432	if (ret)
				1433	goto out;
				1434
				1435	raw_fc_inode = (struct ext4_inode *)
				1436	(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
				1437	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
				1438	if (ret)
				1439	goto out;
				1440
				1441	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
				1442	raw_inode = ext4_raw_inode(&iloc);
				1443
				1444	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
				1445	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
				1446	inode_len - offsetof(struct ext4_inode, i_generation));
				1447	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
				1448	eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
				1449	if (eh->eh_magic != EXT4_EXT_MAGIC) {
				1450	memset(eh, 0, sizeof(*eh));
				1451	eh->eh_magic = EXT4_EXT_MAGIC;
				1452	eh->eh_max = cpu_to_le16(
				1453	(sizeof(raw_inode->i_block) -
				1454	sizeof(struct ext4_extent_header))
				1455	/ sizeof(struct ext4_extent));
				1456	}
				1457	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
				1458	memcpy(raw_inode->i_block, raw_fc_inode->i_block,
				1459	sizeof(raw_inode->i_block));
				1460	}
				1461
				1462	/* Immediately update the inode on disk. */
				1463	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
				1464	if (ret)
				1465	goto out;
				1466	ret = sync_dirty_buffer(iloc.bh);
				1467	if (ret)
				1468	goto out;
				1469	ret = ext4_mark_inode_used(sb, ino);
				1470	if (ret)
				1471	goto out;
				1472
				1473	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
				1474	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
				1475	if (IS_ERR(inode)) {
				1476	jbd_debug(1, "Inode not found.");
				1477	return -EFSCORRUPTED;
				1478	}
				1479
				1480	/*
				1481	* Our allocator could have made different decisions than before
				1482	* crashing. This should be fixed but until then, we calculate
				1483	* the number of blocks the inode.
				1484	*/
				1485	ext4_ext_replay_set_iblocks(inode);
				1486
				1487	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
				1488	ext4_reset_inode_seed(inode);
				1489
				1490	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
				1491	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
				1492	sync_dirty_buffer(iloc.bh);
				1493	brelse(iloc.bh);
				1494	out:
				1495	iput(inode);
				1496	if (!ret)
				1497	blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
				1498
				1499	return 0;
				1500	}
				1501
				1502	/*
				1503	* Dentry create replay function.
				1504	*
				1505	* EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
				1506	* inode for which we are trying to create a dentry here, should already have
				1507	* been replayed before we start here.
				1508	*/
				1509	static int ext4_fc_replay_create(struct super_block sb, struct ext4_fc_tl tl,
				1510	u8 *val)
				1511	{
				1512	int ret = 0;
				1513	struct inode *inode = NULL;
				1514	struct inode *dir = NULL;
				1515	struct dentry_info_args darg;
				1516
				1517	tl_to_darg(&darg, tl, val);
				1518
				1519	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
				1520	darg.parent_ino, darg.dname_len);
				1521
				1522	/* This takes care of update group descriptor and other metadata */
				1523	ret = ext4_mark_inode_used(sb, darg.ino);
				1524	if (ret)
				1525	goto out;
				1526
				1527	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
				1528	if (IS_ERR(inode)) {
				1529	jbd_debug(1, "inode %d not found.", darg.ino);
				1530	inode = NULL;
				1531	ret = -EINVAL;
				1532	goto out;
				1533	}
				1534
				1535	if (S_ISDIR(inode->i_mode)) {
				1536	/*
				1537	* If we are creating a directory, we need to make sure that the
				1538	* dot and dot dot dirents are setup properly.
				1539	*/
				1540	dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
				1541	if (IS_ERR(dir)) {
				1542	jbd_debug(1, "Dir %d not found.", darg.ino);
				1543	goto out;
				1544	}
				1545	ret = ext4_init_new_dir(NULL, dir, inode);
				1546	iput(dir);
				1547	if (ret) {
				1548	ret = 0;
				1549	goto out;
				1550	}
				1551	}
				1552	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
				1553	if (ret)
				1554	goto out;
				1555	set_nlink(inode, 1);
				1556	ext4_mark_inode_dirty(NULL, inode);
				1557	out:
				1558	if (inode)
				1559	iput(inode);
				1560	return ret;
				1561	}
				1562
				1563	/*
				1564	* Record physical disk regions which are in use as per fast commit area,
				1565	* and used by inodes during replay phase. Our simple replay phase
				1566	* allocator excludes these regions from allocation.
				1567	*/
				1568	int ext4_fc_record_regions(struct super_block *sb, int ino,
				1569	ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
				1570	{
				1571	struct ext4_fc_replay_state *state;
				1572	struct ext4_fc_alloc_region *region;
				1573
				1574	state = &EXT4_SB(sb)->s_fc_replay_state;
				1575	/*
				1576	* during replay phase, the fc_regions_valid may not same as
				1577	* fc_regions_used, update it when do new additions.
				1578	*/
				1579	if (replay && state->fc_regions_used != state->fc_regions_valid)
				1580	state->fc_regions_used = state->fc_regions_valid;
				1581	if (state->fc_regions_used == state->fc_regions_size) {
				1582	state->fc_regions_size +=
				1583	EXT4_FC_REPLAY_REALLOC_INCREMENT;
				1584	state->fc_regions = krealloc(
				1585	state->fc_regions,
				1586	state->fc_regions_size *
				1587	sizeof(struct ext4_fc_alloc_region),
				1588	GFP_KERNEL);
				1589	if (!state->fc_regions)
				1590	return -ENOMEM;
				1591	}
				1592	region = &state->fc_regions[state->fc_regions_used++];
				1593	region->ino = ino;
				1594	region->lblk = lblk;
				1595	region->pblk = pblk;
				1596	region->len = len;
				1597
				1598	if (replay)
				1599	state->fc_regions_valid++;
				1600
				1601	return 0;
				1602	}
				1603
				1604	/* Replay add range tag */
				1605	static int ext4_fc_replay_add_range(struct super_block *sb,
				1606	struct ext4_fc_tl tl, u8 val)
				1607	{
				1608	struct ext4_fc_add_range fc_add_ex;
				1609	struct ext4_extent newex, *ex;
				1610	struct inode *inode;
				1611	ext4_lblk_t start, cur;
				1612	int remaining, len;
				1613	ext4_fsblk_t start_pblk;
				1614	struct ext4_map_blocks map;
				1615	struct ext4_ext_path *path = NULL;
				1616	int ret;
				1617
				1618	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
				1619	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
				1620
				1621	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
				1622	le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
				1623	ext4_ext_get_actual_len(ex));
				1624
				1625	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
				1626	if (IS_ERR(inode)) {
				1627	jbd_debug(1, "Inode not found.");
				1628	return 0;
				1629	}
				1630
				1631	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
				1632	if (ret)
				1633	goto out;
				1634
				1635	start = le32_to_cpu(ex->ee_block);
				1636	start_pblk = ext4_ext_pblock(ex);
				1637	len = ext4_ext_get_actual_len(ex);
				1638
				1639	cur = start;
				1640	remaining = len;
				1641	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
				1642	start, start_pblk, len, ext4_ext_is_unwritten(ex),
				1643	inode->i_ino);
				1644
				1645	while (remaining > 0) {
				1646	map.m_lblk = cur;
				1647	map.m_len = remaining;
				1648	map.m_pblk = 0;
				1649	ret = ext4_map_blocks(NULL, inode, &map, 0);
				1650
				1651	if (ret < 0)
				1652	goto out;
				1653
				1654	if (ret == 0) {
				1655	/* Range is not mapped */
				1656	path = ext4_find_extent(inode, cur, NULL, 0);
				1657	if (IS_ERR(path))
				1658	goto out;
				1659	memset(&newex, 0, sizeof(newex));
				1660	newex.ee_block = cpu_to_le32(cur);
				1661	ext4_ext_store_pblock(
				1662	&newex, start_pblk + cur - start);
				1663	newex.ee_len = cpu_to_le16(map.m_len);
				1664	if (ext4_ext_is_unwritten(ex))
				1665	ext4_ext_mark_unwritten(&newex);
				1666	down_write(&EXT4_I(inode)->i_data_sem);
				1667	ret = ext4_ext_insert_extent(
				1668	NULL, inode, &path, &newex, 0);
				1669	up_write((&EXT4_I(inode)->i_data_sem));
				1670	ext4_ext_drop_refs(path);
				1671	kfree(path);
				1672	if (ret)
				1673	goto out;
				1674	goto next;
				1675	}
				1676
				1677	if (start_pblk + cur - start != map.m_pblk) {
				1678	/*
				1679	* Logical to physical mapping changed. This can happen
				1680	* if this range was removed and then reallocated to
				1681	* map to new physical blocks during a fast commit.
				1682	*/
				1683	ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
				1684	ext4_ext_is_unwritten(ex),
				1685	start_pblk + cur - start);
				1686	if (ret)
				1687	goto out;
				1688	/*
				1689	* Mark the old blocks as free since they aren't used
				1690	* anymore. We maintain an array of all the modified
				1691	* inodes. In case these blocks are still used at either
				1692	* a different logical range in the same inode or in
				1693	* some different inode, we will mark them as allocated
				1694	* at the end of the FC replay using our array of
				1695	* modified inodes.
				1696	*/
				1697	ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
				1698	goto next;
				1699	}
				1700
				1701	/* Range is mapped and needs a state change */
				1702	jbd_debug(1, "Converting from %ld to %d %lld",
				1703	map.m_flags & EXT4_MAP_UNWRITTEN,
				1704	ext4_ext_is_unwritten(ex), map.m_pblk);
				1705	ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
				1706	ext4_ext_is_unwritten(ex), map.m_pblk);
				1707	if (ret)
				1708	goto out;
				1709	/*
				1710	* We may have split the extent tree while toggling the state.
				1711	* Try to shrink the extent tree now.
				1712	*/
				1713	ext4_ext_replay_shrink_inode(inode, start + len);
				1714	next:
				1715	cur += map.m_len;
				1716	remaining -= map.m_len;
				1717	}
				1718	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
				1719	sb->s_blocksize_bits);
				1720	out:
				1721	iput(inode);
				1722	return 0;
				1723	}
				1724
				1725	/* Replay DEL_RANGE tag */
				1726	static int
				1727	ext4_fc_replay_del_range(struct super_block sb, struct ext4_fc_tl tl,
				1728	u8 *val)
				1729	{
				1730	struct inode *inode;
				1731	struct ext4_fc_del_range lrange;
				1732	struct ext4_map_blocks map;
				1733	ext4_lblk_t cur, remaining;
				1734	int ret;
				1735
				1736	memcpy(&lrange, val, sizeof(lrange));
				1737	cur = le32_to_cpu(lrange.fc_lblk);
				1738	remaining = le32_to_cpu(lrange.fc_len);
				1739
				1740	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
				1741	le32_to_cpu(lrange.fc_ino), cur, remaining);
				1742
				1743	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
				1744	if (IS_ERR(inode)) {
				1745	jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
				1746	return 0;
				1747	}
				1748
				1749	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
				1750	if (ret)
				1751	goto out;
				1752
				1753	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
				1754	inode->i_ino, le32_to_cpu(lrange.fc_lblk),
				1755	le32_to_cpu(lrange.fc_len));
				1756	while (remaining > 0) {
				1757	map.m_lblk = cur;
				1758	map.m_len = remaining;
				1759
				1760	ret = ext4_map_blocks(NULL, inode, &map, 0);
				1761	if (ret < 0)
				1762	goto out;
				1763	if (ret > 0) {
				1764	remaining -= ret;
				1765	cur += ret;
				1766	ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
				1767	} else {
				1768	remaining -= map.m_len;
				1769	cur += map.m_len;
				1770	}
				1771	}
				1772
				1773	down_write(&EXT4_I(inode)->i_data_sem);
				1774	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
				1775	le32_to_cpu(lrange.fc_lblk) +
				1776	le32_to_cpu(lrange.fc_len) - 1);
				1777	up_write(&EXT4_I(inode)->i_data_sem);
				1778	if (ret)
				1779	goto out;
				1780	ext4_ext_replay_shrink_inode(inode,
				1781	i_size_read(inode) >> sb->s_blocksize_bits);
				1782	ext4_mark_inode_dirty(NULL, inode);
				1783	out:
				1784	iput(inode);
				1785	return 0;
				1786	}
				1787
				1788	static inline const char *tag2str(u16 tag)
				1789	{
				1790	switch (tag) {
				1791	case EXT4_FC_TAG_LINK:
				1792	return "TAG_ADD_ENTRY";
				1793	case EXT4_FC_TAG_UNLINK:
				1794	return "TAG_DEL_ENTRY";
				1795	case EXT4_FC_TAG_ADD_RANGE:
				1796	return "TAG_ADD_RANGE";
				1797	case EXT4_FC_TAG_CREAT:
				1798	return "TAG_CREAT_DENTRY";
				1799	case EXT4_FC_TAG_DEL_RANGE:
				1800	return "TAG_DEL_RANGE";
				1801	case EXT4_FC_TAG_INODE:
				1802	return "TAG_INODE";
				1803	case EXT4_FC_TAG_PAD:
				1804	return "TAG_PAD";
				1805	case EXT4_FC_TAG_TAIL:
				1806	return "TAG_TAIL";
				1807	case EXT4_FC_TAG_HEAD:
				1808	return "TAG_HEAD";
				1809	default:
				1810	return "TAG_ERROR";
				1811	}
				1812	}
				1813
				1814	static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
				1815	{
				1816	struct ext4_fc_replay_state *state;
				1817	struct inode *inode;
				1818	struct ext4_ext_path *path = NULL;
				1819	struct ext4_map_blocks map;
				1820	int i, ret, j;
				1821	ext4_lblk_t cur, end;
				1822
				1823	state = &EXT4_SB(sb)->s_fc_replay_state;
				1824	for (i = 0; i < state->fc_modified_inodes_used; i++) {
				1825	inode = ext4_iget(sb, state->fc_modified_inodes[i],
				1826	EXT4_IGET_NORMAL);
				1827	if (IS_ERR(inode)) {
				1828	jbd_debug(1, "Inode %d not found.",
				1829	state->fc_modified_inodes[i]);
				1830	continue;
				1831	}
				1832	cur = 0;
				1833	end = EXT_MAX_BLOCKS;
				1834	while (cur < end) {
				1835	map.m_lblk = cur;
				1836	map.m_len = end - cur;
				1837
				1838	ret = ext4_map_blocks(NULL, inode, &map, 0);
				1839	if (ret < 0)
				1840	break;
				1841
				1842	if (ret > 0) {
				1843	path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
				1844	if (!IS_ERR(path)) {
				1845	for (j = 0; j < path->p_depth; j++)
				1846	ext4_mb_mark_bb(inode->i_sb,
				1847	path[j].p_block, 1, 1);
				1848	ext4_ext_drop_refs(path);
				1849	kfree(path);
				1850	}
				1851	cur += ret;
				1852	ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
				1853	map.m_len, 1);
				1854	} else {
				1855	cur = cur + (map.m_len ? map.m_len : 1);
				1856	}
				1857	}
				1858	iput(inode);
				1859	}
				1860	}
				1861
				1862	/*
				1863	* Check if block is in excluded regions for block allocation. The simple
				1864	* allocator that runs during replay phase is calls this function to see
				1865	* if it is okay to use a block.
				1866	*/
				1867	bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
				1868	{
				1869	int i;
				1870	struct ext4_fc_replay_state *state;
				1871
				1872	state = &EXT4_SB(sb)->s_fc_replay_state;
				1873	for (i = 0; i < state->fc_regions_valid; i++) {
				1874	if (state->fc_regions[i].ino == 0 \|\|
				1875	state->fc_regions[i].len == 0)
				1876	continue;
				1877	if (blk >= state->fc_regions[i].pblk &&
				1878	blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
				1879	return true;
				1880	}
				1881	return false;
				1882	}
				1883
				1884	/* Cleanup function called after replay */
				1885	void ext4_fc_replay_cleanup(struct super_block *sb)
				1886	{
				1887	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1888
				1889	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
				1890	kfree(sbi->s_fc_replay_state.fc_regions);
				1891	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
				1892	}
				1893
				1894	/*
				1895	* Recovery Scan phase handler
				1896	*
				1897	* This function is called during the scan phase and is responsible
				1898	* for doing following things:
				1899	* - Make sure the fast commit area has valid tags for replay
				1900	* - Count number of tags that need to be replayed by the replay handler
				1901	* - Verify CRC
				1902	* - Create a list of excluded blocks for allocation during replay phase
				1903	*
				1904	* This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
				1905	* incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
				1906	* to indicate that scan has finished and JBD2 can now start replay phase.
				1907	* It returns a negative error to indicate that there was an error. At the end
				1908	* of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
				1909	* to indicate the number of tags that need to replayed during the replay phase.
				1910	*/
				1911	static int ext4_fc_replay_scan(journal_t *journal,
				1912	struct buffer_head *bh, int off,
				1913	tid_t expected_tid)
				1914	{
				1915	struct super_block *sb = journal->j_private;
				1916	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1917	struct ext4_fc_replay_state *state;
				1918	int ret = JBD2_FC_REPLAY_CONTINUE;
				1919	struct ext4_fc_add_range ext;
				1920	struct ext4_fc_tl tl;
				1921	struct ext4_fc_tail tail;
				1922	__u8 start, end, cur, val;
				1923	struct ext4_fc_head head;
				1924	struct ext4_extent *ex;
				1925
				1926	state = &sbi->s_fc_replay_state;
				1927
				1928	start = (u8 *)bh->b_data;
				1929	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
				1930
				1931	if (state->fc_replay_expected_off == 0) {
				1932	state->fc_cur_tag = 0;
				1933	state->fc_replay_num_tags = 0;
				1934	state->fc_crc = 0;
				1935	state->fc_regions = NULL;
				1936	state->fc_regions_valid = state->fc_regions_used =
				1937	state->fc_regions_size = 0;
				1938	/* Check if we can stop early */
				1939	if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
				1940	!= EXT4_FC_TAG_HEAD)
				1941	return 0;
				1942	}
				1943
				1944	if (off != state->fc_replay_expected_off) {
				1945	ret = -EFSCORRUPTED;
				1946	goto out_err;
				1947	}
				1948
				1949	state->fc_replay_expected_off++;
				1950	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
				1951	memcpy(&tl, cur, sizeof(tl));
				1952	val = cur + sizeof(tl);
				1953	jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
				1954	tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
				1955	switch (le16_to_cpu(tl.fc_tag)) {
				1956	case EXT4_FC_TAG_ADD_RANGE:
				1957	memcpy(&ext, val, sizeof(ext));
				1958	ex = (struct ext4_extent *)&ext.fc_ex;
				1959	ret = ext4_fc_record_regions(sb,
				1960	le32_to_cpu(ext.fc_ino),
				1961	le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
				1962	ext4_ext_get_actual_len(ex), 0);
				1963	if (ret < 0)
				1964	break;
				1965	ret = JBD2_FC_REPLAY_CONTINUE;
				1966	fallthrough;
				1967	case EXT4_FC_TAG_DEL_RANGE:
				1968	case EXT4_FC_TAG_LINK:
				1969	case EXT4_FC_TAG_UNLINK:
				1970	case EXT4_FC_TAG_CREAT:
				1971	case EXT4_FC_TAG_INODE:
				1972	case EXT4_FC_TAG_PAD:
				1973	state->fc_cur_tag++;
				1974	state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
				1975	sizeof(tl) + le16_to_cpu(tl.fc_len));
				1976	break;
				1977	case EXT4_FC_TAG_TAIL:
				1978	state->fc_cur_tag++;
				1979	memcpy(&tail, val, sizeof(tail));
				1980	state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
				1981	sizeof(tl) +
				1982	offsetof(struct ext4_fc_tail,
				1983	fc_crc));
				1984	if (le32_to_cpu(tail.fc_tid) == expected_tid &&
				1985	le32_to_cpu(tail.fc_crc) == state->fc_crc) {
				1986	state->fc_replay_num_tags = state->fc_cur_tag;
				1987	state->fc_regions_valid =
				1988	state->fc_regions_used;
				1989	} else {
				1990	ret = state->fc_replay_num_tags ?
				1991	JBD2_FC_REPLAY_STOP : -EFSBADCRC;
				1992	}
				1993	state->fc_crc = 0;
				1994	break;
				1995	case EXT4_FC_TAG_HEAD:
				1996	memcpy(&head, val, sizeof(head));
				1997	if (le32_to_cpu(head.fc_features) &
				1998	~EXT4_FC_SUPPORTED_FEATURES) {
				1999	ret = -EOPNOTSUPP;
				2000	break;
				2001	}
				2002	if (le32_to_cpu(head.fc_tid) != expected_tid) {
				2003	ret = JBD2_FC_REPLAY_STOP;
				2004	break;
				2005	}
				2006	state->fc_cur_tag++;
				2007	state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
				2008	sizeof(tl) + le16_to_cpu(tl.fc_len));
				2009	break;
				2010	default:
				2011	ret = state->fc_replay_num_tags ?
				2012	JBD2_FC_REPLAY_STOP : -ECANCELED;
				2013	}
				2014	if (ret < 0 \|\| ret == JBD2_FC_REPLAY_STOP)
				2015	break;
				2016	}
				2017
				2018	out_err:
				2019	trace_ext4_fc_replay_scan(sb, ret, off);
				2020	return ret;
				2021	}
				2022
				2023	/*
				2024	* Main recovery path entry point.
				2025	* The meaning of return codes is similar as above.
				2026	*/
				2027	static int ext4_fc_replay(journal_t journal, struct buffer_head bh,
				2028	enum passtype pass, int off, tid_t expected_tid)
				2029	{
				2030	struct super_block *sb = journal->j_private;
				2031	struct ext4_sb_info *sbi = EXT4_SB(sb);
				2032	struct ext4_fc_tl tl;
				2033	__u8 start, end, cur, val;
				2034	int ret = JBD2_FC_REPLAY_CONTINUE;
				2035	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
				2036	struct ext4_fc_tail tail;
				2037
				2038	if (pass == PASS_SCAN) {
				2039	state->fc_current_pass = PASS_SCAN;
				2040	return ext4_fc_replay_scan(journal, bh, off, expected_tid);
				2041	}
				2042
				2043	if (state->fc_current_pass != pass) {
				2044	state->fc_current_pass = pass;
				2045	sbi->s_mount_state \|= EXT4_FC_REPLAY;
				2046	}
				2047	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
				2048	jbd_debug(1, "Replay stops\n");
				2049	ext4_fc_set_bitmaps_and_counters(sb);
				2050	return 0;
				2051	}
				2052
				2053	#ifdef CONFIG_EXT4_DEBUG
				2054	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
				2055	pr_warn("Dropping fc block %d because max_replay set\n", off);
				2056	return JBD2_FC_REPLAY_STOP;
				2057	}
				2058	#endif
				2059
				2060	start = (u8 *)bh->b_data;
				2061	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
				2062
				2063	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
				2064	memcpy(&tl, cur, sizeof(tl));
				2065	val = cur + sizeof(tl);
				2066
				2067	if (state->fc_replay_num_tags == 0) {
				2068	ret = JBD2_FC_REPLAY_STOP;
				2069	ext4_fc_set_bitmaps_and_counters(sb);
				2070	break;
				2071	}
				2072	jbd_debug(3, "Replay phase, tag:%s\n",
				2073	tag2str(le16_to_cpu(tl.fc_tag)));
				2074	state->fc_replay_num_tags--;
				2075	switch (le16_to_cpu(tl.fc_tag)) {
				2076	case EXT4_FC_TAG_LINK:
				2077	ret = ext4_fc_replay_link(sb, &tl, val);
				2078	break;
				2079	case EXT4_FC_TAG_UNLINK:
				2080	ret = ext4_fc_replay_unlink(sb, &tl, val);
				2081	break;
				2082	case EXT4_FC_TAG_ADD_RANGE:
				2083	ret = ext4_fc_replay_add_range(sb, &tl, val);
				2084	break;
				2085	case EXT4_FC_TAG_CREAT:
				2086	ret = ext4_fc_replay_create(sb, &tl, val);
				2087	break;
				2088	case EXT4_FC_TAG_DEL_RANGE:
				2089	ret = ext4_fc_replay_del_range(sb, &tl, val);
				2090	break;
				2091	case EXT4_FC_TAG_INODE:
				2092	ret = ext4_fc_replay_inode(sb, &tl, val);
				2093	break;
				2094	case EXT4_FC_TAG_PAD:
				2095	trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
				2096	le16_to_cpu(tl.fc_len), 0);
				2097	break;
				2098	case EXT4_FC_TAG_TAIL:
				2099	trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
				2100	le16_to_cpu(tl.fc_len), 0);
				2101	memcpy(&tail, val, sizeof(tail));
				2102	WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
				2103	break;
				2104	case EXT4_FC_TAG_HEAD:
				2105	break;
				2106	default:
				2107	trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
				2108	le16_to_cpu(tl.fc_len), 0);
				2109	ret = -ECANCELED;
				2110	break;
				2111	}
				2112	if (ret < 0)
				2113	break;
				2114	ret = JBD2_FC_REPLAY_CONTINUE;
				2115	}
				2116	return ret;
				2117	}
				2118
				2119	void ext4_fc_init(struct super_block sb, journal_t journal)
				2120	{
				2121	/*
				2122	* We set replay callback even if fast commit disabled because we may
				2123	* could still have fast commit blocks that need to be replayed even if
				2124	* fast commit has now been turned off.
				2125	*/
				2126	journal->j_fc_replay_callback = ext4_fc_replay;
				2127	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
				2128	return;
				2129	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
				2130	}
				2131
				2132	static const char *fc_ineligible_reasons[] = {
				2133	"Extended attributes changed",
				2134	"Cross rename",
				2135	"Journal flag changed",
				2136	"Insufficient memory",
				2137	"Swap boot",
				2138	"Resize",
				2139	"Dir renamed",
				2140	"Falloc range op",
				2141	"Data journalling",
				2142	"FC Commit Failed"
				2143	};
				2144
				2145	int ext4_fc_info_show(struct seq_file seq, void v)
				2146	{
				2147	struct ext4_sb_info sbi = EXT4_SB((struct super_block )seq->private);
				2148	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
				2149	int i;
				2150
				2151	if (v != SEQ_START_TOKEN)
				2152	return 0;
				2153
				2154	seq_printf(seq,
				2155	"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
				2156	stats->fc_num_commits, stats->fc_ineligible_commits,
				2157	stats->fc_numblks,
				2158	div_u64(sbi->s_fc_avg_commit_time, 1000));
				2159	seq_puts(seq, "Ineligible reasons:\n");
				2160	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
				2161	seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
				2162	stats->fc_ineligible_reason_count[i]);
				2163
				2164	return 0;
				2165	}
				2166
				2167	int __init ext4_fc_init_dentry_cache(void)
				2168	{
				2169	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
				2170	SLAB_RECLAIM_ACCOUNT);
				2171
				2172	if (ext4_fc_dentry_cachep == NULL)
				2173	return -ENOMEM;
				2174
				2175	return 0;
				2176	}
				2177
				2178	void ext4_fc_destroy_dentry_cache(void)
				2179	{
				2180	kmem_cache_destroy(ext4_fc_dentry_cachep);
				2181	}