Blame - fs/ext4/fast_commit.c - hafnium/third_party/linux

blob: 41dcf21558c4edba55321d837c025b72f6f05433 [file] [log] [blame]

Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2
				3	/*
				4	* fs/ext4/fast_commit.c
				5	*
				6	* Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
				7	*
				8	* Ext4 fast commits routines.
				9	*/
				10	#include "ext4.h"
				11	#include "ext4_jbd2.h"
				12	#include "ext4_extents.h"
				13	#include "mballoc.h"
				14
				15	/*
				16	* Ext4 Fast Commits
				17	* -----------------
				18	*
				19	* Ext4 fast commits implement fine grained journalling for Ext4.
				20	*
				21	* Fast commits are organized as a log of tag-length-value (TLV) structs. (See
				22	* struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
				23	* TLV during the recovery phase. For the scenarios for which we currently
				24	* don't have replay code, fast commit falls back to full commits.
				25	* Fast commits record delta in one of the following three categories.
				26	*
				27	* (A) Directory entry updates:
				28	*
				29	* - EXT4_FC_TAG_UNLINK - records directory entry unlink
				30	* - EXT4_FC_TAG_LINK - records directory entry link
				31	* - EXT4_FC_TAG_CREAT - records inode and directory entry creation
				32	*
				33	* (B) File specific data range updates:
				34	*
				35	* - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
				36	* - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
				37	*
				38	* (C) Inode metadata (mtime / ctime etc):
				39	*
				40	* - EXT4_FC_TAG_INODE - record the inode that should be replayed
				41	* during recovery. Note that iblocks field is
				42	* not replayed and instead derived during
				43	* replay.
				44	* Commit Operation
				45	* ----------------
				46	* With fast commits, we maintain all the directory entry operations in the
				47	* order in which they are issued in an in-memory queue. This queue is flushed
				48	* to disk during the commit operation. We also maintain a list of inodes
				49	* that need to be committed during a fast commit in another in memory queue of
				50	* inodes. During the commit operation, we commit in the following order:
				51	*
				52	* [1] Lock inodes for any further data updates by setting COMMITTING state
				53	* [2] Submit data buffers of all the inodes
				54	* [3] Wait for [2] to complete
				55	* [4] Commit all the directory entry updates in the fast commit space
				56	* [5] Commit all the changed inode structures
				57	* [6] Write tail tag (this tag ensures the atomicity, please read the following
				58	* section for more details).
				59	* [7] Wait for [4], [5] and [6] to complete.
				60	*
				61	* All the inode updates must call ext4_fc_start_update() before starting an
				62	* update. If such an ongoing update is present, fast commit waits for it to
				63	* complete. The completion of such an update is marked by
				64	* ext4_fc_stop_update().
				65	*
				66	* Fast Commit Ineligibility
				67	* -------------------------
				68	* Not all operations are supported by fast commits today (e.g extended
				69	* attributes). Fast commit ineligiblity is marked by calling one of the
				70	* two following functions:
				71	*
				72	* - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
				73	* back to full commit. This is useful in case of transient errors.
				74	*
				75	* - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
				76	* the fast commits happening between ext4_fc_start_ineligible() and
				77	* ext4_fc_stop_ineligible() and one fast commit after the call to
				78	* ext4_fc_stop_ineligible() to fall back to full commits. It is important to
				79	* make one more fast commit to fall back to full commit after stop call so
				80	* that it guaranteed that the fast commit ineligible operation contained
				81	* within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
				82	* followed by at least 1 full commit.
				83	*
				84	* Atomicity of commits
				85	* --------------------
				86	* In order to guarantee atomicity during the commit operation, fast commit
				87	* uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
				88	* tag contains CRC of the contents and TID of the transaction after which
				89	* this fast commit should be applied. Recovery code replays fast commit
				90	* logs only if there's at least 1 valid tail present. For every fast commit
				91	* operation, there is 1 tail. This means, we may end up with multiple tails
				92	* in the fast commit space. Here's an example:
				93	*
				94	* - Create a new file A and remove existing file B
				95	* - fsync()
				96	* - Append contents to file A
				97	* - Truncate file A
				98	* - fsync()
				99	*
				100	* The fast commit space at the end of above operations would look like this:
				101	* [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
				102	* \|<--- Fast Commit 1 --->\|<--- Fast Commit 2 ---->\|
				103	*
				104	* Replay code should thus check for all the valid tails in the FC area.
				105	*
				106	* TODOs
				107	* -----
				108	* 1) Make fast commit atomic updates more fine grained. Today, a fast commit
				109	* eligible update must be protected within ext4_fc_start_update() and
				110	* ext4_fc_stop_update(). These routines are called at much higher
				111	* routines. This can be made more fine grained by combining with
				112	* ext4_journal_start().
				113	*
				114	* 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
				115	*
				116	* 3) Handle more ineligible cases.
				117	*/
				118
				119	#include <trace/events/ext4.h>
				120	static struct kmem_cache *ext4_fc_dentry_cachep;
				121
				122	static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
				123	{
				124	BUFFER_TRACE(bh, "");
				125	if (uptodate) {
				126	ext4_debug("%s: Block %lld up-to-date",
				127	__func__, bh->b_blocknr);
				128	set_buffer_uptodate(bh);
				129	} else {
				130	ext4_debug("%s: Block %lld not up-to-date",
				131	__func__, bh->b_blocknr);
				132	clear_buffer_uptodate(bh);
				133	}
				134
				135	unlock_buffer(bh);
				136	}
				137
				138	static inline void ext4_fc_reset_inode(struct inode *inode)
				139	{
				140	struct ext4_inode_info *ei = EXT4_I(inode);
				141
				142	ei->i_fc_lblk_start = 0;
				143	ei->i_fc_lblk_len = 0;
				144	}
				145
				146	void ext4_fc_init_inode(struct inode *inode)
				147	{
				148	struct ext4_inode_info *ei = EXT4_I(inode);
				149
				150	ext4_fc_reset_inode(inode);
				151	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
				152	INIT_LIST_HEAD(&ei->i_fc_list);
				153	init_waitqueue_head(&ei->i_fc_wait);
				154	atomic_set(&ei->i_fc_updates, 0);
				155	}
				156
				157	/* This function must be called with sbi->s_fc_lock held. */
				158	static void ext4_fc_wait_committing_inode(struct inode *inode)
				159	__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
				160	{
				161	wait_queue_head_t *wq;
				162	struct ext4_inode_info *ei = EXT4_I(inode);
				163
				164	#if (BITS_PER_LONG < 64)
				165	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
				166	EXT4_STATE_FC_COMMITTING);
				167	wq = bit_waitqueue(&ei->i_state_flags,
				168	EXT4_STATE_FC_COMMITTING);
				169	#else
				170	DEFINE_WAIT_BIT(wait, &ei->i_flags,
				171	EXT4_STATE_FC_COMMITTING);
				172	wq = bit_waitqueue(&ei->i_flags,
				173	EXT4_STATE_FC_COMMITTING);
				174	#endif
				175	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
				176	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
				177	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				178	schedule();
				179	finish_wait(wq, &wait.wq_entry);
				180	}
				181
				182	/*
				183	* Inform Ext4's fast about start of an inode update
				184	*
				185	* This function is called by the high level call VFS callbacks before
				186	* performing any inode update. This function blocks if there's an ongoing
				187	* fast commit on the inode in question.
				188	*/
				189	void ext4_fc_start_update(struct inode *inode)
				190	{
				191	struct ext4_inode_info *ei = EXT4_I(inode);
				192
				193	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) \|\|
				194	(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
				195	return;
				196
				197	restart:
				198	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				199	if (list_empty(&ei->i_fc_list))
				200	goto out;
				201
				202	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
				203	ext4_fc_wait_committing_inode(inode);
				204	goto restart;
				205	}
				206	out:
				207	atomic_inc(&ei->i_fc_updates);
				208	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				209	}
				210
				211	/*
				212	* Stop inode update and wake up waiting fast commits if any.
				213	*/
				214	void ext4_fc_stop_update(struct inode *inode)
				215	{
				216	struct ext4_inode_info *ei = EXT4_I(inode);
				217
				218	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) \|\|
				219	(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
				220	return;
				221
				222	if (atomic_dec_and_test(&ei->i_fc_updates))
				223	wake_up_all(&ei->i_fc_wait);
				224	}
				225
				226	/*
				227	* Remove inode from fast commit list. If the inode is being committed
				228	* we wait until inode commit is done.
				229	*/
				230	void ext4_fc_del(struct inode *inode)
				231	{
				232	struct ext4_inode_info *ei = EXT4_I(inode);
				233
				234	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) \|\|
				235	(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
				236	return;
				237
				238	restart:
				239	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				240	if (list_empty(&ei->i_fc_list)) {
				241	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				242	return;
				243	}
				244
				245	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
				246	ext4_fc_wait_committing_inode(inode);
				247	goto restart;
				248	}
				249	list_del_init(&ei->i_fc_list);
				250	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				251	}
				252
				253	/*
				254	* Mark file system as fast commit ineligible. This means that next commit
				255	* operation would result in a full jbd2 commit.
				256	*/
				257	void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
				258	{
				259	struct ext4_sb_info *sbi = EXT4_SB(sb);
				260
				261	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) \|\|
				262	(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
				263	return;
				264
				265	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
				266	WARN_ON(reason >= EXT4_FC_REASON_MAX);
				267	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
				268	}
				269
				270	/*
				271	* Start a fast commit ineligible update. Any commits that happen while
				272	* such an operation is in progress fall back to full commits.
				273	*/
				274	void ext4_fc_start_ineligible(struct super_block *sb, int reason)
				275	{
				276	struct ext4_sb_info *sbi = EXT4_SB(sb);
				277
				278	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) \|\|
				279	(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
				280	return;
				281
				282	WARN_ON(reason >= EXT4_FC_REASON_MAX);
				283	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
				284	atomic_inc(&sbi->s_fc_ineligible_updates);
				285	}
				286
				287	/*
				288	* Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
				289	* to ensure that after stopping the ineligible update, at least one full
				290	* commit takes place.
				291	*/
				292	void ext4_fc_stop_ineligible(struct super_block *sb)
				293	{
				294	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) \|\|
				295	(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
				296	return;
				297
				298	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
				299	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
				300	}
				301
				302	static inline int ext4_fc_is_ineligible(struct super_block *sb)
				303	{
				304	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) \|\|
				305	atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
				306	}
				307
				308	/*
				309	* Generic fast commit tracking function. If this is the first time this we are
				310	* called after a full commit, we initialize fast commit fields and then call
				311	* __fc_track_fn() with update = 0. If we have already been called after a full
				312	* commit, we pass update = 1. Based on that, the track function can determine
				313	* if it needs to track a field for the first time or if it needs to just
				314	* update the previously tracked value.
				315	*
				316	* If enqueue is set, this function enqueues the inode in fast commit list.
				317	*/
				318	static int ext4_fc_track_template(
				319	handle_t handle, struct inode inode,
				320	int (__fc_track_fn)(struct inode , void *, bool),
				321	void *args, int enqueue)
				322	{
				323	bool update = false;
				324	struct ext4_inode_info *ei = EXT4_I(inode);
				325	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
				326	tid_t tid = 0;
				327	int ret;
				328
				329	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) \|\|
				330	(sbi->s_mount_state & EXT4_FC_REPLAY))
				331	return -EOPNOTSUPP;
				332
				333	if (ext4_fc_is_ineligible(inode->i_sb))
				334	return -EINVAL;
				335
				336	tid = handle->h_transaction->t_tid;
				337	mutex_lock(&ei->i_fc_lock);
				338	if (tid == ei->i_sync_tid) {
				339	update = true;
				340	} else {
				341	ext4_fc_reset_inode(inode);
				342	ei->i_sync_tid = tid;
				343	}
				344	ret = __fc_track_fn(inode, args, update);
				345	mutex_unlock(&ei->i_fc_lock);
				346
				347	if (!enqueue)
				348	return ret;
				349
				350	spin_lock(&sbi->s_fc_lock);
				351	if (list_empty(&EXT4_I(inode)->i_fc_list))
				352	list_add_tail(&EXT4_I(inode)->i_fc_list,
				353	(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
				354	&sbi->s_fc_q[FC_Q_STAGING] :
				355	&sbi->s_fc_q[FC_Q_MAIN]);
				356	spin_unlock(&sbi->s_fc_lock);
				357
				358	return ret;
				359	}
				360
				361	struct __track_dentry_update_args {
				362	struct dentry *dentry;
				363	int op;
				364	};
				365
				366	/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
				367	static int __track_dentry_update(struct inode inode, void arg, bool update)
				368	{
				369	struct ext4_fc_dentry_update *node;
				370	struct ext4_inode_info *ei = EXT4_I(inode);
				371	struct __track_dentry_update_args *dentry_update =
				372	(struct __track_dentry_update_args *)arg;
				373	struct dentry *dentry = dentry_update->dentry;
				374	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
				375
				376	mutex_unlock(&ei->i_fc_lock);
				377	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
				378	if (!node) {
				379	ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
				380	mutex_lock(&ei->i_fc_lock);
				381	return -ENOMEM;
				382	}
				383
				384	node->fcd_op = dentry_update->op;
				385	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
				386	node->fcd_ino = inode->i_ino;
				387	if (dentry->d_name.len > DNAME_INLINE_LEN) {
				388	node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
				389	if (!node->fcd_name.name) {
				390	kmem_cache_free(ext4_fc_dentry_cachep, node);
				391	ext4_fc_mark_ineligible(inode->i_sb,
				392	EXT4_FC_REASON_NOMEM);
				393	mutex_lock(&ei->i_fc_lock);
				394	return -ENOMEM;
				395	}
				396	memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
				397	dentry->d_name.len);
				398	} else {
				399	memcpy(node->fcd_iname, dentry->d_name.name,
				400	dentry->d_name.len);
				401	node->fcd_name.name = node->fcd_iname;
				402	}
				403	node->fcd_name.len = dentry->d_name.len;
				404
				405	spin_lock(&sbi->s_fc_lock);
				406	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
				407	list_add_tail(&node->fcd_list,
				408	&sbi->s_fc_dentry_q[FC_Q_STAGING]);
				409	else
				410	list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
				411	spin_unlock(&sbi->s_fc_lock);
				412	mutex_lock(&ei->i_fc_lock);
				413
				414	return 0;
				415	}
				416
				417	void __ext4_fc_track_unlink(handle_t *handle,
				418	struct inode inode, struct dentry dentry)
				419	{
				420	struct __track_dentry_update_args args;
				421	int ret;
				422
				423	args.dentry = dentry;
				424	args.op = EXT4_FC_TAG_UNLINK;
				425
				426	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
				427	(void *)&args, 0);
				428	trace_ext4_fc_track_unlink(inode, dentry, ret);
				429	}
				430
				431	void ext4_fc_track_unlink(handle_t handle, struct dentry dentry)
				432	{
				433	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
				434	}
				435
				436	void __ext4_fc_track_link(handle_t *handle,
				437	struct inode inode, struct dentry dentry)
				438	{
				439	struct __track_dentry_update_args args;
				440	int ret;
				441
				442	args.dentry = dentry;
				443	args.op = EXT4_FC_TAG_LINK;
				444
				445	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
				446	(void *)&args, 0);
				447	trace_ext4_fc_track_link(inode, dentry, ret);
				448	}
				449
				450	void ext4_fc_track_link(handle_t handle, struct dentry dentry)
				451	{
				452	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
				453	}
				454
				455	void __ext4_fc_track_create(handle_t handle, struct inode inode,
				456	struct dentry *dentry)
				457	{
				458	struct __track_dentry_update_args args;
				459	int ret;
				460
				461	args.dentry = dentry;
				462	args.op = EXT4_FC_TAG_CREAT;
				463
				464	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
				465	(void *)&args, 0);
				466	trace_ext4_fc_track_create(inode, dentry, ret);
				467	}
				468
				469	void ext4_fc_track_create(handle_t handle, struct dentry dentry)
				470	{
				471	__ext4_fc_track_create(handle, d_inode(dentry), dentry);
				472	}
				473
				474	/* __track_fn for inode tracking */
				475	static int __track_inode(struct inode inode, void arg, bool update)
				476	{
				477	if (update)
				478	return -EEXIST;
				479
				480	EXT4_I(inode)->i_fc_lblk_len = 0;
				481
				482	return 0;
				483	}
				484
				485	void ext4_fc_track_inode(handle_t handle, struct inode inode)
				486	{
				487	int ret;
				488
				489	if (S_ISDIR(inode->i_mode))
				490	return;
				491
				492	if (ext4_should_journal_data(inode)) {
				493	ext4_fc_mark_ineligible(inode->i_sb,
				494	EXT4_FC_REASON_INODE_JOURNAL_DATA);
				495	return;
				496	}
				497
				498	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
				499	trace_ext4_fc_track_inode(inode, ret);
				500	}
				501
				502	struct __track_range_args {
				503	ext4_lblk_t start, end;
				504	};
				505
				506	/* __track_fn for tracking data updates */
				507	static int __track_range(struct inode inode, void arg, bool update)
				508	{
				509	struct ext4_inode_info *ei = EXT4_I(inode);
				510	ext4_lblk_t oldstart;
				511	struct __track_range_args *__arg =
				512	(struct __track_range_args *)arg;
				513
				514	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
				515	ext4_debug("Special inode %ld being modified\n", inode->i_ino);
				516	return -ECANCELED;
				517	}
				518
				519	oldstart = ei->i_fc_lblk_start;
				520
				521	if (update && ei->i_fc_lblk_len > 0) {
				522	ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
				523	ei->i_fc_lblk_len =
				524	max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
				525	ei->i_fc_lblk_start + 1;
				526	} else {
				527	ei->i_fc_lblk_start = __arg->start;
				528	ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
				529	}
				530
				531	return 0;
				532	}
				533
				534	void ext4_fc_track_range(handle_t handle, struct inode inode, ext4_lblk_t start,
				535	ext4_lblk_t end)
				536	{
				537	struct __track_range_args args;
				538	int ret;
				539
				540	if (S_ISDIR(inode->i_mode))
				541	return;
				542
				543	args.start = start;
				544	args.end = end;
				545
				546	ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
				547
				548	trace_ext4_fc_track_range(inode, start, end, ret);
				549	}
				550
				551	static void ext4_fc_submit_bh(struct super_block *sb)
				552	{
				553	int write_flags = REQ_SYNC;
				554	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
				555
				556	/* TODO: REQ_FUA \| REQ_PREFLUSH is unnecessarily expensive. */
				557	if (test_opt(sb, BARRIER))
				558	write_flags \|= REQ_FUA \| REQ_PREFLUSH;
				559	lock_buffer(bh);
				560	set_buffer_dirty(bh);
				561	set_buffer_uptodate(bh);
				562	bh->b_end_io = ext4_end_buffer_io_sync;
				563	submit_bh(REQ_OP_WRITE, write_flags, bh);
				564	EXT4_SB(sb)->s_fc_bh = NULL;
				565	}
				566
				567	/* Ext4 commit path routines */
				568
				569	/* memzero and update CRC */
				570	static void ext4_fc_memzero(struct super_block sb, void *dst, int len,
				571	u32 *crc)
				572	{
				573	void *ret;
				574
				575	ret = memset(dst, 0, len);
				576	if (crc)
				577	crc = ext4_chksum(EXT4_SB(sb), crc, dst, len);
				578	return ret;
				579	}
				580
				581	/*
				582	* Allocate len bytes on a fast commit buffer.
				583	*
				584	* During the commit time this function is used to manage fast commit
				585	* block space. We don't split a fast commit log onto different
				586	* blocks. So this function makes sure that if there's not enough space
				587	* on the current block, the remaining space in the current block is
				588	* marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
				589	* new block is from jbd2 and CRC is updated to reflect the padding
				590	* we added.
				591	*/
				592	static u8 ext4_fc_reserve_space(struct super_block sb, int len, u32 *crc)
				593	{
				594	struct ext4_fc_tl *tl;
				595	struct ext4_sb_info *sbi = EXT4_SB(sb);
				596	struct buffer_head *bh;
				597	int bsize = sbi->s_journal->j_blocksize;
				598	int ret, off = sbi->s_fc_bytes % bsize;
				599	int pad_len;
				600
				601	/*
				602	* After allocating len, we should have space at least for a 0 byte
				603	* padding.
				604	*/
				605	if (len + sizeof(struct ext4_fc_tl) > bsize)
				606	return NULL;
				607
				608	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
				609	/*
				610	* Only allocate from current buffer if we have enough space for
				611	* this request AND we have space to add a zero byte padding.
				612	*/
				613	if (!sbi->s_fc_bh) {
				614	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
				615	if (ret)
				616	return NULL;
				617	sbi->s_fc_bh = bh;
				618	}
				619	sbi->s_fc_bytes += len;
				620	return sbi->s_fc_bh->b_data + off;
				621	}
				622	/* Need to add PAD tag */
				623	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
				624	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
				625	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
				626	tl->fc_len = cpu_to_le16(pad_len);
				627	if (crc)
				628	crc = ext4_chksum(sbi, crc, tl, sizeof(*tl));
				629	if (pad_len > 0)
				630	ext4_fc_memzero(sb, tl + 1, pad_len, crc);
				631	ext4_fc_submit_bh(sb);
				632
				633	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
				634	if (ret)
				635	return NULL;
				636	sbi->s_fc_bh = bh;
				637	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
				638	return sbi->s_fc_bh->b_data;
				639	}
				640
				641	/* memcpy to fc reserved space and update CRC */
				642	static void ext4_fc_memcpy(struct super_block sb, void dst, const void src,
				643	int len, u32 *crc)
				644	{
				645	if (crc)
				646	crc = ext4_chksum(EXT4_SB(sb), crc, src, len);
				647	return memcpy(dst, src, len);
				648	}
				649
				650	/*
				651	* Complete a fast commit by writing tail tag.
				652	*
				653	* Writing tail tag marks the end of a fast commit. In order to guarantee
				654	* atomicity, after writing tail tag, even if there's space remaining
				655	* in the block, next commit shouldn't use it. That's why tail tag
				656	* has the length as that of the remaining space on the block.
				657	*/
				658	static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
				659	{
				660	struct ext4_sb_info *sbi = EXT4_SB(sb);
				661	struct ext4_fc_tl tl;
				662	struct ext4_fc_tail tail;
				663	int off, bsize = sbi->s_journal->j_blocksize;
				664	u8 *dst;
				665
				666	/*
				667	* ext4_fc_reserve_space takes care of allocating an extra block if
				668	* there's no enough space on this block for accommodating this tail.
				669	*/
				670	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
				671	if (!dst)
				672	return -ENOSPC;
				673
				674	off = sbi->s_fc_bytes % bsize;
				675
				676	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
				677	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
				678	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
				679
				680	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
				681	dst += sizeof(tl);
				682	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
				683	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
				684	dst += sizeof(tail.fc_tid);
				685	tail.fc_crc = cpu_to_le32(crc);
				686	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
				687
				688	ext4_fc_submit_bh(sb);
				689
				690	return 0;
				691	}
				692
				693	/*
				694	* Adds tag, length, value and updates CRC. Returns true if tlv was added.
				695	* Returns false if there's not enough space.
				696	*/
				697	static bool ext4_fc_add_tlv(struct super_block sb, u16 tag, u16 len, u8 val,
				698	u32 *crc)
				699	{
				700	struct ext4_fc_tl tl;
				701	u8 *dst;
				702
				703	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
				704	if (!dst)
				705	return false;
				706
				707	tl.fc_tag = cpu_to_le16(tag);
				708	tl.fc_len = cpu_to_le16(len);
				709
				710	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
				711	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
				712
				713	return true;
				714	}
				715
				716	/* Same as above, but adds dentry tlv. */
				717	static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
				718	int parent_ino, int ino, int dlen,
				719	const unsigned char *dname,
				720	u32 *crc)
				721	{
				722	struct ext4_fc_dentry_info fcd;
				723	struct ext4_fc_tl tl;
				724	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
				725	crc);
				726
				727	if (!dst)
				728	return false;
				729
				730	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
				731	fcd.fc_ino = cpu_to_le32(ino);
				732	tl.fc_tag = cpu_to_le16(tag);
				733	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
				734	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
				735	dst += sizeof(tl);
				736	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
				737	dst += sizeof(fcd);
				738	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
				739	dst += dlen;
				740
				741	return true;
				742	}
				743
				744	/*
				745	* Writes inode in the fast commit space under TLV with tag @tag.
				746	* Returns 0 on success, error on failure.
				747	*/
				748	static int ext4_fc_write_inode(struct inode inode, u32 crc)
				749	{
				750	struct ext4_inode_info *ei = EXT4_I(inode);
				751	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
				752	int ret;
				753	struct ext4_iloc iloc;
				754	struct ext4_fc_inode fc_inode;
				755	struct ext4_fc_tl tl;
				756	u8 *dst;
				757
				758	ret = ext4_get_inode_loc(inode, &iloc);
				759	if (ret)
				760	return ret;
				761
				762	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
				763	inode_len += ei->i_extra_isize;
				764
				765	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
				766	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
				767	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
				768
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame^]	769	ret = -ECANCELED;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	770	dst = ext4_fc_reserve_space(inode->i_sb,
				771	sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
				772	if (!dst)
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame^]	773	goto err;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	774
				775	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame^]	776	goto err;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	777	dst += sizeof(tl);
				778	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame^]	779	goto err;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	780	dst += sizeof(fc_inode);
				781	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
				782	inode_len, crc))
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame^]	783	goto err;
				784	ret = 0;
				785	err:
				786	brelse(iloc.bh);
				787	return ret;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	788	}
				789
				790	/*
				791	* Writes updated data ranges for the inode in question. Updates CRC.
				792	* Returns 0 on success, error otherwise.
				793	*/
				794	static int ext4_fc_write_inode_data(struct inode inode, u32 crc)
				795	{
				796	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
				797	struct ext4_inode_info *ei = EXT4_I(inode);
				798	struct ext4_map_blocks map;
				799	struct ext4_fc_add_range fc_ext;
				800	struct ext4_fc_del_range lrange;
				801	struct ext4_extent *ex;
				802	int ret;
				803
				804	mutex_lock(&ei->i_fc_lock);
				805	if (ei->i_fc_lblk_len == 0) {
				806	mutex_unlock(&ei->i_fc_lock);
				807	return 0;
				808	}
				809	old_blk_size = ei->i_fc_lblk_start;
				810	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
				811	ei->i_fc_lblk_len = 0;
				812	mutex_unlock(&ei->i_fc_lock);
				813
				814	cur_lblk_off = old_blk_size;
				815	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
				816	__func__, cur_lblk_off, new_blk_size, inode->i_ino);
				817
				818	while (cur_lblk_off <= new_blk_size) {
				819	map.m_lblk = cur_lblk_off;
				820	map.m_len = new_blk_size - cur_lblk_off + 1;
				821	ret = ext4_map_blocks(NULL, inode, &map, 0);
				822	if (ret < 0)
				823	return -ECANCELED;
				824
				825	if (map.m_len == 0) {
				826	cur_lblk_off++;
				827	continue;
				828	}
				829
				830	if (ret == 0) {
				831	lrange.fc_ino = cpu_to_le32(inode->i_ino);
				832	lrange.fc_lblk = cpu_to_le32(map.m_lblk);
				833	lrange.fc_len = cpu_to_le32(map.m_len);
				834	if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
				835	sizeof(lrange), (u8 *)&lrange, crc))
				836	return -ENOSPC;
				837	} else {
				838	unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
				839	EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
				840
				841	/* Limit the number of blocks in one extent */
				842	map.m_len = min(max, map.m_len);
				843
				844	fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
				845	ex = (struct ext4_extent *)&fc_ext.fc_ex;
				846	ex->ee_block = cpu_to_le32(map.m_lblk);
				847	ex->ee_len = cpu_to_le16(map.m_len);
				848	ext4_ext_store_pblock(ex, map.m_pblk);
				849	if (map.m_flags & EXT4_MAP_UNWRITTEN)
				850	ext4_ext_mark_unwritten(ex);
				851	else
				852	ext4_ext_mark_initialized(ex);
				853	if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
				854	sizeof(fc_ext), (u8 *)&fc_ext, crc))
				855	return -ENOSPC;
				856	}
				857
				858	cur_lblk_off += map.m_len;
				859	}
				860
				861	return 0;
				862	}
				863
				864
				865	/* Submit data for all the fast commit inodes */
				866	static int ext4_fc_submit_inode_data_all(journal_t *journal)
				867	{
				868	struct super_block sb = (struct super_block )(journal->j_private);
				869	struct ext4_sb_info *sbi = EXT4_SB(sb);
				870	struct ext4_inode_info *ei;
				871	struct list_head *pos;
				872	int ret = 0;
				873
				874	spin_lock(&sbi->s_fc_lock);
				875	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
				876	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
				877	ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
				878	ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
				879	while (atomic_read(&ei->i_fc_updates)) {
				880	DEFINE_WAIT(wait);
				881
				882	prepare_to_wait(&ei->i_fc_wait, &wait,
				883	TASK_UNINTERRUPTIBLE);
				884	if (atomic_read(&ei->i_fc_updates)) {
				885	spin_unlock(&sbi->s_fc_lock);
				886	schedule();
				887	spin_lock(&sbi->s_fc_lock);
				888	}
				889	finish_wait(&ei->i_fc_wait, &wait);
				890	}
				891	spin_unlock(&sbi->s_fc_lock);
				892	ret = jbd2_submit_inode_data(ei->jinode);
				893	if (ret)
				894	return ret;
				895	spin_lock(&sbi->s_fc_lock);
				896	}
				897	spin_unlock(&sbi->s_fc_lock);
				898
				899	return ret;
				900	}
				901
				902	/* Wait for completion of data for all the fast commit inodes */
				903	static int ext4_fc_wait_inode_data_all(journal_t *journal)
				904	{
				905	struct super_block sb = (struct super_block )(journal->j_private);
				906	struct ext4_sb_info *sbi = EXT4_SB(sb);
				907	struct ext4_inode_info pos, n;
				908	int ret = 0;
				909
				910	spin_lock(&sbi->s_fc_lock);
				911	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
				912	if (!ext4_test_inode_state(&pos->vfs_inode,
				913	EXT4_STATE_FC_COMMITTING))
				914	continue;
				915	spin_unlock(&sbi->s_fc_lock);
				916
				917	ret = jbd2_wait_inode_data(journal, pos->jinode);
				918	if (ret)
				919	return ret;
				920	spin_lock(&sbi->s_fc_lock);
				921	}
				922	spin_unlock(&sbi->s_fc_lock);
				923
				924	return 0;
				925	}
				926
				927	/* Commit all the directory entry updates */
				928	static int ext4_fc_commit_dentry_updates(journal_t journal, u32 crc)
				929	__acquires(&sbi->s_fc_lock)
				930	__releases(&sbi->s_fc_lock)
				931	{
				932	struct super_block sb = (struct super_block )(journal->j_private);
				933	struct ext4_sb_info *sbi = EXT4_SB(sb);
				934	struct ext4_fc_dentry_update *fc_dentry;
				935	struct inode *inode;
				936	struct list_head pos, n, fcd_pos, fcd_n;
				937	struct ext4_inode_info *ei;
				938	int ret;
				939
				940	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
				941	return 0;
				942	list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
				943	fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
				944	fcd_list);
				945	if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
				946	spin_unlock(&sbi->s_fc_lock);
				947	if (!ext4_fc_add_dentry_tlv(
				948	sb, fc_dentry->fcd_op,
				949	fc_dentry->fcd_parent, fc_dentry->fcd_ino,
				950	fc_dentry->fcd_name.len,
				951	fc_dentry->fcd_name.name, crc)) {
				952	ret = -ENOSPC;
				953	goto lock_and_exit;
				954	}
				955	spin_lock(&sbi->s_fc_lock);
				956	continue;
				957	}
				958
				959	inode = NULL;
				960	list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
				961	ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
				962	if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
				963	inode = &ei->vfs_inode;
				964	break;
				965	}
				966	}
				967	/*
				968	* If we don't find inode in our list, then it was deleted,
				969	* in which case, we don't need to record it's create tag.
				970	*/
				971	if (!inode)
				972	continue;
				973	spin_unlock(&sbi->s_fc_lock);
				974
				975	/*
				976	* We first write the inode and then the create dirent. This
				977	* allows the recovery code to create an unnamed inode first
				978	* and then link it to a directory entry. This allows us
				979	* to use namei.c routines almost as is and simplifies
				980	* the recovery code.
				981	*/
				982	ret = ext4_fc_write_inode(inode, crc);
				983	if (ret)
				984	goto lock_and_exit;
				985
				986	ret = ext4_fc_write_inode_data(inode, crc);
				987	if (ret)
				988	goto lock_and_exit;
				989
				990	if (!ext4_fc_add_dentry_tlv(
				991	sb, fc_dentry->fcd_op,
				992	fc_dentry->fcd_parent, fc_dentry->fcd_ino,
				993	fc_dentry->fcd_name.len,
				994	fc_dentry->fcd_name.name, crc)) {
				995	ret = -ENOSPC;
				996	goto lock_and_exit;
				997	}
				998
				999	spin_lock(&sbi->s_fc_lock);
				1000	}
				1001	return 0;
				1002	lock_and_exit:
				1003	spin_lock(&sbi->s_fc_lock);
				1004	return ret;
				1005	}
				1006
				1007	static int ext4_fc_perform_commit(journal_t *journal)
				1008	{
				1009	struct super_block sb = (struct super_block )(journal->j_private);
				1010	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1011	struct ext4_inode_info *iter;
				1012	struct ext4_fc_head head;
				1013	struct list_head *pos;
				1014	struct inode *inode;
				1015	struct blk_plug plug;
				1016	int ret = 0;
				1017	u32 crc = 0;
				1018
				1019	ret = ext4_fc_submit_inode_data_all(journal);
				1020	if (ret)
				1021	return ret;
				1022
				1023	ret = ext4_fc_wait_inode_data_all(journal);
				1024	if (ret)
				1025	return ret;
				1026
				1027	/*
				1028	* If file system device is different from journal device, issue a cache
				1029	* flush before we start writing fast commit blocks.
				1030	*/
				1031	if (journal->j_fs_dev != journal->j_dev)
				1032	blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
				1033
				1034	blk_start_plug(&plug);
				1035	if (sbi->s_fc_bytes == 0) {
				1036	/*
				1037	* Add a head tag only if this is the first fast commit
				1038	* in this TID.
				1039	*/
				1040	head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
				1041	head.fc_tid = cpu_to_le32(
				1042	sbi->s_journal->j_running_transaction->t_tid);
				1043	if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
				1044	(u8 *)&head, &crc)) {
				1045	ret = -ENOSPC;
				1046	goto out;
				1047	}
				1048	}
				1049
				1050	spin_lock(&sbi->s_fc_lock);
				1051	ret = ext4_fc_commit_dentry_updates(journal, &crc);
				1052	if (ret) {
				1053	spin_unlock(&sbi->s_fc_lock);
				1054	goto out;
				1055	}
				1056
				1057	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
				1058	iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
				1059	inode = &iter->vfs_inode;
				1060	if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
				1061	continue;
				1062
				1063	spin_unlock(&sbi->s_fc_lock);
				1064	ret = ext4_fc_write_inode_data(inode, &crc);
				1065	if (ret)
				1066	goto out;
				1067	ret = ext4_fc_write_inode(inode, &crc);
				1068	if (ret)
				1069	goto out;
				1070	spin_lock(&sbi->s_fc_lock);
				1071	}
				1072	spin_unlock(&sbi->s_fc_lock);
				1073
				1074	ret = ext4_fc_write_tail(sb, crc);
				1075
				1076	out:
				1077	blk_finish_plug(&plug);
				1078	return ret;
				1079	}
				1080
				1081	/*
				1082	* The main commit entry point. Performs a fast commit for transaction
				1083	* commit_tid if needed. If it's not possible to perform a fast commit
				1084	* due to various reasons, we fall back to full commit. Returns 0
				1085	* on success, error otherwise.
				1086	*/
				1087	int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
				1088	{
				1089	struct super_block sb = (struct super_block )(journal->j_private);
				1090	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1091	int nblks = 0, ret, bsize = journal->j_blocksize;
				1092	int subtid = atomic_read(&sbi->s_fc_subtid);
				1093	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
				1094	ktime_t start_time, commit_time;
				1095
				1096	trace_ext4_fc_commit_start(sb);
				1097
				1098	start_time = ktime_get();
				1099
				1100	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) \|\|
				1101	(ext4_fc_is_ineligible(sb))) {
				1102	reason = EXT4_FC_REASON_INELIGIBLE;
				1103	goto out;
				1104	}
				1105
				1106	restart_fc:
				1107	ret = jbd2_fc_begin_commit(journal, commit_tid);
				1108	if (ret == -EALREADY) {
				1109	/* There was an ongoing commit, check if we need to restart */
				1110	if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
				1111	commit_tid > journal->j_commit_sequence)
				1112	goto restart_fc;
				1113	reason = EXT4_FC_REASON_ALREADY_COMMITTED;
				1114	goto out;
				1115	} else if (ret) {
				1116	sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
				1117	reason = EXT4_FC_REASON_FC_START_FAILED;
				1118	goto out;
				1119	}
				1120
				1121	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
				1122	ret = ext4_fc_perform_commit(journal);
				1123	if (ret < 0) {
				1124	sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
				1125	reason = EXT4_FC_REASON_FC_FAILED;
				1126	goto out;
				1127	}
				1128	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
				1129	ret = jbd2_fc_wait_bufs(journal, nblks);
				1130	if (ret < 0) {
				1131	sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
				1132	reason = EXT4_FC_REASON_FC_FAILED;
				1133	goto out;
				1134	}
				1135	atomic_inc(&sbi->s_fc_subtid);
				1136	jbd2_fc_end_commit(journal);
				1137	out:
				1138	/* Has any ineligible update happened since we started? */
				1139	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
				1140	sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
				1141	reason = EXT4_FC_REASON_INELIGIBLE;
				1142	}
				1143
				1144	spin_lock(&sbi->s_fc_lock);
				1145	if (reason != EXT4_FC_REASON_OK &&
				1146	reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
				1147	sbi->s_fc_stats.fc_ineligible_commits++;
				1148	} else {
				1149	sbi->s_fc_stats.fc_num_commits++;
				1150	sbi->s_fc_stats.fc_numblks += nblks;
				1151	}
				1152	spin_unlock(&sbi->s_fc_lock);
				1153	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
				1154	trace_ext4_fc_commit_stop(sb, nblks, reason);
				1155	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
				1156	/*
				1157	* weight the commit time higher than the average time so we don't
				1158	* react too strongly to vast changes in the commit time
				1159	*/
				1160	if (likely(sbi->s_fc_avg_commit_time))
				1161	sbi->s_fc_avg_commit_time = (commit_time +
				1162	sbi->s_fc_avg_commit_time * 3) / 4;
				1163	else
				1164	sbi->s_fc_avg_commit_time = commit_time;
				1165	jbd_debug(1,
				1166	"Fast commit ended with blks = %d, reason = %d, subtid - %d",
				1167	nblks, reason, subtid);
				1168	if (reason == EXT4_FC_REASON_FC_FAILED)
				1169	return jbd2_fc_end_commit_fallback(journal);
				1170	if (reason == EXT4_FC_REASON_FC_START_FAILED \|\|
				1171	reason == EXT4_FC_REASON_INELIGIBLE)
				1172	return jbd2_complete_transaction(journal, commit_tid);
				1173	return 0;
				1174	}
				1175
				1176	/*
				1177	* Fast commit cleanup routine. This is called after every fast commit and
				1178	* full commit. full is true if we are called after a full commit.
				1179	*/
				1180	static void ext4_fc_cleanup(journal_t *journal, int full)
				1181	{
				1182	struct super_block *sb = journal->j_private;
				1183	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1184	struct ext4_inode_info *iter;
				1185	struct ext4_fc_dentry_update *fc_dentry;
				1186	struct list_head pos, n;
				1187
				1188	if (full && sbi->s_fc_bh)
				1189	sbi->s_fc_bh = NULL;
				1190
				1191	jbd2_fc_release_bufs(journal);
				1192
				1193	spin_lock(&sbi->s_fc_lock);
				1194	list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
				1195	iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
				1196	list_del_init(&iter->i_fc_list);
				1197	ext4_clear_inode_state(&iter->vfs_inode,
				1198	EXT4_STATE_FC_COMMITTING);
				1199	ext4_fc_reset_inode(&iter->vfs_inode);
				1200	/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
				1201	smp_mb();
				1202	#if (BITS_PER_LONG < 64)
				1203	wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
				1204	#else
				1205	wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
				1206	#endif
				1207	}
				1208
				1209	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
				1210	fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
				1211	struct ext4_fc_dentry_update,
				1212	fcd_list);
				1213	list_del_init(&fc_dentry->fcd_list);
				1214	spin_unlock(&sbi->s_fc_lock);
				1215
				1216	if (fc_dentry->fcd_name.name &&
				1217	fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
				1218	kfree(fc_dentry->fcd_name.name);
				1219	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
				1220	spin_lock(&sbi->s_fc_lock);
				1221	}
				1222
				1223	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
				1224	&sbi->s_fc_dentry_q[FC_Q_MAIN]);
				1225	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
				1226	&sbi->s_fc_q[FC_Q_MAIN]);
				1227
				1228	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
				1229	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
				1230
				1231	if (full)
				1232	sbi->s_fc_bytes = 0;
				1233	spin_unlock(&sbi->s_fc_lock);
				1234	trace_ext4_fc_stats(sb);
				1235	}
				1236
				1237	/* Ext4 Replay Path Routines */
				1238
				1239	/* Helper struct for dentry replay routines */
				1240	struct dentry_info_args {
				1241	int parent_ino, dname_len, ino, inode_len;
				1242	char *dname;
				1243	};
				1244
				1245	static inline void tl_to_darg(struct dentry_info_args *darg,
				1246	struct ext4_fc_tl tl, u8 val)
				1247	{
				1248	struct ext4_fc_dentry_info fcd;
				1249
				1250	memcpy(&fcd, val, sizeof(fcd));
				1251
				1252	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
				1253	darg->ino = le32_to_cpu(fcd.fc_ino);
				1254	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
				1255	darg->dname_len = le16_to_cpu(tl->fc_len) -
				1256	sizeof(struct ext4_fc_dentry_info);
				1257	}
				1258
				1259	/* Unlink replay function */
				1260	static int ext4_fc_replay_unlink(struct super_block sb, struct ext4_fc_tl tl,
				1261	u8 *val)
				1262	{
				1263	struct inode inode, old_parent;
				1264	struct qstr entry;
				1265	struct dentry_info_args darg;
				1266	int ret = 0;
				1267
				1268	tl_to_darg(&darg, tl, val);
				1269
				1270	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
				1271	darg.parent_ino, darg.dname_len);
				1272
				1273	entry.name = darg.dname;
				1274	entry.len = darg.dname_len;
				1275	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
				1276
				1277	if (IS_ERR(inode)) {
				1278	jbd_debug(1, "Inode %d not found", darg.ino);
				1279	return 0;
				1280	}
				1281
				1282	old_parent = ext4_iget(sb, darg.parent_ino,
				1283	EXT4_IGET_NORMAL);
				1284	if (IS_ERR(old_parent)) {
				1285	jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
				1286	iput(inode);
				1287	return 0;
				1288	}
				1289
				1290	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
				1291	/* -ENOENT ok coz it might not exist anymore. */
				1292	if (ret == -ENOENT)
				1293	ret = 0;
				1294	iput(old_parent);
				1295	iput(inode);
				1296	return ret;
				1297	}
				1298
				1299	static int ext4_fc_replay_link_internal(struct super_block *sb,
				1300	struct dentry_info_args *darg,
				1301	struct inode *inode)
				1302	{
				1303	struct inode *dir = NULL;
				1304	struct dentry dentry_dir = NULL, dentry_inode = NULL;
				1305	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
				1306	int ret = 0;
				1307
				1308	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
				1309	if (IS_ERR(dir)) {
				1310	jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
				1311	dir = NULL;
				1312	goto out;
				1313	}
				1314
				1315	dentry_dir = d_obtain_alias(dir);
				1316	if (IS_ERR(dentry_dir)) {
				1317	jbd_debug(1, "Failed to obtain dentry");
				1318	dentry_dir = NULL;
				1319	goto out;
				1320	}
				1321
				1322	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
				1323	if (!dentry_inode) {
				1324	jbd_debug(1, "Inode dentry not created.");
				1325	ret = -ENOMEM;
				1326	goto out;
				1327	}
				1328
				1329	ret = __ext4_link(dir, inode, dentry_inode);
				1330	/*
				1331	* It's possible that link already existed since data blocks
				1332	* for the dir in question got persisted before we crashed OR
				1333	* we replayed this tag and crashed before the entire replay
				1334	* could complete.
				1335	*/
				1336	if (ret && ret != -EEXIST) {
				1337	jbd_debug(1, "Failed to link\n");
				1338	goto out;
				1339	}
				1340
				1341	ret = 0;
				1342	out:
				1343	if (dentry_dir) {
				1344	d_drop(dentry_dir);
				1345	dput(dentry_dir);
				1346	} else if (dir) {
				1347	iput(dir);
				1348	}
				1349	if (dentry_inode) {
				1350	d_drop(dentry_inode);
				1351	dput(dentry_inode);
				1352	}
				1353
				1354	return ret;
				1355	}
				1356
				1357	/* Link replay function */
				1358	static int ext4_fc_replay_link(struct super_block sb, struct ext4_fc_tl tl,
				1359	u8 *val)
				1360	{
				1361	struct inode *inode;
				1362	struct dentry_info_args darg;
				1363	int ret = 0;
				1364
				1365	tl_to_darg(&darg, tl, val);
				1366	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
				1367	darg.parent_ino, darg.dname_len);
				1368
				1369	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
				1370	if (IS_ERR(inode)) {
				1371	jbd_debug(1, "Inode not found.");
				1372	return 0;
				1373	}
				1374
				1375	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
				1376	iput(inode);
				1377	return ret;
				1378	}
				1379
				1380	/*
				1381	* Record all the modified inodes during replay. We use this later to setup
				1382	* block bitmaps correctly.
				1383	*/
				1384	static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
				1385	{
				1386	struct ext4_fc_replay_state *state;
				1387	int i;
				1388
				1389	state = &EXT4_SB(sb)->s_fc_replay_state;
				1390	for (i = 0; i < state->fc_modified_inodes_used; i++)
				1391	if (state->fc_modified_inodes[i] == ino)
				1392	return 0;
				1393	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame^]	1394	int *fc_modified_inodes;
				1395
				1396	fc_modified_inodes = krealloc(state->fc_modified_inodes,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1397	sizeof(int) * (state->fc_modified_inodes_size +
				1398	EXT4_FC_REPLAY_REALLOC_INCREMENT),
				1399	GFP_KERNEL);
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame^]	1400	if (!fc_modified_inodes)
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1401	return -ENOMEM;
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame^]	1402	state->fc_modified_inodes = fc_modified_inodes;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1403	state->fc_modified_inodes_size +=
				1404	EXT4_FC_REPLAY_REALLOC_INCREMENT;
				1405	}
				1406	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
				1407	return 0;
				1408	}
				1409
				1410	/*
				1411	* Inode replay function
				1412	*/
				1413	static int ext4_fc_replay_inode(struct super_block sb, struct ext4_fc_tl tl,
				1414	u8 *val)
				1415	{
				1416	struct ext4_fc_inode fc_inode;
				1417	struct ext4_inode *raw_inode;
				1418	struct ext4_inode *raw_fc_inode;
				1419	struct inode *inode = NULL;
				1420	struct ext4_iloc iloc;
				1421	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
				1422	struct ext4_extent_header *eh;
				1423
				1424	memcpy(&fc_inode, val, sizeof(fc_inode));
				1425
				1426	ino = le32_to_cpu(fc_inode.fc_ino);
				1427	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
				1428
				1429	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
				1430	if (!IS_ERR(inode)) {
				1431	ext4_ext_clear_bb(inode);
				1432	iput(inode);
				1433	}
				1434	inode = NULL;
				1435
				1436	ret = ext4_fc_record_modified_inode(sb, ino);
				1437	if (ret)
				1438	goto out;
				1439
				1440	raw_fc_inode = (struct ext4_inode *)
				1441	(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
				1442	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
				1443	if (ret)
				1444	goto out;
				1445
				1446	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
				1447	raw_inode = ext4_raw_inode(&iloc);
				1448
				1449	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
				1450	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
				1451	inode_len - offsetof(struct ext4_inode, i_generation));
				1452	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
				1453	eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
				1454	if (eh->eh_magic != EXT4_EXT_MAGIC) {
				1455	memset(eh, 0, sizeof(*eh));
				1456	eh->eh_magic = EXT4_EXT_MAGIC;
				1457	eh->eh_max = cpu_to_le16(
				1458	(sizeof(raw_inode->i_block) -
				1459	sizeof(struct ext4_extent_header))
				1460	/ sizeof(struct ext4_extent));
				1461	}
				1462	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
				1463	memcpy(raw_inode->i_block, raw_fc_inode->i_block,
				1464	sizeof(raw_inode->i_block));
				1465	}
				1466
				1467	/* Immediately update the inode on disk. */
				1468	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
				1469	if (ret)
				1470	goto out;
				1471	ret = sync_dirty_buffer(iloc.bh);
				1472	if (ret)
				1473	goto out;
				1474	ret = ext4_mark_inode_used(sb, ino);
				1475	if (ret)
				1476	goto out;
				1477
				1478	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
				1479	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
				1480	if (IS_ERR(inode)) {
				1481	jbd_debug(1, "Inode not found.");
				1482	return -EFSCORRUPTED;
				1483	}
				1484
				1485	/*
				1486	* Our allocator could have made different decisions than before
				1487	* crashing. This should be fixed but until then, we calculate
				1488	* the number of blocks the inode.
				1489	*/
				1490	ext4_ext_replay_set_iblocks(inode);
				1491
				1492	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
				1493	ext4_reset_inode_seed(inode);
				1494
				1495	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
				1496	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
				1497	sync_dirty_buffer(iloc.bh);
				1498	brelse(iloc.bh);
				1499	out:
				1500	iput(inode);
				1501	if (!ret)
				1502	blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
				1503
				1504	return 0;
				1505	}
				1506
				1507	/*
				1508	* Dentry create replay function.
				1509	*
				1510	* EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
				1511	* inode for which we are trying to create a dentry here, should already have
				1512	* been replayed before we start here.
				1513	*/
				1514	static int ext4_fc_replay_create(struct super_block sb, struct ext4_fc_tl tl,
				1515	u8 *val)
				1516	{
				1517	int ret = 0;
				1518	struct inode *inode = NULL;
				1519	struct inode *dir = NULL;
				1520	struct dentry_info_args darg;
				1521
				1522	tl_to_darg(&darg, tl, val);
				1523
				1524	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
				1525	darg.parent_ino, darg.dname_len);
				1526
				1527	/* This takes care of update group descriptor and other metadata */
				1528	ret = ext4_mark_inode_used(sb, darg.ino);
				1529	if (ret)
				1530	goto out;
				1531
				1532	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
				1533	if (IS_ERR(inode)) {
				1534	jbd_debug(1, "inode %d not found.", darg.ino);
				1535	inode = NULL;
				1536	ret = -EINVAL;
				1537	goto out;
				1538	}
				1539
				1540	if (S_ISDIR(inode->i_mode)) {
				1541	/*
				1542	* If we are creating a directory, we need to make sure that the
				1543	* dot and dot dot dirents are setup properly.
				1544	*/
				1545	dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
				1546	if (IS_ERR(dir)) {
				1547	jbd_debug(1, "Dir %d not found.", darg.ino);
				1548	goto out;
				1549	}
				1550	ret = ext4_init_new_dir(NULL, dir, inode);
				1551	iput(dir);
				1552	if (ret) {
				1553	ret = 0;
				1554	goto out;
				1555	}
				1556	}
				1557	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
				1558	if (ret)
				1559	goto out;
				1560	set_nlink(inode, 1);
				1561	ext4_mark_inode_dirty(NULL, inode);
				1562	out:
				1563	if (inode)
				1564	iput(inode);
				1565	return ret;
				1566	}
				1567
				1568	/*
				1569	* Record physical disk regions which are in use as per fast commit area,
				1570	* and used by inodes during replay phase. Our simple replay phase
				1571	* allocator excludes these regions from allocation.
				1572	*/
				1573	int ext4_fc_record_regions(struct super_block *sb, int ino,
				1574	ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
				1575	{
				1576	struct ext4_fc_replay_state *state;
				1577	struct ext4_fc_alloc_region *region;
				1578
				1579	state = &EXT4_SB(sb)->s_fc_replay_state;
				1580	/*
				1581	* during replay phase, the fc_regions_valid may not same as
				1582	* fc_regions_used, update it when do new additions.
				1583	*/
				1584	if (replay && state->fc_regions_used != state->fc_regions_valid)
				1585	state->fc_regions_used = state->fc_regions_valid;
				1586	if (state->fc_regions_used == state->fc_regions_size) {
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame^]	1587	struct ext4_fc_alloc_region *fc_regions;
				1588
				1589	fc_regions = krealloc(state->fc_regions,
				1590	sizeof(struct ext4_fc_alloc_region) *
				1591	(state->fc_regions_size +
				1592	EXT4_FC_REPLAY_REALLOC_INCREMENT),
				1593	GFP_KERNEL);
				1594	if (!fc_regions)
				1595	return -ENOMEM;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1596	state->fc_regions_size +=
				1597	EXT4_FC_REPLAY_REALLOC_INCREMENT;
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame^]	1598	state->fc_regions = fc_regions;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1599	}
				1600	region = &state->fc_regions[state->fc_regions_used++];
				1601	region->ino = ino;
				1602	region->lblk = lblk;
				1603	region->pblk = pblk;
				1604	region->len = len;
				1605
				1606	if (replay)
				1607	state->fc_regions_valid++;
				1608
				1609	return 0;
				1610	}
				1611
				1612	/* Replay add range tag */
				1613	static int ext4_fc_replay_add_range(struct super_block *sb,
				1614	struct ext4_fc_tl tl, u8 val)
				1615	{
				1616	struct ext4_fc_add_range fc_add_ex;
				1617	struct ext4_extent newex, *ex;
				1618	struct inode *inode;
				1619	ext4_lblk_t start, cur;
				1620	int remaining, len;
				1621	ext4_fsblk_t start_pblk;
				1622	struct ext4_map_blocks map;
				1623	struct ext4_ext_path *path = NULL;
				1624	int ret;
				1625
				1626	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
				1627	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
				1628
				1629	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
				1630	le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
				1631	ext4_ext_get_actual_len(ex));
				1632
				1633	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
				1634	if (IS_ERR(inode)) {
				1635	jbd_debug(1, "Inode not found.");
				1636	return 0;
				1637	}
				1638
				1639	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
				1640	if (ret)
				1641	goto out;
				1642
				1643	start = le32_to_cpu(ex->ee_block);
				1644	start_pblk = ext4_ext_pblock(ex);
				1645	len = ext4_ext_get_actual_len(ex);
				1646
				1647	cur = start;
				1648	remaining = len;
				1649	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
				1650	start, start_pblk, len, ext4_ext_is_unwritten(ex),
				1651	inode->i_ino);
				1652
				1653	while (remaining > 0) {
				1654	map.m_lblk = cur;
				1655	map.m_len = remaining;
				1656	map.m_pblk = 0;
				1657	ret = ext4_map_blocks(NULL, inode, &map, 0);
				1658
				1659	if (ret < 0)
				1660	goto out;
				1661
				1662	if (ret == 0) {
				1663	/* Range is not mapped */
				1664	path = ext4_find_extent(inode, cur, NULL, 0);
				1665	if (IS_ERR(path))
				1666	goto out;
				1667	memset(&newex, 0, sizeof(newex));
				1668	newex.ee_block = cpu_to_le32(cur);
				1669	ext4_ext_store_pblock(
				1670	&newex, start_pblk + cur - start);
				1671	newex.ee_len = cpu_to_le16(map.m_len);
				1672	if (ext4_ext_is_unwritten(ex))
				1673	ext4_ext_mark_unwritten(&newex);
				1674	down_write(&EXT4_I(inode)->i_data_sem);
				1675	ret = ext4_ext_insert_extent(
				1676	NULL, inode, &path, &newex, 0);
				1677	up_write((&EXT4_I(inode)->i_data_sem));
				1678	ext4_ext_drop_refs(path);
				1679	kfree(path);
				1680	if (ret)
				1681	goto out;
				1682	goto next;
				1683	}
				1684
				1685	if (start_pblk + cur - start != map.m_pblk) {
				1686	/*
				1687	* Logical to physical mapping changed. This can happen
				1688	* if this range was removed and then reallocated to
				1689	* map to new physical blocks during a fast commit.
				1690	*/
				1691	ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
				1692	ext4_ext_is_unwritten(ex),
				1693	start_pblk + cur - start);
				1694	if (ret)
				1695	goto out;
				1696	/*
				1697	* Mark the old blocks as free since they aren't used
				1698	* anymore. We maintain an array of all the modified
				1699	* inodes. In case these blocks are still used at either
				1700	* a different logical range in the same inode or in
				1701	* some different inode, we will mark them as allocated
				1702	* at the end of the FC replay using our array of
				1703	* modified inodes.
				1704	*/
				1705	ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
				1706	goto next;
				1707	}
				1708
				1709	/* Range is mapped and needs a state change */
				1710	jbd_debug(1, "Converting from %ld to %d %lld",
				1711	map.m_flags & EXT4_MAP_UNWRITTEN,
				1712	ext4_ext_is_unwritten(ex), map.m_pblk);
				1713	ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
				1714	ext4_ext_is_unwritten(ex), map.m_pblk);
				1715	if (ret)
				1716	goto out;
				1717	/*
				1718	* We may have split the extent tree while toggling the state.
				1719	* Try to shrink the extent tree now.
				1720	*/
				1721	ext4_ext_replay_shrink_inode(inode, start + len);
				1722	next:
				1723	cur += map.m_len;
				1724	remaining -= map.m_len;
				1725	}
				1726	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
				1727	sb->s_blocksize_bits);
				1728	out:
				1729	iput(inode);
				1730	return 0;
				1731	}
				1732
				1733	/* Replay DEL_RANGE tag */
				1734	static int
				1735	ext4_fc_replay_del_range(struct super_block sb, struct ext4_fc_tl tl,
				1736	u8 *val)
				1737	{
				1738	struct inode *inode;
				1739	struct ext4_fc_del_range lrange;
				1740	struct ext4_map_blocks map;
				1741	ext4_lblk_t cur, remaining;
				1742	int ret;
				1743
				1744	memcpy(&lrange, val, sizeof(lrange));
				1745	cur = le32_to_cpu(lrange.fc_lblk);
				1746	remaining = le32_to_cpu(lrange.fc_len);
				1747
				1748	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
				1749	le32_to_cpu(lrange.fc_ino), cur, remaining);
				1750
				1751	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
				1752	if (IS_ERR(inode)) {
				1753	jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
				1754	return 0;
				1755	}
				1756
				1757	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
				1758	if (ret)
				1759	goto out;
				1760
				1761	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
				1762	inode->i_ino, le32_to_cpu(lrange.fc_lblk),
				1763	le32_to_cpu(lrange.fc_len));
				1764	while (remaining > 0) {
				1765	map.m_lblk = cur;
				1766	map.m_len = remaining;
				1767
				1768	ret = ext4_map_blocks(NULL, inode, &map, 0);
				1769	if (ret < 0)
				1770	goto out;
				1771	if (ret > 0) {
				1772	remaining -= ret;
				1773	cur += ret;
				1774	ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
				1775	} else {
				1776	remaining -= map.m_len;
				1777	cur += map.m_len;
				1778	}
				1779	}
				1780
				1781	down_write(&EXT4_I(inode)->i_data_sem);
				1782	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
				1783	le32_to_cpu(lrange.fc_lblk) +
				1784	le32_to_cpu(lrange.fc_len) - 1);
				1785	up_write(&EXT4_I(inode)->i_data_sem);
				1786	if (ret)
				1787	goto out;
				1788	ext4_ext_replay_shrink_inode(inode,
				1789	i_size_read(inode) >> sb->s_blocksize_bits);
				1790	ext4_mark_inode_dirty(NULL, inode);
				1791	out:
				1792	iput(inode);
				1793	return 0;
				1794	}
				1795
				1796	static inline const char *tag2str(u16 tag)
				1797	{
				1798	switch (tag) {
				1799	case EXT4_FC_TAG_LINK:
				1800	return "TAG_ADD_ENTRY";
				1801	case EXT4_FC_TAG_UNLINK:
				1802	return "TAG_DEL_ENTRY";
				1803	case EXT4_FC_TAG_ADD_RANGE:
				1804	return "TAG_ADD_RANGE";
				1805	case EXT4_FC_TAG_CREAT:
				1806	return "TAG_CREAT_DENTRY";
				1807	case EXT4_FC_TAG_DEL_RANGE:
				1808	return "TAG_DEL_RANGE";
				1809	case EXT4_FC_TAG_INODE:
				1810	return "TAG_INODE";
				1811	case EXT4_FC_TAG_PAD:
				1812	return "TAG_PAD";
				1813	case EXT4_FC_TAG_TAIL:
				1814	return "TAG_TAIL";
				1815	case EXT4_FC_TAG_HEAD:
				1816	return "TAG_HEAD";
				1817	default:
				1818	return "TAG_ERROR";
				1819	}
				1820	}
				1821
				1822	static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
				1823	{
				1824	struct ext4_fc_replay_state *state;
				1825	struct inode *inode;
				1826	struct ext4_ext_path *path = NULL;
				1827	struct ext4_map_blocks map;
				1828	int i, ret, j;
				1829	ext4_lblk_t cur, end;
				1830
				1831	state = &EXT4_SB(sb)->s_fc_replay_state;
				1832	for (i = 0; i < state->fc_modified_inodes_used; i++) {
				1833	inode = ext4_iget(sb, state->fc_modified_inodes[i],
				1834	EXT4_IGET_NORMAL);
				1835	if (IS_ERR(inode)) {
				1836	jbd_debug(1, "Inode %d not found.",
				1837	state->fc_modified_inodes[i]);
				1838	continue;
				1839	}
				1840	cur = 0;
				1841	end = EXT_MAX_BLOCKS;
				1842	while (cur < end) {
				1843	map.m_lblk = cur;
				1844	map.m_len = end - cur;
				1845
				1846	ret = ext4_map_blocks(NULL, inode, &map, 0);
				1847	if (ret < 0)
				1848	break;
				1849
				1850	if (ret > 0) {
				1851	path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
				1852	if (!IS_ERR(path)) {
				1853	for (j = 0; j < path->p_depth; j++)
				1854	ext4_mb_mark_bb(inode->i_sb,
				1855	path[j].p_block, 1, 1);
				1856	ext4_ext_drop_refs(path);
				1857	kfree(path);
				1858	}
				1859	cur += ret;
				1860	ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
				1861	map.m_len, 1);
				1862	} else {
				1863	cur = cur + (map.m_len ? map.m_len : 1);
				1864	}
				1865	}
				1866	iput(inode);
				1867	}
				1868	}
				1869
				1870	/*
				1871	* Check if block is in excluded regions for block allocation. The simple
				1872	* allocator that runs during replay phase is calls this function to see
				1873	* if it is okay to use a block.
				1874	*/
				1875	bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
				1876	{
				1877	int i;
				1878	struct ext4_fc_replay_state *state;
				1879
				1880	state = &EXT4_SB(sb)->s_fc_replay_state;
				1881	for (i = 0; i < state->fc_regions_valid; i++) {
				1882	if (state->fc_regions[i].ino == 0 \|\|
				1883	state->fc_regions[i].len == 0)
				1884	continue;
				1885	if (blk >= state->fc_regions[i].pblk &&
				1886	blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
				1887	return true;
				1888	}
				1889	return false;
				1890	}
				1891
				1892	/* Cleanup function called after replay */
				1893	void ext4_fc_replay_cleanup(struct super_block *sb)
				1894	{
				1895	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1896
				1897	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
				1898	kfree(sbi->s_fc_replay_state.fc_regions);
				1899	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
				1900	}
				1901
				1902	/*
				1903	* Recovery Scan phase handler
				1904	*
				1905	* This function is called during the scan phase and is responsible
				1906	* for doing following things:
				1907	* - Make sure the fast commit area has valid tags for replay
				1908	* - Count number of tags that need to be replayed by the replay handler
				1909	* - Verify CRC
				1910	* - Create a list of excluded blocks for allocation during replay phase
				1911	*
				1912	* This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
				1913	* incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
				1914	* to indicate that scan has finished and JBD2 can now start replay phase.
				1915	* It returns a negative error to indicate that there was an error. At the end
				1916	* of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
				1917	* to indicate the number of tags that need to replayed during the replay phase.
				1918	*/
				1919	static int ext4_fc_replay_scan(journal_t *journal,
				1920	struct buffer_head *bh, int off,
				1921	tid_t expected_tid)
				1922	{
				1923	struct super_block *sb = journal->j_private;
				1924	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1925	struct ext4_fc_replay_state *state;
				1926	int ret = JBD2_FC_REPLAY_CONTINUE;
				1927	struct ext4_fc_add_range ext;
				1928	struct ext4_fc_tl tl;
				1929	struct ext4_fc_tail tail;
				1930	__u8 start, end, cur, val;
				1931	struct ext4_fc_head head;
				1932	struct ext4_extent *ex;
				1933
				1934	state = &sbi->s_fc_replay_state;
				1935
				1936	start = (u8 *)bh->b_data;
				1937	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
				1938
				1939	if (state->fc_replay_expected_off == 0) {
				1940	state->fc_cur_tag = 0;
				1941	state->fc_replay_num_tags = 0;
				1942	state->fc_crc = 0;
				1943	state->fc_regions = NULL;
				1944	state->fc_regions_valid = state->fc_regions_used =
				1945	state->fc_regions_size = 0;
				1946	/* Check if we can stop early */
				1947	if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
				1948	!= EXT4_FC_TAG_HEAD)
				1949	return 0;
				1950	}
				1951
				1952	if (off != state->fc_replay_expected_off) {
				1953	ret = -EFSCORRUPTED;
				1954	goto out_err;
				1955	}
				1956
				1957	state->fc_replay_expected_off++;
				1958	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
				1959	memcpy(&tl, cur, sizeof(tl));
				1960	val = cur + sizeof(tl);
				1961	jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
				1962	tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
				1963	switch (le16_to_cpu(tl.fc_tag)) {
				1964	case EXT4_FC_TAG_ADD_RANGE:
				1965	memcpy(&ext, val, sizeof(ext));
				1966	ex = (struct ext4_extent *)&ext.fc_ex;
				1967	ret = ext4_fc_record_regions(sb,
				1968	le32_to_cpu(ext.fc_ino),
				1969	le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
				1970	ext4_ext_get_actual_len(ex), 0);
				1971	if (ret < 0)
				1972	break;
				1973	ret = JBD2_FC_REPLAY_CONTINUE;
				1974	fallthrough;
				1975	case EXT4_FC_TAG_DEL_RANGE:
				1976	case EXT4_FC_TAG_LINK:
				1977	case EXT4_FC_TAG_UNLINK:
				1978	case EXT4_FC_TAG_CREAT:
				1979	case EXT4_FC_TAG_INODE:
				1980	case EXT4_FC_TAG_PAD:
				1981	state->fc_cur_tag++;
				1982	state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
				1983	sizeof(tl) + le16_to_cpu(tl.fc_len));
				1984	break;
				1985	case EXT4_FC_TAG_TAIL:
				1986	state->fc_cur_tag++;
				1987	memcpy(&tail, val, sizeof(tail));
				1988	state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
				1989	sizeof(tl) +
				1990	offsetof(struct ext4_fc_tail,
				1991	fc_crc));
				1992	if (le32_to_cpu(tail.fc_tid) == expected_tid &&
				1993	le32_to_cpu(tail.fc_crc) == state->fc_crc) {
				1994	state->fc_replay_num_tags = state->fc_cur_tag;
				1995	state->fc_regions_valid =
				1996	state->fc_regions_used;
				1997	} else {
				1998	ret = state->fc_replay_num_tags ?
				1999	JBD2_FC_REPLAY_STOP : -EFSBADCRC;
				2000	}
				2001	state->fc_crc = 0;
				2002	break;
				2003	case EXT4_FC_TAG_HEAD:
				2004	memcpy(&head, val, sizeof(head));
				2005	if (le32_to_cpu(head.fc_features) &
				2006	~EXT4_FC_SUPPORTED_FEATURES) {
				2007	ret = -EOPNOTSUPP;
				2008	break;
				2009	}
				2010	if (le32_to_cpu(head.fc_tid) != expected_tid) {
				2011	ret = JBD2_FC_REPLAY_STOP;
				2012	break;
				2013	}
				2014	state->fc_cur_tag++;
				2015	state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
				2016	sizeof(tl) + le16_to_cpu(tl.fc_len));
				2017	break;
				2018	default:
				2019	ret = state->fc_replay_num_tags ?
				2020	JBD2_FC_REPLAY_STOP : -ECANCELED;
				2021	}
				2022	if (ret < 0 \|\| ret == JBD2_FC_REPLAY_STOP)
				2023	break;
				2024	}
				2025
				2026	out_err:
				2027	trace_ext4_fc_replay_scan(sb, ret, off);
				2028	return ret;
				2029	}
				2030
				2031	/*
				2032	* Main recovery path entry point.
				2033	* The meaning of return codes is similar as above.
				2034	*/
				2035	static int ext4_fc_replay(journal_t journal, struct buffer_head bh,
				2036	enum passtype pass, int off, tid_t expected_tid)
				2037	{
				2038	struct super_block *sb = journal->j_private;
				2039	struct ext4_sb_info *sbi = EXT4_SB(sb);
				2040	struct ext4_fc_tl tl;
				2041	__u8 start, end, cur, val;
				2042	int ret = JBD2_FC_REPLAY_CONTINUE;
				2043	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
				2044	struct ext4_fc_tail tail;
				2045
				2046	if (pass == PASS_SCAN) {
				2047	state->fc_current_pass = PASS_SCAN;
				2048	return ext4_fc_replay_scan(journal, bh, off, expected_tid);
				2049	}
				2050
				2051	if (state->fc_current_pass != pass) {
				2052	state->fc_current_pass = pass;
				2053	sbi->s_mount_state \|= EXT4_FC_REPLAY;
				2054	}
				2055	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
				2056	jbd_debug(1, "Replay stops\n");
				2057	ext4_fc_set_bitmaps_and_counters(sb);
				2058	return 0;
				2059	}
				2060
				2061	#ifdef CONFIG_EXT4_DEBUG
				2062	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
				2063	pr_warn("Dropping fc block %d because max_replay set\n", off);
				2064	return JBD2_FC_REPLAY_STOP;
				2065	}
				2066	#endif
				2067
				2068	start = (u8 *)bh->b_data;
				2069	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
				2070
				2071	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
				2072	memcpy(&tl, cur, sizeof(tl));
				2073	val = cur + sizeof(tl);
				2074
				2075	if (state->fc_replay_num_tags == 0) {
				2076	ret = JBD2_FC_REPLAY_STOP;
				2077	ext4_fc_set_bitmaps_and_counters(sb);
				2078	break;
				2079	}
				2080	jbd_debug(3, "Replay phase, tag:%s\n",
				2081	tag2str(le16_to_cpu(tl.fc_tag)));
				2082	state->fc_replay_num_tags--;
				2083	switch (le16_to_cpu(tl.fc_tag)) {
				2084	case EXT4_FC_TAG_LINK:
				2085	ret = ext4_fc_replay_link(sb, &tl, val);
				2086	break;
				2087	case EXT4_FC_TAG_UNLINK:
				2088	ret = ext4_fc_replay_unlink(sb, &tl, val);
				2089	break;
				2090	case EXT4_FC_TAG_ADD_RANGE:
				2091	ret = ext4_fc_replay_add_range(sb, &tl, val);
				2092	break;
				2093	case EXT4_FC_TAG_CREAT:
				2094	ret = ext4_fc_replay_create(sb, &tl, val);
				2095	break;
				2096	case EXT4_FC_TAG_DEL_RANGE:
				2097	ret = ext4_fc_replay_del_range(sb, &tl, val);
				2098	break;
				2099	case EXT4_FC_TAG_INODE:
				2100	ret = ext4_fc_replay_inode(sb, &tl, val);
				2101	break;
				2102	case EXT4_FC_TAG_PAD:
				2103	trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
				2104	le16_to_cpu(tl.fc_len), 0);
				2105	break;
				2106	case EXT4_FC_TAG_TAIL:
				2107	trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
				2108	le16_to_cpu(tl.fc_len), 0);
				2109	memcpy(&tail, val, sizeof(tail));
				2110	WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
				2111	break;
				2112	case EXT4_FC_TAG_HEAD:
				2113	break;
				2114	default:
				2115	trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
				2116	le16_to_cpu(tl.fc_len), 0);
				2117	ret = -ECANCELED;
				2118	break;
				2119	}
				2120	if (ret < 0)
				2121	break;
				2122	ret = JBD2_FC_REPLAY_CONTINUE;
				2123	}
				2124	return ret;
				2125	}
				2126
				2127	void ext4_fc_init(struct super_block sb, journal_t journal)
				2128	{
				2129	/*
				2130	* We set replay callback even if fast commit disabled because we may
				2131	* could still have fast commit blocks that need to be replayed even if
				2132	* fast commit has now been turned off.
				2133	*/
				2134	journal->j_fc_replay_callback = ext4_fc_replay;
				2135	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
				2136	return;
				2137	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
				2138	}
				2139
				2140	static const char *fc_ineligible_reasons[] = {
				2141	"Extended attributes changed",
				2142	"Cross rename",
				2143	"Journal flag changed",
				2144	"Insufficient memory",
				2145	"Swap boot",
				2146	"Resize",
				2147	"Dir renamed",
				2148	"Falloc range op",
				2149	"Data journalling",
				2150	"FC Commit Failed"
				2151	};
				2152
				2153	int ext4_fc_info_show(struct seq_file seq, void v)
				2154	{
				2155	struct ext4_sb_info sbi = EXT4_SB((struct super_block )seq->private);
				2156	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
				2157	int i;
				2158
				2159	if (v != SEQ_START_TOKEN)
				2160	return 0;
				2161
				2162	seq_printf(seq,
				2163	"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
				2164	stats->fc_num_commits, stats->fc_ineligible_commits,
				2165	stats->fc_numblks,
				2166	div_u64(sbi->s_fc_avg_commit_time, 1000));
				2167	seq_puts(seq, "Ineligible reasons:\n");
				2168	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
				2169	seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
				2170	stats->fc_ineligible_reason_count[i]);
				2171
				2172	return 0;
				2173	}
				2174
				2175	int __init ext4_fc_init_dentry_cache(void)
				2176	{
				2177	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
				2178	SLAB_RECLAIM_ACCOUNT);
				2179
				2180	if (ext4_fc_dentry_cachep == NULL)
				2181	return -ENOMEM;
				2182
				2183	return 0;
				2184	}
				2185
				2186	void ext4_fc_destroy_dentry_cache(void)
				2187	{
				2188	kmem_cache_destroy(ext4_fc_dentry_cachep);
				2189	}