blob: 41dcf21558c4edba55321d837c025b72f6f05433 [file] [log] [blame]
Olivier Deprez157378f2022-04-04 15:47:50 +02001// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
10#include "ext4.h"
11#include "ext4_jbd2.h"
12#include "ext4_extents.h"
13#include "mballoc.h"
14
15/*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligiblity is marked by calling one of the
70 * two following functions:
71 *
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
74 *
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
83 *
84 * Atomicity of commits
85 * --------------------
86 * In order to guarantee atomicity during the commit operation, fast commit
87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
93 *
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
99 *
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
103 *
104 * Replay code should thus check for all the valid tails in the FC area.
105 *
106 * TODOs
107 * -----
108 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109 * eligible update must be protected within ext4_fc_start_update() and
110 * ext4_fc_stop_update(). These routines are called at much higher
111 * routines. This can be made more fine grained by combining with
112 * ext4_journal_start().
113 *
114 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115 *
116 * 3) Handle more ineligible cases.
117 */
118
119#include <trace/events/ext4.h>
120static struct kmem_cache *ext4_fc_dentry_cachep;
121
122static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123{
124 BUFFER_TRACE(bh, "");
125 if (uptodate) {
126 ext4_debug("%s: Block %lld up-to-date",
127 __func__, bh->b_blocknr);
128 set_buffer_uptodate(bh);
129 } else {
130 ext4_debug("%s: Block %lld not up-to-date",
131 __func__, bh->b_blocknr);
132 clear_buffer_uptodate(bh);
133 }
134
135 unlock_buffer(bh);
136}
137
138static inline void ext4_fc_reset_inode(struct inode *inode)
139{
140 struct ext4_inode_info *ei = EXT4_I(inode);
141
142 ei->i_fc_lblk_start = 0;
143 ei->i_fc_lblk_len = 0;
144}
145
146void ext4_fc_init_inode(struct inode *inode)
147{
148 struct ext4_inode_info *ei = EXT4_I(inode);
149
150 ext4_fc_reset_inode(inode);
151 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152 INIT_LIST_HEAD(&ei->i_fc_list);
153 init_waitqueue_head(&ei->i_fc_wait);
154 atomic_set(&ei->i_fc_updates, 0);
155}
156
157/* This function must be called with sbi->s_fc_lock held. */
158static void ext4_fc_wait_committing_inode(struct inode *inode)
159__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
160{
161 wait_queue_head_t *wq;
162 struct ext4_inode_info *ei = EXT4_I(inode);
163
164#if (BITS_PER_LONG < 64)
165 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
166 EXT4_STATE_FC_COMMITTING);
167 wq = bit_waitqueue(&ei->i_state_flags,
168 EXT4_STATE_FC_COMMITTING);
169#else
170 DEFINE_WAIT_BIT(wait, &ei->i_flags,
171 EXT4_STATE_FC_COMMITTING);
172 wq = bit_waitqueue(&ei->i_flags,
173 EXT4_STATE_FC_COMMITTING);
174#endif
175 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
176 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
177 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
178 schedule();
179 finish_wait(wq, &wait.wq_entry);
180}
181
182/*
183 * Inform Ext4's fast about start of an inode update
184 *
185 * This function is called by the high level call VFS callbacks before
186 * performing any inode update. This function blocks if there's an ongoing
187 * fast commit on the inode in question.
188 */
189void ext4_fc_start_update(struct inode *inode)
190{
191 struct ext4_inode_info *ei = EXT4_I(inode);
192
193 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
194 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
195 return;
196
197restart:
198 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
199 if (list_empty(&ei->i_fc_list))
200 goto out;
201
202 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
203 ext4_fc_wait_committing_inode(inode);
204 goto restart;
205 }
206out:
207 atomic_inc(&ei->i_fc_updates);
208 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
209}
210
211/*
212 * Stop inode update and wake up waiting fast commits if any.
213 */
214void ext4_fc_stop_update(struct inode *inode)
215{
216 struct ext4_inode_info *ei = EXT4_I(inode);
217
218 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
219 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
220 return;
221
222 if (atomic_dec_and_test(&ei->i_fc_updates))
223 wake_up_all(&ei->i_fc_wait);
224}
225
226/*
227 * Remove inode from fast commit list. If the inode is being committed
228 * we wait until inode commit is done.
229 */
230void ext4_fc_del(struct inode *inode)
231{
232 struct ext4_inode_info *ei = EXT4_I(inode);
233
234 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
235 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
236 return;
237
238restart:
239 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
240 if (list_empty(&ei->i_fc_list)) {
241 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
242 return;
243 }
244
245 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
246 ext4_fc_wait_committing_inode(inode);
247 goto restart;
248 }
249 list_del_init(&ei->i_fc_list);
250 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251}
252
253/*
254 * Mark file system as fast commit ineligible. This means that next commit
255 * operation would result in a full jbd2 commit.
256 */
257void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
258{
259 struct ext4_sb_info *sbi = EXT4_SB(sb);
260
261 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
262 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
263 return;
264
265 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
266 WARN_ON(reason >= EXT4_FC_REASON_MAX);
267 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
268}
269
270/*
271 * Start a fast commit ineligible update. Any commits that happen while
272 * such an operation is in progress fall back to full commits.
273 */
274void ext4_fc_start_ineligible(struct super_block *sb, int reason)
275{
276 struct ext4_sb_info *sbi = EXT4_SB(sb);
277
278 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
279 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
280 return;
281
282 WARN_ON(reason >= EXT4_FC_REASON_MAX);
283 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
284 atomic_inc(&sbi->s_fc_ineligible_updates);
285}
286
287/*
288 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
289 * to ensure that after stopping the ineligible update, at least one full
290 * commit takes place.
291 */
292void ext4_fc_stop_ineligible(struct super_block *sb)
293{
294 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
295 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
296 return;
297
298 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
299 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
300}
301
302static inline int ext4_fc_is_ineligible(struct super_block *sb)
303{
304 return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
305 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
306}
307
308/*
309 * Generic fast commit tracking function. If this is the first time this we are
310 * called after a full commit, we initialize fast commit fields and then call
311 * __fc_track_fn() with update = 0. If we have already been called after a full
312 * commit, we pass update = 1. Based on that, the track function can determine
313 * if it needs to track a field for the first time or if it needs to just
314 * update the previously tracked value.
315 *
316 * If enqueue is set, this function enqueues the inode in fast commit list.
317 */
318static int ext4_fc_track_template(
319 handle_t *handle, struct inode *inode,
320 int (*__fc_track_fn)(struct inode *, void *, bool),
321 void *args, int enqueue)
322{
323 bool update = false;
324 struct ext4_inode_info *ei = EXT4_I(inode);
325 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
326 tid_t tid = 0;
327 int ret;
328
329 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
330 (sbi->s_mount_state & EXT4_FC_REPLAY))
331 return -EOPNOTSUPP;
332
333 if (ext4_fc_is_ineligible(inode->i_sb))
334 return -EINVAL;
335
336 tid = handle->h_transaction->t_tid;
337 mutex_lock(&ei->i_fc_lock);
338 if (tid == ei->i_sync_tid) {
339 update = true;
340 } else {
341 ext4_fc_reset_inode(inode);
342 ei->i_sync_tid = tid;
343 }
344 ret = __fc_track_fn(inode, args, update);
345 mutex_unlock(&ei->i_fc_lock);
346
347 if (!enqueue)
348 return ret;
349
350 spin_lock(&sbi->s_fc_lock);
351 if (list_empty(&EXT4_I(inode)->i_fc_list))
352 list_add_tail(&EXT4_I(inode)->i_fc_list,
353 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
354 &sbi->s_fc_q[FC_Q_STAGING] :
355 &sbi->s_fc_q[FC_Q_MAIN]);
356 spin_unlock(&sbi->s_fc_lock);
357
358 return ret;
359}
360
361struct __track_dentry_update_args {
362 struct dentry *dentry;
363 int op;
364};
365
366/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
367static int __track_dentry_update(struct inode *inode, void *arg, bool update)
368{
369 struct ext4_fc_dentry_update *node;
370 struct ext4_inode_info *ei = EXT4_I(inode);
371 struct __track_dentry_update_args *dentry_update =
372 (struct __track_dentry_update_args *)arg;
373 struct dentry *dentry = dentry_update->dentry;
374 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
375
376 mutex_unlock(&ei->i_fc_lock);
377 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
378 if (!node) {
379 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
380 mutex_lock(&ei->i_fc_lock);
381 return -ENOMEM;
382 }
383
384 node->fcd_op = dentry_update->op;
385 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
386 node->fcd_ino = inode->i_ino;
387 if (dentry->d_name.len > DNAME_INLINE_LEN) {
388 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
389 if (!node->fcd_name.name) {
390 kmem_cache_free(ext4_fc_dentry_cachep, node);
391 ext4_fc_mark_ineligible(inode->i_sb,
392 EXT4_FC_REASON_NOMEM);
393 mutex_lock(&ei->i_fc_lock);
394 return -ENOMEM;
395 }
396 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
397 dentry->d_name.len);
398 } else {
399 memcpy(node->fcd_iname, dentry->d_name.name,
400 dentry->d_name.len);
401 node->fcd_name.name = node->fcd_iname;
402 }
403 node->fcd_name.len = dentry->d_name.len;
404
405 spin_lock(&sbi->s_fc_lock);
406 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
407 list_add_tail(&node->fcd_list,
408 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
409 else
410 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
411 spin_unlock(&sbi->s_fc_lock);
412 mutex_lock(&ei->i_fc_lock);
413
414 return 0;
415}
416
417void __ext4_fc_track_unlink(handle_t *handle,
418 struct inode *inode, struct dentry *dentry)
419{
420 struct __track_dentry_update_args args;
421 int ret;
422
423 args.dentry = dentry;
424 args.op = EXT4_FC_TAG_UNLINK;
425
426 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
427 (void *)&args, 0);
428 trace_ext4_fc_track_unlink(inode, dentry, ret);
429}
430
431void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
432{
433 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
434}
435
436void __ext4_fc_track_link(handle_t *handle,
437 struct inode *inode, struct dentry *dentry)
438{
439 struct __track_dentry_update_args args;
440 int ret;
441
442 args.dentry = dentry;
443 args.op = EXT4_FC_TAG_LINK;
444
445 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
446 (void *)&args, 0);
447 trace_ext4_fc_track_link(inode, dentry, ret);
448}
449
450void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
451{
452 __ext4_fc_track_link(handle, d_inode(dentry), dentry);
453}
454
455void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
456 struct dentry *dentry)
457{
458 struct __track_dentry_update_args args;
459 int ret;
460
461 args.dentry = dentry;
462 args.op = EXT4_FC_TAG_CREAT;
463
464 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
465 (void *)&args, 0);
466 trace_ext4_fc_track_create(inode, dentry, ret);
467}
468
469void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
470{
471 __ext4_fc_track_create(handle, d_inode(dentry), dentry);
472}
473
474/* __track_fn for inode tracking */
475static int __track_inode(struct inode *inode, void *arg, bool update)
476{
477 if (update)
478 return -EEXIST;
479
480 EXT4_I(inode)->i_fc_lblk_len = 0;
481
482 return 0;
483}
484
485void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
486{
487 int ret;
488
489 if (S_ISDIR(inode->i_mode))
490 return;
491
492 if (ext4_should_journal_data(inode)) {
493 ext4_fc_mark_ineligible(inode->i_sb,
494 EXT4_FC_REASON_INODE_JOURNAL_DATA);
495 return;
496 }
497
498 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
499 trace_ext4_fc_track_inode(inode, ret);
500}
501
502struct __track_range_args {
503 ext4_lblk_t start, end;
504};
505
506/* __track_fn for tracking data updates */
507static int __track_range(struct inode *inode, void *arg, bool update)
508{
509 struct ext4_inode_info *ei = EXT4_I(inode);
510 ext4_lblk_t oldstart;
511 struct __track_range_args *__arg =
512 (struct __track_range_args *)arg;
513
514 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
515 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
516 return -ECANCELED;
517 }
518
519 oldstart = ei->i_fc_lblk_start;
520
521 if (update && ei->i_fc_lblk_len > 0) {
522 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
523 ei->i_fc_lblk_len =
524 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
525 ei->i_fc_lblk_start + 1;
526 } else {
527 ei->i_fc_lblk_start = __arg->start;
528 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
529 }
530
531 return 0;
532}
533
534void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
535 ext4_lblk_t end)
536{
537 struct __track_range_args args;
538 int ret;
539
540 if (S_ISDIR(inode->i_mode))
541 return;
542
543 args.start = start;
544 args.end = end;
545
546 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
547
548 trace_ext4_fc_track_range(inode, start, end, ret);
549}
550
551static void ext4_fc_submit_bh(struct super_block *sb)
552{
553 int write_flags = REQ_SYNC;
554 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
555
556 /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
557 if (test_opt(sb, BARRIER))
558 write_flags |= REQ_FUA | REQ_PREFLUSH;
559 lock_buffer(bh);
560 set_buffer_dirty(bh);
561 set_buffer_uptodate(bh);
562 bh->b_end_io = ext4_end_buffer_io_sync;
563 submit_bh(REQ_OP_WRITE, write_flags, bh);
564 EXT4_SB(sb)->s_fc_bh = NULL;
565}
566
567/* Ext4 commit path routines */
568
569/* memzero and update CRC */
570static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
571 u32 *crc)
572{
573 void *ret;
574
575 ret = memset(dst, 0, len);
576 if (crc)
577 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
578 return ret;
579}
580
581/*
582 * Allocate len bytes on a fast commit buffer.
583 *
584 * During the commit time this function is used to manage fast commit
585 * block space. We don't split a fast commit log onto different
586 * blocks. So this function makes sure that if there's not enough space
587 * on the current block, the remaining space in the current block is
588 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
589 * new block is from jbd2 and CRC is updated to reflect the padding
590 * we added.
591 */
592static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
593{
594 struct ext4_fc_tl *tl;
595 struct ext4_sb_info *sbi = EXT4_SB(sb);
596 struct buffer_head *bh;
597 int bsize = sbi->s_journal->j_blocksize;
598 int ret, off = sbi->s_fc_bytes % bsize;
599 int pad_len;
600
601 /*
602 * After allocating len, we should have space at least for a 0 byte
603 * padding.
604 */
605 if (len + sizeof(struct ext4_fc_tl) > bsize)
606 return NULL;
607
608 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
609 /*
610 * Only allocate from current buffer if we have enough space for
611 * this request AND we have space to add a zero byte padding.
612 */
613 if (!sbi->s_fc_bh) {
614 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
615 if (ret)
616 return NULL;
617 sbi->s_fc_bh = bh;
618 }
619 sbi->s_fc_bytes += len;
620 return sbi->s_fc_bh->b_data + off;
621 }
622 /* Need to add PAD tag */
623 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
624 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
625 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
626 tl->fc_len = cpu_to_le16(pad_len);
627 if (crc)
628 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
629 if (pad_len > 0)
630 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
631 ext4_fc_submit_bh(sb);
632
633 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
634 if (ret)
635 return NULL;
636 sbi->s_fc_bh = bh;
637 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
638 return sbi->s_fc_bh->b_data;
639}
640
641/* memcpy to fc reserved space and update CRC */
642static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
643 int len, u32 *crc)
644{
645 if (crc)
646 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
647 return memcpy(dst, src, len);
648}
649
650/*
651 * Complete a fast commit by writing tail tag.
652 *
653 * Writing tail tag marks the end of a fast commit. In order to guarantee
654 * atomicity, after writing tail tag, even if there's space remaining
655 * in the block, next commit shouldn't use it. That's why tail tag
656 * has the length as that of the remaining space on the block.
657 */
658static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
659{
660 struct ext4_sb_info *sbi = EXT4_SB(sb);
661 struct ext4_fc_tl tl;
662 struct ext4_fc_tail tail;
663 int off, bsize = sbi->s_journal->j_blocksize;
664 u8 *dst;
665
666 /*
667 * ext4_fc_reserve_space takes care of allocating an extra block if
668 * there's no enough space on this block for accommodating this tail.
669 */
670 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
671 if (!dst)
672 return -ENOSPC;
673
674 off = sbi->s_fc_bytes % bsize;
675
676 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
677 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
678 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
679
680 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
681 dst += sizeof(tl);
682 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
683 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
684 dst += sizeof(tail.fc_tid);
685 tail.fc_crc = cpu_to_le32(crc);
686 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
687
688 ext4_fc_submit_bh(sb);
689
690 return 0;
691}
692
693/*
694 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
695 * Returns false if there's not enough space.
696 */
697static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
698 u32 *crc)
699{
700 struct ext4_fc_tl tl;
701 u8 *dst;
702
703 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
704 if (!dst)
705 return false;
706
707 tl.fc_tag = cpu_to_le16(tag);
708 tl.fc_len = cpu_to_le16(len);
709
710 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
711 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
712
713 return true;
714}
715
716/* Same as above, but adds dentry tlv. */
717static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
718 int parent_ino, int ino, int dlen,
719 const unsigned char *dname,
720 u32 *crc)
721{
722 struct ext4_fc_dentry_info fcd;
723 struct ext4_fc_tl tl;
724 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
725 crc);
726
727 if (!dst)
728 return false;
729
730 fcd.fc_parent_ino = cpu_to_le32(parent_ino);
731 fcd.fc_ino = cpu_to_le32(ino);
732 tl.fc_tag = cpu_to_le16(tag);
733 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
734 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
735 dst += sizeof(tl);
736 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
737 dst += sizeof(fcd);
738 ext4_fc_memcpy(sb, dst, dname, dlen, crc);
739 dst += dlen;
740
741 return true;
742}
743
744/*
745 * Writes inode in the fast commit space under TLV with tag @tag.
746 * Returns 0 on success, error on failure.
747 */
748static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
749{
750 struct ext4_inode_info *ei = EXT4_I(inode);
751 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
752 int ret;
753 struct ext4_iloc iloc;
754 struct ext4_fc_inode fc_inode;
755 struct ext4_fc_tl tl;
756 u8 *dst;
757
758 ret = ext4_get_inode_loc(inode, &iloc);
759 if (ret)
760 return ret;
761
762 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
763 inode_len += ei->i_extra_isize;
764
765 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
766 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
767 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
768
Olivier Deprez92d4c212022-12-06 15:05:30 +0100769 ret = -ECANCELED;
Olivier Deprez157378f2022-04-04 15:47:50 +0200770 dst = ext4_fc_reserve_space(inode->i_sb,
771 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
772 if (!dst)
Olivier Deprez92d4c212022-12-06 15:05:30 +0100773 goto err;
Olivier Deprez157378f2022-04-04 15:47:50 +0200774
775 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
Olivier Deprez92d4c212022-12-06 15:05:30 +0100776 goto err;
Olivier Deprez157378f2022-04-04 15:47:50 +0200777 dst += sizeof(tl);
778 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
Olivier Deprez92d4c212022-12-06 15:05:30 +0100779 goto err;
Olivier Deprez157378f2022-04-04 15:47:50 +0200780 dst += sizeof(fc_inode);
781 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
782 inode_len, crc))
Olivier Deprez92d4c212022-12-06 15:05:30 +0100783 goto err;
784 ret = 0;
785err:
786 brelse(iloc.bh);
787 return ret;
Olivier Deprez157378f2022-04-04 15:47:50 +0200788}
789
790/*
791 * Writes updated data ranges for the inode in question. Updates CRC.
792 * Returns 0 on success, error otherwise.
793 */
794static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
795{
796 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
797 struct ext4_inode_info *ei = EXT4_I(inode);
798 struct ext4_map_blocks map;
799 struct ext4_fc_add_range fc_ext;
800 struct ext4_fc_del_range lrange;
801 struct ext4_extent *ex;
802 int ret;
803
804 mutex_lock(&ei->i_fc_lock);
805 if (ei->i_fc_lblk_len == 0) {
806 mutex_unlock(&ei->i_fc_lock);
807 return 0;
808 }
809 old_blk_size = ei->i_fc_lblk_start;
810 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
811 ei->i_fc_lblk_len = 0;
812 mutex_unlock(&ei->i_fc_lock);
813
814 cur_lblk_off = old_blk_size;
815 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
816 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
817
818 while (cur_lblk_off <= new_blk_size) {
819 map.m_lblk = cur_lblk_off;
820 map.m_len = new_blk_size - cur_lblk_off + 1;
821 ret = ext4_map_blocks(NULL, inode, &map, 0);
822 if (ret < 0)
823 return -ECANCELED;
824
825 if (map.m_len == 0) {
826 cur_lblk_off++;
827 continue;
828 }
829
830 if (ret == 0) {
831 lrange.fc_ino = cpu_to_le32(inode->i_ino);
832 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
833 lrange.fc_len = cpu_to_le32(map.m_len);
834 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
835 sizeof(lrange), (u8 *)&lrange, crc))
836 return -ENOSPC;
837 } else {
838 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
839 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
840
841 /* Limit the number of blocks in one extent */
842 map.m_len = min(max, map.m_len);
843
844 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
845 ex = (struct ext4_extent *)&fc_ext.fc_ex;
846 ex->ee_block = cpu_to_le32(map.m_lblk);
847 ex->ee_len = cpu_to_le16(map.m_len);
848 ext4_ext_store_pblock(ex, map.m_pblk);
849 if (map.m_flags & EXT4_MAP_UNWRITTEN)
850 ext4_ext_mark_unwritten(ex);
851 else
852 ext4_ext_mark_initialized(ex);
853 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
854 sizeof(fc_ext), (u8 *)&fc_ext, crc))
855 return -ENOSPC;
856 }
857
858 cur_lblk_off += map.m_len;
859 }
860
861 return 0;
862}
863
864
865/* Submit data for all the fast commit inodes */
866static int ext4_fc_submit_inode_data_all(journal_t *journal)
867{
868 struct super_block *sb = (struct super_block *)(journal->j_private);
869 struct ext4_sb_info *sbi = EXT4_SB(sb);
870 struct ext4_inode_info *ei;
871 struct list_head *pos;
872 int ret = 0;
873
874 spin_lock(&sbi->s_fc_lock);
875 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
876 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
877 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
878 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
879 while (atomic_read(&ei->i_fc_updates)) {
880 DEFINE_WAIT(wait);
881
882 prepare_to_wait(&ei->i_fc_wait, &wait,
883 TASK_UNINTERRUPTIBLE);
884 if (atomic_read(&ei->i_fc_updates)) {
885 spin_unlock(&sbi->s_fc_lock);
886 schedule();
887 spin_lock(&sbi->s_fc_lock);
888 }
889 finish_wait(&ei->i_fc_wait, &wait);
890 }
891 spin_unlock(&sbi->s_fc_lock);
892 ret = jbd2_submit_inode_data(ei->jinode);
893 if (ret)
894 return ret;
895 spin_lock(&sbi->s_fc_lock);
896 }
897 spin_unlock(&sbi->s_fc_lock);
898
899 return ret;
900}
901
902/* Wait for completion of data for all the fast commit inodes */
903static int ext4_fc_wait_inode_data_all(journal_t *journal)
904{
905 struct super_block *sb = (struct super_block *)(journal->j_private);
906 struct ext4_sb_info *sbi = EXT4_SB(sb);
907 struct ext4_inode_info *pos, *n;
908 int ret = 0;
909
910 spin_lock(&sbi->s_fc_lock);
911 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
912 if (!ext4_test_inode_state(&pos->vfs_inode,
913 EXT4_STATE_FC_COMMITTING))
914 continue;
915 spin_unlock(&sbi->s_fc_lock);
916
917 ret = jbd2_wait_inode_data(journal, pos->jinode);
918 if (ret)
919 return ret;
920 spin_lock(&sbi->s_fc_lock);
921 }
922 spin_unlock(&sbi->s_fc_lock);
923
924 return 0;
925}
926
927/* Commit all the directory entry updates */
928static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
929__acquires(&sbi->s_fc_lock)
930__releases(&sbi->s_fc_lock)
931{
932 struct super_block *sb = (struct super_block *)(journal->j_private);
933 struct ext4_sb_info *sbi = EXT4_SB(sb);
934 struct ext4_fc_dentry_update *fc_dentry;
935 struct inode *inode;
936 struct list_head *pos, *n, *fcd_pos, *fcd_n;
937 struct ext4_inode_info *ei;
938 int ret;
939
940 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
941 return 0;
942 list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
943 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
944 fcd_list);
945 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
946 spin_unlock(&sbi->s_fc_lock);
947 if (!ext4_fc_add_dentry_tlv(
948 sb, fc_dentry->fcd_op,
949 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
950 fc_dentry->fcd_name.len,
951 fc_dentry->fcd_name.name, crc)) {
952 ret = -ENOSPC;
953 goto lock_and_exit;
954 }
955 spin_lock(&sbi->s_fc_lock);
956 continue;
957 }
958
959 inode = NULL;
960 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
961 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
962 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
963 inode = &ei->vfs_inode;
964 break;
965 }
966 }
967 /*
968 * If we don't find inode in our list, then it was deleted,
969 * in which case, we don't need to record it's create tag.
970 */
971 if (!inode)
972 continue;
973 spin_unlock(&sbi->s_fc_lock);
974
975 /*
976 * We first write the inode and then the create dirent. This
977 * allows the recovery code to create an unnamed inode first
978 * and then link it to a directory entry. This allows us
979 * to use namei.c routines almost as is and simplifies
980 * the recovery code.
981 */
982 ret = ext4_fc_write_inode(inode, crc);
983 if (ret)
984 goto lock_and_exit;
985
986 ret = ext4_fc_write_inode_data(inode, crc);
987 if (ret)
988 goto lock_and_exit;
989
990 if (!ext4_fc_add_dentry_tlv(
991 sb, fc_dentry->fcd_op,
992 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
993 fc_dentry->fcd_name.len,
994 fc_dentry->fcd_name.name, crc)) {
995 ret = -ENOSPC;
996 goto lock_and_exit;
997 }
998
999 spin_lock(&sbi->s_fc_lock);
1000 }
1001 return 0;
1002lock_and_exit:
1003 spin_lock(&sbi->s_fc_lock);
1004 return ret;
1005}
1006
1007static int ext4_fc_perform_commit(journal_t *journal)
1008{
1009 struct super_block *sb = (struct super_block *)(journal->j_private);
1010 struct ext4_sb_info *sbi = EXT4_SB(sb);
1011 struct ext4_inode_info *iter;
1012 struct ext4_fc_head head;
1013 struct list_head *pos;
1014 struct inode *inode;
1015 struct blk_plug plug;
1016 int ret = 0;
1017 u32 crc = 0;
1018
1019 ret = ext4_fc_submit_inode_data_all(journal);
1020 if (ret)
1021 return ret;
1022
1023 ret = ext4_fc_wait_inode_data_all(journal);
1024 if (ret)
1025 return ret;
1026
1027 /*
1028 * If file system device is different from journal device, issue a cache
1029 * flush before we start writing fast commit blocks.
1030 */
1031 if (journal->j_fs_dev != journal->j_dev)
1032 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
1033
1034 blk_start_plug(&plug);
1035 if (sbi->s_fc_bytes == 0) {
1036 /*
1037 * Add a head tag only if this is the first fast commit
1038 * in this TID.
1039 */
1040 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1041 head.fc_tid = cpu_to_le32(
1042 sbi->s_journal->j_running_transaction->t_tid);
1043 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1044 (u8 *)&head, &crc)) {
1045 ret = -ENOSPC;
1046 goto out;
1047 }
1048 }
1049
1050 spin_lock(&sbi->s_fc_lock);
1051 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1052 if (ret) {
1053 spin_unlock(&sbi->s_fc_lock);
1054 goto out;
1055 }
1056
1057 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1058 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1059 inode = &iter->vfs_inode;
1060 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1061 continue;
1062
1063 spin_unlock(&sbi->s_fc_lock);
1064 ret = ext4_fc_write_inode_data(inode, &crc);
1065 if (ret)
1066 goto out;
1067 ret = ext4_fc_write_inode(inode, &crc);
1068 if (ret)
1069 goto out;
1070 spin_lock(&sbi->s_fc_lock);
1071 }
1072 spin_unlock(&sbi->s_fc_lock);
1073
1074 ret = ext4_fc_write_tail(sb, crc);
1075
1076out:
1077 blk_finish_plug(&plug);
1078 return ret;
1079}
1080
1081/*
1082 * The main commit entry point. Performs a fast commit for transaction
1083 * commit_tid if needed. If it's not possible to perform a fast commit
1084 * due to various reasons, we fall back to full commit. Returns 0
1085 * on success, error otherwise.
1086 */
1087int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1088{
1089 struct super_block *sb = (struct super_block *)(journal->j_private);
1090 struct ext4_sb_info *sbi = EXT4_SB(sb);
1091 int nblks = 0, ret, bsize = journal->j_blocksize;
1092 int subtid = atomic_read(&sbi->s_fc_subtid);
1093 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1094 ktime_t start_time, commit_time;
1095
1096 trace_ext4_fc_commit_start(sb);
1097
1098 start_time = ktime_get();
1099
1100 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1101 (ext4_fc_is_ineligible(sb))) {
1102 reason = EXT4_FC_REASON_INELIGIBLE;
1103 goto out;
1104 }
1105
1106restart_fc:
1107 ret = jbd2_fc_begin_commit(journal, commit_tid);
1108 if (ret == -EALREADY) {
1109 /* There was an ongoing commit, check if we need to restart */
1110 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1111 commit_tid > journal->j_commit_sequence)
1112 goto restart_fc;
1113 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1114 goto out;
1115 } else if (ret) {
1116 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1117 reason = EXT4_FC_REASON_FC_START_FAILED;
1118 goto out;
1119 }
1120
1121 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1122 ret = ext4_fc_perform_commit(journal);
1123 if (ret < 0) {
1124 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1125 reason = EXT4_FC_REASON_FC_FAILED;
1126 goto out;
1127 }
1128 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1129 ret = jbd2_fc_wait_bufs(journal, nblks);
1130 if (ret < 0) {
1131 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1132 reason = EXT4_FC_REASON_FC_FAILED;
1133 goto out;
1134 }
1135 atomic_inc(&sbi->s_fc_subtid);
1136 jbd2_fc_end_commit(journal);
1137out:
1138 /* Has any ineligible update happened since we started? */
1139 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1140 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1141 reason = EXT4_FC_REASON_INELIGIBLE;
1142 }
1143
1144 spin_lock(&sbi->s_fc_lock);
1145 if (reason != EXT4_FC_REASON_OK &&
1146 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1147 sbi->s_fc_stats.fc_ineligible_commits++;
1148 } else {
1149 sbi->s_fc_stats.fc_num_commits++;
1150 sbi->s_fc_stats.fc_numblks += nblks;
1151 }
1152 spin_unlock(&sbi->s_fc_lock);
1153 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1154 trace_ext4_fc_commit_stop(sb, nblks, reason);
1155 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1156 /*
1157 * weight the commit time higher than the average time so we don't
1158 * react too strongly to vast changes in the commit time
1159 */
1160 if (likely(sbi->s_fc_avg_commit_time))
1161 sbi->s_fc_avg_commit_time = (commit_time +
1162 sbi->s_fc_avg_commit_time * 3) / 4;
1163 else
1164 sbi->s_fc_avg_commit_time = commit_time;
1165 jbd_debug(1,
1166 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1167 nblks, reason, subtid);
1168 if (reason == EXT4_FC_REASON_FC_FAILED)
1169 return jbd2_fc_end_commit_fallback(journal);
1170 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1171 reason == EXT4_FC_REASON_INELIGIBLE)
1172 return jbd2_complete_transaction(journal, commit_tid);
1173 return 0;
1174}
1175
1176/*
1177 * Fast commit cleanup routine. This is called after every fast commit and
1178 * full commit. full is true if we are called after a full commit.
1179 */
1180static void ext4_fc_cleanup(journal_t *journal, int full)
1181{
1182 struct super_block *sb = journal->j_private;
1183 struct ext4_sb_info *sbi = EXT4_SB(sb);
1184 struct ext4_inode_info *iter;
1185 struct ext4_fc_dentry_update *fc_dentry;
1186 struct list_head *pos, *n;
1187
1188 if (full && sbi->s_fc_bh)
1189 sbi->s_fc_bh = NULL;
1190
1191 jbd2_fc_release_bufs(journal);
1192
1193 spin_lock(&sbi->s_fc_lock);
1194 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1195 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1196 list_del_init(&iter->i_fc_list);
1197 ext4_clear_inode_state(&iter->vfs_inode,
1198 EXT4_STATE_FC_COMMITTING);
1199 ext4_fc_reset_inode(&iter->vfs_inode);
1200 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1201 smp_mb();
1202#if (BITS_PER_LONG < 64)
1203 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1204#else
1205 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1206#endif
1207 }
1208
1209 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1210 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1211 struct ext4_fc_dentry_update,
1212 fcd_list);
1213 list_del_init(&fc_dentry->fcd_list);
1214 spin_unlock(&sbi->s_fc_lock);
1215
1216 if (fc_dentry->fcd_name.name &&
1217 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1218 kfree(fc_dentry->fcd_name.name);
1219 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1220 spin_lock(&sbi->s_fc_lock);
1221 }
1222
1223 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1224 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1225 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1226 &sbi->s_fc_q[FC_Q_MAIN]);
1227
1228 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1229 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1230
1231 if (full)
1232 sbi->s_fc_bytes = 0;
1233 spin_unlock(&sbi->s_fc_lock);
1234 trace_ext4_fc_stats(sb);
1235}
1236
1237/* Ext4 Replay Path Routines */
1238
1239/* Helper struct for dentry replay routines */
1240struct dentry_info_args {
1241 int parent_ino, dname_len, ino, inode_len;
1242 char *dname;
1243};
1244
1245static inline void tl_to_darg(struct dentry_info_args *darg,
1246 struct ext4_fc_tl *tl, u8 *val)
1247{
1248 struct ext4_fc_dentry_info fcd;
1249
1250 memcpy(&fcd, val, sizeof(fcd));
1251
1252 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1253 darg->ino = le32_to_cpu(fcd.fc_ino);
1254 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1255 darg->dname_len = le16_to_cpu(tl->fc_len) -
1256 sizeof(struct ext4_fc_dentry_info);
1257}
1258
1259/* Unlink replay function */
1260static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1261 u8 *val)
1262{
1263 struct inode *inode, *old_parent;
1264 struct qstr entry;
1265 struct dentry_info_args darg;
1266 int ret = 0;
1267
1268 tl_to_darg(&darg, tl, val);
1269
1270 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1271 darg.parent_ino, darg.dname_len);
1272
1273 entry.name = darg.dname;
1274 entry.len = darg.dname_len;
1275 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1276
1277 if (IS_ERR(inode)) {
1278 jbd_debug(1, "Inode %d not found", darg.ino);
1279 return 0;
1280 }
1281
1282 old_parent = ext4_iget(sb, darg.parent_ino,
1283 EXT4_IGET_NORMAL);
1284 if (IS_ERR(old_parent)) {
1285 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1286 iput(inode);
1287 return 0;
1288 }
1289
1290 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1291 /* -ENOENT ok coz it might not exist anymore. */
1292 if (ret == -ENOENT)
1293 ret = 0;
1294 iput(old_parent);
1295 iput(inode);
1296 return ret;
1297}
1298
1299static int ext4_fc_replay_link_internal(struct super_block *sb,
1300 struct dentry_info_args *darg,
1301 struct inode *inode)
1302{
1303 struct inode *dir = NULL;
1304 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1305 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1306 int ret = 0;
1307
1308 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1309 if (IS_ERR(dir)) {
1310 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1311 dir = NULL;
1312 goto out;
1313 }
1314
1315 dentry_dir = d_obtain_alias(dir);
1316 if (IS_ERR(dentry_dir)) {
1317 jbd_debug(1, "Failed to obtain dentry");
1318 dentry_dir = NULL;
1319 goto out;
1320 }
1321
1322 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1323 if (!dentry_inode) {
1324 jbd_debug(1, "Inode dentry not created.");
1325 ret = -ENOMEM;
1326 goto out;
1327 }
1328
1329 ret = __ext4_link(dir, inode, dentry_inode);
1330 /*
1331 * It's possible that link already existed since data blocks
1332 * for the dir in question got persisted before we crashed OR
1333 * we replayed this tag and crashed before the entire replay
1334 * could complete.
1335 */
1336 if (ret && ret != -EEXIST) {
1337 jbd_debug(1, "Failed to link\n");
1338 goto out;
1339 }
1340
1341 ret = 0;
1342out:
1343 if (dentry_dir) {
1344 d_drop(dentry_dir);
1345 dput(dentry_dir);
1346 } else if (dir) {
1347 iput(dir);
1348 }
1349 if (dentry_inode) {
1350 d_drop(dentry_inode);
1351 dput(dentry_inode);
1352 }
1353
1354 return ret;
1355}
1356
1357/* Link replay function */
1358static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1359 u8 *val)
1360{
1361 struct inode *inode;
1362 struct dentry_info_args darg;
1363 int ret = 0;
1364
1365 tl_to_darg(&darg, tl, val);
1366 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1367 darg.parent_ino, darg.dname_len);
1368
1369 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1370 if (IS_ERR(inode)) {
1371 jbd_debug(1, "Inode not found.");
1372 return 0;
1373 }
1374
1375 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1376 iput(inode);
1377 return ret;
1378}
1379
1380/*
1381 * Record all the modified inodes during replay. We use this later to setup
1382 * block bitmaps correctly.
1383 */
1384static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1385{
1386 struct ext4_fc_replay_state *state;
1387 int i;
1388
1389 state = &EXT4_SB(sb)->s_fc_replay_state;
1390 for (i = 0; i < state->fc_modified_inodes_used; i++)
1391 if (state->fc_modified_inodes[i] == ino)
1392 return 0;
1393 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
Olivier Deprez92d4c212022-12-06 15:05:30 +01001394 int *fc_modified_inodes;
1395
1396 fc_modified_inodes = krealloc(state->fc_modified_inodes,
Olivier Deprez157378f2022-04-04 15:47:50 +02001397 sizeof(int) * (state->fc_modified_inodes_size +
1398 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1399 GFP_KERNEL);
Olivier Deprez92d4c212022-12-06 15:05:30 +01001400 if (!fc_modified_inodes)
Olivier Deprez157378f2022-04-04 15:47:50 +02001401 return -ENOMEM;
Olivier Deprez92d4c212022-12-06 15:05:30 +01001402 state->fc_modified_inodes = fc_modified_inodes;
Olivier Deprez157378f2022-04-04 15:47:50 +02001403 state->fc_modified_inodes_size +=
1404 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1405 }
1406 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1407 return 0;
1408}
1409
1410/*
1411 * Inode replay function
1412 */
1413static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1414 u8 *val)
1415{
1416 struct ext4_fc_inode fc_inode;
1417 struct ext4_inode *raw_inode;
1418 struct ext4_inode *raw_fc_inode;
1419 struct inode *inode = NULL;
1420 struct ext4_iloc iloc;
1421 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1422 struct ext4_extent_header *eh;
1423
1424 memcpy(&fc_inode, val, sizeof(fc_inode));
1425
1426 ino = le32_to_cpu(fc_inode.fc_ino);
1427 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1428
1429 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1430 if (!IS_ERR(inode)) {
1431 ext4_ext_clear_bb(inode);
1432 iput(inode);
1433 }
1434 inode = NULL;
1435
1436 ret = ext4_fc_record_modified_inode(sb, ino);
1437 if (ret)
1438 goto out;
1439
1440 raw_fc_inode = (struct ext4_inode *)
1441 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1442 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1443 if (ret)
1444 goto out;
1445
1446 inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1447 raw_inode = ext4_raw_inode(&iloc);
1448
1449 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1450 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1451 inode_len - offsetof(struct ext4_inode, i_generation));
1452 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1453 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1454 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1455 memset(eh, 0, sizeof(*eh));
1456 eh->eh_magic = EXT4_EXT_MAGIC;
1457 eh->eh_max = cpu_to_le16(
1458 (sizeof(raw_inode->i_block) -
1459 sizeof(struct ext4_extent_header))
1460 / sizeof(struct ext4_extent));
1461 }
1462 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1463 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1464 sizeof(raw_inode->i_block));
1465 }
1466
1467 /* Immediately update the inode on disk. */
1468 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1469 if (ret)
1470 goto out;
1471 ret = sync_dirty_buffer(iloc.bh);
1472 if (ret)
1473 goto out;
1474 ret = ext4_mark_inode_used(sb, ino);
1475 if (ret)
1476 goto out;
1477
1478 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1479 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1480 if (IS_ERR(inode)) {
1481 jbd_debug(1, "Inode not found.");
1482 return -EFSCORRUPTED;
1483 }
1484
1485 /*
1486 * Our allocator could have made different decisions than before
1487 * crashing. This should be fixed but until then, we calculate
1488 * the number of blocks the inode.
1489 */
1490 ext4_ext_replay_set_iblocks(inode);
1491
1492 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1493 ext4_reset_inode_seed(inode);
1494
1495 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1496 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1497 sync_dirty_buffer(iloc.bh);
1498 brelse(iloc.bh);
1499out:
1500 iput(inode);
1501 if (!ret)
1502 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1503
1504 return 0;
1505}
1506
1507/*
1508 * Dentry create replay function.
1509 *
1510 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1511 * inode for which we are trying to create a dentry here, should already have
1512 * been replayed before we start here.
1513 */
1514static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1515 u8 *val)
1516{
1517 int ret = 0;
1518 struct inode *inode = NULL;
1519 struct inode *dir = NULL;
1520 struct dentry_info_args darg;
1521
1522 tl_to_darg(&darg, tl, val);
1523
1524 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1525 darg.parent_ino, darg.dname_len);
1526
1527 /* This takes care of update group descriptor and other metadata */
1528 ret = ext4_mark_inode_used(sb, darg.ino);
1529 if (ret)
1530 goto out;
1531
1532 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1533 if (IS_ERR(inode)) {
1534 jbd_debug(1, "inode %d not found.", darg.ino);
1535 inode = NULL;
1536 ret = -EINVAL;
1537 goto out;
1538 }
1539
1540 if (S_ISDIR(inode->i_mode)) {
1541 /*
1542 * If we are creating a directory, we need to make sure that the
1543 * dot and dot dot dirents are setup properly.
1544 */
1545 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1546 if (IS_ERR(dir)) {
1547 jbd_debug(1, "Dir %d not found.", darg.ino);
1548 goto out;
1549 }
1550 ret = ext4_init_new_dir(NULL, dir, inode);
1551 iput(dir);
1552 if (ret) {
1553 ret = 0;
1554 goto out;
1555 }
1556 }
1557 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1558 if (ret)
1559 goto out;
1560 set_nlink(inode, 1);
1561 ext4_mark_inode_dirty(NULL, inode);
1562out:
1563 if (inode)
1564 iput(inode);
1565 return ret;
1566}
1567
1568/*
1569 * Record physical disk regions which are in use as per fast commit area,
1570 * and used by inodes during replay phase. Our simple replay phase
1571 * allocator excludes these regions from allocation.
1572 */
1573int ext4_fc_record_regions(struct super_block *sb, int ino,
1574 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1575{
1576 struct ext4_fc_replay_state *state;
1577 struct ext4_fc_alloc_region *region;
1578
1579 state = &EXT4_SB(sb)->s_fc_replay_state;
1580 /*
1581 * during replay phase, the fc_regions_valid may not same as
1582 * fc_regions_used, update it when do new additions.
1583 */
1584 if (replay && state->fc_regions_used != state->fc_regions_valid)
1585 state->fc_regions_used = state->fc_regions_valid;
1586 if (state->fc_regions_used == state->fc_regions_size) {
Olivier Deprez92d4c212022-12-06 15:05:30 +01001587 struct ext4_fc_alloc_region *fc_regions;
1588
1589 fc_regions = krealloc(state->fc_regions,
1590 sizeof(struct ext4_fc_alloc_region) *
1591 (state->fc_regions_size +
1592 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1593 GFP_KERNEL);
1594 if (!fc_regions)
1595 return -ENOMEM;
Olivier Deprez157378f2022-04-04 15:47:50 +02001596 state->fc_regions_size +=
1597 EXT4_FC_REPLAY_REALLOC_INCREMENT;
Olivier Deprez92d4c212022-12-06 15:05:30 +01001598 state->fc_regions = fc_regions;
Olivier Deprez157378f2022-04-04 15:47:50 +02001599 }
1600 region = &state->fc_regions[state->fc_regions_used++];
1601 region->ino = ino;
1602 region->lblk = lblk;
1603 region->pblk = pblk;
1604 region->len = len;
1605
1606 if (replay)
1607 state->fc_regions_valid++;
1608
1609 return 0;
1610}
1611
1612/* Replay add range tag */
1613static int ext4_fc_replay_add_range(struct super_block *sb,
1614 struct ext4_fc_tl *tl, u8 *val)
1615{
1616 struct ext4_fc_add_range fc_add_ex;
1617 struct ext4_extent newex, *ex;
1618 struct inode *inode;
1619 ext4_lblk_t start, cur;
1620 int remaining, len;
1621 ext4_fsblk_t start_pblk;
1622 struct ext4_map_blocks map;
1623 struct ext4_ext_path *path = NULL;
1624 int ret;
1625
1626 memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1627 ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1628
1629 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1630 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1631 ext4_ext_get_actual_len(ex));
1632
1633 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1634 if (IS_ERR(inode)) {
1635 jbd_debug(1, "Inode not found.");
1636 return 0;
1637 }
1638
1639 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1640 if (ret)
1641 goto out;
1642
1643 start = le32_to_cpu(ex->ee_block);
1644 start_pblk = ext4_ext_pblock(ex);
1645 len = ext4_ext_get_actual_len(ex);
1646
1647 cur = start;
1648 remaining = len;
1649 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1650 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1651 inode->i_ino);
1652
1653 while (remaining > 0) {
1654 map.m_lblk = cur;
1655 map.m_len = remaining;
1656 map.m_pblk = 0;
1657 ret = ext4_map_blocks(NULL, inode, &map, 0);
1658
1659 if (ret < 0)
1660 goto out;
1661
1662 if (ret == 0) {
1663 /* Range is not mapped */
1664 path = ext4_find_extent(inode, cur, NULL, 0);
1665 if (IS_ERR(path))
1666 goto out;
1667 memset(&newex, 0, sizeof(newex));
1668 newex.ee_block = cpu_to_le32(cur);
1669 ext4_ext_store_pblock(
1670 &newex, start_pblk + cur - start);
1671 newex.ee_len = cpu_to_le16(map.m_len);
1672 if (ext4_ext_is_unwritten(ex))
1673 ext4_ext_mark_unwritten(&newex);
1674 down_write(&EXT4_I(inode)->i_data_sem);
1675 ret = ext4_ext_insert_extent(
1676 NULL, inode, &path, &newex, 0);
1677 up_write((&EXT4_I(inode)->i_data_sem));
1678 ext4_ext_drop_refs(path);
1679 kfree(path);
1680 if (ret)
1681 goto out;
1682 goto next;
1683 }
1684
1685 if (start_pblk + cur - start != map.m_pblk) {
1686 /*
1687 * Logical to physical mapping changed. This can happen
1688 * if this range was removed and then reallocated to
1689 * map to new physical blocks during a fast commit.
1690 */
1691 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1692 ext4_ext_is_unwritten(ex),
1693 start_pblk + cur - start);
1694 if (ret)
1695 goto out;
1696 /*
1697 * Mark the old blocks as free since they aren't used
1698 * anymore. We maintain an array of all the modified
1699 * inodes. In case these blocks are still used at either
1700 * a different logical range in the same inode or in
1701 * some different inode, we will mark them as allocated
1702 * at the end of the FC replay using our array of
1703 * modified inodes.
1704 */
1705 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1706 goto next;
1707 }
1708
1709 /* Range is mapped and needs a state change */
1710 jbd_debug(1, "Converting from %ld to %d %lld",
1711 map.m_flags & EXT4_MAP_UNWRITTEN,
1712 ext4_ext_is_unwritten(ex), map.m_pblk);
1713 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1714 ext4_ext_is_unwritten(ex), map.m_pblk);
1715 if (ret)
1716 goto out;
1717 /*
1718 * We may have split the extent tree while toggling the state.
1719 * Try to shrink the extent tree now.
1720 */
1721 ext4_ext_replay_shrink_inode(inode, start + len);
1722next:
1723 cur += map.m_len;
1724 remaining -= map.m_len;
1725 }
1726 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1727 sb->s_blocksize_bits);
1728out:
1729 iput(inode);
1730 return 0;
1731}
1732
1733/* Replay DEL_RANGE tag */
1734static int
1735ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1736 u8 *val)
1737{
1738 struct inode *inode;
1739 struct ext4_fc_del_range lrange;
1740 struct ext4_map_blocks map;
1741 ext4_lblk_t cur, remaining;
1742 int ret;
1743
1744 memcpy(&lrange, val, sizeof(lrange));
1745 cur = le32_to_cpu(lrange.fc_lblk);
1746 remaining = le32_to_cpu(lrange.fc_len);
1747
1748 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1749 le32_to_cpu(lrange.fc_ino), cur, remaining);
1750
1751 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1752 if (IS_ERR(inode)) {
1753 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1754 return 0;
1755 }
1756
1757 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1758 if (ret)
1759 goto out;
1760
1761 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1762 inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1763 le32_to_cpu(lrange.fc_len));
1764 while (remaining > 0) {
1765 map.m_lblk = cur;
1766 map.m_len = remaining;
1767
1768 ret = ext4_map_blocks(NULL, inode, &map, 0);
1769 if (ret < 0)
1770 goto out;
1771 if (ret > 0) {
1772 remaining -= ret;
1773 cur += ret;
1774 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1775 } else {
1776 remaining -= map.m_len;
1777 cur += map.m_len;
1778 }
1779 }
1780
1781 down_write(&EXT4_I(inode)->i_data_sem);
1782 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1783 le32_to_cpu(lrange.fc_lblk) +
1784 le32_to_cpu(lrange.fc_len) - 1);
1785 up_write(&EXT4_I(inode)->i_data_sem);
1786 if (ret)
1787 goto out;
1788 ext4_ext_replay_shrink_inode(inode,
1789 i_size_read(inode) >> sb->s_blocksize_bits);
1790 ext4_mark_inode_dirty(NULL, inode);
1791out:
1792 iput(inode);
1793 return 0;
1794}
1795
1796static inline const char *tag2str(u16 tag)
1797{
1798 switch (tag) {
1799 case EXT4_FC_TAG_LINK:
1800 return "TAG_ADD_ENTRY";
1801 case EXT4_FC_TAG_UNLINK:
1802 return "TAG_DEL_ENTRY";
1803 case EXT4_FC_TAG_ADD_RANGE:
1804 return "TAG_ADD_RANGE";
1805 case EXT4_FC_TAG_CREAT:
1806 return "TAG_CREAT_DENTRY";
1807 case EXT4_FC_TAG_DEL_RANGE:
1808 return "TAG_DEL_RANGE";
1809 case EXT4_FC_TAG_INODE:
1810 return "TAG_INODE";
1811 case EXT4_FC_TAG_PAD:
1812 return "TAG_PAD";
1813 case EXT4_FC_TAG_TAIL:
1814 return "TAG_TAIL";
1815 case EXT4_FC_TAG_HEAD:
1816 return "TAG_HEAD";
1817 default:
1818 return "TAG_ERROR";
1819 }
1820}
1821
1822static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1823{
1824 struct ext4_fc_replay_state *state;
1825 struct inode *inode;
1826 struct ext4_ext_path *path = NULL;
1827 struct ext4_map_blocks map;
1828 int i, ret, j;
1829 ext4_lblk_t cur, end;
1830
1831 state = &EXT4_SB(sb)->s_fc_replay_state;
1832 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1833 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1834 EXT4_IGET_NORMAL);
1835 if (IS_ERR(inode)) {
1836 jbd_debug(1, "Inode %d not found.",
1837 state->fc_modified_inodes[i]);
1838 continue;
1839 }
1840 cur = 0;
1841 end = EXT_MAX_BLOCKS;
1842 while (cur < end) {
1843 map.m_lblk = cur;
1844 map.m_len = end - cur;
1845
1846 ret = ext4_map_blocks(NULL, inode, &map, 0);
1847 if (ret < 0)
1848 break;
1849
1850 if (ret > 0) {
1851 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1852 if (!IS_ERR(path)) {
1853 for (j = 0; j < path->p_depth; j++)
1854 ext4_mb_mark_bb(inode->i_sb,
1855 path[j].p_block, 1, 1);
1856 ext4_ext_drop_refs(path);
1857 kfree(path);
1858 }
1859 cur += ret;
1860 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1861 map.m_len, 1);
1862 } else {
1863 cur = cur + (map.m_len ? map.m_len : 1);
1864 }
1865 }
1866 iput(inode);
1867 }
1868}
1869
1870/*
1871 * Check if block is in excluded regions for block allocation. The simple
1872 * allocator that runs during replay phase is calls this function to see
1873 * if it is okay to use a block.
1874 */
1875bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1876{
1877 int i;
1878 struct ext4_fc_replay_state *state;
1879
1880 state = &EXT4_SB(sb)->s_fc_replay_state;
1881 for (i = 0; i < state->fc_regions_valid; i++) {
1882 if (state->fc_regions[i].ino == 0 ||
1883 state->fc_regions[i].len == 0)
1884 continue;
1885 if (blk >= state->fc_regions[i].pblk &&
1886 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1887 return true;
1888 }
1889 return false;
1890}
1891
1892/* Cleanup function called after replay */
1893void ext4_fc_replay_cleanup(struct super_block *sb)
1894{
1895 struct ext4_sb_info *sbi = EXT4_SB(sb);
1896
1897 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1898 kfree(sbi->s_fc_replay_state.fc_regions);
1899 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1900}
1901
1902/*
1903 * Recovery Scan phase handler
1904 *
1905 * This function is called during the scan phase and is responsible
1906 * for doing following things:
1907 * - Make sure the fast commit area has valid tags for replay
1908 * - Count number of tags that need to be replayed by the replay handler
1909 * - Verify CRC
1910 * - Create a list of excluded blocks for allocation during replay phase
1911 *
1912 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1913 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1914 * to indicate that scan has finished and JBD2 can now start replay phase.
1915 * It returns a negative error to indicate that there was an error. At the end
1916 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1917 * to indicate the number of tags that need to replayed during the replay phase.
1918 */
1919static int ext4_fc_replay_scan(journal_t *journal,
1920 struct buffer_head *bh, int off,
1921 tid_t expected_tid)
1922{
1923 struct super_block *sb = journal->j_private;
1924 struct ext4_sb_info *sbi = EXT4_SB(sb);
1925 struct ext4_fc_replay_state *state;
1926 int ret = JBD2_FC_REPLAY_CONTINUE;
1927 struct ext4_fc_add_range ext;
1928 struct ext4_fc_tl tl;
1929 struct ext4_fc_tail tail;
1930 __u8 *start, *end, *cur, *val;
1931 struct ext4_fc_head head;
1932 struct ext4_extent *ex;
1933
1934 state = &sbi->s_fc_replay_state;
1935
1936 start = (u8 *)bh->b_data;
1937 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1938
1939 if (state->fc_replay_expected_off == 0) {
1940 state->fc_cur_tag = 0;
1941 state->fc_replay_num_tags = 0;
1942 state->fc_crc = 0;
1943 state->fc_regions = NULL;
1944 state->fc_regions_valid = state->fc_regions_used =
1945 state->fc_regions_size = 0;
1946 /* Check if we can stop early */
1947 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1948 != EXT4_FC_TAG_HEAD)
1949 return 0;
1950 }
1951
1952 if (off != state->fc_replay_expected_off) {
1953 ret = -EFSCORRUPTED;
1954 goto out_err;
1955 }
1956
1957 state->fc_replay_expected_off++;
1958 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1959 memcpy(&tl, cur, sizeof(tl));
1960 val = cur + sizeof(tl);
1961 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1962 tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1963 switch (le16_to_cpu(tl.fc_tag)) {
1964 case EXT4_FC_TAG_ADD_RANGE:
1965 memcpy(&ext, val, sizeof(ext));
1966 ex = (struct ext4_extent *)&ext.fc_ex;
1967 ret = ext4_fc_record_regions(sb,
1968 le32_to_cpu(ext.fc_ino),
1969 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1970 ext4_ext_get_actual_len(ex), 0);
1971 if (ret < 0)
1972 break;
1973 ret = JBD2_FC_REPLAY_CONTINUE;
1974 fallthrough;
1975 case EXT4_FC_TAG_DEL_RANGE:
1976 case EXT4_FC_TAG_LINK:
1977 case EXT4_FC_TAG_UNLINK:
1978 case EXT4_FC_TAG_CREAT:
1979 case EXT4_FC_TAG_INODE:
1980 case EXT4_FC_TAG_PAD:
1981 state->fc_cur_tag++;
1982 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1983 sizeof(tl) + le16_to_cpu(tl.fc_len));
1984 break;
1985 case EXT4_FC_TAG_TAIL:
1986 state->fc_cur_tag++;
1987 memcpy(&tail, val, sizeof(tail));
1988 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1989 sizeof(tl) +
1990 offsetof(struct ext4_fc_tail,
1991 fc_crc));
1992 if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1993 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
1994 state->fc_replay_num_tags = state->fc_cur_tag;
1995 state->fc_regions_valid =
1996 state->fc_regions_used;
1997 } else {
1998 ret = state->fc_replay_num_tags ?
1999 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2000 }
2001 state->fc_crc = 0;
2002 break;
2003 case EXT4_FC_TAG_HEAD:
2004 memcpy(&head, val, sizeof(head));
2005 if (le32_to_cpu(head.fc_features) &
2006 ~EXT4_FC_SUPPORTED_FEATURES) {
2007 ret = -EOPNOTSUPP;
2008 break;
2009 }
2010 if (le32_to_cpu(head.fc_tid) != expected_tid) {
2011 ret = JBD2_FC_REPLAY_STOP;
2012 break;
2013 }
2014 state->fc_cur_tag++;
2015 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2016 sizeof(tl) + le16_to_cpu(tl.fc_len));
2017 break;
2018 default:
2019 ret = state->fc_replay_num_tags ?
2020 JBD2_FC_REPLAY_STOP : -ECANCELED;
2021 }
2022 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2023 break;
2024 }
2025
2026out_err:
2027 trace_ext4_fc_replay_scan(sb, ret, off);
2028 return ret;
2029}
2030
2031/*
2032 * Main recovery path entry point.
2033 * The meaning of return codes is similar as above.
2034 */
2035static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2036 enum passtype pass, int off, tid_t expected_tid)
2037{
2038 struct super_block *sb = journal->j_private;
2039 struct ext4_sb_info *sbi = EXT4_SB(sb);
2040 struct ext4_fc_tl tl;
2041 __u8 *start, *end, *cur, *val;
2042 int ret = JBD2_FC_REPLAY_CONTINUE;
2043 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2044 struct ext4_fc_tail tail;
2045
2046 if (pass == PASS_SCAN) {
2047 state->fc_current_pass = PASS_SCAN;
2048 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2049 }
2050
2051 if (state->fc_current_pass != pass) {
2052 state->fc_current_pass = pass;
2053 sbi->s_mount_state |= EXT4_FC_REPLAY;
2054 }
2055 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2056 jbd_debug(1, "Replay stops\n");
2057 ext4_fc_set_bitmaps_and_counters(sb);
2058 return 0;
2059 }
2060
2061#ifdef CONFIG_EXT4_DEBUG
2062 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2063 pr_warn("Dropping fc block %d because max_replay set\n", off);
2064 return JBD2_FC_REPLAY_STOP;
2065 }
2066#endif
2067
2068 start = (u8 *)bh->b_data;
2069 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2070
2071 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2072 memcpy(&tl, cur, sizeof(tl));
2073 val = cur + sizeof(tl);
2074
2075 if (state->fc_replay_num_tags == 0) {
2076 ret = JBD2_FC_REPLAY_STOP;
2077 ext4_fc_set_bitmaps_and_counters(sb);
2078 break;
2079 }
2080 jbd_debug(3, "Replay phase, tag:%s\n",
2081 tag2str(le16_to_cpu(tl.fc_tag)));
2082 state->fc_replay_num_tags--;
2083 switch (le16_to_cpu(tl.fc_tag)) {
2084 case EXT4_FC_TAG_LINK:
2085 ret = ext4_fc_replay_link(sb, &tl, val);
2086 break;
2087 case EXT4_FC_TAG_UNLINK:
2088 ret = ext4_fc_replay_unlink(sb, &tl, val);
2089 break;
2090 case EXT4_FC_TAG_ADD_RANGE:
2091 ret = ext4_fc_replay_add_range(sb, &tl, val);
2092 break;
2093 case EXT4_FC_TAG_CREAT:
2094 ret = ext4_fc_replay_create(sb, &tl, val);
2095 break;
2096 case EXT4_FC_TAG_DEL_RANGE:
2097 ret = ext4_fc_replay_del_range(sb, &tl, val);
2098 break;
2099 case EXT4_FC_TAG_INODE:
2100 ret = ext4_fc_replay_inode(sb, &tl, val);
2101 break;
2102 case EXT4_FC_TAG_PAD:
2103 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2104 le16_to_cpu(tl.fc_len), 0);
2105 break;
2106 case EXT4_FC_TAG_TAIL:
2107 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2108 le16_to_cpu(tl.fc_len), 0);
2109 memcpy(&tail, val, sizeof(tail));
2110 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2111 break;
2112 case EXT4_FC_TAG_HEAD:
2113 break;
2114 default:
2115 trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2116 le16_to_cpu(tl.fc_len), 0);
2117 ret = -ECANCELED;
2118 break;
2119 }
2120 if (ret < 0)
2121 break;
2122 ret = JBD2_FC_REPLAY_CONTINUE;
2123 }
2124 return ret;
2125}
2126
2127void ext4_fc_init(struct super_block *sb, journal_t *journal)
2128{
2129 /*
2130 * We set replay callback even if fast commit disabled because we may
2131 * could still have fast commit blocks that need to be replayed even if
2132 * fast commit has now been turned off.
2133 */
2134 journal->j_fc_replay_callback = ext4_fc_replay;
2135 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2136 return;
2137 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2138}
2139
2140static const char *fc_ineligible_reasons[] = {
2141 "Extended attributes changed",
2142 "Cross rename",
2143 "Journal flag changed",
2144 "Insufficient memory",
2145 "Swap boot",
2146 "Resize",
2147 "Dir renamed",
2148 "Falloc range op",
2149 "Data journalling",
2150 "FC Commit Failed"
2151};
2152
2153int ext4_fc_info_show(struct seq_file *seq, void *v)
2154{
2155 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2156 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2157 int i;
2158
2159 if (v != SEQ_START_TOKEN)
2160 return 0;
2161
2162 seq_printf(seq,
2163 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2164 stats->fc_num_commits, stats->fc_ineligible_commits,
2165 stats->fc_numblks,
2166 div_u64(sbi->s_fc_avg_commit_time, 1000));
2167 seq_puts(seq, "Ineligible reasons:\n");
2168 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2169 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2170 stats->fc_ineligible_reason_count[i]);
2171
2172 return 0;
2173}
2174
2175int __init ext4_fc_init_dentry_cache(void)
2176{
2177 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2178 SLAB_RECLAIM_ACCOUNT);
2179
2180 if (ext4_fc_dentry_cachep == NULL)
2181 return -ENOMEM;
2182
2183 return 0;
2184}
2185
2186void ext4_fc_destroy_dentry_cache(void)
2187{
2188 kmem_cache_destroy(ext4_fc_dentry_cachep);
2189}