blob: 501e60713010e57a0f84ad3b4335b9a43d23d497 [file] [log] [blame]
Olivier Deprez157378f2022-04-04 15:47:50 +02001// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
10#include "ext4.h"
11#include "ext4_jbd2.h"
12#include "ext4_extents.h"
13#include "mballoc.h"
14
15/*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligiblity is marked by calling one of the
70 * two following functions:
71 *
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
74 *
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
83 *
84 * Atomicity of commits
85 * --------------------
86 * In order to guarantee atomicity during the commit operation, fast commit
87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
93 *
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
99 *
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
103 *
104 * Replay code should thus check for all the valid tails in the FC area.
105 *
106 * TODOs
107 * -----
108 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109 * eligible update must be protected within ext4_fc_start_update() and
110 * ext4_fc_stop_update(). These routines are called at much higher
111 * routines. This can be made more fine grained by combining with
112 * ext4_journal_start().
113 *
114 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115 *
116 * 3) Handle more ineligible cases.
117 */
118
119#include <trace/events/ext4.h>
120static struct kmem_cache *ext4_fc_dentry_cachep;
121
122static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123{
124 BUFFER_TRACE(bh, "");
125 if (uptodate) {
126 ext4_debug("%s: Block %lld up-to-date",
127 __func__, bh->b_blocknr);
128 set_buffer_uptodate(bh);
129 } else {
130 ext4_debug("%s: Block %lld not up-to-date",
131 __func__, bh->b_blocknr);
132 clear_buffer_uptodate(bh);
133 }
134
135 unlock_buffer(bh);
136}
137
138static inline void ext4_fc_reset_inode(struct inode *inode)
139{
140 struct ext4_inode_info *ei = EXT4_I(inode);
141
142 ei->i_fc_lblk_start = 0;
143 ei->i_fc_lblk_len = 0;
144}
145
146void ext4_fc_init_inode(struct inode *inode)
147{
148 struct ext4_inode_info *ei = EXT4_I(inode);
149
150 ext4_fc_reset_inode(inode);
151 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152 INIT_LIST_HEAD(&ei->i_fc_list);
153 init_waitqueue_head(&ei->i_fc_wait);
154 atomic_set(&ei->i_fc_updates, 0);
155}
156
157/* This function must be called with sbi->s_fc_lock held. */
158static void ext4_fc_wait_committing_inode(struct inode *inode)
159__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
160{
161 wait_queue_head_t *wq;
162 struct ext4_inode_info *ei = EXT4_I(inode);
163
164#if (BITS_PER_LONG < 64)
165 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
166 EXT4_STATE_FC_COMMITTING);
167 wq = bit_waitqueue(&ei->i_state_flags,
168 EXT4_STATE_FC_COMMITTING);
169#else
170 DEFINE_WAIT_BIT(wait, &ei->i_flags,
171 EXT4_STATE_FC_COMMITTING);
172 wq = bit_waitqueue(&ei->i_flags,
173 EXT4_STATE_FC_COMMITTING);
174#endif
175 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
176 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
177 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
178 schedule();
179 finish_wait(wq, &wait.wq_entry);
180}
181
182/*
183 * Inform Ext4's fast about start of an inode update
184 *
185 * This function is called by the high level call VFS callbacks before
186 * performing any inode update. This function blocks if there's an ongoing
187 * fast commit on the inode in question.
188 */
189void ext4_fc_start_update(struct inode *inode)
190{
191 struct ext4_inode_info *ei = EXT4_I(inode);
192
193 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
194 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
195 return;
196
197restart:
198 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
199 if (list_empty(&ei->i_fc_list))
200 goto out;
201
202 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
203 ext4_fc_wait_committing_inode(inode);
204 goto restart;
205 }
206out:
207 atomic_inc(&ei->i_fc_updates);
208 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
209}
210
211/*
212 * Stop inode update and wake up waiting fast commits if any.
213 */
214void ext4_fc_stop_update(struct inode *inode)
215{
216 struct ext4_inode_info *ei = EXT4_I(inode);
217
218 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
219 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
220 return;
221
222 if (atomic_dec_and_test(&ei->i_fc_updates))
223 wake_up_all(&ei->i_fc_wait);
224}
225
226/*
227 * Remove inode from fast commit list. If the inode is being committed
228 * we wait until inode commit is done.
229 */
230void ext4_fc_del(struct inode *inode)
231{
232 struct ext4_inode_info *ei = EXT4_I(inode);
233
234 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
235 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
236 return;
237
238restart:
239 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
240 if (list_empty(&ei->i_fc_list)) {
241 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
242 return;
243 }
244
245 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
246 ext4_fc_wait_committing_inode(inode);
247 goto restart;
248 }
249 list_del_init(&ei->i_fc_list);
250 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251}
252
253/*
254 * Mark file system as fast commit ineligible. This means that next commit
255 * operation would result in a full jbd2 commit.
256 */
257void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
258{
259 struct ext4_sb_info *sbi = EXT4_SB(sb);
260
261 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
262 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
263 return;
264
265 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
266 WARN_ON(reason >= EXT4_FC_REASON_MAX);
267 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
268}
269
270/*
271 * Start a fast commit ineligible update. Any commits that happen while
272 * such an operation is in progress fall back to full commits.
273 */
274void ext4_fc_start_ineligible(struct super_block *sb, int reason)
275{
276 struct ext4_sb_info *sbi = EXT4_SB(sb);
277
278 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
279 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
280 return;
281
282 WARN_ON(reason >= EXT4_FC_REASON_MAX);
283 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
284 atomic_inc(&sbi->s_fc_ineligible_updates);
285}
286
287/*
288 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
289 * to ensure that after stopping the ineligible update, at least one full
290 * commit takes place.
291 */
292void ext4_fc_stop_ineligible(struct super_block *sb)
293{
294 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
295 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
296 return;
297
298 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
299 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
300}
301
302static inline int ext4_fc_is_ineligible(struct super_block *sb)
303{
304 return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
305 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
306}
307
308/*
309 * Generic fast commit tracking function. If this is the first time this we are
310 * called after a full commit, we initialize fast commit fields and then call
311 * __fc_track_fn() with update = 0. If we have already been called after a full
312 * commit, we pass update = 1. Based on that, the track function can determine
313 * if it needs to track a field for the first time or if it needs to just
314 * update the previously tracked value.
315 *
316 * If enqueue is set, this function enqueues the inode in fast commit list.
317 */
318static int ext4_fc_track_template(
319 handle_t *handle, struct inode *inode,
320 int (*__fc_track_fn)(struct inode *, void *, bool),
321 void *args, int enqueue)
322{
323 bool update = false;
324 struct ext4_inode_info *ei = EXT4_I(inode);
325 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
326 tid_t tid = 0;
327 int ret;
328
329 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
330 (sbi->s_mount_state & EXT4_FC_REPLAY))
331 return -EOPNOTSUPP;
332
333 if (ext4_fc_is_ineligible(inode->i_sb))
334 return -EINVAL;
335
336 tid = handle->h_transaction->t_tid;
337 mutex_lock(&ei->i_fc_lock);
338 if (tid == ei->i_sync_tid) {
339 update = true;
340 } else {
341 ext4_fc_reset_inode(inode);
342 ei->i_sync_tid = tid;
343 }
344 ret = __fc_track_fn(inode, args, update);
345 mutex_unlock(&ei->i_fc_lock);
346
347 if (!enqueue)
348 return ret;
349
350 spin_lock(&sbi->s_fc_lock);
351 if (list_empty(&EXT4_I(inode)->i_fc_list))
352 list_add_tail(&EXT4_I(inode)->i_fc_list,
353 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
354 &sbi->s_fc_q[FC_Q_STAGING] :
355 &sbi->s_fc_q[FC_Q_MAIN]);
356 spin_unlock(&sbi->s_fc_lock);
357
358 return ret;
359}
360
361struct __track_dentry_update_args {
362 struct dentry *dentry;
363 int op;
364};
365
366/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
367static int __track_dentry_update(struct inode *inode, void *arg, bool update)
368{
369 struct ext4_fc_dentry_update *node;
370 struct ext4_inode_info *ei = EXT4_I(inode);
371 struct __track_dentry_update_args *dentry_update =
372 (struct __track_dentry_update_args *)arg;
373 struct dentry *dentry = dentry_update->dentry;
374 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
375
376 mutex_unlock(&ei->i_fc_lock);
377 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
378 if (!node) {
379 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
380 mutex_lock(&ei->i_fc_lock);
381 return -ENOMEM;
382 }
383
384 node->fcd_op = dentry_update->op;
385 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
386 node->fcd_ino = inode->i_ino;
387 if (dentry->d_name.len > DNAME_INLINE_LEN) {
388 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
389 if (!node->fcd_name.name) {
390 kmem_cache_free(ext4_fc_dentry_cachep, node);
391 ext4_fc_mark_ineligible(inode->i_sb,
392 EXT4_FC_REASON_NOMEM);
393 mutex_lock(&ei->i_fc_lock);
394 return -ENOMEM;
395 }
396 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
397 dentry->d_name.len);
398 } else {
399 memcpy(node->fcd_iname, dentry->d_name.name,
400 dentry->d_name.len);
401 node->fcd_name.name = node->fcd_iname;
402 }
403 node->fcd_name.len = dentry->d_name.len;
404
405 spin_lock(&sbi->s_fc_lock);
406 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
407 list_add_tail(&node->fcd_list,
408 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
409 else
410 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
411 spin_unlock(&sbi->s_fc_lock);
412 mutex_lock(&ei->i_fc_lock);
413
414 return 0;
415}
416
417void __ext4_fc_track_unlink(handle_t *handle,
418 struct inode *inode, struct dentry *dentry)
419{
420 struct __track_dentry_update_args args;
421 int ret;
422
423 args.dentry = dentry;
424 args.op = EXT4_FC_TAG_UNLINK;
425
426 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
427 (void *)&args, 0);
428 trace_ext4_fc_track_unlink(inode, dentry, ret);
429}
430
431void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
432{
433 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
434}
435
436void __ext4_fc_track_link(handle_t *handle,
437 struct inode *inode, struct dentry *dentry)
438{
439 struct __track_dentry_update_args args;
440 int ret;
441
442 args.dentry = dentry;
443 args.op = EXT4_FC_TAG_LINK;
444
445 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
446 (void *)&args, 0);
447 trace_ext4_fc_track_link(inode, dentry, ret);
448}
449
450void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
451{
452 __ext4_fc_track_link(handle, d_inode(dentry), dentry);
453}
454
455void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
456 struct dentry *dentry)
457{
458 struct __track_dentry_update_args args;
459 int ret;
460
461 args.dentry = dentry;
462 args.op = EXT4_FC_TAG_CREAT;
463
464 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
465 (void *)&args, 0);
466 trace_ext4_fc_track_create(inode, dentry, ret);
467}
468
469void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
470{
471 __ext4_fc_track_create(handle, d_inode(dentry), dentry);
472}
473
474/* __track_fn for inode tracking */
475static int __track_inode(struct inode *inode, void *arg, bool update)
476{
477 if (update)
478 return -EEXIST;
479
480 EXT4_I(inode)->i_fc_lblk_len = 0;
481
482 return 0;
483}
484
485void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
486{
487 int ret;
488
489 if (S_ISDIR(inode->i_mode))
490 return;
491
492 if (ext4_should_journal_data(inode)) {
493 ext4_fc_mark_ineligible(inode->i_sb,
494 EXT4_FC_REASON_INODE_JOURNAL_DATA);
495 return;
496 }
497
498 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
499 trace_ext4_fc_track_inode(inode, ret);
500}
501
502struct __track_range_args {
503 ext4_lblk_t start, end;
504};
505
506/* __track_fn for tracking data updates */
507static int __track_range(struct inode *inode, void *arg, bool update)
508{
509 struct ext4_inode_info *ei = EXT4_I(inode);
510 ext4_lblk_t oldstart;
511 struct __track_range_args *__arg =
512 (struct __track_range_args *)arg;
513
514 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
515 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
516 return -ECANCELED;
517 }
518
519 oldstart = ei->i_fc_lblk_start;
520
521 if (update && ei->i_fc_lblk_len > 0) {
522 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
523 ei->i_fc_lblk_len =
524 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
525 ei->i_fc_lblk_start + 1;
526 } else {
527 ei->i_fc_lblk_start = __arg->start;
528 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
529 }
530
531 return 0;
532}
533
534void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
535 ext4_lblk_t end)
536{
537 struct __track_range_args args;
538 int ret;
539
540 if (S_ISDIR(inode->i_mode))
541 return;
542
543 args.start = start;
544 args.end = end;
545
546 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
547
548 trace_ext4_fc_track_range(inode, start, end, ret);
549}
550
551static void ext4_fc_submit_bh(struct super_block *sb)
552{
553 int write_flags = REQ_SYNC;
554 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
555
556 /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
557 if (test_opt(sb, BARRIER))
558 write_flags |= REQ_FUA | REQ_PREFLUSH;
559 lock_buffer(bh);
560 set_buffer_dirty(bh);
561 set_buffer_uptodate(bh);
562 bh->b_end_io = ext4_end_buffer_io_sync;
563 submit_bh(REQ_OP_WRITE, write_flags, bh);
564 EXT4_SB(sb)->s_fc_bh = NULL;
565}
566
567/* Ext4 commit path routines */
568
569/* memzero and update CRC */
570static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
571 u32 *crc)
572{
573 void *ret;
574
575 ret = memset(dst, 0, len);
576 if (crc)
577 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
578 return ret;
579}
580
581/*
582 * Allocate len bytes on a fast commit buffer.
583 *
584 * During the commit time this function is used to manage fast commit
585 * block space. We don't split a fast commit log onto different
586 * blocks. So this function makes sure that if there's not enough space
587 * on the current block, the remaining space in the current block is
588 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
589 * new block is from jbd2 and CRC is updated to reflect the padding
590 * we added.
591 */
592static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
593{
594 struct ext4_fc_tl *tl;
595 struct ext4_sb_info *sbi = EXT4_SB(sb);
596 struct buffer_head *bh;
597 int bsize = sbi->s_journal->j_blocksize;
598 int ret, off = sbi->s_fc_bytes % bsize;
599 int pad_len;
600
601 /*
602 * After allocating len, we should have space at least for a 0 byte
603 * padding.
604 */
605 if (len + sizeof(struct ext4_fc_tl) > bsize)
606 return NULL;
607
608 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
609 /*
610 * Only allocate from current buffer if we have enough space for
611 * this request AND we have space to add a zero byte padding.
612 */
613 if (!sbi->s_fc_bh) {
614 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
615 if (ret)
616 return NULL;
617 sbi->s_fc_bh = bh;
618 }
619 sbi->s_fc_bytes += len;
620 return sbi->s_fc_bh->b_data + off;
621 }
622 /* Need to add PAD tag */
623 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
624 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
625 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
626 tl->fc_len = cpu_to_le16(pad_len);
627 if (crc)
628 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
629 if (pad_len > 0)
630 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
631 ext4_fc_submit_bh(sb);
632
633 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
634 if (ret)
635 return NULL;
636 sbi->s_fc_bh = bh;
637 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
638 return sbi->s_fc_bh->b_data;
639}
640
641/* memcpy to fc reserved space and update CRC */
642static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
643 int len, u32 *crc)
644{
645 if (crc)
646 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
647 return memcpy(dst, src, len);
648}
649
650/*
651 * Complete a fast commit by writing tail tag.
652 *
653 * Writing tail tag marks the end of a fast commit. In order to guarantee
654 * atomicity, after writing tail tag, even if there's space remaining
655 * in the block, next commit shouldn't use it. That's why tail tag
656 * has the length as that of the remaining space on the block.
657 */
658static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
659{
660 struct ext4_sb_info *sbi = EXT4_SB(sb);
661 struct ext4_fc_tl tl;
662 struct ext4_fc_tail tail;
663 int off, bsize = sbi->s_journal->j_blocksize;
664 u8 *dst;
665
666 /*
667 * ext4_fc_reserve_space takes care of allocating an extra block if
668 * there's no enough space on this block for accommodating this tail.
669 */
670 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
671 if (!dst)
672 return -ENOSPC;
673
674 off = sbi->s_fc_bytes % bsize;
675
676 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
677 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
678 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
679
680 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
681 dst += sizeof(tl);
682 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
683 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
684 dst += sizeof(tail.fc_tid);
685 tail.fc_crc = cpu_to_le32(crc);
686 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
687
688 ext4_fc_submit_bh(sb);
689
690 return 0;
691}
692
693/*
694 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
695 * Returns false if there's not enough space.
696 */
697static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
698 u32 *crc)
699{
700 struct ext4_fc_tl tl;
701 u8 *dst;
702
703 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
704 if (!dst)
705 return false;
706
707 tl.fc_tag = cpu_to_le16(tag);
708 tl.fc_len = cpu_to_le16(len);
709
710 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
711 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
712
713 return true;
714}
715
716/* Same as above, but adds dentry tlv. */
717static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
718 int parent_ino, int ino, int dlen,
719 const unsigned char *dname,
720 u32 *crc)
721{
722 struct ext4_fc_dentry_info fcd;
723 struct ext4_fc_tl tl;
724 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
725 crc);
726
727 if (!dst)
728 return false;
729
730 fcd.fc_parent_ino = cpu_to_le32(parent_ino);
731 fcd.fc_ino = cpu_to_le32(ino);
732 tl.fc_tag = cpu_to_le16(tag);
733 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
734 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
735 dst += sizeof(tl);
736 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
737 dst += sizeof(fcd);
738 ext4_fc_memcpy(sb, dst, dname, dlen, crc);
739 dst += dlen;
740
741 return true;
742}
743
744/*
745 * Writes inode in the fast commit space under TLV with tag @tag.
746 * Returns 0 on success, error on failure.
747 */
748static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
749{
750 struct ext4_inode_info *ei = EXT4_I(inode);
751 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
752 int ret;
753 struct ext4_iloc iloc;
754 struct ext4_fc_inode fc_inode;
755 struct ext4_fc_tl tl;
756 u8 *dst;
757
758 ret = ext4_get_inode_loc(inode, &iloc);
759 if (ret)
760 return ret;
761
762 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
763 inode_len += ei->i_extra_isize;
764
765 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
766 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
767 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
768
769 dst = ext4_fc_reserve_space(inode->i_sb,
770 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
771 if (!dst)
772 return -ECANCELED;
773
774 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
775 return -ECANCELED;
776 dst += sizeof(tl);
777 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
778 return -ECANCELED;
779 dst += sizeof(fc_inode);
780 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
781 inode_len, crc))
782 return -ECANCELED;
783
784 return 0;
785}
786
787/*
788 * Writes updated data ranges for the inode in question. Updates CRC.
789 * Returns 0 on success, error otherwise.
790 */
791static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
792{
793 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
794 struct ext4_inode_info *ei = EXT4_I(inode);
795 struct ext4_map_blocks map;
796 struct ext4_fc_add_range fc_ext;
797 struct ext4_fc_del_range lrange;
798 struct ext4_extent *ex;
799 int ret;
800
801 mutex_lock(&ei->i_fc_lock);
802 if (ei->i_fc_lblk_len == 0) {
803 mutex_unlock(&ei->i_fc_lock);
804 return 0;
805 }
806 old_blk_size = ei->i_fc_lblk_start;
807 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
808 ei->i_fc_lblk_len = 0;
809 mutex_unlock(&ei->i_fc_lock);
810
811 cur_lblk_off = old_blk_size;
812 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
813 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
814
815 while (cur_lblk_off <= new_blk_size) {
816 map.m_lblk = cur_lblk_off;
817 map.m_len = new_blk_size - cur_lblk_off + 1;
818 ret = ext4_map_blocks(NULL, inode, &map, 0);
819 if (ret < 0)
820 return -ECANCELED;
821
822 if (map.m_len == 0) {
823 cur_lblk_off++;
824 continue;
825 }
826
827 if (ret == 0) {
828 lrange.fc_ino = cpu_to_le32(inode->i_ino);
829 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
830 lrange.fc_len = cpu_to_le32(map.m_len);
831 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
832 sizeof(lrange), (u8 *)&lrange, crc))
833 return -ENOSPC;
834 } else {
835 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
836 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
837
838 /* Limit the number of blocks in one extent */
839 map.m_len = min(max, map.m_len);
840
841 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
842 ex = (struct ext4_extent *)&fc_ext.fc_ex;
843 ex->ee_block = cpu_to_le32(map.m_lblk);
844 ex->ee_len = cpu_to_le16(map.m_len);
845 ext4_ext_store_pblock(ex, map.m_pblk);
846 if (map.m_flags & EXT4_MAP_UNWRITTEN)
847 ext4_ext_mark_unwritten(ex);
848 else
849 ext4_ext_mark_initialized(ex);
850 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
851 sizeof(fc_ext), (u8 *)&fc_ext, crc))
852 return -ENOSPC;
853 }
854
855 cur_lblk_off += map.m_len;
856 }
857
858 return 0;
859}
860
861
862/* Submit data for all the fast commit inodes */
863static int ext4_fc_submit_inode_data_all(journal_t *journal)
864{
865 struct super_block *sb = (struct super_block *)(journal->j_private);
866 struct ext4_sb_info *sbi = EXT4_SB(sb);
867 struct ext4_inode_info *ei;
868 struct list_head *pos;
869 int ret = 0;
870
871 spin_lock(&sbi->s_fc_lock);
872 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
873 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
874 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
875 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
876 while (atomic_read(&ei->i_fc_updates)) {
877 DEFINE_WAIT(wait);
878
879 prepare_to_wait(&ei->i_fc_wait, &wait,
880 TASK_UNINTERRUPTIBLE);
881 if (atomic_read(&ei->i_fc_updates)) {
882 spin_unlock(&sbi->s_fc_lock);
883 schedule();
884 spin_lock(&sbi->s_fc_lock);
885 }
886 finish_wait(&ei->i_fc_wait, &wait);
887 }
888 spin_unlock(&sbi->s_fc_lock);
889 ret = jbd2_submit_inode_data(ei->jinode);
890 if (ret)
891 return ret;
892 spin_lock(&sbi->s_fc_lock);
893 }
894 spin_unlock(&sbi->s_fc_lock);
895
896 return ret;
897}
898
899/* Wait for completion of data for all the fast commit inodes */
900static int ext4_fc_wait_inode_data_all(journal_t *journal)
901{
902 struct super_block *sb = (struct super_block *)(journal->j_private);
903 struct ext4_sb_info *sbi = EXT4_SB(sb);
904 struct ext4_inode_info *pos, *n;
905 int ret = 0;
906
907 spin_lock(&sbi->s_fc_lock);
908 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
909 if (!ext4_test_inode_state(&pos->vfs_inode,
910 EXT4_STATE_FC_COMMITTING))
911 continue;
912 spin_unlock(&sbi->s_fc_lock);
913
914 ret = jbd2_wait_inode_data(journal, pos->jinode);
915 if (ret)
916 return ret;
917 spin_lock(&sbi->s_fc_lock);
918 }
919 spin_unlock(&sbi->s_fc_lock);
920
921 return 0;
922}
923
924/* Commit all the directory entry updates */
925static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
926__acquires(&sbi->s_fc_lock)
927__releases(&sbi->s_fc_lock)
928{
929 struct super_block *sb = (struct super_block *)(journal->j_private);
930 struct ext4_sb_info *sbi = EXT4_SB(sb);
931 struct ext4_fc_dentry_update *fc_dentry;
932 struct inode *inode;
933 struct list_head *pos, *n, *fcd_pos, *fcd_n;
934 struct ext4_inode_info *ei;
935 int ret;
936
937 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
938 return 0;
939 list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
940 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
941 fcd_list);
942 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
943 spin_unlock(&sbi->s_fc_lock);
944 if (!ext4_fc_add_dentry_tlv(
945 sb, fc_dentry->fcd_op,
946 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
947 fc_dentry->fcd_name.len,
948 fc_dentry->fcd_name.name, crc)) {
949 ret = -ENOSPC;
950 goto lock_and_exit;
951 }
952 spin_lock(&sbi->s_fc_lock);
953 continue;
954 }
955
956 inode = NULL;
957 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
958 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
959 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
960 inode = &ei->vfs_inode;
961 break;
962 }
963 }
964 /*
965 * If we don't find inode in our list, then it was deleted,
966 * in which case, we don't need to record it's create tag.
967 */
968 if (!inode)
969 continue;
970 spin_unlock(&sbi->s_fc_lock);
971
972 /*
973 * We first write the inode and then the create dirent. This
974 * allows the recovery code to create an unnamed inode first
975 * and then link it to a directory entry. This allows us
976 * to use namei.c routines almost as is and simplifies
977 * the recovery code.
978 */
979 ret = ext4_fc_write_inode(inode, crc);
980 if (ret)
981 goto lock_and_exit;
982
983 ret = ext4_fc_write_inode_data(inode, crc);
984 if (ret)
985 goto lock_and_exit;
986
987 if (!ext4_fc_add_dentry_tlv(
988 sb, fc_dentry->fcd_op,
989 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
990 fc_dentry->fcd_name.len,
991 fc_dentry->fcd_name.name, crc)) {
992 ret = -ENOSPC;
993 goto lock_and_exit;
994 }
995
996 spin_lock(&sbi->s_fc_lock);
997 }
998 return 0;
999lock_and_exit:
1000 spin_lock(&sbi->s_fc_lock);
1001 return ret;
1002}
1003
1004static int ext4_fc_perform_commit(journal_t *journal)
1005{
1006 struct super_block *sb = (struct super_block *)(journal->j_private);
1007 struct ext4_sb_info *sbi = EXT4_SB(sb);
1008 struct ext4_inode_info *iter;
1009 struct ext4_fc_head head;
1010 struct list_head *pos;
1011 struct inode *inode;
1012 struct blk_plug plug;
1013 int ret = 0;
1014 u32 crc = 0;
1015
1016 ret = ext4_fc_submit_inode_data_all(journal);
1017 if (ret)
1018 return ret;
1019
1020 ret = ext4_fc_wait_inode_data_all(journal);
1021 if (ret)
1022 return ret;
1023
1024 /*
1025 * If file system device is different from journal device, issue a cache
1026 * flush before we start writing fast commit blocks.
1027 */
1028 if (journal->j_fs_dev != journal->j_dev)
1029 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
1030
1031 blk_start_plug(&plug);
1032 if (sbi->s_fc_bytes == 0) {
1033 /*
1034 * Add a head tag only if this is the first fast commit
1035 * in this TID.
1036 */
1037 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1038 head.fc_tid = cpu_to_le32(
1039 sbi->s_journal->j_running_transaction->t_tid);
1040 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1041 (u8 *)&head, &crc)) {
1042 ret = -ENOSPC;
1043 goto out;
1044 }
1045 }
1046
1047 spin_lock(&sbi->s_fc_lock);
1048 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1049 if (ret) {
1050 spin_unlock(&sbi->s_fc_lock);
1051 goto out;
1052 }
1053
1054 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1055 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1056 inode = &iter->vfs_inode;
1057 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1058 continue;
1059
1060 spin_unlock(&sbi->s_fc_lock);
1061 ret = ext4_fc_write_inode_data(inode, &crc);
1062 if (ret)
1063 goto out;
1064 ret = ext4_fc_write_inode(inode, &crc);
1065 if (ret)
1066 goto out;
1067 spin_lock(&sbi->s_fc_lock);
1068 }
1069 spin_unlock(&sbi->s_fc_lock);
1070
1071 ret = ext4_fc_write_tail(sb, crc);
1072
1073out:
1074 blk_finish_plug(&plug);
1075 return ret;
1076}
1077
1078/*
1079 * The main commit entry point. Performs a fast commit for transaction
1080 * commit_tid if needed. If it's not possible to perform a fast commit
1081 * due to various reasons, we fall back to full commit. Returns 0
1082 * on success, error otherwise.
1083 */
1084int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1085{
1086 struct super_block *sb = (struct super_block *)(journal->j_private);
1087 struct ext4_sb_info *sbi = EXT4_SB(sb);
1088 int nblks = 0, ret, bsize = journal->j_blocksize;
1089 int subtid = atomic_read(&sbi->s_fc_subtid);
1090 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1091 ktime_t start_time, commit_time;
1092
1093 trace_ext4_fc_commit_start(sb);
1094
1095 start_time = ktime_get();
1096
1097 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1098 (ext4_fc_is_ineligible(sb))) {
1099 reason = EXT4_FC_REASON_INELIGIBLE;
1100 goto out;
1101 }
1102
1103restart_fc:
1104 ret = jbd2_fc_begin_commit(journal, commit_tid);
1105 if (ret == -EALREADY) {
1106 /* There was an ongoing commit, check if we need to restart */
1107 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1108 commit_tid > journal->j_commit_sequence)
1109 goto restart_fc;
1110 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1111 goto out;
1112 } else if (ret) {
1113 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1114 reason = EXT4_FC_REASON_FC_START_FAILED;
1115 goto out;
1116 }
1117
1118 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1119 ret = ext4_fc_perform_commit(journal);
1120 if (ret < 0) {
1121 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1122 reason = EXT4_FC_REASON_FC_FAILED;
1123 goto out;
1124 }
1125 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1126 ret = jbd2_fc_wait_bufs(journal, nblks);
1127 if (ret < 0) {
1128 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1129 reason = EXT4_FC_REASON_FC_FAILED;
1130 goto out;
1131 }
1132 atomic_inc(&sbi->s_fc_subtid);
1133 jbd2_fc_end_commit(journal);
1134out:
1135 /* Has any ineligible update happened since we started? */
1136 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1137 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1138 reason = EXT4_FC_REASON_INELIGIBLE;
1139 }
1140
1141 spin_lock(&sbi->s_fc_lock);
1142 if (reason != EXT4_FC_REASON_OK &&
1143 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1144 sbi->s_fc_stats.fc_ineligible_commits++;
1145 } else {
1146 sbi->s_fc_stats.fc_num_commits++;
1147 sbi->s_fc_stats.fc_numblks += nblks;
1148 }
1149 spin_unlock(&sbi->s_fc_lock);
1150 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1151 trace_ext4_fc_commit_stop(sb, nblks, reason);
1152 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1153 /*
1154 * weight the commit time higher than the average time so we don't
1155 * react too strongly to vast changes in the commit time
1156 */
1157 if (likely(sbi->s_fc_avg_commit_time))
1158 sbi->s_fc_avg_commit_time = (commit_time +
1159 sbi->s_fc_avg_commit_time * 3) / 4;
1160 else
1161 sbi->s_fc_avg_commit_time = commit_time;
1162 jbd_debug(1,
1163 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1164 nblks, reason, subtid);
1165 if (reason == EXT4_FC_REASON_FC_FAILED)
1166 return jbd2_fc_end_commit_fallback(journal);
1167 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1168 reason == EXT4_FC_REASON_INELIGIBLE)
1169 return jbd2_complete_transaction(journal, commit_tid);
1170 return 0;
1171}
1172
1173/*
1174 * Fast commit cleanup routine. This is called after every fast commit and
1175 * full commit. full is true if we are called after a full commit.
1176 */
1177static void ext4_fc_cleanup(journal_t *journal, int full)
1178{
1179 struct super_block *sb = journal->j_private;
1180 struct ext4_sb_info *sbi = EXT4_SB(sb);
1181 struct ext4_inode_info *iter;
1182 struct ext4_fc_dentry_update *fc_dentry;
1183 struct list_head *pos, *n;
1184
1185 if (full && sbi->s_fc_bh)
1186 sbi->s_fc_bh = NULL;
1187
1188 jbd2_fc_release_bufs(journal);
1189
1190 spin_lock(&sbi->s_fc_lock);
1191 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1192 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1193 list_del_init(&iter->i_fc_list);
1194 ext4_clear_inode_state(&iter->vfs_inode,
1195 EXT4_STATE_FC_COMMITTING);
1196 ext4_fc_reset_inode(&iter->vfs_inode);
1197 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1198 smp_mb();
1199#if (BITS_PER_LONG < 64)
1200 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1201#else
1202 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1203#endif
1204 }
1205
1206 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1207 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1208 struct ext4_fc_dentry_update,
1209 fcd_list);
1210 list_del_init(&fc_dentry->fcd_list);
1211 spin_unlock(&sbi->s_fc_lock);
1212
1213 if (fc_dentry->fcd_name.name &&
1214 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1215 kfree(fc_dentry->fcd_name.name);
1216 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1217 spin_lock(&sbi->s_fc_lock);
1218 }
1219
1220 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1221 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1222 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1223 &sbi->s_fc_q[FC_Q_MAIN]);
1224
1225 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1226 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1227
1228 if (full)
1229 sbi->s_fc_bytes = 0;
1230 spin_unlock(&sbi->s_fc_lock);
1231 trace_ext4_fc_stats(sb);
1232}
1233
1234/* Ext4 Replay Path Routines */
1235
1236/* Helper struct for dentry replay routines */
1237struct dentry_info_args {
1238 int parent_ino, dname_len, ino, inode_len;
1239 char *dname;
1240};
1241
1242static inline void tl_to_darg(struct dentry_info_args *darg,
1243 struct ext4_fc_tl *tl, u8 *val)
1244{
1245 struct ext4_fc_dentry_info fcd;
1246
1247 memcpy(&fcd, val, sizeof(fcd));
1248
1249 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1250 darg->ino = le32_to_cpu(fcd.fc_ino);
1251 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1252 darg->dname_len = le16_to_cpu(tl->fc_len) -
1253 sizeof(struct ext4_fc_dentry_info);
1254}
1255
1256/* Unlink replay function */
1257static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1258 u8 *val)
1259{
1260 struct inode *inode, *old_parent;
1261 struct qstr entry;
1262 struct dentry_info_args darg;
1263 int ret = 0;
1264
1265 tl_to_darg(&darg, tl, val);
1266
1267 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1268 darg.parent_ino, darg.dname_len);
1269
1270 entry.name = darg.dname;
1271 entry.len = darg.dname_len;
1272 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1273
1274 if (IS_ERR(inode)) {
1275 jbd_debug(1, "Inode %d not found", darg.ino);
1276 return 0;
1277 }
1278
1279 old_parent = ext4_iget(sb, darg.parent_ino,
1280 EXT4_IGET_NORMAL);
1281 if (IS_ERR(old_parent)) {
1282 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1283 iput(inode);
1284 return 0;
1285 }
1286
1287 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1288 /* -ENOENT ok coz it might not exist anymore. */
1289 if (ret == -ENOENT)
1290 ret = 0;
1291 iput(old_parent);
1292 iput(inode);
1293 return ret;
1294}
1295
1296static int ext4_fc_replay_link_internal(struct super_block *sb,
1297 struct dentry_info_args *darg,
1298 struct inode *inode)
1299{
1300 struct inode *dir = NULL;
1301 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1302 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1303 int ret = 0;
1304
1305 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1306 if (IS_ERR(dir)) {
1307 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1308 dir = NULL;
1309 goto out;
1310 }
1311
1312 dentry_dir = d_obtain_alias(dir);
1313 if (IS_ERR(dentry_dir)) {
1314 jbd_debug(1, "Failed to obtain dentry");
1315 dentry_dir = NULL;
1316 goto out;
1317 }
1318
1319 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1320 if (!dentry_inode) {
1321 jbd_debug(1, "Inode dentry not created.");
1322 ret = -ENOMEM;
1323 goto out;
1324 }
1325
1326 ret = __ext4_link(dir, inode, dentry_inode);
1327 /*
1328 * It's possible that link already existed since data blocks
1329 * for the dir in question got persisted before we crashed OR
1330 * we replayed this tag and crashed before the entire replay
1331 * could complete.
1332 */
1333 if (ret && ret != -EEXIST) {
1334 jbd_debug(1, "Failed to link\n");
1335 goto out;
1336 }
1337
1338 ret = 0;
1339out:
1340 if (dentry_dir) {
1341 d_drop(dentry_dir);
1342 dput(dentry_dir);
1343 } else if (dir) {
1344 iput(dir);
1345 }
1346 if (dentry_inode) {
1347 d_drop(dentry_inode);
1348 dput(dentry_inode);
1349 }
1350
1351 return ret;
1352}
1353
1354/* Link replay function */
1355static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1356 u8 *val)
1357{
1358 struct inode *inode;
1359 struct dentry_info_args darg;
1360 int ret = 0;
1361
1362 tl_to_darg(&darg, tl, val);
1363 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1364 darg.parent_ino, darg.dname_len);
1365
1366 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1367 if (IS_ERR(inode)) {
1368 jbd_debug(1, "Inode not found.");
1369 return 0;
1370 }
1371
1372 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1373 iput(inode);
1374 return ret;
1375}
1376
1377/*
1378 * Record all the modified inodes during replay. We use this later to setup
1379 * block bitmaps correctly.
1380 */
1381static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1382{
1383 struct ext4_fc_replay_state *state;
1384 int i;
1385
1386 state = &EXT4_SB(sb)->s_fc_replay_state;
1387 for (i = 0; i < state->fc_modified_inodes_used; i++)
1388 if (state->fc_modified_inodes[i] == ino)
1389 return 0;
1390 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1391 state->fc_modified_inodes = krealloc(
1392 state->fc_modified_inodes,
1393 sizeof(int) * (state->fc_modified_inodes_size +
1394 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1395 GFP_KERNEL);
1396 if (!state->fc_modified_inodes)
1397 return -ENOMEM;
1398 state->fc_modified_inodes_size +=
1399 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1400 }
1401 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1402 return 0;
1403}
1404
1405/*
1406 * Inode replay function
1407 */
1408static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1409 u8 *val)
1410{
1411 struct ext4_fc_inode fc_inode;
1412 struct ext4_inode *raw_inode;
1413 struct ext4_inode *raw_fc_inode;
1414 struct inode *inode = NULL;
1415 struct ext4_iloc iloc;
1416 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1417 struct ext4_extent_header *eh;
1418
1419 memcpy(&fc_inode, val, sizeof(fc_inode));
1420
1421 ino = le32_to_cpu(fc_inode.fc_ino);
1422 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1423
1424 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1425 if (!IS_ERR(inode)) {
1426 ext4_ext_clear_bb(inode);
1427 iput(inode);
1428 }
1429 inode = NULL;
1430
1431 ret = ext4_fc_record_modified_inode(sb, ino);
1432 if (ret)
1433 goto out;
1434
1435 raw_fc_inode = (struct ext4_inode *)
1436 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1437 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1438 if (ret)
1439 goto out;
1440
1441 inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1442 raw_inode = ext4_raw_inode(&iloc);
1443
1444 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1445 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1446 inode_len - offsetof(struct ext4_inode, i_generation));
1447 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1448 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1449 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1450 memset(eh, 0, sizeof(*eh));
1451 eh->eh_magic = EXT4_EXT_MAGIC;
1452 eh->eh_max = cpu_to_le16(
1453 (sizeof(raw_inode->i_block) -
1454 sizeof(struct ext4_extent_header))
1455 / sizeof(struct ext4_extent));
1456 }
1457 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1458 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1459 sizeof(raw_inode->i_block));
1460 }
1461
1462 /* Immediately update the inode on disk. */
1463 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1464 if (ret)
1465 goto out;
1466 ret = sync_dirty_buffer(iloc.bh);
1467 if (ret)
1468 goto out;
1469 ret = ext4_mark_inode_used(sb, ino);
1470 if (ret)
1471 goto out;
1472
1473 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1474 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1475 if (IS_ERR(inode)) {
1476 jbd_debug(1, "Inode not found.");
1477 return -EFSCORRUPTED;
1478 }
1479
1480 /*
1481 * Our allocator could have made different decisions than before
1482 * crashing. This should be fixed but until then, we calculate
1483 * the number of blocks the inode.
1484 */
1485 ext4_ext_replay_set_iblocks(inode);
1486
1487 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1488 ext4_reset_inode_seed(inode);
1489
1490 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1491 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1492 sync_dirty_buffer(iloc.bh);
1493 brelse(iloc.bh);
1494out:
1495 iput(inode);
1496 if (!ret)
1497 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1498
1499 return 0;
1500}
1501
1502/*
1503 * Dentry create replay function.
1504 *
1505 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1506 * inode for which we are trying to create a dentry here, should already have
1507 * been replayed before we start here.
1508 */
1509static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1510 u8 *val)
1511{
1512 int ret = 0;
1513 struct inode *inode = NULL;
1514 struct inode *dir = NULL;
1515 struct dentry_info_args darg;
1516
1517 tl_to_darg(&darg, tl, val);
1518
1519 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1520 darg.parent_ino, darg.dname_len);
1521
1522 /* This takes care of update group descriptor and other metadata */
1523 ret = ext4_mark_inode_used(sb, darg.ino);
1524 if (ret)
1525 goto out;
1526
1527 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1528 if (IS_ERR(inode)) {
1529 jbd_debug(1, "inode %d not found.", darg.ino);
1530 inode = NULL;
1531 ret = -EINVAL;
1532 goto out;
1533 }
1534
1535 if (S_ISDIR(inode->i_mode)) {
1536 /*
1537 * If we are creating a directory, we need to make sure that the
1538 * dot and dot dot dirents are setup properly.
1539 */
1540 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1541 if (IS_ERR(dir)) {
1542 jbd_debug(1, "Dir %d not found.", darg.ino);
1543 goto out;
1544 }
1545 ret = ext4_init_new_dir(NULL, dir, inode);
1546 iput(dir);
1547 if (ret) {
1548 ret = 0;
1549 goto out;
1550 }
1551 }
1552 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1553 if (ret)
1554 goto out;
1555 set_nlink(inode, 1);
1556 ext4_mark_inode_dirty(NULL, inode);
1557out:
1558 if (inode)
1559 iput(inode);
1560 return ret;
1561}
1562
1563/*
1564 * Record physical disk regions which are in use as per fast commit area,
1565 * and used by inodes during replay phase. Our simple replay phase
1566 * allocator excludes these regions from allocation.
1567 */
1568int ext4_fc_record_regions(struct super_block *sb, int ino,
1569 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1570{
1571 struct ext4_fc_replay_state *state;
1572 struct ext4_fc_alloc_region *region;
1573
1574 state = &EXT4_SB(sb)->s_fc_replay_state;
1575 /*
1576 * during replay phase, the fc_regions_valid may not same as
1577 * fc_regions_used, update it when do new additions.
1578 */
1579 if (replay && state->fc_regions_used != state->fc_regions_valid)
1580 state->fc_regions_used = state->fc_regions_valid;
1581 if (state->fc_regions_used == state->fc_regions_size) {
1582 state->fc_regions_size +=
1583 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1584 state->fc_regions = krealloc(
1585 state->fc_regions,
1586 state->fc_regions_size *
1587 sizeof(struct ext4_fc_alloc_region),
1588 GFP_KERNEL);
1589 if (!state->fc_regions)
1590 return -ENOMEM;
1591 }
1592 region = &state->fc_regions[state->fc_regions_used++];
1593 region->ino = ino;
1594 region->lblk = lblk;
1595 region->pblk = pblk;
1596 region->len = len;
1597
1598 if (replay)
1599 state->fc_regions_valid++;
1600
1601 return 0;
1602}
1603
1604/* Replay add range tag */
1605static int ext4_fc_replay_add_range(struct super_block *sb,
1606 struct ext4_fc_tl *tl, u8 *val)
1607{
1608 struct ext4_fc_add_range fc_add_ex;
1609 struct ext4_extent newex, *ex;
1610 struct inode *inode;
1611 ext4_lblk_t start, cur;
1612 int remaining, len;
1613 ext4_fsblk_t start_pblk;
1614 struct ext4_map_blocks map;
1615 struct ext4_ext_path *path = NULL;
1616 int ret;
1617
1618 memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1619 ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1620
1621 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1622 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1623 ext4_ext_get_actual_len(ex));
1624
1625 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1626 if (IS_ERR(inode)) {
1627 jbd_debug(1, "Inode not found.");
1628 return 0;
1629 }
1630
1631 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1632 if (ret)
1633 goto out;
1634
1635 start = le32_to_cpu(ex->ee_block);
1636 start_pblk = ext4_ext_pblock(ex);
1637 len = ext4_ext_get_actual_len(ex);
1638
1639 cur = start;
1640 remaining = len;
1641 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1642 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1643 inode->i_ino);
1644
1645 while (remaining > 0) {
1646 map.m_lblk = cur;
1647 map.m_len = remaining;
1648 map.m_pblk = 0;
1649 ret = ext4_map_blocks(NULL, inode, &map, 0);
1650
1651 if (ret < 0)
1652 goto out;
1653
1654 if (ret == 0) {
1655 /* Range is not mapped */
1656 path = ext4_find_extent(inode, cur, NULL, 0);
1657 if (IS_ERR(path))
1658 goto out;
1659 memset(&newex, 0, sizeof(newex));
1660 newex.ee_block = cpu_to_le32(cur);
1661 ext4_ext_store_pblock(
1662 &newex, start_pblk + cur - start);
1663 newex.ee_len = cpu_to_le16(map.m_len);
1664 if (ext4_ext_is_unwritten(ex))
1665 ext4_ext_mark_unwritten(&newex);
1666 down_write(&EXT4_I(inode)->i_data_sem);
1667 ret = ext4_ext_insert_extent(
1668 NULL, inode, &path, &newex, 0);
1669 up_write((&EXT4_I(inode)->i_data_sem));
1670 ext4_ext_drop_refs(path);
1671 kfree(path);
1672 if (ret)
1673 goto out;
1674 goto next;
1675 }
1676
1677 if (start_pblk + cur - start != map.m_pblk) {
1678 /*
1679 * Logical to physical mapping changed. This can happen
1680 * if this range was removed and then reallocated to
1681 * map to new physical blocks during a fast commit.
1682 */
1683 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1684 ext4_ext_is_unwritten(ex),
1685 start_pblk + cur - start);
1686 if (ret)
1687 goto out;
1688 /*
1689 * Mark the old blocks as free since they aren't used
1690 * anymore. We maintain an array of all the modified
1691 * inodes. In case these blocks are still used at either
1692 * a different logical range in the same inode or in
1693 * some different inode, we will mark them as allocated
1694 * at the end of the FC replay using our array of
1695 * modified inodes.
1696 */
1697 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1698 goto next;
1699 }
1700
1701 /* Range is mapped and needs a state change */
1702 jbd_debug(1, "Converting from %ld to %d %lld",
1703 map.m_flags & EXT4_MAP_UNWRITTEN,
1704 ext4_ext_is_unwritten(ex), map.m_pblk);
1705 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1706 ext4_ext_is_unwritten(ex), map.m_pblk);
1707 if (ret)
1708 goto out;
1709 /*
1710 * We may have split the extent tree while toggling the state.
1711 * Try to shrink the extent tree now.
1712 */
1713 ext4_ext_replay_shrink_inode(inode, start + len);
1714next:
1715 cur += map.m_len;
1716 remaining -= map.m_len;
1717 }
1718 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1719 sb->s_blocksize_bits);
1720out:
1721 iput(inode);
1722 return 0;
1723}
1724
1725/* Replay DEL_RANGE tag */
1726static int
1727ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1728 u8 *val)
1729{
1730 struct inode *inode;
1731 struct ext4_fc_del_range lrange;
1732 struct ext4_map_blocks map;
1733 ext4_lblk_t cur, remaining;
1734 int ret;
1735
1736 memcpy(&lrange, val, sizeof(lrange));
1737 cur = le32_to_cpu(lrange.fc_lblk);
1738 remaining = le32_to_cpu(lrange.fc_len);
1739
1740 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1741 le32_to_cpu(lrange.fc_ino), cur, remaining);
1742
1743 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1744 if (IS_ERR(inode)) {
1745 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1746 return 0;
1747 }
1748
1749 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1750 if (ret)
1751 goto out;
1752
1753 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1754 inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1755 le32_to_cpu(lrange.fc_len));
1756 while (remaining > 0) {
1757 map.m_lblk = cur;
1758 map.m_len = remaining;
1759
1760 ret = ext4_map_blocks(NULL, inode, &map, 0);
1761 if (ret < 0)
1762 goto out;
1763 if (ret > 0) {
1764 remaining -= ret;
1765 cur += ret;
1766 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1767 } else {
1768 remaining -= map.m_len;
1769 cur += map.m_len;
1770 }
1771 }
1772
1773 down_write(&EXT4_I(inode)->i_data_sem);
1774 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1775 le32_to_cpu(lrange.fc_lblk) +
1776 le32_to_cpu(lrange.fc_len) - 1);
1777 up_write(&EXT4_I(inode)->i_data_sem);
1778 if (ret)
1779 goto out;
1780 ext4_ext_replay_shrink_inode(inode,
1781 i_size_read(inode) >> sb->s_blocksize_bits);
1782 ext4_mark_inode_dirty(NULL, inode);
1783out:
1784 iput(inode);
1785 return 0;
1786}
1787
1788static inline const char *tag2str(u16 tag)
1789{
1790 switch (tag) {
1791 case EXT4_FC_TAG_LINK:
1792 return "TAG_ADD_ENTRY";
1793 case EXT4_FC_TAG_UNLINK:
1794 return "TAG_DEL_ENTRY";
1795 case EXT4_FC_TAG_ADD_RANGE:
1796 return "TAG_ADD_RANGE";
1797 case EXT4_FC_TAG_CREAT:
1798 return "TAG_CREAT_DENTRY";
1799 case EXT4_FC_TAG_DEL_RANGE:
1800 return "TAG_DEL_RANGE";
1801 case EXT4_FC_TAG_INODE:
1802 return "TAG_INODE";
1803 case EXT4_FC_TAG_PAD:
1804 return "TAG_PAD";
1805 case EXT4_FC_TAG_TAIL:
1806 return "TAG_TAIL";
1807 case EXT4_FC_TAG_HEAD:
1808 return "TAG_HEAD";
1809 default:
1810 return "TAG_ERROR";
1811 }
1812}
1813
1814static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1815{
1816 struct ext4_fc_replay_state *state;
1817 struct inode *inode;
1818 struct ext4_ext_path *path = NULL;
1819 struct ext4_map_blocks map;
1820 int i, ret, j;
1821 ext4_lblk_t cur, end;
1822
1823 state = &EXT4_SB(sb)->s_fc_replay_state;
1824 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1825 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1826 EXT4_IGET_NORMAL);
1827 if (IS_ERR(inode)) {
1828 jbd_debug(1, "Inode %d not found.",
1829 state->fc_modified_inodes[i]);
1830 continue;
1831 }
1832 cur = 0;
1833 end = EXT_MAX_BLOCKS;
1834 while (cur < end) {
1835 map.m_lblk = cur;
1836 map.m_len = end - cur;
1837
1838 ret = ext4_map_blocks(NULL, inode, &map, 0);
1839 if (ret < 0)
1840 break;
1841
1842 if (ret > 0) {
1843 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1844 if (!IS_ERR(path)) {
1845 for (j = 0; j < path->p_depth; j++)
1846 ext4_mb_mark_bb(inode->i_sb,
1847 path[j].p_block, 1, 1);
1848 ext4_ext_drop_refs(path);
1849 kfree(path);
1850 }
1851 cur += ret;
1852 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1853 map.m_len, 1);
1854 } else {
1855 cur = cur + (map.m_len ? map.m_len : 1);
1856 }
1857 }
1858 iput(inode);
1859 }
1860}
1861
1862/*
1863 * Check if block is in excluded regions for block allocation. The simple
1864 * allocator that runs during replay phase is calls this function to see
1865 * if it is okay to use a block.
1866 */
1867bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1868{
1869 int i;
1870 struct ext4_fc_replay_state *state;
1871
1872 state = &EXT4_SB(sb)->s_fc_replay_state;
1873 for (i = 0; i < state->fc_regions_valid; i++) {
1874 if (state->fc_regions[i].ino == 0 ||
1875 state->fc_regions[i].len == 0)
1876 continue;
1877 if (blk >= state->fc_regions[i].pblk &&
1878 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1879 return true;
1880 }
1881 return false;
1882}
1883
1884/* Cleanup function called after replay */
1885void ext4_fc_replay_cleanup(struct super_block *sb)
1886{
1887 struct ext4_sb_info *sbi = EXT4_SB(sb);
1888
1889 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1890 kfree(sbi->s_fc_replay_state.fc_regions);
1891 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1892}
1893
1894/*
1895 * Recovery Scan phase handler
1896 *
1897 * This function is called during the scan phase and is responsible
1898 * for doing following things:
1899 * - Make sure the fast commit area has valid tags for replay
1900 * - Count number of tags that need to be replayed by the replay handler
1901 * - Verify CRC
1902 * - Create a list of excluded blocks for allocation during replay phase
1903 *
1904 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1905 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1906 * to indicate that scan has finished and JBD2 can now start replay phase.
1907 * It returns a negative error to indicate that there was an error. At the end
1908 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1909 * to indicate the number of tags that need to replayed during the replay phase.
1910 */
1911static int ext4_fc_replay_scan(journal_t *journal,
1912 struct buffer_head *bh, int off,
1913 tid_t expected_tid)
1914{
1915 struct super_block *sb = journal->j_private;
1916 struct ext4_sb_info *sbi = EXT4_SB(sb);
1917 struct ext4_fc_replay_state *state;
1918 int ret = JBD2_FC_REPLAY_CONTINUE;
1919 struct ext4_fc_add_range ext;
1920 struct ext4_fc_tl tl;
1921 struct ext4_fc_tail tail;
1922 __u8 *start, *end, *cur, *val;
1923 struct ext4_fc_head head;
1924 struct ext4_extent *ex;
1925
1926 state = &sbi->s_fc_replay_state;
1927
1928 start = (u8 *)bh->b_data;
1929 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1930
1931 if (state->fc_replay_expected_off == 0) {
1932 state->fc_cur_tag = 0;
1933 state->fc_replay_num_tags = 0;
1934 state->fc_crc = 0;
1935 state->fc_regions = NULL;
1936 state->fc_regions_valid = state->fc_regions_used =
1937 state->fc_regions_size = 0;
1938 /* Check if we can stop early */
1939 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1940 != EXT4_FC_TAG_HEAD)
1941 return 0;
1942 }
1943
1944 if (off != state->fc_replay_expected_off) {
1945 ret = -EFSCORRUPTED;
1946 goto out_err;
1947 }
1948
1949 state->fc_replay_expected_off++;
1950 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1951 memcpy(&tl, cur, sizeof(tl));
1952 val = cur + sizeof(tl);
1953 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1954 tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1955 switch (le16_to_cpu(tl.fc_tag)) {
1956 case EXT4_FC_TAG_ADD_RANGE:
1957 memcpy(&ext, val, sizeof(ext));
1958 ex = (struct ext4_extent *)&ext.fc_ex;
1959 ret = ext4_fc_record_regions(sb,
1960 le32_to_cpu(ext.fc_ino),
1961 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1962 ext4_ext_get_actual_len(ex), 0);
1963 if (ret < 0)
1964 break;
1965 ret = JBD2_FC_REPLAY_CONTINUE;
1966 fallthrough;
1967 case EXT4_FC_TAG_DEL_RANGE:
1968 case EXT4_FC_TAG_LINK:
1969 case EXT4_FC_TAG_UNLINK:
1970 case EXT4_FC_TAG_CREAT:
1971 case EXT4_FC_TAG_INODE:
1972 case EXT4_FC_TAG_PAD:
1973 state->fc_cur_tag++;
1974 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1975 sizeof(tl) + le16_to_cpu(tl.fc_len));
1976 break;
1977 case EXT4_FC_TAG_TAIL:
1978 state->fc_cur_tag++;
1979 memcpy(&tail, val, sizeof(tail));
1980 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1981 sizeof(tl) +
1982 offsetof(struct ext4_fc_tail,
1983 fc_crc));
1984 if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1985 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
1986 state->fc_replay_num_tags = state->fc_cur_tag;
1987 state->fc_regions_valid =
1988 state->fc_regions_used;
1989 } else {
1990 ret = state->fc_replay_num_tags ?
1991 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1992 }
1993 state->fc_crc = 0;
1994 break;
1995 case EXT4_FC_TAG_HEAD:
1996 memcpy(&head, val, sizeof(head));
1997 if (le32_to_cpu(head.fc_features) &
1998 ~EXT4_FC_SUPPORTED_FEATURES) {
1999 ret = -EOPNOTSUPP;
2000 break;
2001 }
2002 if (le32_to_cpu(head.fc_tid) != expected_tid) {
2003 ret = JBD2_FC_REPLAY_STOP;
2004 break;
2005 }
2006 state->fc_cur_tag++;
2007 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2008 sizeof(tl) + le16_to_cpu(tl.fc_len));
2009 break;
2010 default:
2011 ret = state->fc_replay_num_tags ?
2012 JBD2_FC_REPLAY_STOP : -ECANCELED;
2013 }
2014 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2015 break;
2016 }
2017
2018out_err:
2019 trace_ext4_fc_replay_scan(sb, ret, off);
2020 return ret;
2021}
2022
2023/*
2024 * Main recovery path entry point.
2025 * The meaning of return codes is similar as above.
2026 */
2027static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2028 enum passtype pass, int off, tid_t expected_tid)
2029{
2030 struct super_block *sb = journal->j_private;
2031 struct ext4_sb_info *sbi = EXT4_SB(sb);
2032 struct ext4_fc_tl tl;
2033 __u8 *start, *end, *cur, *val;
2034 int ret = JBD2_FC_REPLAY_CONTINUE;
2035 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2036 struct ext4_fc_tail tail;
2037
2038 if (pass == PASS_SCAN) {
2039 state->fc_current_pass = PASS_SCAN;
2040 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2041 }
2042
2043 if (state->fc_current_pass != pass) {
2044 state->fc_current_pass = pass;
2045 sbi->s_mount_state |= EXT4_FC_REPLAY;
2046 }
2047 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2048 jbd_debug(1, "Replay stops\n");
2049 ext4_fc_set_bitmaps_and_counters(sb);
2050 return 0;
2051 }
2052
2053#ifdef CONFIG_EXT4_DEBUG
2054 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2055 pr_warn("Dropping fc block %d because max_replay set\n", off);
2056 return JBD2_FC_REPLAY_STOP;
2057 }
2058#endif
2059
2060 start = (u8 *)bh->b_data;
2061 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2062
2063 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2064 memcpy(&tl, cur, sizeof(tl));
2065 val = cur + sizeof(tl);
2066
2067 if (state->fc_replay_num_tags == 0) {
2068 ret = JBD2_FC_REPLAY_STOP;
2069 ext4_fc_set_bitmaps_and_counters(sb);
2070 break;
2071 }
2072 jbd_debug(3, "Replay phase, tag:%s\n",
2073 tag2str(le16_to_cpu(tl.fc_tag)));
2074 state->fc_replay_num_tags--;
2075 switch (le16_to_cpu(tl.fc_tag)) {
2076 case EXT4_FC_TAG_LINK:
2077 ret = ext4_fc_replay_link(sb, &tl, val);
2078 break;
2079 case EXT4_FC_TAG_UNLINK:
2080 ret = ext4_fc_replay_unlink(sb, &tl, val);
2081 break;
2082 case EXT4_FC_TAG_ADD_RANGE:
2083 ret = ext4_fc_replay_add_range(sb, &tl, val);
2084 break;
2085 case EXT4_FC_TAG_CREAT:
2086 ret = ext4_fc_replay_create(sb, &tl, val);
2087 break;
2088 case EXT4_FC_TAG_DEL_RANGE:
2089 ret = ext4_fc_replay_del_range(sb, &tl, val);
2090 break;
2091 case EXT4_FC_TAG_INODE:
2092 ret = ext4_fc_replay_inode(sb, &tl, val);
2093 break;
2094 case EXT4_FC_TAG_PAD:
2095 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2096 le16_to_cpu(tl.fc_len), 0);
2097 break;
2098 case EXT4_FC_TAG_TAIL:
2099 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2100 le16_to_cpu(tl.fc_len), 0);
2101 memcpy(&tail, val, sizeof(tail));
2102 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2103 break;
2104 case EXT4_FC_TAG_HEAD:
2105 break;
2106 default:
2107 trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2108 le16_to_cpu(tl.fc_len), 0);
2109 ret = -ECANCELED;
2110 break;
2111 }
2112 if (ret < 0)
2113 break;
2114 ret = JBD2_FC_REPLAY_CONTINUE;
2115 }
2116 return ret;
2117}
2118
2119void ext4_fc_init(struct super_block *sb, journal_t *journal)
2120{
2121 /*
2122 * We set replay callback even if fast commit disabled because we may
2123 * could still have fast commit blocks that need to be replayed even if
2124 * fast commit has now been turned off.
2125 */
2126 journal->j_fc_replay_callback = ext4_fc_replay;
2127 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2128 return;
2129 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2130}
2131
2132static const char *fc_ineligible_reasons[] = {
2133 "Extended attributes changed",
2134 "Cross rename",
2135 "Journal flag changed",
2136 "Insufficient memory",
2137 "Swap boot",
2138 "Resize",
2139 "Dir renamed",
2140 "Falloc range op",
2141 "Data journalling",
2142 "FC Commit Failed"
2143};
2144
2145int ext4_fc_info_show(struct seq_file *seq, void *v)
2146{
2147 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2148 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2149 int i;
2150
2151 if (v != SEQ_START_TOKEN)
2152 return 0;
2153
2154 seq_printf(seq,
2155 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2156 stats->fc_num_commits, stats->fc_ineligible_commits,
2157 stats->fc_numblks,
2158 div_u64(sbi->s_fc_avg_commit_time, 1000));
2159 seq_puts(seq, "Ineligible reasons:\n");
2160 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2161 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2162 stats->fc_ineligible_reason_count[i]);
2163
2164 return 0;
2165}
2166
2167int __init ext4_fc_init_dentry_cache(void)
2168{
2169 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2170 SLAB_RECLAIM_ACCOUNT);
2171
2172 if (ext4_fc_dentry_cachep == NULL)
2173 return -ENOMEM;
2174
2175 return 0;
2176}
2177
2178void ext4_fc_destroy_dentry_cache(void)
2179{
2180 kmem_cache_destroy(ext4_fc_dentry_cachep);
2181}