Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5ef99b9..472932b 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -112,7 +112,7 @@
int nblocks, space_left;
/* assert_spin_locked(&journal->j_state_lock); */
- nblocks = jbd2_space_needed(journal);
+ nblocks = journal->j_max_transaction_buffers;
while (jbd2_log_space_left(journal) < nblocks) {
write_unlock(&journal->j_state_lock);
mutex_lock_io(&journal->j_checkpoint_mutex);
@@ -416,7 +416,7 @@
* jbd2_cleanup_journal_tail() doesn't get called all that often.
*/
if (journal->j_flags & JBD2_BARRIER)
- blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
+ blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
return __jbd2_update_log_tail(journal, first_tid, blocknr);
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8814600..b121d7d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -187,21 +187,49 @@
* use writepages() because with delayed allocation we may be doing
* block allocation in writepages().
*/
-static int journal_submit_inode_data_buffers(struct address_space *mapping,
- loff_t dirty_start, loff_t dirty_end)
+int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
- int ret;
+ struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = mapping->nrpages * 2,
- .range_start = dirty_start,
- .range_end = dirty_end,
+ .range_start = jinode->i_dirty_start,
+ .range_end = jinode->i_dirty_end,
};
- ret = generic_writepages(mapping, &wbc);
- return ret;
+ /*
+ * submit the inode data buffers. We use writepage
+ * instead of writepages. Because writepages can do
+ * block allocation with delalloc. We need to write
+ * only allocated blocks here.
+ */
+ return generic_writepages(mapping, &wbc);
}
+/* Send all the data buffers related to an inode */
+int jbd2_submit_inode_data(struct jbd2_inode *jinode)
+{
+
+ if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
+ return 0;
+
+ trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
+ return jbd2_journal_submit_inode_data_buffers(jinode);
+
+}
+EXPORT_SYMBOL(jbd2_submit_inode_data);
+
+int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
+{
+ if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) ||
+ !jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping)
+ return 0;
+ return filemap_fdatawait_range_keep_errors(
+ jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start,
+ jinode->i_dirty_end);
+}
+EXPORT_SYMBOL(jbd2_wait_inode_data);
+
/*
* Submit all the data buffers of inode associated with the transaction to
* disk.
@@ -215,29 +243,20 @@
{
struct jbd2_inode *jinode;
int err, ret = 0;
- struct address_space *mapping;
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
- loff_t dirty_start = jinode->i_dirty_start;
- loff_t dirty_end = jinode->i_dirty_end;
-
if (!(jinode->i_flags & JI_WRITE_DATA))
continue;
- mapping = jinode->i_vfs_inode->i_mapping;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
- /*
- * submit the inode data buffers. We use writepage
- * instead of writepages. Because writepages can do
- * block allocation with delalloc. We need to write
- * only allocated blocks here.
- */
+ /* submit the inode data buffers. */
trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
- err = journal_submit_inode_data_buffers(mapping, dirty_start,
- dirty_end);
- if (!ret)
- ret = err;
+ if (journal->j_submit_inode_data_buffers) {
+ err = journal->j_submit_inode_data_buffers(jinode);
+ if (!ret)
+ ret = err;
+ }
spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction);
jinode->i_flags &= ~JI_COMMIT_RUNNING;
@@ -248,6 +267,15 @@
return ret;
}
+int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
+
+ return filemap_fdatawait_range_keep_errors(mapping,
+ jinode->i_dirty_start,
+ jinode->i_dirty_end);
+}
+
/*
* Wait for data submitted for writeout, refile inodes to proper
* transaction if needed.
@@ -262,18 +290,16 @@
/* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
- loff_t dirty_start = jinode->i_dirty_start;
- loff_t dirty_end = jinode->i_dirty_end;
-
if (!(jinode->i_flags & JI_WAIT_DATA))
continue;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
- err = filemap_fdatawait_range_keep_errors(
- jinode->i_vfs_inode->i_mapping, dirty_start,
- dirty_end);
- if (!ret)
- ret = err;
+ /* wait for the inode data buffers writeout. */
+ if (journal->j_finish_inode_data_buffers) {
+ err = journal->j_finish_inode_data_buffers(jinode);
+ if (!ret)
+ ret = err;
+ }
spin_lock(&journal->j_list_lock);
jinode->i_flags &= ~JI_COMMIT_RUNNING;
smp_mb();
@@ -413,6 +439,29 @@
J_ASSERT(journal->j_running_transaction != NULL);
J_ASSERT(journal->j_committing_transaction == NULL);
+ write_lock(&journal->j_state_lock);
+ journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
+ while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&journal->j_fc_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ write_unlock(&journal->j_state_lock);
+ schedule();
+ write_lock(&journal->j_state_lock);
+ finish_wait(&journal->j_fc_wait, &wait);
+ /*
+ * TODO: by blocking fast commits here, we are increasing
+ * fsync() latency slightly. Strictly speaking, we don't need
+ * to block fast commits until the transaction enters T_FLUSH
+ * state. So an optimization is possible where we block new fast
+ * commits here and wait for existing ones to complete
+ * just before we enter T_FLUSH. That way, the existing fast
+ * commits and this full commit can proceed parallely.
+ */
+ }
+ write_unlock(&journal->j_state_lock);
+
commit_transaction = journal->j_running_transaction;
trace_jbd2_start_commit(journal, commit_transaction);
@@ -420,6 +469,7 @@
commit_transaction->t_tid);
write_lock(&journal->j_state_lock);
+ journal->j_fc_off = 0;
J_ASSERT(commit_transaction->t_state == T_RUNNING);
commit_transaction->t_state = T_LOCKED;
@@ -482,10 +532,10 @@
if (jh->b_committed_data) {
struct buffer_head *bh = jh2bh(jh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
jbd2_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
jbd2_journal_refile_buffer(journal, jh);
}
@@ -560,8 +610,7 @@
stats.run.rs_logging = jiffies;
stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
stats.run.rs_logging);
- stats.run.rs_blocks =
- atomic_read(&commit_transaction->t_outstanding_credits);
+ stats.run.rs_blocks = commit_transaction->t_nr_buffers;
stats.run.rs_blocks_logged = 0;
J_ASSERT(commit_transaction->t_nr_buffers <=
@@ -642,8 +691,7 @@
/*
* start_this_handle() uses t_outstanding_credits to determine
- * the free space in the log, but this counter is changed
- * by jbd2_journal_next_log_block() also.
+ * the free space in the log.
*/
atomic_dec(&commit_transaction->t_outstanding_credits);
@@ -762,7 +810,7 @@
if (first_block < journal->j_tail)
freed += journal->j_last - journal->j_first;
/* Update tail only if we free significant amount of space */
- if (freed < journal->j_maxlen / 4)
+ if (freed < jbd2_journal_get_max_txn_bufs(journal))
update_tail = 0;
}
J_ASSERT(commit_transaction->t_state == T_COMMIT);
@@ -777,7 +825,7 @@
if (commit_transaction->t_need_data_flush &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
+ blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
/* Done it all: now write the commit record asynchronously. */
if (jbd2_has_feature_async_commit(journal)) {
@@ -884,12 +932,15 @@
stats.run.rs_blocks_logged++;
if (jbd2_has_feature_async_commit(journal) &&
journal->j_flags & JBD2_BARRIER) {
- blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
+ blkdev_issue_flush(journal->j_dev, GFP_NOFS);
}
if (err)
jbd2_journal_abort(journal, err);
+ WARN_ON_ONCE(
+ atomic_read(&commit_transaction->t_outstanding_credits) < 0);
+
/*
* Now disk caches for filesystem device are flushed so we are safe to
* erase checkpointed transactions from the log by updating journal
@@ -920,6 +971,7 @@
transaction_t *cp_transaction;
struct buffer_head *bh;
int try_to_free = 0;
+ bool drop_ref;
jh = commit_transaction->t_forget;
spin_unlock(&journal->j_list_lock);
@@ -929,7 +981,7 @@
* done with it.
*/
get_bh(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
/*
@@ -1029,8 +1081,10 @@
try_to_free = 1;
}
JBUFFER_TRACE(jh, "refile or unfile buffer");
- __jbd2_journal_refile_buffer(jh);
- jbd_unlock_bh_state(bh);
+ drop_ref = __jbd2_journal_refile_buffer(jh);
+ spin_unlock(&jh->b_state_lock);
+ if (drop_ref)
+ jbd2_journal_put_journal_head(jh);
if (try_to_free)
release_buffer_page(bh); /* Drops bh reference */
else
@@ -1115,12 +1169,16 @@
if (journal->j_commit_callback)
journal->j_commit_callback(journal, commit_transaction);
+ if (journal->j_fc_cleanup_callback)
+ journal->j_fc_cleanup_callback(journal, 1);
trace_jbd2_end_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
write_lock(&journal->j_state_lock);
+ journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
+ journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
spin_lock(&journal->j_list_lock);
commit_transaction->t_state = T_FINISHED;
/* Check if the transaction can be dropped now that we are finished */
@@ -1132,6 +1190,7 @@
spin_unlock(&journal->j_list_lock);
write_unlock(&journal->j_state_lock);
wake_up(&journal->j_wait_done_commit);
+ wake_up(&journal->j_fc_wait);
/*
* Calculate overall stats
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b7c5819..b748329 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -91,6 +91,8 @@
EXPORT_SYMBOL(jbd2_journal_force_commit);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
+EXPORT_SYMBOL(jbd2_journal_submit_inode_data_buffers);
+EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers);
EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
@@ -157,7 +159,9 @@
*
* 1) COMMIT: Every so often we need to commit the current state of the
* filesystem to disk. The journal thread is responsible for writing
- * all of the metadata buffers to disk.
+ * all of the metadata buffers to disk. If a fast commit is ongoing
+ * journal thread waits until it's done and then continues from
+ * there on.
*
* 2) CHECKPOINT: We cannot reuse a used section of the log file until all
* of the data in that part of the log has been rewritten elsewhere on
@@ -362,7 +366,7 @@
/* keep subsequent assertions sane */
atomic_set(&new_bh->b_count, 1);
- jbd_lock_bh_state(bh_in);
+ spin_lock(&jh_in->b_state_lock);
repeat:
/*
* If a new transaction has already done a buffer copy-out, then
@@ -404,13 +408,13 @@
if (need_copy_out && !done_copy_out) {
char *tmp;
- jbd_unlock_bh_state(bh_in);
+ spin_unlock(&jh_in->b_state_lock);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
if (!tmp) {
brelse(new_bh);
return -ENOMEM;
}
- jbd_lock_bh_state(bh_in);
+ spin_lock(&jh_in->b_state_lock);
if (jh_in->b_frozen_data) {
jbd2_free(tmp, bh_in->b_size);
goto repeat;
@@ -463,7 +467,7 @@
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
spin_unlock(&journal->j_list_lock);
set_buffer_shadow(bh_in);
- jbd_unlock_bh_state(bh_in);
+ spin_unlock(&jh_in->b_state_lock);
return do_escape | (done_copy_out << 1);
}
@@ -562,12 +566,14 @@
}
/**
- * Force and wait upon a commit if the calling process is not within
- * transaction. This is used for forcing out undo-protected data which contains
- * bitmaps, when the fs is running out of space.
+ * jbd2_journal_force_commit_nested - Force and wait upon a commit if the
+ * calling process is not within transaction.
*
* @journal: journal to force
* Returns true if progress was made.
+ *
+ * This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
*/
int jbd2_journal_force_commit_nested(journal_t *journal)
{
@@ -578,7 +584,7 @@
}
/**
- * int journal_force_commit() - force any uncommitted transactions
+ * jbd2_journal_force_commit() - force any uncommitted transactions
* @journal: journal to force
*
* Caller want unconditional commit. We can only force the running transaction
@@ -714,6 +720,85 @@
return err;
}
+/*
+ * Start a fast commit. If there's an ongoing fast or full commit wait for
+ * it to complete. Returns 0 if a new fast commit was started. Returns -EALREADY
+ * if a fast commit is not needed, either because there's an already a commit
+ * going on or this tid has already been committed. Returns -EINVAL if no jbd2
+ * commit has yet been performed.
+ */
+int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
+{
+ if (unlikely(is_journal_aborted(journal)))
+ return -EIO;
+ /*
+ * Fast commits only allowed if at least one full commit has
+ * been processed.
+ */
+ if (!journal->j_stats.ts_tid)
+ return -EINVAL;
+
+ write_lock(&journal->j_state_lock);
+ if (tid <= journal->j_commit_sequence) {
+ write_unlock(&journal->j_state_lock);
+ return -EALREADY;
+ }
+
+ if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
+ (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&journal->j_fc_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ write_unlock(&journal->j_state_lock);
+ schedule();
+ finish_wait(&journal->j_fc_wait, &wait);
+ return -EALREADY;
+ }
+ journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
+ write_unlock(&journal->j_state_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(jbd2_fc_begin_commit);
+
+/*
+ * Stop a fast commit. If fallback is set, this function starts commit of
+ * TID tid before any other fast commit can start.
+ */
+static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
+{
+ if (journal->j_fc_cleanup_callback)
+ journal->j_fc_cleanup_callback(journal, 0);
+ write_lock(&journal->j_state_lock);
+ journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
+ if (fallback)
+ journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
+ write_unlock(&journal->j_state_lock);
+ wake_up(&journal->j_fc_wait);
+ if (fallback)
+ return jbd2_complete_transaction(journal, tid);
+ return 0;
+}
+
+int jbd2_fc_end_commit(journal_t *journal)
+{
+ return __jbd2_fc_end_commit(journal, 0, false);
+}
+EXPORT_SYMBOL(jbd2_fc_end_commit);
+
+int jbd2_fc_end_commit_fallback(journal_t *journal)
+{
+ tid_t tid;
+
+ read_lock(&journal->j_state_lock);
+ tid = journal->j_running_transaction ?
+ journal->j_running_transaction->t_tid : 0;
+ read_unlock(&journal->j_state_lock);
+ return __jbd2_fc_end_commit(journal, tid, true);
+}
+EXPORT_SYMBOL(jbd2_fc_end_commit_fallback);
+
/* Return 1 when transaction with given tid has already committed. */
int jbd2_transaction_committed(journal_t *journal, tid_t tid)
{
@@ -782,6 +867,100 @@
return jbd2_journal_bmap(journal, blocknr, retp);
}
+/* Map one fast commit buffer for use by the file system */
+int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
+{
+ unsigned long long pblock;
+ unsigned long blocknr;
+ int ret = 0;
+ struct buffer_head *bh;
+ int fc_off;
+
+ *bh_out = NULL;
+
+ if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) {
+ fc_off = journal->j_fc_off;
+ blocknr = journal->j_fc_first + fc_off;
+ journal->j_fc_off++;
+ } else {
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ return ret;
+
+ ret = jbd2_journal_bmap(journal, blocknr, &pblock);
+ if (ret)
+ return ret;
+
+ bh = __getblk(journal->j_dev, pblock, journal->j_blocksize);
+ if (!bh)
+ return -ENOMEM;
+
+
+ journal->j_fc_wbuf[fc_off] = bh;
+
+ *bh_out = bh;
+
+ return 0;
+}
+EXPORT_SYMBOL(jbd2_fc_get_buf);
+
+/*
+ * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf
+ * for completion.
+ */
+int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
+{
+ struct buffer_head *bh;
+ int i, j_fc_off;
+
+ j_fc_off = journal->j_fc_off;
+
+ /*
+ * Wait in reverse order to minimize chances of us being woken up before
+ * all IOs have completed
+ */
+ for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) {
+ bh = journal->j_fc_wbuf[i];
+ wait_on_buffer(bh);
+ put_bh(bh);
+ journal->j_fc_wbuf[i] = NULL;
+ if (unlikely(!buffer_uptodate(bh)))
+ return -EIO;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(jbd2_fc_wait_bufs);
+
+/*
+ * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf
+ * for completion.
+ */
+int jbd2_fc_release_bufs(journal_t *journal)
+{
+ struct buffer_head *bh;
+ int i, j_fc_off;
+
+ j_fc_off = journal->j_fc_off;
+
+ /*
+ * Wait in reverse order to minimize chances of us being woken up before
+ * all IOs have completed
+ */
+ for (i = j_fc_off - 1; i >= 0; i--) {
+ bh = journal->j_fc_wbuf[i];
+ if (!bh)
+ break;
+ put_bh(bh);
+ journal->j_fc_wbuf[i] = NULL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(jbd2_fc_release_bufs);
+
/*
* Conversion of logical to physical block numbers for the journal
*
@@ -794,18 +973,22 @@
{
int err = 0;
unsigned long long ret;
+ sector_t block = 0;
if (journal->j_inode) {
- ret = bmap(journal->j_inode, blocknr);
- if (ret)
- *retp = ret;
- else {
+ block = blocknr;
+ ret = bmap(journal->j_inode, &block);
+
+ if (ret || !block) {
printk(KERN_ALERT "%s: journal block not found "
"at offset %lu on %s\n",
__func__, blocknr, journal->j_devname);
err = -EIO;
jbd2_journal_abort(journal, err);
+ } else {
+ *retp = block;
}
+
} else {
*retp = blocknr; /* +journal->j_blk_offset */
}
@@ -839,6 +1022,7 @@
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh)
return NULL;
+ atomic_dec(&transaction->t_outstanding_credits);
lock_buffer(bh);
memset(bh->b_data, 0, journal->j_blocksize);
header = (journal_header_t *)bh->b_data;
@@ -1073,12 +1257,11 @@
return seq_release(inode, file);
}
-static const struct file_operations jbd2_seq_info_fops = {
- .owner = THIS_MODULE,
- .open = jbd2_seq_info_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = jbd2_seq_info_release,
+static const struct proc_ops jbd2_info_proc_ops = {
+ .proc_open = jbd2_seq_info_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = jbd2_seq_info_release,
};
static struct proc_dir_entry *proc_jbd2_stats;
@@ -1088,7 +1271,7 @@
journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
if (journal->j_proc_entry) {
proc_create_data("info", S_IRUGO, journal->j_proc_entry,
- &jbd2_seq_info_fops, journal);
+ &jbd2_info_proc_ops, journal);
}
}
@@ -1098,6 +1281,16 @@
remove_proc_entry(journal->j_devname, proc_jbd2_stats);
}
+/* Minimum size of descriptor tag */
+static int jbd2_min_tag_size(void)
+{
+ /*
+ * Tag with 32-bit block numbers does not use last four bytes of the
+ * structure
+ */
+ return sizeof(journal_block_tag_t) - 4;
+}
+
/*
* Management for journal control blocks: functions to create and
* destroy journal_t structures, and to initialise and read existing
@@ -1126,6 +1319,8 @@
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
init_waitqueue_head(&journal->j_wait_reserved);
+ init_waitqueue_head(&journal->j_fc_wait);
+ mutex_init(&journal->j_abort_mutex);
mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
@@ -1155,9 +1350,11 @@
journal->j_dev = bdev;
journal->j_fs_dev = fs_dev;
journal->j_blk_offset = start;
- journal->j_maxlen = len;
- n = journal->j_blocksize / sizeof(journal_block_tag_t);
+ journal->j_total_len = len;
+ /* We need enough buffers to write out full descriptor block. */
+ n = journal->j_blocksize / jbd2_min_tag_size();
journal->j_wbufsize = n;
+ journal->j_fc_wbuf = NULL;
journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
GFP_KERNEL);
if (!journal->j_wbuf)
@@ -1232,11 +1429,14 @@
journal_t *jbd2_journal_init_inode(struct inode *inode)
{
journal_t *journal;
+ sector_t blocknr;
char *p;
- unsigned long long blocknr;
+ int err = 0;
- blocknr = bmap(inode, 0);
- if (!blocknr) {
+ blocknr = 0;
+ err = bmap(inode, &blocknr);
+
+ if (err || !blocknr) {
pr_err("%s: Cannot locate journal superblock\n",
__func__);
return NULL;
@@ -1266,7 +1466,7 @@
* superblock as being NULL to prevent the journal destroy from writing
* back a bogus superblock.
*/
-static void journal_fail_superblock (journal_t *journal)
+static void journal_fail_superblock(journal_t *journal)
{
struct buffer_head *bh = journal->j_sb_buffer;
brelse(bh);
@@ -1297,15 +1497,22 @@
journal->j_first = first;
journal->j_last = last;
- journal->j_head = first;
- journal->j_tail = first;
- journal->j_free = last - first;
+ journal->j_head = journal->j_first;
+ journal->j_tail = journal->j_first;
+ journal->j_free = journal->j_last - journal->j_first;
journal->j_tail_sequence = journal->j_transaction_sequence;
journal->j_commit_sequence = journal->j_transaction_sequence - 1;
journal->j_commit_request = journal->j_commit_sequence;
- journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+ journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal);
+
+ /*
+ * Now that journal recovery is done, turn fast commits off here. This
+ * way, if fast commit was enabled before the crash but if now FS has
+ * disabled it, we don't enable fast commits.
+ */
+ jbd2_clear_feature_fast_commit(journal);
/*
* As a special case, if the on-disk copy is already marked as needing
@@ -1386,7 +1593,8 @@
printk(KERN_ERR "JBD2: Error %d detected when updating "
"journal superblock for %s.\n", ret,
journal->j_devname);
- jbd2_journal_abort(journal, ret);
+ if (!is_journal_aborted(journal))
+ jbd2_journal_abort(journal, ret);
}
return ret;
@@ -1444,6 +1652,7 @@
static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
{
journal_superblock_t *sb = journal->j_superblock;
+ bool had_fast_commit = false;
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
lock_buffer(journal->j_sb_buffer);
@@ -1457,9 +1666,20 @@
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
sb->s_start = cpu_to_be32(0);
+ if (jbd2_has_feature_fast_commit(journal)) {
+ /*
+ * When journal is clean, no need to commit fast commit flag and
+ * make file system incompatible with older kernels.
+ */
+ jbd2_clear_feature_fast_commit(journal);
+ had_fast_commit = true;
+ }
jbd2_write_superblock(journal, write_op);
+ if (had_fast_commit)
+ jbd2_set_feature_fast_commit(journal);
+
/* Log is no longer empty */
write_lock(&journal->j_state_lock);
journal->j_flags |= JBD2_FLUSHED;
@@ -1490,6 +1710,21 @@
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
+static int journal_revoke_records_per_block(journal_t *journal)
+{
+ int record_size;
+ int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);
+
+ if (jbd2_has_feature_64bit(journal))
+ record_size = 8;
+ else
+ record_size = 4;
+
+ if (jbd2_journal_has_csum_v2or3(journal))
+ space -= sizeof(struct jbd2_journal_block_tail);
+ return space / record_size;
+}
+
/*
* Read the superblock for a given journal, performing initial
* validation of the format.
@@ -1538,15 +1773,15 @@
goto out;
}
- if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
- journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
- else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
+ if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len)
+ journal->j_total_len = be32_to_cpu(sb->s_maxlen);
+ else if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) {
printk(KERN_WARNING "JBD2: journal file too short\n");
goto out;
}
if (be32_to_cpu(sb->s_first) == 0 ||
- be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+ be32_to_cpu(sb->s_first) >= journal->j_total_len) {
printk(KERN_WARNING
"JBD2: Invalid start block of journal: %u\n",
be32_to_cpu(sb->s_first));
@@ -1598,6 +1833,8 @@
sizeof(sb->s_uuid));
}
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
set_buffer_verified(bh);
return 0;
@@ -1616,6 +1853,7 @@
{
int err;
journal_superblock_t *sb;
+ int num_fc_blocks;
err = journal_get_superblock(journal);
if (err)
@@ -1626,15 +1864,26 @@
journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
journal->j_tail = be32_to_cpu(sb->s_start);
journal->j_first = be32_to_cpu(sb->s_first);
- journal->j_last = be32_to_cpu(sb->s_maxlen);
journal->j_errno = be32_to_cpu(sb->s_errno);
+ journal->j_last = be32_to_cpu(sb->s_maxlen);
+
+ if (jbd2_has_feature_fast_commit(journal)) {
+ journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
+ num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks);
+ if (!num_fc_blocks)
+ num_fc_blocks = JBD2_MIN_FC_BLOCKS;
+ if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS)
+ journal->j_last = journal->j_fc_last - num_fc_blocks;
+ journal->j_fc_first = journal->j_last + 1;
+ journal->j_fc_off = 0;
+ }
return 0;
}
/**
- * int jbd2_journal_load() - Read journal from disk.
+ * jbd2_journal_load() - Read journal from disk.
* @journal: Journal to act on.
*
* Given a journal_t structure which tells us which disk blocks contain
@@ -1704,7 +1953,7 @@
}
/**
- * void jbd2_journal_destroy() - Release a journal_t structure.
+ * jbd2_journal_destroy() - Release a journal_t structure.
* @journal: Journal to act on.
*
* Release a journal_t structure once it is no longer in use by the
@@ -1772,6 +2021,7 @@
jbd2_journal_destroy_revoke(journal);
if (journal->j_chksum_driver)
crypto_free_shash(journal->j_chksum_driver);
+ kfree(journal->j_fc_wbuf);
kfree(journal->j_wbuf);
kfree(journal);
@@ -1780,7 +2030,7 @@
/**
- *int jbd2_journal_check_used_features () - Check if features specified are used.
+ * jbd2_journal_check_used_features() - Check if features specified are used.
* @journal: Journal to check.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
@@ -1790,7 +2040,7 @@
* features. Return true (non-zero) if it does.
**/
-int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
+int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
journal_superblock_t *sb;
@@ -1815,7 +2065,7 @@
}
/**
- * int jbd2_journal_check_available_features() - Check feature set in journalling layer
+ * jbd2_journal_check_available_features() - Check feature set in journalling layer
* @journal: Journal to check.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
@@ -1825,7 +2075,7 @@
* all of a given set of features on this journal. Return true
* (non-zero) if it can. */
-int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
+int jbd2_journal_check_available_features(journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
if (!compat && !ro && !incompat)
@@ -1846,8 +2096,39 @@
return 0;
}
+static int
+jbd2_journal_initialize_fast_commit(journal_t *journal)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+ unsigned long long num_fc_blks;
+
+ num_fc_blks = be32_to_cpu(sb->s_num_fc_blks);
+ if (num_fc_blks == 0)
+ num_fc_blks = JBD2_MIN_FC_BLOCKS;
+ if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
+ return -ENOSPC;
+
+ /* Are we called twice? */
+ WARN_ON(journal->j_fc_wbuf != NULL);
+ journal->j_fc_wbuf = kmalloc_array(num_fc_blks,
+ sizeof(struct buffer_head *), GFP_KERNEL);
+ if (!journal->j_fc_wbuf)
+ return -ENOMEM;
+
+ journal->j_fc_wbufsize = num_fc_blks;
+ journal->j_fc_last = journal->j_last;
+ journal->j_last = journal->j_fc_last - num_fc_blks;
+ journal->j_fc_first = journal->j_last + 1;
+ journal->j_fc_off = 0;
+ journal->j_free = journal->j_last - journal->j_first;
+ journal->j_max_transaction_buffers =
+ jbd2_journal_get_max_txn_bufs(journal);
+
+ return 0;
+}
+
/**
- * int jbd2_journal_set_features () - Mark a given journal feature in the superblock
+ * jbd2_journal_set_features() - Mark a given journal feature in the superblock
* @journal: Journal to act on.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
@@ -1858,7 +2139,7 @@
*
*/
-int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
+int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
#define INCOMPAT_FEATURE_ON(f) \
@@ -1889,6 +2170,13 @@
sb = journal->j_superblock;
+ if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) {
+ if (jbd2_journal_initialize_fast_commit(journal)) {
+ pr_err("JBD2: Cannot enable fast commits.\n");
+ return 0;
+ }
+ }
+
/* Load the checksum driver if necessary */
if ((journal->j_chksum_driver == NULL) &&
INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
@@ -1922,6 +2210,8 @@
sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat);
unlock_buffer(journal->j_sb_buffer);
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
return 1;
#undef COMPAT_FEATURE_ON
@@ -1929,7 +2219,7 @@
}
/*
- * jbd2_journal_clear_features () - Clear a given journal feature in the
+ * jbd2_journal_clear_features() - Clear a given journal feature in the
* superblock
* @journal: Journal to act on.
* @compat: bitmask of compatible features
@@ -1952,11 +2242,13 @@
sb->s_feature_compat &= ~cpu_to_be32(compat);
sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
sb->s_feature_incompat &= ~cpu_to_be32(incompat);
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
}
EXPORT_SYMBOL(jbd2_journal_clear_features);
/**
- * int jbd2_journal_flush () - Flush journal
+ * jbd2_journal_flush() - Flush journal
* @journal: Journal to act on.
*
* Flush all data for a given journal to disk and empty the journal.
@@ -2031,7 +2323,7 @@
}
/**
- * int jbd2_journal_wipe() - Wipe journal contents
+ * jbd2_journal_wipe() - Wipe journal contents
* @journal: Journal to act on.
* @write: flag (see below)
*
@@ -2072,7 +2364,7 @@
}
/**
- * void jbd2_journal_abort () - Shutdown the journal immediately.
+ * jbd2_journal_abort () - Shutdown the journal immediately.
* @journal: the journal to shutdown.
* @errno: an error number to record in the journal indicating
* the reason for the shutdown.
@@ -2117,6 +2409,13 @@
transaction_t *transaction;
/*
+ * Lock the aborting procedure until everything is done, this avoid
+ * races between filesystem's error handling flow (e.g. ext4_abort()),
+ * ensure panic after the error info is written into journal's
+ * superblock.
+ */
+ mutex_lock(&journal->j_abort_mutex);
+ /*
* ESHUTDOWN always takes precedence because a file system check
* caused by any other journal abort error is not required after
* a shutdown triggered.
@@ -2130,6 +2429,7 @@
journal->j_errno = errno;
jbd2_journal_update_sb_errno(journal);
}
+ mutex_unlock(&journal->j_abort_mutex);
return;
}
@@ -2151,14 +2451,11 @@
* layer could realise that a filesystem check is needed.
*/
jbd2_journal_update_sb_errno(journal);
-
- write_lock(&journal->j_state_lock);
- journal->j_flags |= JBD2_REC_ERR;
- write_unlock(&journal->j_state_lock);
+ mutex_unlock(&journal->j_abort_mutex);
}
/**
- * int jbd2_journal_errno () - returns the journal's error state.
+ * jbd2_journal_errno() - returns the journal's error state.
* @journal: journal to examine.
*
* This is the errno number set with jbd2_journal_abort(), the last
@@ -2182,7 +2479,7 @@
}
/**
- * int jbd2_journal_clear_err () - clears the journal's error state
+ * jbd2_journal_clear_err() - clears the journal's error state
* @journal: journal to act on.
*
* An error must be cleared or acked to take a FS out of readonly
@@ -2202,7 +2499,7 @@
}
/**
- * void jbd2_journal_ack_err() - Ack journal err.
+ * jbd2_journal_ack_err() - Ack journal err.
* @journal: journal to act on.
*
* An error must be cleared or acked to take a FS out of readonly
@@ -2390,6 +2687,8 @@
ret = kmem_cache_zalloc(jbd2_journal_head_cache,
GFP_NOFS | __GFP_NOFAIL);
}
+ if (ret)
+ spin_lock_init(&ret->b_state_lock);
return ret;
}
@@ -2496,12 +2795,12 @@
jbd_unlock_bh_journal_head(bh);
return jh;
}
+EXPORT_SYMBOL(jbd2_journal_grab_journal_head);
static void __journal_remove_journal_head(struct buffer_head *bh)
{
struct journal_head *jh = bh2jh(bh);
- J_ASSERT_JH(jh, jh->b_jcount >= 0);
J_ASSERT_JH(jh, jh->b_transaction == NULL);
J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
@@ -2509,17 +2808,23 @@
J_ASSERT_BH(bh, buffer_jbd(bh));
J_ASSERT_BH(bh, jh2bh(jh) == bh);
BUFFER_TRACE(bh, "remove journal_head");
- if (jh->b_frozen_data) {
- printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
- jbd2_free(jh->b_frozen_data, bh->b_size);
- }
- if (jh->b_committed_data) {
- printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
- jbd2_free(jh->b_committed_data, bh->b_size);
- }
+
+ /* Unlink before dropping the lock */
bh->b_private = NULL;
jh->b_bh = NULL; /* debug, really */
clear_buffer_jbd(bh);
+}
+
+static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
+{
+ if (jh->b_frozen_data) {
+ printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
+ jbd2_free(jh->b_frozen_data, b_size);
+ }
+ if (jh->b_committed_data) {
+ printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
+ jbd2_free(jh->b_committed_data, b_size);
+ }
journal_free_journal_head(jh);
}
@@ -2537,10 +2842,13 @@
if (!jh->b_jcount) {
__journal_remove_journal_head(bh);
jbd_unlock_bh_journal_head(bh);
+ journal_release_journal_head(jh, bh->b_size);
__brelse(bh);
- } else
+ } else {
jbd_unlock_bh_journal_head(bh);
+ }
}
+EXPORT_SYMBOL(jbd2_journal_put_journal_head);
/*
* Initialize jbd inode head
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index a4967b2..1e07dfa 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -35,7 +35,6 @@
int nr_revoke_hits;
};
-enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass);
static int scan_revoke_records(journal_t *, struct buffer_head *,
@@ -75,8 +74,8 @@
/* Do up to 128K of readahead */
max = start + (128 * 1024 / journal->j_blocksize);
- if (max > journal->j_maxlen)
- max = journal->j_maxlen;
+ if (max > journal->j_total_len)
+ max = journal->j_total_len;
/* Do the readahead itself. We'll submit MAXBUF buffer_heads at
* a time to the block device IO layer. */
@@ -135,7 +134,7 @@
*bhp = NULL;
- if (offset >= journal->j_maxlen) {
+ if (offset >= journal->j_total_len) {
printk(KERN_ERR "JBD2: corrupted journal superblock\n");
return -EFSCORRUPTED;
}
@@ -225,10 +224,50 @@
/* Make sure we wrap around the log correctly! */
#define wrap(journal, var) \
do { \
- if (var >= (journal)->j_last) \
- var -= ((journal)->j_last - (journal)->j_first); \
+ unsigned long _wrap_last = \
+ jbd2_has_feature_fast_commit(journal) ? \
+ (journal)->j_fc_last : (journal)->j_last; \
+ \
+ if (var >= _wrap_last) \
+ var -= (_wrap_last - (journal)->j_first); \
} while (0)
+static int fc_do_one_pass(journal_t *journal,
+ struct recovery_info *info, enum passtype pass)
+{
+ unsigned int expected_commit_id = info->end_transaction;
+ unsigned long next_fc_block;
+ struct buffer_head *bh;
+ int err = 0;
+
+ next_fc_block = journal->j_fc_first;
+ if (!journal->j_fc_replay_callback)
+ return 0;
+
+ while (next_fc_block <= journal->j_fc_last) {
+ jbd_debug(3, "Fast commit replay: next block %ld\n",
+ next_fc_block);
+ err = jread(&bh, journal, next_fc_block);
+ if (err) {
+ jbd_debug(3, "Fast commit replay: read error\n");
+ break;
+ }
+
+ err = journal->j_fc_replay_callback(journal, bh, pass,
+ next_fc_block - journal->j_fc_first,
+ expected_commit_id);
+ next_fc_block++;
+ if (err < 0 || err == JBD2_FC_REPLAY_STOP)
+ break;
+ err = 0;
+ }
+
+ if (err)
+ jbd_debug(3, "Fast commit replay failed, err = %d\n", err);
+
+ return err;
+}
+
/**
* jbd2_journal_recover - recovers a on-disk journal
* @journal: the journal to recover
@@ -286,7 +325,7 @@
err = err2;
/* Make sure all replayed data is on permanent storage */
if (journal->j_flags & JBD2_BARRIER) {
- err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+ err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL);
if (!err)
err = err2;
}
@@ -428,6 +467,8 @@
__u32 crc32_sum = ~0; /* Transactional Checksums */
int descr_csum_size = 0;
int block_error = 0;
+ bool need_check_commit_time = false;
+ __u64 last_trans_commit_time = 0, commit_time;
/*
* First thing is to establish what we expect to find in the log
@@ -470,7 +511,9 @@
break;
jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
- next_commit_ID, next_log_block, journal->j_last);
+ next_commit_ID, next_log_block,
+ jbd2_has_feature_fast_commit(journal) ?
+ journal->j_fc_last : journal->j_last);
/* Skip over each chunk of the transaction looking
* either the next descriptor block or the final commit
@@ -520,12 +563,21 @@
if (descr_csum_size > 0 &&
!jbd2_descriptor_block_csum_verify(journal,
bh->b_data)) {
- printk(KERN_ERR "JBD2: Invalid checksum "
- "recovering block %lu in log\n",
- next_log_block);
- err = -EFSBADCRC;
- brelse(bh);
- goto failed;
+ /*
+ * PASS_SCAN can see stale blocks due to lazy
+ * journal init. Don't error out on those yet.
+ */
+ if (pass != PASS_SCAN) {
+ pr_err("JBD2: Invalid checksum recovering block %lu in log\n",
+ next_log_block);
+ err = -EFSBADCRC;
+ brelse(bh);
+ goto failed;
+ }
+ need_check_commit_time = true;
+ jbd_debug(1,
+ "invalid descriptor block found in %lu\n",
+ next_log_block);
}
/* If it is a valid descriptor block, replay it
@@ -535,6 +587,7 @@
if (pass != PASS_REPLAY) {
if (pass == PASS_SCAN &&
jbd2_has_feature_checksum(journal) &&
+ !need_check_commit_time &&
!info->end_transaction) {
if (calc_chksums(journal, bh,
&next_log_block,
@@ -683,21 +736,48 @@
* mentioned conditions. Hence assume
* "Interrupted Commit".)
*/
+ commit_time = be64_to_cpu(
+ ((struct commit_header *)bh->b_data)->h_commit_sec);
+ /*
+ * If need_check_commit_time is set, it means we are in
+ * PASS_SCAN and csum verify failed before. If
+ * commit_time is increasing, it's the same journal,
+ * otherwise it is stale journal block, just end this
+ * recovery.
+ */
+ if (need_check_commit_time) {
+ if (commit_time >= last_trans_commit_time) {
+ pr_err("JBD2: Invalid checksum found in transaction %u\n",
+ next_commit_ID);
+ err = -EFSBADCRC;
+ brelse(bh);
+ goto failed;
+ }
+ ignore_crc_mismatch:
+ /*
+ * It likely does not belong to same journal,
+ * just end this recovery with success.
+ */
+ jbd_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
+ next_commit_ID);
+ err = 0;
+ brelse(bh);
+ goto done;
+ }
- /* Found an expected commit block: if checksums
- * are present verify them in PASS_SCAN; else not
+ /*
+ * Found an expected commit block: if checksums
+ * are present, verify them in PASS_SCAN; else not
* much to do other than move on to the next sequence
- * number. */
+ * number.
+ */
if (pass == PASS_SCAN &&
jbd2_has_feature_checksum(journal)) {
- int chksum_err, chksum_seen;
struct commit_header *cbh =
(struct commit_header *)bh->b_data;
unsigned found_chksum =
be32_to_cpu(cbh->h_chksum[0]);
- chksum_err = chksum_seen = 0;
-
if (info->end_transaction) {
journal->j_failed_commit =
info->end_transaction;
@@ -705,42 +785,25 @@
break;
}
- if (crc32_sum == found_chksum &&
- cbh->h_chksum_type == JBD2_CRC32_CHKSUM &&
- cbh->h_chksum_size ==
- JBD2_CRC32_CHKSUM_SIZE)
- chksum_seen = 1;
- else if (!(cbh->h_chksum_type == 0 &&
- cbh->h_chksum_size == 0 &&
- found_chksum == 0 &&
- !chksum_seen))
- /*
- * If fs is mounted using an old kernel and then
- * kernel with journal_chksum is used then we
- * get a situation where the journal flag has
- * checksum flag set but checksums are not
- * present i.e chksum = 0, in the individual
- * commit blocks.
- * Hence to avoid checksum failures, in this
- * situation, this extra check is added.
- */
- chksum_err = 1;
+ /* Neither checksum match nor unused? */
+ if (!((crc32_sum == found_chksum &&
+ cbh->h_chksum_type ==
+ JBD2_CRC32_CHKSUM &&
+ cbh->h_chksum_size ==
+ JBD2_CRC32_CHKSUM_SIZE) ||
+ (cbh->h_chksum_type == 0 &&
+ cbh->h_chksum_size == 0 &&
+ found_chksum == 0)))
+ goto chksum_error;
- if (chksum_err) {
- info->end_transaction = next_commit_ID;
-
- if (!jbd2_has_feature_async_commit(journal)) {
- journal->j_failed_commit =
- next_commit_ID;
- brelse(bh);
- break;
- }
- }
crc32_sum = ~0;
}
if (pass == PASS_SCAN &&
!jbd2_commit_block_csum_verify(journal,
bh->b_data)) {
+ chksum_error:
+ if (commit_time < last_trans_commit_time)
+ goto ignore_crc_mismatch;
info->end_transaction = next_commit_ID;
if (!jbd2_has_feature_async_commit(journal)) {
@@ -750,11 +813,24 @@
break;
}
}
+ if (pass == PASS_SCAN)
+ last_trans_commit_time = commit_time;
brelse(bh);
next_commit_ID++;
continue;
case JBD2_REVOKE_BLOCK:
+ /*
+ * Check revoke block crc in pass_scan, if csum verify
+ * failed, check commit block time later.
+ */
+ if (pass == PASS_SCAN &&
+ !jbd2_descriptor_block_csum_verify(journal,
+ bh->b_data)) {
+ jbd_debug(1, "JBD2: invalid revoke block found in %lu\n",
+ next_log_block);
+ need_check_commit_time = true;
+ }
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
if (pass != PASS_REVOKE) {
@@ -799,6 +875,13 @@
success = -EIO;
}
}
+
+ if (jbd2_has_feature_fast_commit(journal) && pass != PASS_REVOKE) {
+ err = fc_do_one_pass(journal, info, pass);
+ if (err)
+ success = err;
+ }
+
if (block_error && success == 0)
success = -EIO;
return success;
@@ -822,9 +905,6 @@
offset = sizeof(jbd2_journal_revoke_header_t);
rcount = be32_to_cpu(header->r_count);
- if (!jbd2_descriptor_block_csum_verify(journal, header))
- return -EFSBADCRC;
-
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_block_tail);
if (rcount > journal->j_blocksize - csum_size)
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f08073d..fa60878 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -371,6 +371,11 @@
}
#endif
+ if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) {
+ if (!bh_in)
+ brelse(bh);
+ return -EIO;
+ }
/* We really ought not ever to revoke twice in a row without
first having the revoke cancelled: it's illegal to free a
block twice without allocating it in between! */
@@ -391,6 +396,7 @@
__brelse(bh);
}
}
+ handle->h_revoke_credits--;
jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index be05fb9..e8fc45f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -63,6 +63,28 @@
}
/*
+ * Base amount of descriptor blocks we reserve for each transaction.
+ */
+static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
+{
+ int tag_space = journal->j_blocksize - sizeof(journal_header_t);
+ int tags_per_block;
+
+ /* Subtract UUID */
+ tag_space -= 16;
+ if (jbd2_journal_has_csum_v2or3(journal))
+ tag_space -= sizeof(struct jbd2_journal_block_tail);
+ /* Commit code leaves a slack space of 16 bytes at the end of block */
+ tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
+ /*
+ * Revoke descriptors are accounted separately so we need to reserve
+ * space for commit block and normal transaction descriptor blocks.
+ */
+ return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
+ tags_per_block);
+}
+
+/*
* jbd2_get_transaction: obtain a new transaction_t object.
*
* Simply initialise a new transaction. Initialize it in
@@ -88,7 +110,9 @@
spin_lock_init(&transaction->t_handle_lock);
atomic_set(&transaction->t_updates, 0);
atomic_set(&transaction->t_outstanding_credits,
+ jbd2_descriptor_blocks_per_trans(journal) +
atomic_read(&journal->j_reserved_credits));
+ atomic_set(&transaction->t_outstanding_revokes, 0);
atomic_set(&transaction->t_handle_count, 0);
INIT_LIST_HEAD(&transaction->t_inode_list);
INIT_LIST_HEAD(&transaction->t_private_list);
@@ -260,12 +284,13 @@
* *before* starting to dirty potentially checkpointed buffers
* in the new transaction.
*/
- if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+ if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
atomic_sub(total, &t->t_outstanding_credits);
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
write_lock(&journal->j_state_lock);
- if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+ if (jbd2_log_space_left(journal) <
+ journal->j_max_transaction_buffers)
__jbd2_log_wait_for_space(journal);
write_unlock(&journal->j_state_lock);
return 1;
@@ -301,12 +326,12 @@
gfp_t gfp_mask)
{
transaction_t *transaction, *new_transaction = NULL;
- int blocks = handle->h_buffer_credits;
+ int blocks = handle->h_total_credits;
int rsv_blocks = 0;
unsigned long ts = jiffies;
if (handle->h_rsv_handle)
- rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+ rsv_blocks = handle->h_rsv_handle->h_total_credits;
/*
* Limit the number of reserved credits to 1/2 of maximum transaction
@@ -324,7 +349,12 @@
}
alloc_transaction:
- if (!journal->j_running_transaction) {
+ /*
+ * This check is racy but it is just an optimization of allocating new
+ * transaction early if there are high chances we'll need it. If we
+ * guess wrong, we'll retry or free unused transaction.
+ */
+ if (!data_race(journal->j_running_transaction)) {
/*
* If __GFP_FS is not present, then we may be being called from
* inside the fs writeback layer, so we MUST NOT fail.
@@ -407,6 +437,7 @@
update_t_max_wait(transaction, ts);
handle->h_transaction = transaction;
handle->h_requested_credits = blocks;
+ handle->h_revoke_credits_requested = handle->h_revoke_credits;
handle->h_start_jiffies = jiffies;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
@@ -433,15 +464,15 @@
handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
if (!handle)
return NULL;
- handle->h_buffer_credits = nblocks;
+ handle->h_total_credits = nblocks;
handle->h_ref = 1;
return handle;
}
handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
- gfp_t gfp_mask, unsigned int type,
- unsigned int line_no)
+ int revoke_records, gfp_t gfp_mask,
+ unsigned int type, unsigned int line_no)
{
handle_t *handle = journal_current_handle();
int err;
@@ -455,6 +486,8 @@
return handle;
}
+ nblocks += DIV_ROUND_UP(revoke_records,
+ journal->j_revoke_records_per_block);
handle = new_handle(nblocks);
if (!handle)
return ERR_PTR(-ENOMEM);
@@ -470,6 +503,7 @@
rsv_handle->h_journal = journal;
handle->h_rsv_handle = rsv_handle;
}
+ handle->h_revoke_credits = revoke_records;
err = start_this_handle(journal, handle, gfp_mask);
if (err < 0) {
@@ -490,7 +524,7 @@
/**
- * handle_t *jbd2_journal_start() - Obtain a new handle.
+ * jbd2_journal_start() - Obtain a new handle.
* @journal: Journal to start transaction on.
* @nblocks: number of block buffer we might modify
*
@@ -498,7 +532,7 @@
* modified buffers in the log. We block until the log can guarantee
* that much space. Additionally, if rsv_blocks > 0, we also create another
* handle with rsv_blocks reserved blocks in the journal. This handle is
- * is stored in h_rsv_handle. It is not attached to any particular transaction
+ * stored in h_rsv_handle. It is not attached to any particular transaction
* and thus doesn't block transaction commit. If the caller uses this reserved
* handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
* on the parent handle will dispose the reserved one. Reserved handle has to
@@ -510,22 +544,34 @@
*/
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
{
- return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
+ return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0);
}
EXPORT_SYMBOL(jbd2_journal_start);
+static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t)
+{
+ journal_t *journal = handle->h_journal;
+
+ WARN_ON(!handle->h_reserved);
+ sub_reserved_credits(journal, handle->h_total_credits);
+ if (t)
+ atomic_sub(handle->h_total_credits, &t->t_outstanding_credits);
+}
+
void jbd2_journal_free_reserved(handle_t *handle)
{
journal_t *journal = handle->h_journal;
- WARN_ON(!handle->h_reserved);
- sub_reserved_credits(journal, handle->h_buffer_credits);
+ /* Get j_state_lock to pin running transaction if it exists */
+ read_lock(&journal->j_state_lock);
+ __jbd2_journal_unreserve_handle(handle, journal->j_running_transaction);
+ read_unlock(&journal->j_state_lock);
jbd2_free_handle(handle);
}
EXPORT_SYMBOL(jbd2_journal_free_reserved);
/**
- * int jbd2_journal_start_reserved() - start reserved handle
+ * jbd2_journal_start_reserved() - start reserved handle
* @handle: handle to start
* @type: for handle statistics
* @line_no: for handle statistics
@@ -573,15 +619,16 @@
handle->h_line_no = line_no;
trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
handle->h_transaction->t_tid, type,
- line_no, handle->h_buffer_credits);
+ line_no, handle->h_total_credits);
return 0;
}
EXPORT_SYMBOL(jbd2_journal_start_reserved);
/**
- * int jbd2_journal_extend() - extend buffer credits.
+ * jbd2_journal_extend() - extend buffer credits.
* @handle: handle to 'extend'
* @nblocks: nr blocks to try to extend by.
+ * @revoke_records: number of revoke records to try to extend by.
*
* Some transactions, such as large extends and truncates, can be done
* atomically all at once or in several stages. The operation requests
@@ -598,7 +645,7 @@
* return code < 0 implies an error
* return code > 0 implies normal transaction-full status.
*/
-int jbd2_journal_extend(handle_t *handle, int nblocks)
+int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
@@ -620,6 +667,12 @@
goto error_out;
}
+ nblocks += DIV_ROUND_UP(
+ handle->h_revoke_credits_requested + revoke_records,
+ journal->j_revoke_records_per_block) -
+ DIV_ROUND_UP(
+ handle->h_revoke_credits_requested,
+ journal->j_revoke_records_per_block);
spin_lock(&transaction->t_handle_lock);
wanted = atomic_add_return(nblocks,
&transaction->t_outstanding_credits);
@@ -631,22 +684,16 @@
goto unlock;
}
- if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
- jbd2_log_space_left(journal)) {
- jbd_debug(3, "denied handle %p %d blocks: "
- "insufficient log space\n", handle, nblocks);
- atomic_sub(nblocks, &transaction->t_outstanding_credits);
- goto unlock;
- }
-
trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
transaction->t_tid,
handle->h_type, handle->h_line_no,
- handle->h_buffer_credits,
+ handle->h_total_credits,
nblocks);
- handle->h_buffer_credits += nblocks;
+ handle->h_total_credits += nblocks;
handle->h_requested_credits += nblocks;
+ handle->h_revoke_credits += revoke_records;
+ handle->h_revoke_credits_requested += revoke_records;
result = 0;
jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -657,11 +704,56 @@
return result;
}
+static void stop_this_handle(handle_t *handle)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ int revokes;
+
+ J_ASSERT(journal_current_handle() == handle);
+ J_ASSERT(atomic_read(&transaction->t_updates) > 0);
+ current->journal_info = NULL;
+ /*
+ * Subtract necessary revoke descriptor blocks from handle credits. We
+ * take care to account only for revoke descriptor blocks the
+ * transaction will really need as large sequences of transactions with
+ * small numbers of revokes are relatively common.
+ */
+ revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
+ if (revokes) {
+ int t_revokes, revoke_descriptors;
+ int rr_per_blk = journal->j_revoke_records_per_block;
+
+ WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
+ > handle->h_total_credits);
+ t_revokes = atomic_add_return(revokes,
+ &transaction->t_outstanding_revokes);
+ revoke_descriptors =
+ DIV_ROUND_UP(t_revokes, rr_per_blk) -
+ DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
+ handle->h_total_credits -= revoke_descriptors;
+ }
+ atomic_sub(handle->h_total_credits,
+ &transaction->t_outstanding_credits);
+ if (handle->h_rsv_handle)
+ __jbd2_journal_unreserve_handle(handle->h_rsv_handle,
+ transaction);
+ if (atomic_dec_and_test(&transaction->t_updates))
+ wake_up(&journal->j_wait_updates);
+
+ rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
+ /*
+ * Scope of the GFP_NOFS context is over here and so we can restore the
+ * original alloc context.
+ */
+ memalloc_nofs_restore(handle->saved_alloc_context);
+}
/**
- * int jbd2_journal_restart() - restart a handle .
+ * jbd2__journal_restart() - restart a handle .
* @handle: handle to restart
* @nblocks: nr credits requested
+ * @revoke_records: number of revoke record credits requested
* @gfp_mask: memory allocation flags (for start_this_handle)
*
* Restart a handle for a multi-transaction filesystem
@@ -674,56 +766,48 @@
* credits. We preserve reserved handle if there's any attached to the
* passed in handle.
*/
-int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
+int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
+ gfp_t gfp_mask)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
tid_t tid;
- int need_to_start, ret;
+ int need_to_start;
+ int ret;
/* If we've had an abort of any type, don't even think about
* actually doing the restart! */
if (is_handle_aborted(handle))
return 0;
journal = transaction->t_journal;
+ tid = transaction->t_tid;
/*
* First unlink the handle from its current transaction, and start the
* commit on that.
*/
- J_ASSERT(atomic_read(&transaction->t_updates) > 0);
- J_ASSERT(journal_current_handle() == handle);
-
- read_lock(&journal->j_state_lock);
- spin_lock(&transaction->t_handle_lock);
- atomic_sub(handle->h_buffer_credits,
- &transaction->t_outstanding_credits);
- if (handle->h_rsv_handle) {
- sub_reserved_credits(journal,
- handle->h_rsv_handle->h_buffer_credits);
- }
- if (atomic_dec_and_test(&transaction->t_updates))
- wake_up(&journal->j_wait_updates);
- tid = transaction->t_tid;
- spin_unlock(&transaction->t_handle_lock);
- handle->h_transaction = NULL;
- current->journal_info = NULL;
-
jbd_debug(2, "restarting handle %p\n", handle);
+ stop_this_handle(handle);
+ handle->h_transaction = NULL;
+
+ /*
+ * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
+ * get rid of pointless j_state_lock traffic like this.
+ */
+ read_lock(&journal->j_state_lock);
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
if (need_to_start)
jbd2_log_start_commit(journal, tid);
-
- rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
- handle->h_buffer_credits = nblocks;
- /*
- * Restore the original nofs context because the journal restart
- * is basically the same thing as journal stop and start.
- * start_this_handle will start a new nofs context.
- */
- memalloc_nofs_restore(handle->saved_alloc_context);
+ handle->h_total_credits = nblocks +
+ DIV_ROUND_UP(revoke_records,
+ journal->j_revoke_records_per_block);
+ handle->h_revoke_credits = revoke_records;
ret = start_this_handle(journal, handle, gfp_mask);
+ trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev,
+ ret ? 0 : handle->h_transaction->t_tid,
+ handle->h_type, handle->h_line_no,
+ handle->h_total_credits);
return ret;
}
EXPORT_SYMBOL(jbd2__journal_restart);
@@ -731,12 +815,12 @@
int jbd2_journal_restart(handle_t *handle, int nblocks)
{
- return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
+ return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS);
}
EXPORT_SYMBOL(jbd2_journal_restart);
/**
- * void jbd2_journal_lock_updates () - establish a transaction barrier.
+ * jbd2_journal_lock_updates () - establish a transaction barrier.
* @journal: Journal to establish a barrier on.
*
* This locks out any further updates from being started, and blocks
@@ -795,7 +879,7 @@
}
/**
- * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
+ * jbd2_journal_unlock_updates () - release barrier
* @journal: Journal to release the barrier on.
*
* Release a transaction barrier obtained with jbd2_journal_lock_updates().
@@ -879,7 +963,7 @@
start_lock = jiffies;
lock_buffer(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
/* If it takes too long to lock the buffer, trace it */
time_lock = jbd2_time_diff(start_lock, jiffies);
@@ -929,7 +1013,7 @@
error = -EROFS;
if (is_handle_aborted(handle)) {
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
goto out;
}
error = 0;
@@ -993,7 +1077,7 @@
*/
if (buffer_shadow(bh)) {
JBUFFER_TRACE(jh, "on shadow: sleep");
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
goto repeat;
}
@@ -1014,7 +1098,7 @@
JBUFFER_TRACE(jh, "generate frozen data");
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
GFP_NOFS | __GFP_NOFAIL);
goto repeat;
@@ -1033,7 +1117,7 @@
jh->b_next_transaction = transaction;
done:
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
/*
* If we are about to journal a buffer, then any revoke pending on it is
@@ -1103,7 +1187,8 @@
}
/**
- * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
+ * jbd2_journal_get_write_access() - notify intent to modify a buffer
+ * for metadata (not data) update.
* @handle: transaction to add buffer modifications to
* @bh: bh to be used for metadata writes
*
@@ -1147,7 +1232,7 @@
* unlocked buffer beforehand. */
/**
- * int jbd2_journal_get_create_access () - notify intent to use newly created bh
+ * jbd2_journal_get_create_access () - notify intent to use newly created bh
* @handle: transaction to new buffer to
* @bh: new buffer.
*
@@ -1175,7 +1260,7 @@
* that case: the transaction must have deleted the buffer for it to be
* reused here.
*/
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
jh->b_transaction == NULL ||
(jh->b_transaction == journal->j_committing_transaction &&
@@ -1210,7 +1295,7 @@
jh->b_next_transaction = transaction;
spin_unlock(&journal->j_list_lock);
}
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
/*
* akpm: I added this. ext3_alloc_branch can pick up new indirect
@@ -1227,7 +1312,7 @@
}
/**
- * int jbd2_journal_get_undo_access() - Notify intent to modify metadata with
+ * jbd2_journal_get_undo_access() - Notify intent to modify metadata with
* non-rewindable consequences
* @handle: transaction
* @bh: buffer to undo
@@ -1281,13 +1366,13 @@
committed_data = jbd2_alloc(jh2bh(jh)->b_size,
GFP_NOFS|__GFP_NOFAIL);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
if (!jh->b_committed_data) {
/* Copy out the current buffer contents into the
* preserved, committed copy. */
JBUFFER_TRACE(jh, "generate b_committed data");
if (!committed_data) {
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
goto repeat;
}
@@ -1295,7 +1380,7 @@
committed_data = NULL;
memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
}
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
out:
jbd2_journal_put_journal_head(jh);
if (unlikely(committed_data))
@@ -1304,7 +1389,7 @@
}
/**
- * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * jbd2_journal_set_triggers() - Add triggers for commit writeout
* @bh: buffer to trigger on
* @type: struct jbd2_buffer_trigger_type containing the trigger(s).
*
@@ -1346,7 +1431,7 @@
}
/**
- * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
+ * jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
*
@@ -1394,18 +1479,18 @@
* crucial to catch bugs so let's do a reliable check until the
* lockless handling is fully proven.
*/
- if (jh->b_transaction != transaction &&
- jh->b_next_transaction != transaction) {
- jbd_lock_bh_state(bh);
+ if (data_race(jh->b_transaction != transaction &&
+ jh->b_next_transaction != transaction)) {
+ spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, jh->b_transaction == transaction ||
jh->b_next_transaction == transaction);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
if (jh->b_modified == 1) {
/* If it's in our transaction it must be in BJ_Metadata list. */
- if (jh->b_transaction == transaction &&
- jh->b_jlist != BJ_Metadata) {
- jbd_lock_bh_state(bh);
+ if (data_race(jh->b_transaction == transaction &&
+ jh->b_jlist != BJ_Metadata)) {
+ spin_lock(&jh->b_state_lock);
if (jh->b_transaction == transaction &&
jh->b_jlist != BJ_Metadata)
pr_err("JBD2: assertion failure: h_type=%u "
@@ -1415,13 +1500,13 @@
jh->b_jlist);
J_ASSERT_JH(jh, jh->b_transaction != transaction ||
jh->b_jlist == BJ_Metadata);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
goto out;
}
journal = transaction->t_journal;
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
if (jh->b_modified == 0) {
/*
@@ -1429,12 +1514,12 @@
* of the transaction. This needs to be done
* once a transaction -bzzz
*/
- if (handle->h_buffer_credits <= 0) {
+ if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) {
ret = -ENOSPC;
goto out_unlock_bh;
}
jh->b_modified = 1;
- handle->h_buffer_credits--;
+ handle->h_total_credits--;
}
/*
@@ -1507,14 +1592,14 @@
__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
spin_unlock(&journal->j_list_lock);
out_unlock_bh:
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
out:
JBUFFER_TRACE(jh, "exit");
return ret;
}
/**
- * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
+ * jbd2_journal_forget() - bforget() for potentially-journaled buffers.
* @handle: transaction handle
* @bh: bh to 'forget'
*
@@ -1530,7 +1615,7 @@
* Allow this call even if the handle has aborted --- it may be part of
* the caller's cleanup after an abort.
*/
-int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
+int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
@@ -1545,18 +1630,20 @@
BUFFER_TRACE(bh, "entry");
- jbd_lock_bh_state(bh);
+ jh = jbd2_journal_grab_journal_head(bh);
+ if (!jh) {
+ __bforget(bh);
+ return 0;
+ }
- if (!buffer_jbd(bh))
- goto not_jbd;
- jh = bh2jh(bh);
+ spin_lock(&jh->b_state_lock);
/* Critical error: attempting to delete a bitmap buffer, maybe?
* Don't do any jbd operations, and return an error. */
if (!J_EXPECT_JH(jh, !jh->b_committed_data,
"inconsistent data on disk")) {
err = -EIO;
- goto not_jbd;
+ goto drop;
}
/* keep track of whether or not this transaction modified us */
@@ -1604,10 +1691,7 @@
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
} else {
__jbd2_journal_unfile_buffer(jh);
- if (!buffer_jbd(bh)) {
- spin_unlock(&journal->j_list_lock);
- goto not_jbd;
- }
+ jbd2_journal_put_journal_head(jh);
}
spin_unlock(&journal->j_list_lock);
} else if (jh->b_transaction) {
@@ -1649,7 +1733,7 @@
if (!jh->b_cp_transaction) {
JBUFFER_TRACE(jh, "belongs to none transaction");
spin_unlock(&journal->j_list_lock);
- goto not_jbd;
+ goto drop;
}
/*
@@ -1659,7 +1743,7 @@
if (!buffer_dirty(bh)) {
__jbd2_journal_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
- goto not_jbd;
+ goto drop;
}
/*
@@ -1672,24 +1756,19 @@
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
spin_unlock(&journal->j_list_lock);
}
-
- jbd_unlock_bh_state(bh);
- __brelse(bh);
drop:
+ __brelse(bh);
+ spin_unlock(&jh->b_state_lock);
+ jbd2_journal_put_journal_head(jh);
if (drop_reserve) {
/* no need to reserve log space for this block -bzzz */
- handle->h_buffer_credits++;
+ handle->h_total_credits++;
}
return err;
-
-not_jbd:
- jbd_unlock_bh_state(bh);
- __bforget(bh);
- goto drop;
}
/**
- * int jbd2_journal_stop() - complete a transaction
+ * jbd2_journal_stop() - complete a transaction
* @handle: transaction to complete.
*
* All done for a particular handle.
@@ -1712,45 +1791,34 @@
tid_t tid;
pid_t pid;
+ if (--handle->h_ref > 0) {
+ jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+ handle->h_ref);
+ if (is_handle_aborted(handle))
+ return -EIO;
+ return 0;
+ }
if (!transaction) {
/*
- * Handle is already detached from the transaction so
- * there is nothing to do other than decrease a refcount,
- * or free the handle if refcount drops to zero
+ * Handle is already detached from the transaction so there is
+ * nothing to do other than free the handle.
*/
- if (--handle->h_ref > 0) {
- jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
- handle->h_ref);
- return err;
- } else {
- if (handle->h_rsv_handle)
- jbd2_free_handle(handle->h_rsv_handle);
- goto free_and_exit;
- }
+ memalloc_nofs_restore(handle->saved_alloc_context);
+ goto free_and_exit;
}
journal = transaction->t_journal;
-
- J_ASSERT(journal_current_handle() == handle);
+ tid = transaction->t_tid;
if (is_handle_aborted(handle))
err = -EIO;
- else
- J_ASSERT(atomic_read(&transaction->t_updates) > 0);
-
- if (--handle->h_ref > 0) {
- jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
- handle->h_ref);
- return err;
- }
jbd_debug(4, "Handle %p going down\n", handle);
trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
- transaction->t_tid,
- handle->h_type, handle->h_line_no,
+ tid, handle->h_type, handle->h_line_no,
jiffies - handle->h_start_jiffies,
handle->h_sync, handle->h_requested_credits,
(handle->h_requested_credits -
- handle->h_buffer_credits));
+ handle->h_total_credits));
/*
* Implement synchronous transaction batching. If the handle
@@ -1810,19 +1878,13 @@
if (handle->h_sync)
transaction->t_synchronous_commit = 1;
- current->journal_info = NULL;
- atomic_sub(handle->h_buffer_credits,
- &transaction->t_outstanding_credits);
/*
* If the handle is marked SYNC, we need to set another commit
- * going! We also want to force a commit if the current
- * transaction is occupying too much of the log, or if the
- * transaction is too old now.
+ * going! We also want to force a commit if the transaction is too
+ * old now.
*/
if (handle->h_sync ||
- (atomic_read(&transaction->t_outstanding_credits) >
- journal->j_max_transaction_buffers) ||
time_after_eq(jiffies, transaction->t_expires)) {
/* Do this even for aborted journals: an abort still
* completes the commit thread, it just doesn't write
@@ -1831,7 +1893,7 @@
jbd_debug(2, "transaction too old, requesting commit for "
"handle %p\n", handle);
/* This is non-blocking */
- jbd2_log_start_commit(journal, transaction->t_tid);
+ jbd2_log_start_commit(journal, tid);
/*
* Special case: JBD2_SYNC synchronous updates require us
@@ -1842,31 +1904,19 @@
}
/*
- * Once we drop t_updates, if it goes to zero the transaction
- * could start committing on us and eventually disappear. So
- * once we do this, we must not dereference transaction
- * pointer again.
+ * Once stop_this_handle() drops t_updates, the transaction could start
+ * committing on us and eventually disappear. So we must not
+ * dereference transaction pointer again after calling
+ * stop_this_handle().
*/
- tid = transaction->t_tid;
- if (atomic_dec_and_test(&transaction->t_updates)) {
- wake_up(&journal->j_wait_updates);
- if (journal->j_barrier_count)
- wake_up(&journal->j_wait_transaction_locked);
- }
-
- rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
+ stop_this_handle(handle);
if (wait_for_commit)
err = jbd2_log_wait_commit(journal, tid);
- if (handle->h_rsv_handle)
- jbd2_journal_free_reserved(handle->h_rsv_handle);
free_and_exit:
- /*
- * Scope of the GFP_NOFS context is over here and so we can restore the
- * original alloc context.
- */
- memalloc_nofs_restore(handle->saved_alloc_context);
+ if (handle->h_rsv_handle)
+ jbd2_free_handle(handle->h_rsv_handle);
jbd2_free_handle(handle);
return err;
}
@@ -1884,7 +1934,7 @@
*
* j_list_lock is held.
*
- * jbd_lock_bh_state(jh2bh(jh)) is held.
+ * jh->b_state_lock is held.
*/
static inline void
@@ -1908,7 +1958,7 @@
*
* Called with j_list_lock held, and the journal may not be locked.
*
- * jbd_lock_bh_state(jh2bh(jh)) is held.
+ * jh->b_state_lock is held.
*/
static inline void
@@ -1940,7 +1990,7 @@
transaction_t *transaction;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
transaction = jh->b_transaction;
if (transaction)
assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -1977,11 +2027,10 @@
}
/*
- * Remove buffer from all transactions.
+ * Remove buffer from all transactions. The caller is responsible for dropping
+ * the jh reference that belonged to the transaction.
*
* Called with bh_state lock and j_list_lock
- *
- * jh and bh may be already freed when this function returns.
*/
static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
{
@@ -1990,7 +2039,6 @@
__jbd2_journal_temp_unlink_buffer(jh);
jh->b_transaction = NULL;
- jbd2_journal_put_journal_head(jh);
}
void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
@@ -1999,18 +2047,19 @@
/* Get reference so that buffer cannot be freed before we unlock it */
get_bh(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
__jbd2_journal_unfile_buffer(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
+ jbd2_journal_put_journal_head(jh);
__brelse(bh);
}
/*
* Called from jbd2_journal_try_to_free_buffers().
*
- * Called under jbd_lock_bh_state(bh)
+ * Called under jh->b_state_lock
*/
static void
__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
@@ -2037,13 +2086,9 @@
}
/**
- * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
+ * jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @gfp_mask: we use the mask to detect how hard should we try to release
- * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit
- * code to release the buffers.
- *
*
* For all the buffers on this page,
* if they are fully written out ordered data, move them onto BUF_CLEAN
@@ -2074,8 +2119,7 @@
*
* Return 0 on failure, 1 on success
*/
-int jbd2_journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t gfp_mask)
+int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -2098,10 +2142,10 @@
if (!jh)
continue;
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
__journal_try_to_free_buffer(journal, bh);
+ spin_unlock(&jh->b_state_lock);
jbd2_journal_put_journal_head(jh);
- jbd_unlock_bh_state(bh);
if (buffer_jbd(bh))
goto busy;
@@ -2137,7 +2181,7 @@
*
* Called under j_list_lock.
*
- * Called under jbd_lock_bh_state(bh).
+ * Called under jh->b_state_lock.
*/
static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
{
@@ -2158,6 +2202,7 @@
} else {
JBUFFER_TRACE(jh, "on running transaction");
__jbd2_journal_unfile_buffer(jh);
+ jbd2_journal_put_journal_head(jh);
}
return may_free;
}
@@ -2224,18 +2269,15 @@
* holding the page lock. --sct
*/
- if (!buffer_jbd(bh))
+ jh = jbd2_journal_grab_journal_head(bh);
+ if (!jh)
goto zap_buffer_unlocked;
/* OK, we have data buffer in journaled mode */
write_lock(&journal->j_state_lock);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
- jh = jbd2_journal_grab_journal_head(bh);
- if (!jh)
- goto zap_buffer_no_jh;
-
/*
* We cannot remove the buffer from checkpoint lists until the
* transaction adding inode to orphan list (let's call it T)
@@ -2314,10 +2356,10 @@
* for commit and try again.
*/
if (partial_page) {
- jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
return -EBUSY;
}
/*
@@ -2331,10 +2373,10 @@
if (journal->j_running_transaction && buffer_jbddirty(bh))
jh->b_next_transaction = journal->j_running_transaction;
jh->b_modified = 0;
- jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
return 0;
} else {
/* Good, the buffer belongs to the running transaction.
@@ -2358,11 +2400,10 @@
* here.
*/
jh->b_modified = 0;
- jbd2_journal_put_journal_head(jh);
-zap_buffer_no_jh:
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
zap_buffer_unlocked:
clear_buffer_dirty(bh);
J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@ -2376,7 +2417,7 @@
}
/**
- * void jbd2_journal_invalidatepage()
+ * jbd2_journal_invalidatepage()
* @journal: journal to use for flush...
* @page: page to flush
* @offset: start of the range to invalidate
@@ -2449,7 +2490,7 @@
int was_dirty = 0;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
assert_spin_locked(&transaction->t_journal->j_list_lock);
J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
@@ -2511,11 +2552,11 @@
void jbd2_journal_file_buffer(struct journal_head *jh,
transaction_t *transaction, int jlist)
{
- jbd_lock_bh_state(jh2bh(jh));
+ spin_lock(&jh->b_state_lock);
spin_lock(&transaction->t_journal->j_list_lock);
__jbd2_journal_file_buffer(jh, transaction, jlist);
spin_unlock(&transaction->t_journal->j_list_lock);
- jbd_unlock_bh_state(jh2bh(jh));
+ spin_unlock(&jh->b_state_lock);
}
/*
@@ -2525,23 +2566,25 @@
* buffer on that transaction's metadata list.
*
* Called under j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh))
+ * Called under jh->b_state_lock
*
- * jh and bh may be already free when this function returns
+ * When this function returns true, there's no next transaction to refile to
+ * and the caller has to drop jh reference through
+ * jbd2_journal_put_journal_head().
*/
-void __jbd2_journal_refile_buffer(struct journal_head *jh)
+bool __jbd2_journal_refile_buffer(struct journal_head *jh)
{
int was_dirty, jlist;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
if (jh->b_transaction)
assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
/* If the buffer is now unused, just drop it. */
if (jh->b_next_transaction == NULL) {
__jbd2_journal_unfile_buffer(jh);
- return;
+ return true;
}
/*
@@ -2576,6 +2619,7 @@
if (was_dirty)
set_buffer_jbddirty(bh);
+ return false;
}
/*
@@ -2586,16 +2630,15 @@
*/
void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
{
- struct buffer_head *bh = jh2bh(jh);
+ bool drop;
- /* Get reference so that buffer cannot be freed before we unlock it */
- get_bh(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
- __jbd2_journal_refile_buffer(jh);
- jbd_unlock_bh_state(bh);
+ drop = __jbd2_journal_refile_buffer(jh);
+ spin_unlock(&jh->b_state_lock);
spin_unlock(&journal->j_list_lock);
- __brelse(bh);
+ if (drop)
+ jbd2_journal_put_journal_head(jh);
}
/*