Update Linux to v5.4.148
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.4.148.tar.gz
Change-Id: Ib3d26c5ba9b022e2e03533005c4fed4d7c30b61b
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8a6cc60..8ea4b3d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -167,13 +167,14 @@
if (ret)
goto out;
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
}
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
- if (ctx) {
+ if (ctx && !ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
@@ -193,6 +194,9 @@
{
int ret = -ENOENT;
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
+ return ret;
+
mutex_lock(&root->log_mutex);
if (root->log_root) {
ret = 0;
@@ -715,7 +719,9 @@
*/
ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
ins.offset);
- if (ret == 0) {
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
btrfs_init_generic_ref(&ref,
BTRFS_ADD_DELAYED_REF,
ins.objectid, ins.offset, 0);
@@ -808,7 +814,8 @@
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_del_csums(trans, fs_info,
+ ret = btrfs_del_csums(trans,
+ fs_info->csum_root,
sums->bytenr,
sums->len);
if (!ret)
@@ -1770,6 +1777,7 @@
break;
if (ret == 1) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -1782,17 +1790,19 @@
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ break;
btrfs_release_path(path);
inode = read_one_inode(root, key.offset);
- if (!inode)
- return -EIO;
+ if (!inode) {
+ ret = -EIO;
+ break;
+ }
ret = fixup_inode_link_count(trans, root, inode);
iput(inode);
if (ret)
- goto out;
+ break;
/*
* fixup on a directory may create new entries,
@@ -1801,8 +1811,6 @@
*/
key.offset = (u64)-1;
}
- ret = 0;
-out:
btrfs_release_path(path);
return ret;
}
@@ -1841,8 +1849,6 @@
ret = btrfs_update_inode(trans, root, inode);
} else if (ret == -EEXIST) {
ret = 0;
- } else {
- BUG(); /* Logic Error */
}
iput(inode);
@@ -3135,29 +3141,17 @@
btrfs_init_log_ctx(&root_log_ctx, NULL);
mutex_lock(&log_root_tree->log_mutex);
- atomic_inc(&log_root_tree->log_batch);
- atomic_inc(&log_root_tree->log_writers);
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
- mutex_unlock(&log_root_tree->log_mutex);
-
- mutex_lock(&log_root_tree->log_mutex);
-
/*
* Now we are safe to update the log_root_tree because we're under the
* log_mutex, and we're a current writer so we're holding the commit
* open until we drop the log_mutex.
*/
ret = update_log_root(trans, log, &new_root_item);
-
- if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- /* atomic_dec_and_test implies a barrier */
- cond_wake_up_nomb(&log_root_tree->log_writer_wait);
- }
-
if (ret) {
if (!list_empty(&root_log_ctx.list))
list_del_init(&root_log_ctx.list);
@@ -3203,8 +3197,6 @@
root_log_ctx.log_transid - 1);
}
- wait_for_writer(log_root_tree);
-
/*
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
@@ -3326,6 +3318,7 @@
if (root->log_root) {
free_log_tree(trans, root->log_root);
root->log_root = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
}
return 0;
}
@@ -3481,11 +3474,13 @@
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
- if (ret == -ENOSPC) {
+ if (err == -ENOSPC) {
btrfs_set_log_full_commit(trans);
- ret = 0;
- } else if (ret < 0)
- btrfs_abort_transaction(trans, ret);
+ err = 0;
+ } else if (err < 0 && err != -ENOENT) {
+ /* ENOENT can be returned if the entry hasn't been fsynced yet */
+ btrfs_abort_transaction(trans, err);
+ }
btrfs_end_log_trans(root);
@@ -3645,6 +3640,7 @@
* search and this search we'll not find the key again and can just
* bail.
*/
+search:
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
if (ret != 0)
goto done;
@@ -3664,6 +3660,13 @@
if (min_key.objectid != ino || min_key.type != key_type)
goto done;
+
+ if (need_resched()) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto search;
+ }
+
ret = overwrite_item(trans, log, dst_path, src, i,
&min_key);
if (ret) {
@@ -3927,10 +3930,32 @@
return 0;
}
+static int log_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log_root,
+ struct btrfs_ordered_sum *sums)
+{
+ int ret;
+
+ /*
+ * Due to extent cloning, we might have logged a csum item that covers a
+ * subrange of a cloned extent, and later we can end up logging a csum
+ * item for a larger subrange of the same extent or the entire range.
+ * This would leave csum items in the log tree that cover the same range
+ * and break the searches for checksums in the log tree, resulting in
+ * some checksums missing in the fs/subvolume tree. So just delete (or
+ * trim and adjust) any existing csum items in the log for this range.
+ */
+ ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
+ if (ret)
+ return ret;
+
+ return btrfs_csum_file_blocks(trans, log_root, sums);
+}
+
static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *dst_path,
- struct btrfs_path *src_path, u64 *last_extent,
+ struct btrfs_path *src_path,
int start_slot, int nr, int inode_only,
u64 logged_isize)
{
@@ -3941,7 +3966,6 @@
struct btrfs_file_extent_item *extent;
struct btrfs_inode_item *inode_item;
struct extent_buffer *src = src_path->nodes[0];
- struct btrfs_key first_key, last_key, key;
int ret;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
@@ -3949,9 +3973,6 @@
int i;
struct list_head ordered_sums;
int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
- bool has_extents = false;
- bool need_find_last_extent = true;
- bool done = false;
INIT_LIST_HEAD(&ordered_sums);
@@ -3960,8 +3981,6 @@
if (!ins_data)
return -ENOMEM;
- first_key.objectid = (u64)-1;
-
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
@@ -3982,9 +4001,6 @@
src_offset = btrfs_item_ptr_offset(src, start_slot + i);
- if (i == nr - 1)
- last_key = ins_keys[i];
-
if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
inode_item = btrfs_item_ptr(dst_path->nodes[0],
dst_path->slots[0],
@@ -3998,20 +4014,6 @@
src_offset, ins_sizes[i]);
}
- /*
- * We set need_find_last_extent here in case we know we were
- * processing other items and then walk into the first extent in
- * the inode. If we don't hit an extent then nothing changes,
- * we'll do the last search the next time around.
- */
- if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
- has_extents = true;
- if (first_key.objectid == (u64)-1)
- first_key = ins_keys[i];
- } else {
- need_find_last_extent = false;
- }
-
/* take a reference on file data extents so that truncates
* or deletes of this inode don't have to relog the inode
* again
@@ -4049,11 +4051,8 @@
fs_info->csum_root,
ds + cs, ds + cs + cl - 1,
&ordered_sums, 0);
- if (ret) {
- btrfs_release_path(dst_path);
- kfree(ins_data);
- return ret;
- }
+ if (ret)
+ break;
}
}
}
@@ -4066,178 +4065,16 @@
* we have to do this after the loop above to avoid changing the
* log tree while trying to change the log tree.
*/
- ret = 0;
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_csum_file_blocks(trans, log, sums);
+ ret = log_csums(trans, log, sums);
list_del(&sums->list);
kfree(sums);
}
- if (!has_extents)
- return ret;
-
- if (need_find_last_extent && *last_extent == first_key.offset) {
- /*
- * We don't have any leafs between our current one and the one
- * we processed before that can have file extent items for our
- * inode (and have a generation number smaller than our current
- * transaction id).
- */
- need_find_last_extent = false;
- }
-
- /*
- * Because we use btrfs_search_forward we could skip leaves that were
- * not modified and then assume *last_extent is valid when it really
- * isn't. So back up to the previous leaf and read the end of the last
- * extent before we go and fill in holes.
- */
- if (need_find_last_extent) {
- u64 len;
-
- ret = btrfs_prev_leaf(inode->root, src_path);
- if (ret < 0)
- return ret;
- if (ret)
- goto fill_holes;
- if (src_path->slots[0])
- src_path->slots[0]--;
- src = src_path->nodes[0];
- btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
- if (key.objectid != btrfs_ino(inode) ||
- key.type != BTRFS_EXTENT_DATA_KEY)
- goto fill_holes;
- extent = btrfs_item_ptr(src, src_path->slots[0],
- struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(src, extent) ==
- BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(src, extent);
- *last_extent = ALIGN(key.offset + len,
- fs_info->sectorsize);
- } else {
- len = btrfs_file_extent_num_bytes(src, extent);
- *last_extent = key.offset + len;
- }
- }
-fill_holes:
- /* So we did prev_leaf, now we need to move to the next leaf, but a few
- * things could have happened
- *
- * 1) A merge could have happened, so we could currently be on a leaf
- * that holds what we were copying in the first place.
- * 2) A split could have happened, and now not all of the items we want
- * are on the same leaf.
- *
- * So we need to adjust how we search for holes, we need to drop the
- * path and re-search for the first extent key we found, and then walk
- * forward until we hit the last one we copied.
- */
- if (need_find_last_extent) {
- /* btrfs_prev_leaf could return 1 without releasing the path */
- btrfs_release_path(src_path);
- ret = btrfs_search_slot(NULL, inode->root, &first_key,
- src_path, 0, 0);
- if (ret < 0)
- return ret;
- ASSERT(ret == 0);
- src = src_path->nodes[0];
- i = src_path->slots[0];
- } else {
- i = start_slot;
- }
-
- /*
- * Ok so here we need to go through and fill in any holes we may have
- * to make sure that holes are punched for those areas in case they had
- * extents previously.
- */
- while (!done) {
- u64 offset, len;
- u64 extent_end;
-
- if (i >= btrfs_header_nritems(src_path->nodes[0])) {
- ret = btrfs_next_leaf(inode->root, src_path);
- if (ret < 0)
- return ret;
- ASSERT(ret == 0);
- src = src_path->nodes[0];
- i = 0;
- need_find_last_extent = true;
- }
-
- btrfs_item_key_to_cpu(src, &key, i);
- if (!btrfs_comp_cpu_keys(&key, &last_key))
- done = true;
- if (key.objectid != btrfs_ino(inode) ||
- key.type != BTRFS_EXTENT_DATA_KEY) {
- i++;
- continue;
- }
- extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(src, extent) ==
- BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(src, extent);
- extent_end = ALIGN(key.offset + len,
- fs_info->sectorsize);
- } else {
- len = btrfs_file_extent_num_bytes(src, extent);
- extent_end = key.offset + len;
- }
- i++;
-
- if (*last_extent == key.offset) {
- *last_extent = extent_end;
- continue;
- }
- offset = *last_extent;
- len = key.offset - *last_extent;
- ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
- offset, 0, 0, len, 0, len, 0, 0, 0);
- if (ret)
- break;
- *last_extent = extent_end;
- }
-
- /*
- * Check if there is a hole between the last extent found in our leaf
- * and the first extent in the next leaf. If there is one, we need to
- * log an explicit hole so that at replay time we can punch the hole.
- */
- if (ret == 0 &&
- key.objectid == btrfs_ino(inode) &&
- key.type == BTRFS_EXTENT_DATA_KEY &&
- i == btrfs_header_nritems(src_path->nodes[0])) {
- ret = btrfs_next_leaf(inode->root, src_path);
- need_find_last_extent = true;
- if (ret > 0) {
- ret = 0;
- } else if (ret == 0) {
- btrfs_item_key_to_cpu(src_path->nodes[0], &key,
- src_path->slots[0]);
- if (key.objectid == btrfs_ino(inode) &&
- key.type == BTRFS_EXTENT_DATA_KEY &&
- *last_extent < key.offset) {
- const u64 len = key.offset - *last_extent;
-
- ret = btrfs_insert_file_extent(trans, log,
- btrfs_ino(inode),
- *last_extent, 0,
- 0, len, 0, len,
- 0, 0, 0);
- *last_extent += len;
- }
- }
- }
- /*
- * Need to let the callers know we dropped the path so they should
- * re-search.
- */
- if (!ret && need_find_last_extent)
- ret = 1;
return ret;
}
@@ -4292,7 +4129,7 @@
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_csum_file_blocks(trans, log_root, sums);
+ ret = log_csums(trans, log_root, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4402,7 +4239,10 @@
const u64 i_size = i_size_read(&inode->vfs_inode);
const u64 ino = btrfs_ino(inode);
struct btrfs_path *dst_path = NULL;
- u64 last_extent = (u64)-1;
+ bool dropped_extents = false;
+ u64 truncate_offset = i_size;
+ struct extent_buffer *leaf;
+ int slot;
int ins_nr = 0;
int start_slot;
int ret;
@@ -4417,15 +4257,48 @@
if (ret < 0)
goto out;
+ /*
+ * We must check if there is a prealloc extent that starts before the
+ * i_size and crosses the i_size boundary. This is to ensure later we
+ * truncate down to the end of that extent and not to the i_size, as
+ * otherwise we end up losing part of the prealloc extent after a log
+ * replay and with an implicit hole if there is another prealloc extent
+ * that starts at an offset beyond i_size.
+ */
+ ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+ if (ret < 0)
+ goto out;
+
+ if (ret == 0) {
+ struct btrfs_file_extent_item *ei;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, ei) ==
+ BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 extent_end;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ extent_end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, ei);
+
+ if (extent_end > i_size)
+ truncate_offset = extent_end;
+ }
+ } else {
+ ret = 0;
+ }
+
while (true) {
- struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
+ leaf = path->nodes[0];
+ slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, start_slot,
- ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0);
if (ret < 0)
goto out;
ins_nr = 0;
@@ -4449,8 +4322,7 @@
path->slots[0]++;
continue;
}
- if (last_extent == (u64)-1) {
- last_extent = key.offset;
+ if (!dropped_extents) {
/*
* Avoid logging extent items logged in past fsync calls
* and leading to duplicate keys in the log tree.
@@ -4459,11 +4331,12 @@
ret = btrfs_truncate_inode_items(trans,
root->log_root,
&inode->vfs_inode,
- i_size,
+ truncate_offset,
BTRFS_EXTENT_DATA_KEY);
} while (ret == -EAGAIN);
if (ret)
goto out;
+ dropped_extents = true;
}
if (ins_nr == 0)
start_slot = slot;
@@ -4478,7 +4351,7 @@
}
}
if (ins_nr > 0) {
- ret = copy_items(trans, inode, dst_path, path, &last_extent,
+ ret = copy_items(trans, inode, dst_path, path,
start_slot, ins_nr, 1, 0);
if (ret > 0)
ret = 0;
@@ -4665,13 +4538,8 @@
if (slot >= nritems) {
if (ins_nr > 0) {
- u64 last_extent = 0;
-
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, start_slot,
- ins_nr, 1, 0);
- /* can't be 1, extent items aren't processed */
- ASSERT(ret <= 0);
+ start_slot, ins_nr, 1, 0);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -4695,13 +4563,8 @@
cond_resched();
}
if (ins_nr > 0) {
- u64 last_extent = 0;
-
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, start_slot,
- ins_nr, 1, 0);
- /* can't be 1, extent items aren't processed */
- ASSERT(ret <= 0);
+ start_slot, ins_nr, 1, 0);
if (ret < 0)
return ret;
}
@@ -4710,100 +4573,119 @@
}
/*
- * If the no holes feature is enabled we need to make sure any hole between the
- * last extent and the i_size of our inode is explicitly marked in the log. This
- * is to make sure that doing something like:
- *
- * 1) create file with 128Kb of data
- * 2) truncate file to 64Kb
- * 3) truncate file to 256Kb
- * 4) fsync file
- * 5) <crash/power failure>
- * 6) mount fs and trigger log replay
- *
- * Will give us a file with a size of 256Kb, the first 64Kb of data match what
- * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
- * file correspond to a hole. The presence of explicit holes in a log tree is
- * what guarantees that log replay will remove/adjust file extent items in the
- * fs/subvol tree.
- *
- * Here we do not need to care about holes between extents, that is already done
- * by copy_items(). We also only need to do this in the full sync path, where we
- * lookup for extents from the fs/subvol tree only. In the fast path case, we
- * lookup the list of modified extent maps and if any represents a hole, we
- * insert a corresponding extent representing a hole in the log tree.
+ * When using the NO_HOLES feature if we punched a hole that causes the
+ * deletion of entire leafs or all the extent items of the first leaf (the one
+ * that contains the inode item and references) we may end up not processing
+ * any extents, because there are no leafs with a generation matching the
+ * current transaction that have extent items for our inode. So we need to find
+ * if any holes exist and then log them. We also need to log holes after any
+ * truncate operation that changes the inode's size.
*/
-static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *inode,
- struct btrfs_path *path)
+static int btrfs_log_holes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
struct btrfs_key key;
- u64 hole_start;
- u64 hole_size;
- struct extent_buffer *leaf;
- struct btrfs_root *log = root->log_root;
const u64 ino = btrfs_ino(inode);
const u64 i_size = i_size_read(&inode->vfs_inode);
+ u64 prev_extent_end = 0;
+ int ret;
- if (!btrfs_fs_incompat(fs_info, NO_HOLES))
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
return 0;
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = (u64)-1;
+ key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- ASSERT(ret != 0);
if (ret < 0)
return ret;
- ASSERT(path->slots[0] > 0);
- path->slots[0]--;
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-
- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
- /* inode does not have any extents */
- hole_start = 0;
- hole_size = i_size;
- } else {
+ while (true) {
struct btrfs_file_extent_item *extent;
+ struct extent_buffer *leaf = path->nodes[0];
u64 len;
- /*
- * If there's an extent beyond i_size, an explicit hole was
- * already inserted by copy_items().
- */
- if (key.offset >= i_size)
- return 0;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ /* We have a hole, log it. */
+ if (prev_extent_end < key.offset) {
+ const u64 hole_len = key.offset - prev_extent_end;
+
+ /*
+ * Release the path to avoid deadlocks with other code
+ * paths that search the root while holding locks on
+ * leafs from the log root.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_insert_file_extent(trans, root->log_root,
+ ino, prev_extent_end, 0,
+ 0, hole_len, 0, hole_len,
+ 0, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Search for the same key again in the root. Since it's
+ * an extent item and we are holding the inode lock, the
+ * key must still exist. If it doesn't just emit warning
+ * and return an error to fall back to a transaction
+ * commit.
+ */
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+ if (WARN_ON(ret > 0))
+ return -ENOENT;
+ leaf = path->nodes[0];
+ }
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
-
if (btrfs_file_extent_type(leaf, extent) ==
- BTRFS_FILE_EXTENT_INLINE)
- return 0;
+ BTRFS_FILE_EXTENT_INLINE) {
+ len = btrfs_file_extent_ram_bytes(leaf, extent);
+ prev_extent_end = ALIGN(key.offset + len,
+ fs_info->sectorsize);
+ } else {
+ len = btrfs_file_extent_num_bytes(leaf, extent);
+ prev_extent_end = key.offset + len;
+ }
- len = btrfs_file_extent_num_bytes(leaf, extent);
- /* Last extent goes beyond i_size, no need to log a hole. */
- if (key.offset + len > i_size)
- return 0;
- hole_start = key.offset + len;
- hole_size = i_size - hole_start;
+ path->slots[0]++;
+ cond_resched();
}
- btrfs_release_path(path);
- /* Last extent ends at i_size. */
- if (hole_size == 0)
- return 0;
+ if (prev_extent_end < i_size) {
+ u64 hole_len;
- hole_size = ALIGN(hole_size, fs_info->sectorsize);
- ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
- hole_size, 0, hole_size, 0, 0, 0);
- return ret;
+ btrfs_release_path(path);
+ hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
+ ret = btrfs_insert_file_extent(trans, root->log_root,
+ ino, prev_extent_end, 0, 0,
+ hole_len, 0, hole_len,
+ 0, 0, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
}
/*
@@ -5008,6 +4890,50 @@
continue;
}
/*
+ * If the inode was already logged skip it - otherwise we can
+ * hit an infinite loop. Example:
+ *
+ * From the commit root (previous transaction) we have the
+ * following inodes:
+ *
+ * inode 257 a directory
+ * inode 258 with references "zz" and "zz_link" on inode 257
+ * inode 259 with reference "a" on inode 257
+ *
+ * And in the current (uncommitted) transaction we have:
+ *
+ * inode 257 a directory, unchanged
+ * inode 258 with references "a" and "a2" on inode 257
+ * inode 259 with reference "zz_link" on inode 257
+ * inode 261 with reference "zz" on inode 257
+ *
+ * When logging inode 261 the following infinite loop could
+ * happen if we don't skip already logged inodes:
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 261
+ * on reference "zz", and log it;
+ *
+ * - we detect inode 259 as a conflicting inode, with inode 258
+ * on reference "a", and log it;
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 259
+ * on reference "zz_link", and log it - again! After this we
+ * repeat the above steps forever.
+ */
+ spin_lock(&BTRFS_I(inode)->lock);
+ /*
+ * Check the inode's logged_trans only instead of
+ * btrfs_inode_in_log(). This is because the last_log_commit of
+ * the inode is not updated when we only log that it exists and
+ * it has the full sync bit set (see btrfs_log_inode()).
+ */
+ if (BTRFS_I(inode)->logged_trans == trans->transid) {
+ spin_unlock(&BTRFS_I(inode)->lock);
+ btrfs_add_delayed_iput(inode);
+ continue;
+ }
+ spin_unlock(&BTRFS_I(inode)->lock);
+ /*
* We are safe logging the other inode without acquiring its
* lock as long as we log with the LOG_INODE_EXISTS mode. We
* are safe against concurrent renames of the other inode as
@@ -5079,6 +5005,138 @@
return ret;
}
+static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_key *min_key,
+ const struct btrfs_key *max_key,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ const u64 logged_isize,
+ const bool recursive_logging,
+ const int inode_only,
+ struct btrfs_log_ctx *ctx,
+ bool *need_log_inode_item)
+{
+ struct btrfs_root *root = inode->root;
+ int ins_start_slot = 0;
+ int ins_nr = 0;
+ int ret;
+
+ while (1) {
+ ret = btrfs_search_forward(root, min_key, path, trans->transid);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+again:
+ /* Note, ins_nr might be > 0 here, cleanup outside the loop */
+ if (min_key->objectid != max_key->objectid)
+ break;
+ if (min_key->type > max_key->type)
+ break;
+
+ if (min_key->type == BTRFS_INODE_ITEM_KEY)
+ *need_log_inode_item = false;
+
+ if ((min_key->type == BTRFS_INODE_REF_KEY ||
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
+ inode->generation == trans->transid &&
+ !recursive_logging) {
+ u64 other_ino = 0;
+ u64 other_parent = 0;
+
+ ret = btrfs_check_ref_name_override(path->nodes[0],
+ path->slots[0], min_key, inode,
+ &other_ino, &other_parent);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0 && ctx &&
+ other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
+ if (ins_nr > 0) {
+ ins_nr++;
+ } else {
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+ }
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot, ins_nr,
+ inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+
+ ret = log_conflicting_inodes(trans, root, path,
+ ctx, other_ino, other_parent);
+ if (ret)
+ return ret;
+ btrfs_release_path(path);
+ goto next_key;
+ }
+ }
+
+ /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
+ if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ if (ins_nr == 0)
+ goto next_slot;
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ goto next_slot;
+ }
+
+ if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+ ins_nr++;
+ goto next_slot;
+ } else if (!ins_nr) {
+ ins_start_slot = path->slots[0];
+ ins_nr = 1;
+ goto next_slot;
+ }
+
+ ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+next_slot:
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key_to_cpu(path->nodes[0], min_key,
+ path->slots[0]);
+ goto again;
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ }
+ btrfs_release_path(path);
+next_key:
+ if (min_key->offset < (u64)-1) {
+ min_key->offset++;
+ } else if (min_key->type < max_key->type) {
+ min_key->type++;
+ min_key->offset = 0;
+ } else {
+ break;
+ }
+ }
+ if (ins_nr)
+ ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -5100,18 +5158,13 @@
const loff_t end,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_path *dst_path;
struct btrfs_key min_key;
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
- u64 last_extent = 0;
int err = 0;
- int ret;
- int nritems;
- int ins_start_slot = 0;
- int ins_nr;
+ int ret = 0;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &inode->extent_tree;
@@ -5147,15 +5200,19 @@
max_key.offset = (u64)-1;
/*
- * Only run delayed items if we are a dir or a new file.
- * Otherwise commit the delayed inode only, which is needed in
- * order for the log replay code to mark inodes for link count
- * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+ * Only run delayed items if we are a directory. We want to make sure
+ * all directory indexes hit the fs/subvolume tree so we can find them
+ * and figure out which index ranges have to be logged.
+ *
+ * Otherwise commit the delayed inode only if the full sync flag is set,
+ * as we want to make sure an up to date version is in the subvolume
+ * tree so copy_inode_items_to_log() / copy_items() can find it and copy
+ * it to the log tree. For a non full sync, we always log the inode item
+ * based on the in-memory struct btrfs_inode which is always up to date.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode) ||
- inode->generation > fs_info->last_trans_committed)
+ if (S_ISDIR(inode->vfs_inode.i_mode))
ret = btrfs_commit_inode_delayed_items(trans, inode);
- else
+ else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
ret = btrfs_commit_inode_delayed_inode(inode);
if (ret) {
@@ -5242,150 +5299,12 @@
goto out_unlock;
}
- while (1) {
- ins_nr = 0;
- ret = btrfs_search_forward(root, &min_key,
- path, trans->transid);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- if (ret != 0)
- break;
-again:
- /* note, ins_nr might be > 0 here, cleanup outside the loop */
- if (min_key.objectid != ino)
- break;
- if (min_key.type > max_key.type)
- break;
-
- if (min_key.type == BTRFS_INODE_ITEM_KEY)
- need_log_inode_item = false;
-
- if ((min_key.type == BTRFS_INODE_REF_KEY ||
- min_key.type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
- u64 other_ino = 0;
- u64 other_parent = 0;
-
- ret = btrfs_check_ref_name_override(path->nodes[0],
- path->slots[0], &min_key, inode,
- &other_ino, &other_parent);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- } else if (ret > 0 && ctx &&
- other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
- if (ins_nr > 0) {
- ins_nr++;
- } else {
- ins_nr = 1;
- ins_start_slot = path->slots[0];
- }
- ret = copy_items(trans, inode, dst_path, path,
- &last_extent, ins_start_slot,
- ins_nr, inode_only,
- logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
-
- err = log_conflicting_inodes(trans, root, path,
- ctx, other_ino, other_parent);
- if (err)
- goto out_unlock;
- btrfs_release_path(path);
- goto next_key;
- }
- }
-
- /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
- if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
- if (ins_nr == 0)
- goto next_slot;
- ret = copy_items(trans, inode, dst_path, path,
- &last_extent, ins_start_slot,
- ins_nr, inode_only, logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
- if (ret) {
- btrfs_release_path(path);
- continue;
- }
- goto next_slot;
- }
-
- if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
- ins_nr++;
- goto next_slot;
- } else if (!ins_nr) {
- ins_start_slot = path->slots[0];
- ins_nr = 1;
- goto next_slot;
- }
-
- ret = copy_items(trans, inode, dst_path, path, &last_extent,
- ins_start_slot, ins_nr, inode_only,
- logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- if (ret) {
- ins_nr = 0;
- btrfs_release_path(path);
- continue;
- }
- ins_nr = 1;
- ins_start_slot = path->slots[0];
-next_slot:
-
- nritems = btrfs_header_nritems(path->nodes[0]);
- path->slots[0]++;
- if (path->slots[0] < nritems) {
- btrfs_item_key_to_cpu(path->nodes[0], &min_key,
- path->slots[0]);
- goto again;
- }
- if (ins_nr) {
- ret = copy_items(trans, inode, dst_path, path,
- &last_extent, ins_start_slot,
- ins_nr, inode_only, logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ret = 0;
- ins_nr = 0;
- }
- btrfs_release_path(path);
-next_key:
- if (min_key.offset < (u64)-1) {
- min_key.offset++;
- } else if (min_key.type < max_key.type) {
- min_key.type++;
- min_key.offset = 0;
- } else {
- break;
- }
- }
- if (ins_nr) {
- ret = copy_items(trans, inode, dst_path, path, &last_extent,
- ins_start_slot, ins_nr, inode_only,
- logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ret = 0;
- ins_nr = 0;
- }
+ err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
+ path, dst_path, logged_isize,
+ recursive_logging, inode_only, ctx,
+ &need_log_inode_item);
+ if (err)
+ goto out_unlock;
btrfs_release_path(path);
btrfs_release_path(dst_path);
@@ -5396,7 +5315,7 @@
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_trailing_hole(trans, root, inode, path);
+ err = btrfs_log_holes(trans, root, inode, path);
if (err)
goto out_unlock;
}
@@ -5462,19 +5381,34 @@
}
/*
- * Don't update last_log_commit if we logged that an inode exists after
- * it was loaded to memory (full_sync bit set).
- * This is to prevent data loss when we do a write to the inode, then
- * the inode gets evicted after all delalloc was flushed, then we log
- * it exists (due to a rename for example) and then fsync it. This last
- * fsync would do nothing (not logging the extents previously written).
+ * If we are logging that an ancestor inode exists as part of logging a
+ * new name from a link or rename operation, don't mark the inode as
+ * logged - otherwise if an explicit fsync is made against an ancestor,
+ * the fsync considers the inode in the log and doesn't sync the log,
+ * resulting in the ancestor missing after a power failure unless the
+ * log was synced as part of an fsync against any other unrelated inode.
+ * So keep it simple for this case and just don't flag the ancestors as
+ * logged.
*/
- spin_lock(&inode->lock);
- inode->logged_trans = trans->transid;
- if (inode_only != LOG_INODE_EXISTS ||
- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
- inode->last_log_commit = inode->last_sub_trans;
- spin_unlock(&inode->lock);
+ if (!ctx ||
+ !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
+ &inode->vfs_inode != ctx->inode)) {
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
+ /*
+ * Don't update last_log_commit if we logged that an inode exists
+ * after it was loaded to memory (full_sync bit set).
+ * This is to prevent data loss when we do a write to the inode,
+ * then the inode gets evicted after all delalloc was flushed,
+ * then we log it exists (due to a rename for example) and then
+ * fsync it. This last fsync would do nothing (not logging the
+ * extents previously written).
+ */
+ if (inode_only != LOG_INODE_EXISTS ||
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
+ }
out_unlock:
mutex_unlock(&inode->log_mutex);
@@ -6314,9 +6248,28 @@
wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
if (IS_ERR(wc.replay_dest)) {
ret = PTR_ERR(wc.replay_dest);
+
+ /*
+ * We didn't find the subvol, likely because it was
+ * deleted. This is ok, simply skip this log and go to
+ * the next one.
+ *
+ * We need to exclude the root because we can't have
+ * other log replays overwriting this log as we'll read
+ * it back in a few more times. This will keep our
+ * block from being modified, and we'll just bail for
+ * each subsequent pass.
+ */
+ if (ret == -ENOENT)
+ ret = btrfs_pin_extent_for_log_replay(fs_info,
+ log->node->start,
+ log->node->len);
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
+
+ if (!ret)
+ goto next;
btrfs_handle_fs_error(fs_info, ret,
"Couldn't read target root for tree log recovery.");
goto error;
@@ -6348,7 +6301,6 @@
&root->highest_objectid);
}
- key.offset = found_key.offset - 1;
wc.replay_dest->log_root = NULL;
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
@@ -6356,9 +6308,10 @@
if (ret)
goto error;
-
+next:
if (found_key.offset == 0)
break;
+ key.offset = found_key.offset - 1;
}
btrfs_release_path(path);
@@ -6391,6 +6344,7 @@
error:
if (wc.trans)
btrfs_end_transaction(wc.trans);
+ clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
btrfs_free_path(path);
return ret;
}
@@ -6480,26 +6434,12 @@
/*
* Call this after adding a new name for a file and it will properly
* update the log to reflect the new name.
- *
- * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
- * true (because it's not used).
- *
- * Return value depends on whether @sync_log is true or false.
- * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
- * otherwise.
- * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
- * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
- * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed (without attempting to sync the log).
*/
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx)
+ struct dentry *parent)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- int ret;
+ struct btrfs_log_ctx ctx;
/*
* this will force the logging code to walk the dentry chain
@@ -6512,36 +6452,20 @@
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
- if (inode->logged_trans <= fs_info->last_trans_committed &&
- (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
- return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
- BTRFS_DONT_NEED_LOG_SYNC;
+ if (!inode_logged(trans, inode) &&
+ (!old_dir || !inode_logged(trans, old_dir)))
+ return;
- if (sync_log) {
- struct btrfs_log_ctx ctx2;
-
- btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, &ctx2);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
-
- ret = btrfs_sync_log(trans, inode->root, &ctx2);
- if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- }
-
- ASSERT(ctx);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, ctx);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_LOG_SYNC;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
-
- return BTRFS_NEED_LOG_SYNC;
+ btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
+ ctx.logging_new_name = true;
+ /*
+ * We don't care about the return value. If we fail to log the new name
+ * then we know the next attempt to sync the log will fallback to a full
+ * transaction commit (due to a call to btrfs_set_log_full_commit()), so
+ * we don't need to worry about getting a log committed that has an
+ * inconsistent state after a rename operation.
+ */
+ btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
+ LOG_INODE_EXISTS, &ctx);
}