Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 16ecb76..8a6cc60 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -8,6 +8,7 @@
#include <linux/blkdev.h>
#include <linux/list_sort.h>
#include <linux/iversion.h>
+#include "misc.h"
#include "ctree.h"
#include "tree-log.h"
#include "disk-io.h"
@@ -24,9 +25,12 @@
* LOG_INODE_EXISTS means to log just enough to recreate the inode
* during log replay
*/
-#define LOG_INODE_ALL 0
-#define LOG_INODE_EXISTS 1
-#define LOG_OTHER_INODE 2
+enum {
+ LOG_INODE_ALL,
+ LOG_INODE_EXISTS,
+ LOG_OTHER_INODE,
+ LOG_OTHER_INODE_ALL,
+};
/*
* directory trouble cases
@@ -80,10 +84,12 @@
* The last stage is to deal with directories and links and extents
* and all the other fun semantics
*/
-#define LOG_WALK_PIN_ONLY 0
-#define LOG_WALK_REPLAY_INODES 1
-#define LOG_WALK_REPLAY_DIR_INDEX 2
-#define LOG_WALK_REPLAY_ALL 3
+enum {
+ LOG_WALK_PIN_ONLY,
+ LOG_WALK_REPLAY_INODES,
+ LOG_WALK_REPLAY_DIR_INDEX,
+ LOG_WALK_REPLAY_ALL,
+};
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
@@ -138,7 +144,7 @@
mutex_lock(&root->log_mutex);
if (root->log_root) {
- if (btrfs_need_log_full_commit(fs_info, trans)) {
+ if (btrfs_need_log_full_commit(trans)) {
ret = -EAGAIN;
goto out;
}
@@ -187,10 +193,6 @@
{
int ret = -ENOENT;
- smp_mb();
- if (!root->log_root)
- return -ENOENT;
-
mutex_lock(&root->log_mutex);
if (root->log_root) {
ret = 0;
@@ -205,14 +207,11 @@
* until you call btrfs_end_log_trans() or it makes any future
* log transactions wait until you call btrfs_end_log_trans()
*/
-int btrfs_pin_log_trans(struct btrfs_root *root)
+void btrfs_pin_log_trans(struct btrfs_root *root)
{
- int ret = -ENOENT;
-
mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
mutex_unlock(&root->log_mutex);
- return ret;
}
/*
@@ -227,6 +226,17 @@
}
}
+static int btrfs_write_tree_block(struct extent_buffer *buf)
+{
+ return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
+ buf->start + buf->len - 1);
+}
+
+static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+ filemap_fdatawait_range(buf->pages[0]->mapping,
+ buf->start, buf->start + buf->len - 1);
+}
/*
* the walk control struct is used to pass state down the chain when
@@ -306,7 +316,7 @@
if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
if (wc->pin && btrfs_header_level(eb) == 0)
- ret = btrfs_exclude_logged_extents(fs_info, eb);
+ ret = btrfs_exclude_logged_extents(eb);
if (wc->write)
btrfs_write_tree_block(eb);
if (wc->wait)
@@ -335,7 +345,6 @@
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
u32 item_size;
u64 saved_i_size = 0;
@@ -456,10 +465,9 @@
found_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
if (found_size > item_size)
- btrfs_truncate_item(fs_info, path, item_size, 1);
+ btrfs_truncate_item(path, item_size, 1);
else if (found_size < item_size)
- btrfs_extend_item(fs_info, path,
- item_size - found_size);
+ btrfs_extend_item(path, item_size - found_size);
} else if (ret) {
return ret;
}
@@ -498,7 +506,7 @@
ino_size != 0) {
struct btrfs_map_token token;
- btrfs_init_map_token(&token);
+ btrfs_init_map_token(&token, dst_eb);
btrfs_set_token_inode_size(dst_eb, dst_item,
ino_size, &token);
}
@@ -696,9 +704,11 @@
goto out;
if (ins.objectid > 0) {
+ struct btrfs_ref ref = { 0 };
u64 csum_start;
u64 csum_end;
LIST_HEAD(ordered_sums);
+
/*
* is this extent already allocated in the extent
* allocation tree? If so, just add a reference
@@ -706,10 +716,13 @@
ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
ins.offset);
if (ret == 0) {
- ret = btrfs_inc_extent_ref(trans, root,
- ins.objectid, ins.offset,
- 0, root->root_key.objectid,
+ btrfs_init_generic_ref(&ref,
+ BTRFS_ADD_DELAYED_REF,
+ ins.objectid, ins.offset, 0);
+ btrfs_init_data_ref(&ref,
+ root->root_key.objectid,
key->objectid, offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
if (ret)
goto out;
} else {
@@ -955,7 +968,7 @@
if (btrfs_find_name_in_ext_backref(path->nodes[0],
path->slots[0],
ref_objectid,
- name, namelen, NULL))
+ name, namelen))
match = 1;
goto out;
@@ -1147,7 +1160,7 @@
}
btrfs_release_path(path);
- /* look for a conflicing name */
+ /* look for a conflicting name */
di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
name, namelen, 0);
if (di && !IS_ERR(di)) {
@@ -1254,12 +1267,12 @@
goto out;
if (key->type == BTRFS_INODE_EXTREF_KEY)
- ret = btrfs_find_name_in_ext_backref(log_eb, log_slot,
- parent_id, name,
- namelen, NULL);
+ ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
+ parent_id, name,
+ namelen);
else
- ret = btrfs_find_name_in_backref(log_eb, log_slot, name,
- namelen, NULL);
+ ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
+ name, namelen);
if (!ret) {
struct inode *dir;
@@ -1321,18 +1334,78 @@
goto out;
}
if (key.type == BTRFS_INODE_EXTREF_KEY)
- ret = btrfs_find_name_in_ext_backref(path->nodes[0],
- path->slots[0], parent_id,
- name, namelen, NULL);
+ ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
+ path->slots[0], parent_id, name, namelen);
else
- ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
- name, namelen, NULL);
+ ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
+ name, namelen);
out:
btrfs_free_path(path);
return ret;
}
+static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct inode *dir, struct inode *inode, const char *name,
+ int namelen, u64 ref_index)
+{
+ struct btrfs_dir_item *dir_item;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct inode *other_inode = NULL;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ dir_item = btrfs_lookup_dir_item(NULL, root, path,
+ btrfs_ino(BTRFS_I(dir)),
+ name, namelen, 0);
+ if (!dir_item) {
+ btrfs_release_path(path);
+ goto add_link;
+ } else if (IS_ERR(dir_item)) {
+ ret = PTR_ERR(dir_item);
+ goto out;
+ }
+
+ /*
+ * Our inode's dentry collides with the dentry of another inode which is
+ * in the log but not yet processed since it has a higher inode number.
+ * So delete that other dentry.
+ */
+ btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
+ btrfs_release_path(path);
+ other_inode = read_one_inode(root, key.objectid);
+ if (!other_inode) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
+ name, namelen);
+ if (ret)
+ goto out;
+ /*
+ * If we dropped the link count to 0, bump it so that later the iput()
+ * on the inode will not free it. We will fixup the link count later.
+ */
+ if (other_inode->i_nlink == 0)
+ inc_nlink(other_inode);
+
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
+ goto out;
+add_link:
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ name, namelen, 0, ref_index);
+out:
+ iput(other_inode);
+ btrfs_free_path(path);
+
+ return ret;
+}
+
/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
@@ -1469,9 +1542,8 @@
goto out;
/* insert our name */
- ret = btrfs_add_link(trans, BTRFS_I(dir),
- BTRFS_I(inode),
- name, namelen, 0, ref_index);
+ ret = add_link(trans, root, dir, inode, name, namelen,
+ ref_index);
if (ret)
goto out;
@@ -2666,8 +2738,8 @@
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
- clean_tree_block(fs_info, next);
+ btrfs_set_lock_blocking_write(next);
+ btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
} else {
@@ -2750,8 +2822,8 @@
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
- clean_tree_block(fs_info, next);
+ btrfs_set_lock_blocking_write(next);
+ btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
} else {
@@ -2832,8 +2904,8 @@
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
- clean_tree_block(fs_info, next);
+ btrfs_set_lock_blocking_write(next);
+ btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
} else {
@@ -2860,7 +2932,8 @@
* in the tree of log roots
*/
static int update_log_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *log)
+ struct btrfs_root *log,
+ struct btrfs_root_item *root_item)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret;
@@ -2868,10 +2941,10 @@
if (log->log_transid == 1) {
/* insert root item on the first sync */
ret = btrfs_insert_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
} else {
ret = btrfs_update_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
}
return ret;
}
@@ -2969,6 +3042,7 @@
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+ struct btrfs_root_item new_root_item;
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
@@ -3008,7 +3082,7 @@
}
/* bail out if we need to do a full commit */
- if (btrfs_need_log_full_commit(fs_info, trans)) {
+ if (btrfs_need_log_full_commit(trans)) {
ret = -EAGAIN;
mutex_unlock(&root->log_mutex);
goto out;
@@ -3027,12 +3101,26 @@
if (ret) {
blk_finish_plug(&plug);
btrfs_abort_transaction(trans, ret);
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
mutex_unlock(&root->log_mutex);
goto out;
}
+ /*
+ * We _must_ update under the root->log_mutex in order to make sure we
+ * have a consistent view of the log root we are trying to commit at
+ * this moment.
+ *
+ * We _must_ copy this into a local copy, because we are not holding the
+ * log_root_tree->log_mutex yet. This is important because when we
+ * commit the log_root_tree we must have a consistent view of the
+ * log_root_tree when we update the super block to point at the
+ * log_root_tree bytenr. If we update the log_root_tree here we'll race
+ * with the commit and possibly point at the new block which we may not
+ * have written out.
+ */
btrfs_set_root_node(&log->root_item, log->node);
+ memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
root->log_transid++;
log->log_transid = root->log_transid;
@@ -3056,9 +3144,15 @@
mutex_unlock(&log_root_tree->log_mutex);
- ret = update_log_root(trans, log);
-
mutex_lock(&log_root_tree->log_mutex);
+
+ /*
+ * Now we are safe to update the log_root_tree because we're under the
+ * log_mutex, and we're a current writer so we're holding the commit
+ * open until we drop the log_mutex.
+ */
+ ret = update_log_root(trans, log, &new_root_item);
+
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
/* atomic_dec_and_test implies a barrier */
cond_wake_up_nomb(&log_root_tree->log_writer_wait);
@@ -3069,7 +3163,7 @@
list_del_init(&root_log_ctx.list);
blk_finish_plug(&plug);
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
if (ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
@@ -3115,7 +3209,7 @@
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
*/
- if (btrfs_need_log_full_commit(fs_info, trans)) {
+ if (btrfs_need_log_full_commit(trans)) {
blk_finish_plug(&plug);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
@@ -3128,7 +3222,7 @@
EXTENT_DIRTY | EXTENT_NEW);
blk_finish_plug(&plug);
if (ret) {
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
@@ -3138,7 +3232,7 @@
ret = btrfs_wait_tree_log_extents(log_root_tree,
EXTENT_NEW | EXTENT_DIRTY);
if (ret) {
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
@@ -3152,7 +3246,7 @@
mutex_unlock(&log_root_tree->log_mutex);
/*
- * nobody else is going to jump in and write the the ctree
+ * Nobody else is going to jump in and write the ctree
* super here because the log_commit atomic below is protecting
* us. We must be called with a transaction handle pinning
* the running transaction open, so a full commit can't hop
@@ -3160,7 +3254,7 @@
*/
ret = write_all_supers(fs_info, 1);
if (ret) {
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
goto out_wake_log_root;
}
@@ -3204,8 +3298,6 @@
struct btrfs_root *log)
{
int ret;
- u64 start;
- u64 end;
struct walk_control wc = {
.free = 1,
.process_func = process_one_buffer
@@ -3219,18 +3311,8 @@
btrfs_handle_fs_error(log->fs_info, ret, NULL);
}
- while (1) {
- ret = find_first_extent_bit(&log->dirty_log_pages,
- 0, &start, &end,
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT,
- NULL);
- if (ret)
- break;
-
- clear_extent_bits(&log->dirty_log_pages, start, end,
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
- }
-
+ clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
+ EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
free_extent_buffer(log->node);
kfree(log);
}
@@ -3259,6 +3341,30 @@
}
/*
+ * Check if an inode was logged in the current transaction. We can't always rely
+ * on an inode's logged_trans value, because it's an in-memory only field and
+ * therefore not persisted. This means that its value is lost if the inode gets
+ * evicted and loaded again from disk (in which case it has a value of 0, and
+ * certainly it is smaller then any possible transaction ID), when that happens
+ * the full_sync flag is set in the inode's runtime flags, so on that case we
+ * assume eviction happened and ignore the logged_trans value, assuming the
+ * worst case, that the inode was logged before in the current transaction.
+ */
+static bool inode_logged(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ if (inode->logged_trans == trans->transid)
+ return true;
+
+ if (inode->last_trans == trans->transid &&
+ test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+ !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
+ return true;
+
+ return false;
+}
+
+/*
* If both a file and directory are logged, and unlinks or renames are
* mixed in, we have a few interesting corners:
*
@@ -3292,7 +3398,7 @@
int bytes_del = 0;
u64 dir_ino = btrfs_ino(dir);
- if (dir->logged_trans < trans->transid)
+ if (!inode_logged(trans, dir))
return 0;
ret = join_running_log_trans(root);
@@ -3376,7 +3482,7 @@
out_unlock:
mutex_unlock(&dir->log_mutex);
if (ret == -ENOSPC) {
- btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_set_log_full_commit(trans);
ret = 0;
} else if (ret < 0)
btrfs_abort_transaction(trans, ret);
@@ -3392,12 +3498,11 @@
const char *name, int name_len,
struct btrfs_inode *inode, u64 dirid)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log;
u64 index;
int ret;
- if (inode->logged_trans < trans->transid)
+ if (!inode_logged(trans, inode))
return 0;
ret = join_running_log_trans(root);
@@ -3410,7 +3515,7 @@
dirid, &index);
mutex_unlock(&inode->log_mutex);
if (ret == -ENOSPC) {
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
ret = 0;
} else if (ret < 0 && ret != -ENOENT)
btrfs_abort_transaction(trans, ret);
@@ -3532,9 +3637,16 @@
}
btrfs_release_path(path);
- /* find the first key from this transaction again */
+ /*
+ * Find the first key from this transaction again. See the note for
+ * log_new_dir_dentries, if we're logging a directory recursively we
+ * won't be holding its i_mutex, which means we can modify the directory
+ * while we're logging it. If we remove an entry between our first
+ * search and this search we'll not find the key again and can just
+ * bail.
+ */
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
- if (WARN_ON(ret != 0))
+ if (ret != 0)
goto done;
/*
@@ -3721,6 +3833,8 @@
found_key.type = 0;
ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
&start_slot);
+ if (ret < 0)
+ break;
ret = btrfs_del_items(trans, log, path, start_slot,
path->slots[0] - start_slot + 1);
@@ -3746,7 +3860,7 @@
{
struct btrfs_map_token token;
- btrfs_init_map_token(&token);
+ btrfs_init_map_token(&token, leaf);
if (log_inode_only) {
/* set the generation to zero so the recover code
@@ -4114,6 +4228,7 @@
*last_extent, 0,
0, len, 0, len,
0, 0, 0);
+ *last_extent += len;
}
}
}
@@ -4205,8 +4320,6 @@
if (ret)
return ret;
- btrfs_init_map_token(&token);
-
ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
em->start + em->len, NULL, 0, 1,
sizeof(*fi), &extent_inserted);
@@ -4224,6 +4337,7 @@
return ret;
}
leaf = path->nodes[0];
+ btrfs_init_map_token(&token, leaf);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -4386,7 +4500,6 @@
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
- u64 logged_start, logged_end;
u64 test_gen;
int ret = 0;
int num = 0;
@@ -4395,8 +4508,6 @@
write_lock(&tree->lock);
test_gen = root->fs_info->last_trans_committed;
- logged_start = start;
- logged_end = end;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
/*
@@ -4437,11 +4548,6 @@
em->start >= i_size_read(&inode->vfs_inode))
continue;
- if (em->start < logged_start)
- logged_start = em->start;
- if ((em->start + em->len - 1) > logged_end)
- logged_end = em->start + em->len - 1;
-
/* Need a ref to keep it from getting evicted from cache */
refcount_inc(&em->refs);
set_bit(EXTENT_FLAG_LOGGING, &em->flags);
@@ -4504,6 +4610,19 @@
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
*size_ret = btrfs_inode_size(path->nodes[0], item);
+ /*
+ * If the in-memory inode's i_size is smaller then the inode
+ * size stored in the btree, return the inode's i_size, so
+ * that we get a correct inode size after replaying the log
+ * when before a power failure we had a shrinking truncate
+ * followed by addition of a new name (rename / new hard link).
+ * Otherwise return the inode size from the btree, to avoid
+ * data loss when replaying a log due to previously doing a
+ * write that expands the inode's size and logging a new name
+ * immediately after.
+ */
+ if (*size_ret > inode->vfs_inode.i_size)
+ *size_ret = inode->vfs_inode.i_size;
}
btrfs_release_path(path);
@@ -4665,15 +4784,8 @@
struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, extent) ==
- BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(leaf, extent);
- ASSERT(len == i_size ||
- (len == fs_info->sectorsize &&
- btrfs_file_extent_compression(leaf, extent) !=
- BTRFS_COMPRESS_NONE) ||
- (len < i_size && i_size < fs_info->sectorsize));
+ BTRFS_FILE_EXTENT_INLINE)
return 0;
- }
len = btrfs_file_extent_num_bytes(leaf, extent);
/* Last extent goes beyond i_size, no need to log a hole. */
@@ -4740,7 +4852,7 @@
const int slot,
const struct btrfs_key *key,
struct btrfs_inode *inode,
- u64 *other_ino)
+ u64 *other_ino, u64 *other_parent)
{
int ret;
struct btrfs_path *search_path;
@@ -4803,8 +4915,13 @@
btrfs_dir_item_key_to_cpu(search_path->nodes[0],
di, &di_key);
if (di_key.type == BTRFS_INODE_ITEM_KEY) {
- ret = 1;
- *other_ino = di_key.objectid;
+ if (di_key.objectid != key->objectid) {
+ ret = 1;
+ *other_ino = di_key.objectid;
+ *other_parent = parent;
+ } else {
+ ret = 0;
+ }
} else {
ret = -EAGAIN;
}
@@ -4824,6 +4941,144 @@
return ret;
}
+struct btrfs_ino_list {
+ u64 ino;
+ u64 parent;
+ struct list_head list;
+};
+
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx,
+ u64 ino, u64 parent)
+{
+ struct btrfs_ino_list *ino_elem;
+ LIST_HEAD(inode_list);
+ int ret = 0;
+
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &inode_list);
+
+ while (!list_empty(&inode_list)) {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key key;
+ struct inode *inode;
+
+ ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
+ list);
+ ino = ino_elem->ino;
+ parent = ino_elem->parent;
+ list_del(&ino_elem->list);
+ kfree(ino_elem);
+ if (ret)
+ continue;
+
+ btrfs_release_path(path);
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ /*
+ * If the other inode that had a conflicting dir entry was
+ * deleted in the current transaction, we need to log its parent
+ * directory.
+ */
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ if (ret == -ENOENT) {
+ key.objectid = parent;
+ inode = btrfs_iget(fs_info->sb, &key, root,
+ NULL);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ } else {
+ ret = btrfs_log_inode(trans, root,
+ BTRFS_I(inode),
+ LOG_OTHER_INODE_ALL,
+ 0, LLONG_MAX, ctx);
+ btrfs_add_delayed_iput(inode);
+ }
+ }
+ continue;
+ }
+ /*
+ * We are safe logging the other inode without acquiring its
+ * lock as long as we log with the LOG_INODE_EXISTS mode. We
+ * are safe against concurrent renames of the other inode as
+ * well because during a rename we pin the log and update the
+ * log with the new name before we unpin it.
+ */
+ ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+ LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
+ if (ret) {
+ btrfs_add_delayed_iput(inode);
+ continue;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ btrfs_add_delayed_iput(inode);
+ continue;
+ }
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ u64 other_ino = 0;
+ u64 other_parent = 0;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ break;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ (key.type != BTRFS_INODE_REF_KEY &&
+ key.type != BTRFS_INODE_EXTREF_KEY)) {
+ ret = 0;
+ break;
+ }
+
+ ret = btrfs_check_ref_name_override(leaf, slot, &key,
+ BTRFS_I(inode), &other_ino,
+ &other_parent);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem) {
+ ret = -ENOMEM;
+ break;
+ }
+ ino_elem->ino = other_ino;
+ ino_elem->parent = other_parent;
+ list_add_tail(&ino_elem->list, &inode_list);
+ ret = 0;
+ }
+ path->slots[0]++;
+ }
+ btrfs_add_delayed_iput(inode);
+ }
+
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -4863,6 +5118,7 @@
u64 logged_isize = 0;
bool need_log_inode_item = true;
bool xattrs_logged = false;
+ bool recursive_logging = false;
path = btrfs_alloc_path();
if (!path)
@@ -4908,8 +5164,12 @@
return ret;
}
- if (inode_only == LOG_OTHER_INODE) {
- inode_only = LOG_INODE_EXISTS;
+ if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
+ recursive_logging = true;
+ if (inode_only == LOG_OTHER_INODE)
+ inode_only = LOG_INODE_EXISTS;
+ else
+ inode_only = LOG_INODE_ALL;
mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
} else {
mutex_lock(&inode->log_mutex);
@@ -5004,20 +5264,19 @@
if ((min_key.type == BTRFS_INODE_REF_KEY ||
min_key.type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid) {
+ inode->generation == trans->transid &&
+ !recursive_logging) {
u64 other_ino = 0;
+ u64 other_parent = 0;
ret = btrfs_check_ref_name_override(path->nodes[0],
path->slots[0], &min_key, inode,
- &other_ino);
+ &other_ino, &other_parent);
if (ret < 0) {
err = ret;
goto out_unlock;
} else if (ret > 0 && ctx &&
other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
- struct btrfs_key inode_key;
- struct inode *other_inode;
-
if (ins_nr > 0) {
ins_nr++;
} else {
@@ -5033,43 +5292,13 @@
goto out_unlock;
}
ins_nr = 0;
- btrfs_release_path(path);
- inode_key.objectid = other_ino;
- inode_key.type = BTRFS_INODE_ITEM_KEY;
- inode_key.offset = 0;
- other_inode = btrfs_iget(fs_info->sb,
- &inode_key, root,
- NULL);
- /*
- * If the other inode that had a conflicting dir
- * entry was deleted in the current transaction,
- * we don't need to do more work nor fallback to
- * a transaction commit.
- */
- if (other_inode == ERR_PTR(-ENOENT)) {
- goto next_key;
- } else if (IS_ERR(other_inode)) {
- err = PTR_ERR(other_inode);
- goto out_unlock;
- }
- /*
- * We are safe logging the other inode without
- * acquiring its i_mutex as long as we log with
- * the LOG_INODE_EXISTS mode. We're safe against
- * concurrent renames of the other inode as well
- * because during a rename we pin the log and
- * update the log with the new name before we
- * unpin it.
- */
- err = btrfs_log_inode(trans, root,
- BTRFS_I(other_inode),
- LOG_OTHER_INODE, 0, LLONG_MAX,
- ctx);
- iput(other_inode);
+
+ err = log_conflicting_inodes(trans, root, path,
+ ctx, other_ino, other_parent);
if (err)
goto out_unlock;
- else
- goto next_key;
+ btrfs_release_path(path);
+ goto next_key;
}
}
@@ -5232,9 +5461,19 @@
}
}
+ /*
+ * Don't update last_log_commit if we logged that an inode exists after
+ * it was loaded to memory (full_sync bit set).
+ * This is to prevent data loss when we do a write to the inode, then
+ * the inode gets evicted after all delalloc was flushed, then we log
+ * it exists (due to a rename for example) and then fsync it. This last
+ * fsync would do nothing (not logging the extents previously written).
+ */
spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
- inode->last_log_commit = inode->last_sub_trans;
+ if (inode_only != LOG_INODE_EXISTS ||
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
out_unlock:
mutex_unlock(&inode->log_mutex);
@@ -5272,7 +5511,7 @@
* Make sure any commits to the log are forced to be full
* commits.
*/
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
ret = true;
}
mutex_unlock(&inode->log_mutex);
@@ -5294,7 +5533,6 @@
{
int ret = 0;
struct dentry *old_parent = NULL;
- struct btrfs_inode *orig_inode = inode;
/*
* for regular files, if its inode is already on disk, we don't
@@ -5314,16 +5552,6 @@
}
while (1) {
- /*
- * If we are logging a directory then we start with our inode,
- * not our parent's inode, so we need to skip setting the
- * logged_trans so that further down in the log code we don't
- * think this inode has already been logged.
- */
- if (inode != orig_inode)
- inode->logged_trans = trans->transid;
- smp_mb();
-
if (btrfs_must_commit_transaction(trans, inode)) {
ret = 1;
break;
@@ -5478,7 +5706,7 @@
}
if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
- iput(di_inode);
+ btrfs_add_delayed_iput(di_inode);
break;
}
@@ -5490,7 +5718,7 @@
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
ret = 1;
- iput(di_inode);
+ btrfs_add_delayed_iput(di_inode);
if (ret)
goto next_dir_inode;
if (ctx->log_new_dentries) {
@@ -5637,7 +5865,7 @@
if (!ret && ctx && ctx->log_new_dentries)
ret = log_new_dir_dentries(trans, root,
BTRFS_I(dir_inode), ctx);
- iput(dir_inode);
+ btrfs_add_delayed_iput(dir_inode);
if (ret)
goto out;
}
@@ -5649,6 +5877,190 @@
return ret;
}
+static int log_new_ancestors(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_key found_key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+
+ while (true) {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ const u64 last_committed = fs_info->last_trans_committed;
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_key search_key;
+ struct inode *inode;
+ int ret = 0;
+
+ btrfs_release_path(path);
+
+ search_key.objectid = found_key.offset;
+ search_key.type = BTRFS_INODE_ITEM_KEY;
+ search_key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &search_key, root, NULL);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (BTRFS_I(inode)->generation > last_committed)
+ ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+ LOG_INODE_EXISTS,
+ 0, LLONG_MAX, ctx);
+ btrfs_add_delayed_iput(inode);
+ if (ret)
+ return ret;
+
+ if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
+ break;
+
+ search_key.type = BTRFS_INODE_REF_KEY;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ return -ENOENT;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid != search_key.objectid ||
+ found_key.type != BTRFS_INODE_REF_KEY)
+ return -ENOENT;
+ }
+ return 0;
+}
+
+static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct dentry *parent,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct dentry *old_parent = NULL;
+ struct super_block *sb = inode->vfs_inode.i_sb;
+ int ret = 0;
+
+ while (true) {
+ if (!parent || d_really_is_negative(parent) ||
+ sb != parent->d_sb)
+ break;
+
+ inode = BTRFS_I(d_inode(parent));
+ if (root != inode->root)
+ break;
+
+ if (inode->generation > fs_info->last_trans_committed) {
+ ret = btrfs_log_inode(trans, root, inode,
+ LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
+ if (ret)
+ break;
+ }
+ if (IS_ROOT(parent))
+ break;
+
+ parent = dget_parent(parent);
+ dput(old_parent);
+ old_parent = parent;
+ }
+ dput(old_parent);
+
+ return ret;
+}
+
+static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct dentry *parent,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = inode->root;
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_path *path;
+ struct btrfs_key search_key;
+ int ret;
+
+ /*
+ * For a single hard link case, go through a fast path that does not
+ * need to iterate the fs/subvolume tree.
+ */
+ if (inode->vfs_inode.i_nlink < 2)
+ return log_new_ancestors_fast(trans, inode, parent, ctx);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = 0;
+again:
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret == 0)
+ path->slots[0]++;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_key found_key;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid != ino ||
+ found_key.type > BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ /*
+ * Don't deal with extended references because they are rare
+ * cases and too complex to deal with (we would need to keep
+ * track of which subitem we are processing for each item in
+ * this loop, etc). So just return some error to fallback to
+ * a transaction commit.
+ */
+ if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = -EMLINK;
+ goto out;
+ }
+
+ /*
+ * Logging ancestors needs to do more searches on the fs/subvol
+ * tree, so it releases the path as needed to avoid deadlocks.
+ * Keep track of the last inode ref key and resume from that key
+ * after logging all new ancestors for the current hard link.
+ */
+ memcpy(&search_key, &found_key, sizeof(search_key));
+
+ ret = log_new_ancestors(trans, root, path, ctx);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ goto again;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -5666,11 +6078,9 @@
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct super_block *sb;
- struct dentry *old_parent = NULL;
int ret = 0;
u64 last_committed = fs_info->last_trans_committed;
bool log_dentries = false;
- struct btrfs_inode *orig_inode = inode;
sb = inode->vfs_inode.i_sb;
@@ -5776,40 +6186,22 @@
* and has a link count of 2.
*/
if (inode->last_unlink_trans > last_committed) {
- ret = btrfs_log_all_parents(trans, orig_inode, ctx);
+ ret = btrfs_log_all_parents(trans, inode, ctx);
if (ret)
goto end_trans;
}
- while (1) {
- if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
- break;
+ ret = log_all_new_ancestors(trans, inode, parent, ctx);
+ if (ret)
+ goto end_trans;
- inode = BTRFS_I(d_inode(parent));
- if (root != inode->root)
- break;
-
- if (inode->generation > last_committed) {
- ret = btrfs_log_inode(trans, root, inode,
- LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
- if (ret)
- goto end_trans;
- }
- if (IS_ROOT(parent))
- break;
-
- parent = dget_parent(parent);
- dput(old_parent);
- old_parent = parent;
- }
if (log_dentries)
- ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
+ ret = log_new_dir_dentries(trans, root, inode, ctx);
else
ret = 0;
end_trans:
- dput(old_parent);
if (ret < 0) {
- btrfs_set_log_full_commit(fs_info, trans);
+ btrfs_set_log_full_commit(trans);
ret = 1;
}
@@ -5858,7 +6250,7 @@
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
struct walk_control wc = {
.process_func = process_one_buffer,
- .stage = 0,
+ .stage = LOG_WALK_PIN_ONLY,
};
path = btrfs_alloc_path();
@@ -6036,7 +6428,6 @@
* if this directory was already logged any new
* names for this file/dir will get recorded
*/
- smp_mb();
if (dir->logged_trans == trans->transid)
return;