Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dac30b0..5addd1e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -31,9 +31,14 @@
static const struct btrfs_csums {
u16 size;
- const char *name;
+ const char name[10];
+ const char driver[12];
} btrfs_csums[] = {
[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
+ [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
+ [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
+ [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
+ .driver = "blake2b-256" },
};
int btrfs_super_csum_size(const struct btrfs_super_block *s)
@@ -51,36 +56,28 @@
return btrfs_csums[csum_type].name;
}
+/*
+ * Return driver name if defined, otherwise the name that's also a valid driver
+ * name
+ */
+const char *btrfs_super_csum_driver(u16 csum_type)
+{
+ /* csum type is validated at mount time */
+ return btrfs_csums[csum_type].driver[0] ?
+ btrfs_csums[csum_type].driver :
+ btrfs_csums[csum_type].name;
+}
+
+size_t __attribute_const__ btrfs_get_num_csums(void)
+{
+ return ARRAY_SIZE(btrfs_csums);
+}
+
struct btrfs_path *btrfs_alloc_path(void)
{
return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
}
-/*
- * set all locked nodes in the path to blocking locks. This should
- * be done before scheduling
- */
-noinline void btrfs_set_path_blocking(struct btrfs_path *p)
-{
- int i;
- for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
- if (!p->nodes[i] || !p->locks[i])
- continue;
- /*
- * If we currently have a spinning reader or writer lock this
- * will bump the count of blocking holders and drop the
- * spinlock.
- */
- if (p->locks[i] == BTRFS_READ_LOCK) {
- btrfs_set_lock_blocking_read(p->nodes[i]);
- p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
- } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
- btrfs_set_lock_blocking_write(p->nodes[i]);
- p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
- }
- }
-}
-
/* this also releases the path */
void btrfs_free_path(struct btrfs_path *p)
{
@@ -147,47 +144,10 @@
return eb;
}
-/* loop around taking references on and locking the root node of the
- * tree until you end up with a lock on the root. A locked buffer
- * is returned, with a reference held.
- */
-struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
-{
- struct extent_buffer *eb;
-
- while (1) {
- eb = btrfs_root_node(root);
- btrfs_tree_lock(eb);
- if (eb == root->node)
- break;
- btrfs_tree_unlock(eb);
- free_extent_buffer(eb);
- }
- return eb;
-}
-
-/* loop around taking references on and locking the root node of the
- * tree until you end up with a lock on the root. A locked buffer
- * is returned, with a reference held.
- */
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
-{
- struct extent_buffer *eb;
-
- while (1) {
- eb = btrfs_root_node(root);
- btrfs_tree_read_lock(eb);
- if (eb == root->node)
- break;
- btrfs_tree_read_unlock(eb);
- free_extent_buffer(eb);
- }
- return eb;
-}
-
-/* cowonly root (everything not a reference counted cow subvolume), just get
- * put onto a simple dirty list. transaction.c walks this to make sure they
- * get properly updated on disk.
+/*
+ * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
+ * just get put onto a simple dirty list. Transaction walks this list to make
+ * sure they get properly updated on disk.
*/
static void add_root_to_dirty_list(struct btrfs_root *root)
{
@@ -226,9 +186,9 @@
int level;
struct btrfs_disk_key disk_key;
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != root->last_trans);
level = btrfs_header_level(buf);
@@ -238,7 +198,8 @@
btrfs_node_key(buf, &disk_key, 0);
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
- &disk_key, level, buf->start, 0);
+ &disk_key, level, buf->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -348,7 +309,6 @@
struct rb_root *tm_root;
struct rb_node *node;
struct rb_node *next;
- struct seq_list *cur_elem;
struct tree_mod_elem *tm;
u64 min_seq = (u64)-1;
u64 seq_putting = elem->seq;
@@ -360,18 +320,20 @@
list_del(&elem->list);
elem->seq = 0;
- list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
- if (cur_elem->seq < min_seq) {
- if (seq_putting > cur_elem->seq) {
- /*
- * blocker with lower sequence number exists, we
- * cannot remove anything from the log
- */
- write_unlock(&fs_info->tree_mod_log_lock);
- return;
- }
- min_seq = cur_elem->seq;
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ struct seq_list *first;
+
+ first = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct seq_list, list);
+ if (seq_putting > first->seq) {
+ /*
+ * Blocker with lower sequence number exists, we
+ * cannot remove anything from the log.
+ */
+ write_unlock(&fs_info->tree_mod_log_lock);
+ return;
}
+ min_seq = first->seq;
}
/*
@@ -869,12 +831,11 @@
struct extent_buffer *buf)
{
/*
- * Tree blocks not in reference counted trees and tree roots
- * are never shared. If a block was allocated after the last
- * snapshot and the block was not allocated by tree relocation,
- * we know the block is not shared.
+ * Tree blocks not in shareable trees and tree roots are never shared.
+ * If a block was allocated after the last snapshot and the block was
+ * not allocated by tree relocation, we know the block is not shared.
*/
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
buf != root->node && buf != root->commit_root &&
(btrfs_header_generation(buf) <=
btrfs_root_last_snapshot(&root->root_item) ||
@@ -969,9 +930,7 @@
if (new_flags != 0) {
int level = btrfs_header_level(buf);
- ret = btrfs_set_disk_extent_flags(trans,
- buf->start,
- buf->len,
+ ret = btrfs_set_disk_extent_flags(trans, buf,
new_flags, level, 0);
if (ret)
return ret;
@@ -1002,7 +961,8 @@
const struct btrfs_disk_key *disk_key,
int level,
u64 hint,
- u64 empty_size)
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *ret;
@@ -1031,7 +991,7 @@
ret = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, disk_key, level,
- hint, empty_size);
+ hint, empty_size, nest);
trans->can_flush_pending_bgs = true;
return ret;
@@ -1054,7 +1014,8 @@
struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size)
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_disk_key disk_key;
@@ -1069,9 +1030,9 @@
btrfs_assert_tree_locked(buf);
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != root->last_trans);
level = btrfs_header_level(buf);
@@ -1085,7 +1046,7 @@
parent_start = parent->start;
cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
- level, search_start, empty_size);
+ level, search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1112,7 +1073,7 @@
return ret;
}
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
btrfs_tree_unlock(cow);
@@ -1128,7 +1089,7 @@
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
parent_start = buf->start;
- extent_buffer_get(cow);
+ atomic_inc(&cow->refs);
ret = tree_mod_log_insert_root(root->node, cow, 1);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, cow);
@@ -1247,7 +1208,7 @@
switch (tm->op) {
case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
BUG_ON(tm->slot < n);
- /* Fallthrough */
+ fallthrough;
case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
case MOD_LOG_KEY_REMOVE:
btrfs_set_node_key(eb, &tm->key, tm->slot);
@@ -1519,7 +1480,8 @@
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret)
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
@@ -1558,7 +1520,7 @@
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0);
+ parent_slot, cow_ret, search_start, 0, nest);
trace_btrfs_cow_block(root, buf, *cow_ret);
@@ -1578,6 +1540,22 @@
return 0;
}
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Compare two keys, on little-endian the disk order is same as CPU order and
+ * we can avoid the conversion.
+ */
+static int comp_keys(const struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *k2)
+{
+ const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
+
+ return btrfs_comp_cpu_keys(k1, k2);
+}
+
+#else
+
/*
* compare two keys in a memcmp fashion
*/
@@ -1590,11 +1568,12 @@
return btrfs_comp_cpu_keys(&k1, k2);
}
+#endif
/*
* same as comp_keys only with two btrfs_key's
*/
-int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
+int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
{
if (k1->objectid > k2->objectid)
return 1;
@@ -1713,7 +1692,8 @@
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
- (end_slot - i) * blocksize));
+ (end_slot - i) * blocksize),
+ BTRFS_NESTING_COW);
if (err) {
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
@@ -1745,15 +1725,8 @@
{
int low = 0;
int high = max;
- int mid;
int ret;
- struct btrfs_disk_key *tmp = NULL;
- struct btrfs_disk_key unaligned;
- unsigned long offset;
- char *kaddr = NULL;
- unsigned long map_start = 0;
- unsigned long map_len = 0;
- int err;
+ const int key_size = sizeof(struct btrfs_disk_key);
if (low > high) {
btrfs_err(eb->fs_info,
@@ -1764,32 +1737,26 @@
}
while (low < high) {
+ unsigned long oip;
+ unsigned long offset;
+ struct btrfs_disk_key *tmp;
+ struct btrfs_disk_key unaligned;
+ int mid;
+
mid = (low + high) / 2;
offset = p + mid * item_size;
+ oip = offset_in_page(offset);
- if (!kaddr || offset < map_start ||
- (offset + sizeof(struct btrfs_disk_key)) >
- map_start + map_len) {
+ if (oip + key_size <= PAGE_SIZE) {
+ const unsigned long idx = offset >> PAGE_SHIFT;
+ char *kaddr = page_address(eb->pages[idx]);
- err = map_private_extent_buffer(eb, offset,
- sizeof(struct btrfs_disk_key),
- &kaddr, &map_start, &map_len);
-
- if (!err) {
- tmp = (struct btrfs_disk_key *)(kaddr + offset -
- map_start);
- } else if (err == 1) {
- read_extent_buffer(eb, &unaligned,
- offset, sizeof(unaligned));
- tmp = &unaligned;
- } else {
- return err;
- }
-
+ tmp = (struct btrfs_disk_key *)(kaddr + oip);
} else {
- tmp = (struct btrfs_disk_key *)(kaddr + offset -
- map_start);
+ read_extent_buffer(eb, &unaligned, offset, key_size);
+ tmp = &unaligned;
}
+
ret = comp_keys(tmp, key);
if (ret < 0)
@@ -1810,9 +1777,9 @@
* leaves vs nodes
*/
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
- int level, int *slot)
+ int *slot)
{
- if (level == 0)
+ if (btrfs_header_level(eb) == 0)
return generic_bin_search(eb,
offsetof(struct btrfs_leaf, items),
sizeof(struct btrfs_item),
@@ -1924,7 +1891,8 @@
btrfs_tree_lock(child);
btrfs_set_lock_blocking_write(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(child);
free_extent_buffer(child);
@@ -1960,10 +1928,11 @@
left = NULL;
if (left) {
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
wret = btrfs_cow_block(trans, root, left,
- parent, pslot - 1, &left);
+ parent, pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -1975,10 +1944,11 @@
right = NULL;
if (right) {
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
wret = btrfs_cow_block(trans, root, right,
- parent, pslot + 1, &right);
+ parent, pslot + 1, &right,
+ BTRFS_NESTING_RIGHT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -2067,7 +2037,7 @@
/* update the path */
if (left) {
if (btrfs_header_nritems(left) > orig_slot) {
- extent_buffer_get(left);
+ atomic_inc(&left->refs);
/* left was locked after cow */
path->nodes[level] = left;
path->slots[level + 1] -= 1;
@@ -2138,7 +2108,7 @@
if (left) {
u32 left_nr;
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
left_nr = btrfs_header_nritems(left);
@@ -2146,7 +2116,8 @@
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
- pslot - 1, &left);
+ pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret)
wret = 1;
else {
@@ -2192,7 +2163,7 @@
if (right) {
u32 right_nr;
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
right_nr = btrfs_header_nritems(right);
@@ -2201,7 +2172,7 @@
} else {
ret = btrfs_cow_block(trans, root, right,
parent, pslot + 1,
- &right);
+ &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
wret = 1;
else {
@@ -2410,32 +2381,6 @@
}
/*
- * This releases any locks held in the path starting at level and
- * going all the way up to the root.
- *
- * btrfs_search_slot will keep the lock held on higher nodes in a few
- * corner cases, such as COW of the block at slot zero in the node. This
- * ignores those rules, and it should only be called when there are no
- * more updates to be done higher up in the tree.
- */
-noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
-{
- int i;
-
- if (path->keep_locks)
- return;
-
- for (i = level; i < BTRFS_MAX_LEVEL; i++) {
- if (!path->nodes[i])
- continue;
- if (!path->locks[i])
- continue;
- btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
- path->locks[i] = 0;
- }
-}
-
-/*
* helper function for btrfs_search_slot. The goal is to find a block
* in cache without setting the path to blocking. If we find the block
* we return zero and the path is unchanged.
@@ -2451,16 +2396,15 @@
struct btrfs_fs_info *fs_info = root->fs_info;
u64 blocknr;
u64 gen;
- struct extent_buffer *b = *eb_ret;
struct extent_buffer *tmp;
struct btrfs_key first_key;
int ret;
int parent_level;
- blocknr = btrfs_node_blockptr(b, slot);
- gen = btrfs_node_ptr_generation(b, slot);
- parent_level = btrfs_header_level(b);
- btrfs_node_key_to_cpu(b, &first_key, slot);
+ blocknr = btrfs_node_blockptr(*eb_ret, slot);
+ gen = btrfs_node_ptr_generation(*eb_ret, slot);
+ parent_level = btrfs_header_level(*eb_ret);
+ btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
@@ -2604,19 +2548,6 @@
return ret;
}
-static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
- int level, int *prev_cmp, int *slot)
-{
- if (*prev_cmp != 0) {
- *prev_cmp = btrfs_bin_search(b, key, level, slot);
- return *prev_cmp;
- }
-
- *slot = 0;
-
- return 0;
-}
-
int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
u64 iobjectid, u64 ioff, u8 key_type,
struct btrfs_key *found_key)
@@ -2658,12 +2589,9 @@
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
- int root_lock;
+ int root_lock = 0;
int level = 0;
- /* We try very hard to do read locks on the root */
- root_lock = BTRFS_READ_LOCK;
-
if (p->search_commit_root) {
/*
* The commit roots are read only so we always do read locks,
@@ -2683,7 +2611,7 @@
} else {
b = root->commit_root;
- extent_buffer_get(b);
+ atomic_inc(&b->refs);
}
level = btrfs_header_level(b);
/*
@@ -2701,6 +2629,9 @@
goto out;
}
+ /* We try very hard to do read locks on the root */
+ root_lock = BTRFS_READ_LOCK;
+
/*
* If the level is set to maximum, we can skip trying to get the read
* lock.
@@ -2710,7 +2641,7 @@
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = btrfs_read_lock_root_node(root);
+ b = __btrfs_read_lock_root_node(root, p->recurse);
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -2727,6 +2658,17 @@
level = btrfs_header_level(b);
out:
+ /*
+ * The root may have failed to write out at some point, and thus is no
+ * longer valid, return an error in this case.
+ */
+ if (!extent_buffer_uptodate(b)) {
+ if (root_lock)
+ btrfs_tree_unlock_rw(b, root_lock);
+ free_extent_buffer(b);
+ return ERR_PTR(-EIO);
+ }
+
p->nodes[level] = b;
if (!p->skip_locking)
p->locks[level] = root_lock;
@@ -2816,12 +2758,10 @@
}
while (b) {
+ int dec = 0;
+
level = btrfs_header_level(b);
- /*
- * setup the path here so we can release it under lock
- * contention with the cow code
- */
if (cow) {
bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
@@ -2851,11 +2791,13 @@
btrfs_set_path_blocking(p);
if (last_level)
err = btrfs_cow_block(trans, root, b, NULL, 0,
- &b);
+ &b,
+ BTRFS_NESTING_COW);
else
err = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
- p->slots[level + 1], &b);
+ p->slots[level + 1], &b,
+ BTRFS_NESTING_COW);
if (err) {
ret = err;
goto done;
@@ -2888,77 +2830,25 @@
}
}
- ret = key_search(b, key, level, &prev_cmp, &slot);
- if (ret < 0)
- goto done;
-
- if (level != 0) {
- int dec = 0;
- if (ret && slot > 0) {
- dec = 1;
- slot -= 1;
- }
- p->slots[level] = slot;
- err = setup_nodes_for_search(trans, root, p, b, level,
- ins_len, &write_lock_level);
- if (err == -EAGAIN)
- goto again;
- if (err) {
- ret = err;
- goto done;
- }
- b = p->nodes[level];
- slot = p->slots[level];
-
- /*
- * slot 0 is special, if we change the key
- * we have to update the parent pointer
- * which means we must have a write lock
- * on the parent
- */
- if (slot == 0 && ins_len &&
- write_lock_level < level + 1) {
- write_lock_level = level + 1;
- btrfs_release_path(p);
- goto again;
- }
-
- unlock_up(p, level, lowest_unlock,
- min_write_lock_level, &write_lock_level);
-
- if (level == lowest_level) {
- if (dec)
- p->slots[level]++;
- goto done;
- }
-
- err = read_block_for_search(root, p, &b, level,
- slot, key);
- if (err == -EAGAIN)
- goto again;
- if (err) {
- ret = err;
- goto done;
- }
-
- if (!p->skip_locking) {
- level = btrfs_header_level(b);
- if (level <= write_lock_level) {
- if (!btrfs_try_tree_write_lock(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_lock(b);
- }
- p->locks[level] = BTRFS_WRITE_LOCK;
- } else {
- if (!btrfs_tree_read_lock_atomic(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
- }
- p->locks[level] = BTRFS_READ_LOCK;
- }
- p->nodes[level] = b;
- }
+ /*
+ * If btrfs_bin_search returns an exact match (prev_cmp == 0)
+ * we can safely assume the target key will always be in slot 0
+ * on lower levels due to the invariants BTRFS' btree provides,
+ * namely that a btrfs_key_ptr entry always points to the
+ * lowest key in the child node, thus we can skip searching
+ * lower levels
+ */
+ if (prev_cmp == 0) {
+ slot = 0;
+ ret = 0;
} else {
+ ret = btrfs_bin_search(b, key, &slot);
+ prev_cmp = ret;
+ if (ret < 0)
+ goto done;
+ }
+
+ if (level == 0) {
p->slots[level] = slot;
if (ins_len > 0 &&
btrfs_leaf_free_space(b) < ins_len) {
@@ -2983,6 +2873,68 @@
min_write_lock_level, NULL);
goto done;
}
+ if (ret && slot > 0) {
+ dec = 1;
+ slot--;
+ }
+ p->slots[level] = slot;
+ err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
+ &write_lock_level);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+ b = p->nodes[level];
+ slot = p->slots[level];
+
+ /*
+ * Slot 0 is special, if we change the key we have to update
+ * the parent pointer which means we must have a write lock on
+ * the parent
+ */
+ if (slot == 0 && ins_len && write_lock_level < level + 1) {
+ write_lock_level = level + 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
+ unlock_up(p, level, lowest_unlock, min_write_lock_level,
+ &write_lock_level);
+
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
+
+ err = read_block_for_search(root, p, &b, level, slot, key);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ if (!p->skip_locking) {
+ level = btrfs_header_level(b);
+ if (level <= write_lock_level) {
+ if (!btrfs_try_tree_write_lock(b)) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_lock(b);
+ }
+ p->locks[level] = BTRFS_WRITE_LOCK;
+ } else {
+ if (!btrfs_tree_read_lock_atomic(b)) {
+ btrfs_set_path_blocking(p);
+ __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
+ p->recurse);
+ }
+ p->locks[level] = BTRFS_READ_LOCK;
+ }
+ p->nodes[level] = b;
+ }
}
ret = 1;
done:
@@ -3019,7 +2971,6 @@
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
- int prev_cmp = -1;
lowest_level = p->lowest_level;
WARN_ON(p->nodes[0] != NULL);
@@ -3039,6 +2990,8 @@
p->locks[level] = BTRFS_READ_LOCK;
while (b) {
+ int dec = 0;
+
level = btrfs_header_level(b);
p->nodes[level] = b;
@@ -3050,56 +3003,49 @@
*/
btrfs_unlock_up_safe(p, level + 1);
- /*
- * Since we can unwind ebs we want to do a real search every
- * time.
- */
- prev_cmp = -1;
- ret = key_search(b, key, level, &prev_cmp, &slot);
+ ret = btrfs_bin_search(b, key, &slot);
if (ret < 0)
goto done;
- if (level != 0) {
- int dec = 0;
- if (ret && slot > 0) {
- dec = 1;
- slot -= 1;
- }
- p->slots[level] = slot;
- unlock_up(p, level, lowest_unlock, 0, NULL);
-
- if (level == lowest_level) {
- if (dec)
- p->slots[level]++;
- goto done;
- }
-
- err = read_block_for_search(root, p, &b, level,
- slot, key);
- if (err == -EAGAIN)
- goto again;
- if (err) {
- ret = err;
- goto done;
- }
-
- level = btrfs_header_level(b);
- if (!btrfs_tree_read_lock_atomic(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
- }
- b = tree_mod_log_rewind(fs_info, p, b, time_seq);
- if (!b) {
- ret = -ENOMEM;
- goto done;
- }
- p->locks[level] = BTRFS_READ_LOCK;
- p->nodes[level] = b;
- } else {
+ if (level == 0) {
p->slots[level] = slot;
unlock_up(p, level, lowest_unlock, 0, NULL);
goto done;
}
+
+ if (ret && slot > 0) {
+ dec = 1;
+ slot--;
+ }
+ p->slots[level] = slot;
+ unlock_up(p, level, lowest_unlock, 0, NULL);
+
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
+
+ err = read_block_for_search(root, p, &b, level, slot, key);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ level = btrfs_header_level(b);
+ if (!btrfs_tree_read_lock_atomic(b)) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_read_lock(b);
+ }
+ b = tree_mod_log_rewind(fs_info, p, b, time_seq);
+ if (!b) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ p->locks[level] = BTRFS_READ_LOCK;
+ p->nodes[level] = b;
}
ret = 1;
done:
@@ -3272,6 +3218,58 @@
}
/*
+ * Check key order of two sibling extent buffers.
+ *
+ * Return true if something is wrong.
+ * Return false if everything is fine.
+ *
+ * Tree-checker only works inside one tree block, thus the following
+ * corruption can not be detected by tree-checker:
+ *
+ * Leaf @left | Leaf @right
+ * --------------------------------------------------------------
+ * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
+ *
+ * Key f6 in leaf @left itself is valid, but not valid when the next
+ * key in leaf @right is 7.
+ * This can only be checked at tree block merge time.
+ * And since tree checker has ensured all key order in each tree block
+ * is correct, we only need to bother the last key of @left and the first
+ * key of @right.
+ */
+static bool check_sibling_keys(struct extent_buffer *left,
+ struct extent_buffer *right)
+{
+ struct btrfs_key left_last;
+ struct btrfs_key right_first;
+ int level = btrfs_header_level(left);
+ int nr_left = btrfs_header_nritems(left);
+ int nr_right = btrfs_header_nritems(right);
+
+ /* No key to check in one of the tree blocks */
+ if (!nr_left || !nr_right)
+ return false;
+
+ if (level) {
+ btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_node_key_to_cpu(right, &right_first, 0);
+ } else {
+ btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_item_key_to_cpu(right, &right_first, 0);
+ }
+
+ if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
+ btrfs_crit(left->fs_info,
+"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
+ left_last.objectid, left_last.type,
+ left_last.offset, right_first.objectid,
+ right_first.type, right_first.offset);
+ return true;
+ }
+ return false;
+}
+
+/*
* try to push data from one node into the next node left in the
* tree.
*
@@ -3315,6 +3313,12 @@
} else
push_items = min(src_nritems - 8, push_items);
+ /* dst is the left eb, src is the middle eb */
+ if (check_sibling_keys(dst, src)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -3383,6 +3387,12 @@
if (max_push < push_items)
push_items = max_push;
+ /* dst is the right eb, src is the middle eb */
+ if (check_sibling_keys(src, dst)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
@@ -3439,7 +3449,8 @@
btrfs_node_key(lower, &lower_key, 0);
c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
- root->node->start, 0);
+ root->node->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -3464,7 +3475,7 @@
free_extent_buffer(old);
add_root_to_dirty_list(root);
- extent_buffer_get(c);
+ atomic_inc(&c->refs);
path->nodes[level] = c;
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
path->slots[level] = 0;
@@ -3569,7 +3580,7 @@
btrfs_node_key(c, &disk_key, mid);
split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
- c->start, 0);
+ c->start, 0, BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -3617,19 +3628,17 @@
{
struct btrfs_item *start_item;
struct btrfs_item *end_item;
- struct btrfs_map_token token;
int data_len;
int nritems = btrfs_header_nritems(l);
int end = min(nritems, start + nr) - 1;
if (!nr)
return 0;
- btrfs_init_map_token(&token, l);
start_item = btrfs_item_nr(start);
end_item = btrfs_item_nr(end);
- data_len = btrfs_token_item_offset(l, start_item, &token) +
- btrfs_token_item_size(l, start_item, &token);
- data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
+ data_len = btrfs_item_offset(l, start_item) +
+ btrfs_item_size(l, start_item);
+ data_len = data_len - btrfs_item_offset(l, end_item);
data_len += sizeof(struct btrfs_item) * nr;
WARN_ON(data_len < 0);
return data_len;
@@ -3760,8 +3769,8 @@
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(i);
- push_space -= btrfs_token_item_size(right, item, &token);
- btrfs_set_token_item_offset(right, item, push_space, &token);
+ push_space -= btrfs_token_item_size(&token, item);
+ btrfs_set_token_item_offset(&token, item, push_space);
}
left_nritems -= push_items;
@@ -3840,7 +3849,7 @@
if (IS_ERR(right))
return 1;
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
free_space = btrfs_leaf_free_space(right);
@@ -3849,7 +3858,7 @@
/* cow and double check */
ret = btrfs_cow_block(trans, root, right, upper,
- slot + 1, &right);
+ slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
goto out_unlock;
@@ -3861,6 +3870,12 @@
if (left_nritems == 0)
goto out_unlock;
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return ret;
+ }
if (path->slots[0] == left_nritems && !empty) {
/* Key greater than all keys in the leaf, right neighbor has
* enough room for it and we're not emptying our leaf to delete
@@ -3969,10 +3984,9 @@
item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(left, item, &token);
- btrfs_set_token_item_offset(left, item,
- ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
- &token);
+ ioff = btrfs_token_item_offset(&token, item);
+ btrfs_set_token_item_offset(&token, item,
+ ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -4002,9 +4016,8 @@
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(i);
- push_space = push_space - btrfs_token_item_size(right,
- item, &token);
- btrfs_set_token_item_offset(right, item, push_space, &token);
+ push_space = push_space - btrfs_token_item_size(&token, item);
+ btrfs_set_token_item_offset(&token, item, push_space);
}
btrfs_mark_buffer_dirty(left);
@@ -4075,7 +4088,7 @@
if (IS_ERR(left))
return 1;
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
free_space = btrfs_leaf_free_space(left);
@@ -4086,7 +4099,8 @@
/* cow and double check */
ret = btrfs_cow_block(trans, root, left,
- path->nodes[1], slot - 1, &left);
+ path->nodes[1], slot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret) {
/* we hit -ENOSPC, but it isn't fatal here */
if (ret == -ENOSPC)
@@ -4100,6 +4114,10 @@
goto out;
}
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
return __push_leaf_left(path, min_data_size,
empty, left, free_space, right_nritems,
max_slot);
@@ -4146,9 +4164,8 @@
struct btrfs_item *item = btrfs_item_nr(i);
u32 ioff;
- ioff = btrfs_token_item_offset(right, item, &token);
- btrfs_set_token_item_offset(right, item,
- ioff + rt_data_off, &token);
+ ioff = btrfs_token_item_offset(&token, item);
+ btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
}
btrfs_set_header_nritems(l, mid);
@@ -4349,8 +4366,18 @@
else
btrfs_item_key(l, &disk_key, mid);
+ /*
+ * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
+ * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
+ * subclasses, which is 8 at the time of this patch, and we've maxed it
+ * out. In the future we could add a
+ * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
+ * use BTRFS_NESTING_NEW_ROOT.
+ */
right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
- l->start, 0);
+ l->start, 0, num_doubles ?
+ BTRFS_NESTING_NEW_ROOT :
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -4595,9 +4622,7 @@
return ret;
path->slots[0]++;
- setup_items_for_insert(root, path, new_key, &item_size,
- item_size, item_size +
- sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, path, new_key, &item_size, 1);
leaf = path->nodes[0];
memcpy_extent_buffer(leaf,
btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -4651,9 +4676,8 @@
u32 ioff;
item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff + size_diff, &token);
+ ioff = btrfs_token_item_offset(&token, item);
+ btrfs_set_token_item_offset(&token, item, ioff + size_diff);
}
/* shift the data */
@@ -4750,9 +4774,8 @@
u32 ioff;
item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff - data_size, &token);
+ ioff = btrfs_token_item_offset(&token, item);
+ btrfs_set_token_item_offset(&token, item, ioff - data_size);
}
/* shift the data */
@@ -4772,14 +4795,20 @@
}
}
-/*
- * this is a helper for btrfs_insert_empty_items, the main goal here is
- * to save stack depth by doing the bulk of the work in a function
- * that doesn't call btrfs_search_slot
+/**
+ * setup_items_for_insert - Helper called before inserting one or more items
+ * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
+ * in a function that doesn't call btrfs_search_slot
+ *
+ * @root: root we are inserting items to
+ * @path: points to the leaf/slot where we are going to insert new items
+ * @cpu_key: array of keys for items to be inserted
+ * @data_size: size of the body of each item we are going to insert
+ * @nr: size of @cpu_key/@data_size arrays
*/
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr)
+ int nr)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_item *item;
@@ -4790,6 +4819,12 @@
struct extent_buffer *leaf;
int slot;
struct btrfs_map_token token;
+ u32 total_size;
+ u32 total_data = 0;
+
+ for (i = 0; i < nr; i++)
+ total_data += data_size[i];
+ total_size = total_data + (nr * sizeof(struct btrfs_item));
if (path->slots[0] == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
@@ -4816,7 +4851,8 @@
if (old_data < data_end) {
btrfs_print_leaf(leaf);
- btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
+ btrfs_crit(fs_info,
+ "item at slot %d with data offset %u beyond data end of leaf %u",
slot, old_data, data_end);
BUG();
}
@@ -4828,9 +4864,9 @@
u32 ioff;
item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff - total_data, &token);
+ ioff = btrfs_token_item_offset(&token, item);
+ btrfs_set_token_item_offset(&token, item,
+ ioff - total_data);
}
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
@@ -4849,10 +4885,9 @@
btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(slot + i);
- btrfs_set_token_item_offset(leaf, item,
- data_end - data_size[i], &token);
data_end -= data_size[i];
- btrfs_set_token_item_size(leaf, item, data_size[i], &token);
+ btrfs_set_token_item_offset(&token, item, data_end);
+ btrfs_set_token_item_size(&token, item, data_size[i]);
}
btrfs_set_header_nritems(leaf, nritems + nr);
@@ -4893,8 +4928,7 @@
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, cpu_key, data_size,
- total_data, total_size, nr);
+ setup_items_for_insert(root, path, cpu_key, data_size, nr);
return 0;
}
@@ -4997,7 +5031,7 @@
root_sub_used(root, leaf->len);
- extent_buffer_get(leaf);
+ atomic_inc(&leaf->refs);
btrfs_free_tree_block(trans, root, leaf, 0, 1);
free_extent_buffer_stale(leaf);
}
@@ -5040,9 +5074,8 @@
u32 ioff;
item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff + dsize, &token);
+ ioff = btrfs_token_item_offset(&token, item);
+ btrfs_set_token_item_offset(&token, item, ioff + dsize);
}
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
@@ -5078,7 +5111,7 @@
* for possible call to del_ptr below
*/
slot = path->slots[1];
- extent_buffer_get(leaf);
+ atomic_inc(&leaf->refs);
btrfs_set_path_blocking(path);
wret = push_leaf_left(trans, root, path, 1, 1,
@@ -5213,7 +5246,7 @@
while (1) {
nritems = btrfs_header_nritems(cur);
level = btrfs_header_level(cur);
- sret = btrfs_bin_search(cur, min_key, level, &slot);
+ sret = btrfs_bin_search(cur, min_key, &slot);
if (sret < 0) {
ret = sret;
goto out;
@@ -5496,7 +5529,9 @@
}
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
+ __btrfs_tree_read_lock(next,
+ BTRFS_NESTING_RIGHT,
+ path->recurse);
}
next_rw_lock = BTRFS_READ_LOCK;
}
@@ -5531,7 +5566,9 @@
ret = btrfs_try_tree_read_lock(next);
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
+ __btrfs_tree_read_lock(next,
+ BTRFS_NESTING_RIGHT,
+ path->recurse);
}
next_rw_lock = BTRFS_READ_LOCK;
}