Update Linux to v5.4.148
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.4.148.tar.gz
Change-Id: Ib3d26c5ba9b022e2e03533005c4fed4d7c30b61b
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cceaf05..9108a73 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -647,9 +647,7 @@
static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
{
- struct inode *inode = tree->private_data;
-
- btrfs_panic(btrfs_sb(inode->i_sb), err,
+ btrfs_panic(tree->fs_info, err,
"locking error: extent tree was modified by another thread while locked");
}
@@ -1583,21 +1581,25 @@
/* Find first extent with bits cleared */
while (1) {
node = __etree_search(tree, start, &next, &prev, NULL, NULL);
- if (!node) {
+ if (!node && !next && !prev) {
+ /*
+ * Tree is completely empty, send full range and let
+ * caller deal with it
+ */
+ *start_ret = 0;
+ *end_ret = -1;
+ goto out;
+ } else if (!node && !next) {
+ /*
+ * We are past the last allocated chunk, set start at
+ * the end of the last extent.
+ */
+ state = rb_entry(prev, struct extent_state, rb_node);
+ *start_ret = state->end + 1;
+ *end_ret = -1;
+ goto out;
+ } else if (!node) {
node = next;
- if (!node) {
- /*
- * We are past the last allocated chunk,
- * set start at the end of the last extent. The
- * device alloc tree should never be empty so
- * prev is always set.
- */
- ASSERT(prev);
- state = rb_entry(prev, struct extent_state, rb_node);
- *start_ret = state->end + 1;
- *end_ret = -1;
- goto out;
- }
}
/*
* At this point 'node' either contains 'start' or start is
@@ -1899,7 +1901,7 @@
if (page_ops & PAGE_SET_PRIVATE2)
SetPagePrivate2(pages[i]);
- if (pages[i] == locked_page) {
+ if (locked_page && pages[i] == locked_page) {
put_page(pages[i]);
pages_locked++;
continue;
@@ -1919,7 +1921,8 @@
if (!PageDirty(pages[i]) ||
pages[i]->mapping != mapping) {
unlock_page(pages[i]);
- put_page(pages[i]);
+ for (; i < ret; i++)
+ put_page(pages[i]);
err = -EAGAIN;
goto out;
}
@@ -3149,7 +3152,7 @@
/*
* If we have a file range that points to a compressed extent
- * and it's followed by a consecutive file range that points to
+ * and it's followed by a consecutive file range that points
* to the same compressed extent (possibly with a different
* offset and/or length, so it either points to the whole extent
* or only part of it), we must make sure we do not submit a
@@ -3924,6 +3927,7 @@
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
+ struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
@@ -3938,6 +3942,11 @@
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
+ /*
+ * Start from the beginning does not need to cycle over the
+ * range, mark it as scanned.
+ */
+ scanned = (index == 0);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
@@ -3955,7 +3964,6 @@
tag))) {
unsigned i;
- scanned = 1;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -4033,7 +4041,39 @@
end_write_bio(&epd, ret);
return ret;
}
- ret = flush_write_bio(&epd);
+ /*
+ * If something went wrong, don't allow any metadata write bio to be
+ * submitted.
+ *
+ * This would prevent use-after-free if we had dirty pages not
+ * cleaned up, which can still happen by fuzzed images.
+ *
+ * - Bad extent tree
+ * Allowing existing tree block to be allocated for other trees.
+ *
+ * - Log tree operations
+ * Exiting tree blocks get allocated to log tree, bumps its
+ * generation, then get cleaned in tree re-balance.
+ * Such tree block will not be written back, since it's clean,
+ * thus no WRITTEN flag set.
+ * And after log writes back, this tree block is not traced by
+ * any dirty extent_io_tree.
+ *
+ * - Offending tree block gets re-dirtied from its original owner
+ * Since it has bumped generation, no WRITTEN flag, it can be
+ * reused without COWing. This tree block will not be traced
+ * by btrfs_transaction::dirty_pages.
+ *
+ * Now such dirty tree block will not be cleaned by any dirty
+ * extent io tree. Thus we don't want to submit such wild eb
+ * if the fs already has error.
+ */
+ if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ ret = flush_write_bio(&epd);
+ } else {
+ ret = -EROFS;
+ end_write_bio(&epd, ret);
+ }
return ret;
}
@@ -4084,6 +4124,11 @@
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
+ /*
+ * Start from the beginning does not need to cycle over the
+ * range, mark it as scanned.
+ */
+ scanned = (index == 0);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
@@ -4117,11 +4162,10 @@
&index, end, tag))) {
unsigned i;
- scanned = 1;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- done_index = page->index;
+ done_index = page->index + 1;
/*
* At this point we hold neither the i_pages lock nor
* the page lock: the page may be truncated or
@@ -4156,16 +4200,6 @@
ret = __extent_writepage(page, wbc, epd);
if (ret < 0) {
- /*
- * done_index is set past this page,
- * so media errors will not choke
- * background writeout for the entire
- * file. This has consequences for
- * range_cyclic semantics (ie. it may
- * not be suitable for data integrity
- * writeout).
- */
- done_index = page->index + 1;
done = 1;
break;
}
@@ -4187,7 +4221,16 @@
*/
scanned = 1;
index = 0;
- goto retry;
+
+ /*
+ * If we're looping we could run into a page that is locked by a
+ * writer and that writer could be waiting on writeback for a
+ * page in our current bio, and thus deadlock, so flush the
+ * write bio here.
+ */
+ ret = flush_write_bio(epd);
+ if (!ret)
+ goto retry;
}
if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
@@ -4422,20 +4465,32 @@
free_extent_map(em);
break;
}
- if (!test_range_bit(tree, em->start,
- extent_map_end(em) - 1,
- EXTENT_LOCKED, 0, NULL)) {
+ if (test_range_bit(tree, em->start,
+ extent_map_end(em) - 1,
+ EXTENT_LOCKED, 0, NULL))
+ goto next;
+ /*
+ * If it's not in the list of modified extents, used
+ * by a fast fsync, we can remove it. If it's being
+ * logged we can safely remove it since fsync took an
+ * extra reference on the em.
+ */
+ if (list_empty(&em->list) ||
+ test_bit(EXTENT_FLAG_LOGGING, &em->flags)) {
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&btrfs_inode->runtime_flags);
remove_extent_mapping(map, em);
/* once for the rb tree */
free_extent_map(em);
}
+next:
start = extent_map_end(em);
write_unlock(&map->lock);
/* once for us */
free_extent_map(em);
+
+ cond_resched(); /* Allow large-extent preemption. */
}
}
return try_release_extent_state(tree, page, mask);
@@ -4593,7 +4648,7 @@
__u64 start, __u64 len)
{
int ret = 0;
- u64 off = start;
+ u64 off;
u64 max = start + len;
u32 flags = 0;
u32 found_type;
@@ -4629,6 +4684,11 @@
goto out_free_ulist;
}
+ /*
+ * We can't initialize that to 'start' as this could miss extents due
+ * to extent item merging
+ */
+ off = 0;
start = round_down(start, btrfs_inode_sectorsize(inode));
len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
@@ -4981,25 +5041,28 @@
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
- /* the ref bit is tricky. We have to make sure it is set
- * if we have the buffer dirty. Otherwise the
- * code to free a buffer can end up dropping a dirty
- * page
+ /*
+ * The TREE_REF bit is first set when the extent_buffer is added
+ * to the radix tree. It is also reset, if unset, when a new reference
+ * is created by find_extent_buffer.
*
- * Once the ref bit is set, it won't go away while the
- * buffer is dirty or in writeback, and it also won't
- * go away while we have the reference count on the
- * eb bumped.
+ * It is only cleared in two cases: freeing the last non-tree
+ * reference to the extent_buffer when its STALE bit is set or
+ * calling releasepage when the tree reference is the only reference.
*
- * We can't just set the ref bit without bumping the
- * ref on the eb because free_extent_buffer might
- * see the ref bit and try to clear it. If this happens
- * free_extent_buffer might end up dropping our original
- * ref by mistake and freeing the page before we are able
- * to add one more ref.
+ * In both cases, care is taken to ensure that the extent_buffer's
+ * pages are not under io. However, releasepage can be concurrently
+ * called with creating new references, which is prone to race
+ * conditions between the calls to check_buffer_tree_ref in those
+ * codepaths and clearing TREE_REF in try_release_extent_buffer.
*
- * So bump the ref count first, then set the bit. If someone
- * beat us to it, drop the ref we added.
+ * The actual lifetime of the extent_buffer in the radix tree is
+ * adequately protected by the refcount, but the TREE_REF bit and
+ * its corresponding reference are not. To protect against this
+ * class of races, we call check_buffer_tree_ref from the codepaths
+ * which trigger io after they set eb->io_pages. Note that once io is
+ * initiated, TREE_REF can no longer be cleared, so that is the
+ * moment at which any such race is best fixed.
*/
refs = atomic_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -5076,12 +5139,14 @@
return eb;
eb = alloc_dummy_extent_buffer(fs_info, start);
if (!eb)
- return NULL;
+ return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info;
again:
ret = radix_tree_preload(GFP_NOFS);
- if (ret)
+ if (ret) {
+ exists = ERR_PTR(ret);
goto free_eb;
+ }
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
start >> PAGE_SHIFT, eb);
@@ -5447,6 +5512,11 @@
clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
atomic_set(&eb->io_pages, num_reads);
+ /*
+ * It is possible for releasepage to clear the TREE_REF bit before we
+ * set io_pages. See check_buffer_tree_ref for a more detailed comment.
+ */
+ check_buffer_tree_ref(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
@@ -5540,9 +5610,9 @@
}
}
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dstv,
- unsigned long start, unsigned long len)
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dstv,
+ unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
@@ -5563,7 +5633,7 @@
cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
- if (copy_to_user(dst, kaddr + offset, cur)) {
+ if (probe_user_write(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
}