Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9108a73..5fc65a7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -14,6 +14,7 @@
#include <linux/prefetch.h>
#include <linux/cleancache.h>
#include "extent_io.h"
+#include "extent-io-tree.h"
#include "extent_map.h"
#include "ctree.h"
#include "btrfs_inode.h"
@@ -34,36 +35,59 @@
}
#ifdef CONFIG_BTRFS_DEBUG
-static LIST_HEAD(buffers);
static LIST_HEAD(states);
-
static DEFINE_SPINLOCK(leak_lock);
-static inline
-void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
+static inline void btrfs_leak_debug_add(spinlock_t *lock,
+ struct list_head *new,
+ struct list_head *head)
{
unsigned long flags;
- spin_lock_irqsave(&leak_lock, flags);
+ spin_lock_irqsave(lock, flags);
list_add(new, head);
- spin_unlock_irqrestore(&leak_lock, flags);
+ spin_unlock_irqrestore(lock, flags);
}
-static inline
-void btrfs_leak_debug_del(struct list_head *entry)
+static inline void btrfs_leak_debug_del(spinlock_t *lock,
+ struct list_head *entry)
{
unsigned long flags;
- spin_lock_irqsave(&leak_lock, flags);
+ spin_lock_irqsave(lock, flags);
list_del(entry);
- spin_unlock_irqrestore(&leak_lock, flags);
+ spin_unlock_irqrestore(lock, flags);
}
-static inline
-void btrfs_leak_debug_check(void)
+void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
+{
+ struct extent_buffer *eb;
+ unsigned long flags;
+
+ /*
+ * If we didn't get into open_ctree our allocated_ebs will not be
+ * initialized, so just skip this.
+ */
+ if (!fs_info->allocated_ebs.next)
+ return;
+
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ while (!list_empty(&fs_info->allocated_ebs)) {
+ eb = list_first_entry(&fs_info->allocated_ebs,
+ struct extent_buffer, leak_list);
+ pr_err(
+ "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
+ eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
+ btrfs_header_owner(eb));
+ list_del(&eb->leak_list);
+ kmem_cache_free(extent_buffer_cache, eb);
+ }
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
+}
+
+static inline void btrfs_extent_state_leak_debug_check(void)
{
struct extent_state *state;
- struct extent_buffer *eb;
while (!list_empty(&states)) {
state = list_entry(states.next, struct extent_state, leak_list);
@@ -74,14 +98,6 @@
list_del(&state->leak_list);
kmem_cache_free(extent_state_cache, state);
}
-
- while (!list_empty(&buffers)) {
- eb = list_entry(buffers.next, struct extent_buffer, leak_list);
- pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
- eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
- list_del(&eb->leak_list);
- kmem_cache_free(extent_buffer_cache, eb);
- }
}
#define btrfs_debug_check_extent_io_range(tree, start, end) \
@@ -103,9 +119,9 @@
}
}
#else
-#define btrfs_leak_debug_add(new, head) do {} while (0)
-#define btrfs_leak_debug_del(entry) do {} while (0)
-#define btrfs_leak_debug_check() do {} while (0)
+#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
+#define btrfs_leak_debug_del(lock, entry) do {} while (0)
+#define btrfs_extent_state_leak_debug_check() do {} while (0)
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
@@ -117,7 +133,6 @@
struct extent_page_data {
struct bio *bio;
- struct extent_io_tree *tree;
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
*/
@@ -145,19 +160,20 @@
return ret;
}
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
{
blk_status_t ret = 0;
struct extent_io_tree *tree = bio->bi_private;
bio->bi_private = NULL;
- if (tree->ops)
- ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- mirror_num, bio_flags);
+ if (is_data_inode(tree->private_data))
+ ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
+ bio_flags);
else
- btrfsic_submit_bio(bio);
+ ret = btrfs_submit_metadata_bio(tree->private_data, bio,
+ mirror_num, bio_flags);
return blk_status_to_errno(ret);
}
@@ -196,19 +212,23 @@
return ret;
}
-int __init extent_io_init(void)
+int __init extent_state_cache_init(void)
{
extent_state_cache = kmem_cache_create("btrfs_extent_state",
sizeof(struct extent_state), 0,
SLAB_MEM_SPREAD, NULL);
if (!extent_state_cache)
return -ENOMEM;
+ return 0;
+}
+int __init extent_io_init(void)
+{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
SLAB_MEM_SPREAD, NULL);
if (!extent_buffer_cache)
- goto free_state_cache;
+ return -ENOMEM;
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_io_bio, bio),
@@ -226,38 +246,47 @@
free_buffer_cache:
kmem_cache_destroy(extent_buffer_cache);
extent_buffer_cache = NULL;
-
-free_state_cache:
- kmem_cache_destroy(extent_state_cache);
- extent_state_cache = NULL;
return -ENOMEM;
}
+void __cold extent_state_cache_exit(void)
+{
+ btrfs_extent_state_leak_debug_check();
+ kmem_cache_destroy(extent_state_cache);
+}
+
void __cold extent_io_exit(void)
{
- btrfs_leak_debug_check();
-
/*
* Make sure all delayed rcu free are flushed before we
* destroy caches.
*/
rcu_barrier();
- kmem_cache_destroy(extent_state_cache);
kmem_cache_destroy(extent_buffer_cache);
bioset_exit(&btrfs_bioset);
}
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data)
{
tree->fs_info = fs_info;
tree->state = RB_ROOT;
- tree->ops = NULL;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
tree->private_data = private_data;
tree->owner = owner;
+ if (owner == IO_TREE_INODE_FILE_EXTENT)
+ lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
void extent_io_tree_release(struct extent_io_tree *tree)
@@ -304,7 +333,7 @@
state->state = 0;
state->failrec = NULL;
RB_CLEAR_NODE(&state->rb_node);
- btrfs_leak_debug_add(&state->leak_list, &states);
+ btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
refcount_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
trace_alloc_extent_state(state, mask, _RET_IP_);
@@ -317,7 +346,7 @@
return;
if (refcount_dec_and_test(&state->refs)) {
WARN_ON(extent_state_in_tree(state));
- btrfs_leak_debug_del(&state->leak_list);
+ btrfs_leak_debug_del(&leak_lock, &state->leak_list);
trace_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
}
@@ -1041,6 +1070,16 @@
goto out;
}
+ /*
+ * If this extent already has all the bits we want set, then
+ * skip it, not necessary to split it or do anything with it.
+ */
+ if ((state->state & bits) == bits) {
+ start = state->end + 1;
+ cache_state(state, cached_state);
+ goto search_again;
+ }
+
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, start);
@@ -1556,6 +1595,43 @@
}
/**
+ * find_contiguous_extent_bit: find a contiguous area of bits
+ * @tree - io tree to check
+ * @start - offset to start the search from
+ * @start_ret - the first offset we found with the bits set
+ * @end_ret - the final contiguous range of the bits that were set
+ * @bits - bits to look for
+ *
+ * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
+ * to set bits appropriately, and then merge them again. During this time it
+ * will drop the tree->lock, so use this helper if you want to find the actual
+ * contiguous area for given bits. We will search to the first bit we find, and
+ * then walk down the tree until we find a non-contiguous area. The area
+ * returned will be the full contiguous area with the bits set.
+ */
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, start, bits);
+ if (state) {
+ *start_ret = state->start;
+ *end_ret = state->end;
+ while ((state = next_state(state)) != NULL) {
+ if (state->start > (*end_ret + 1))
+ break;
+ *end_ret = state->end;
+ }
+ ret = 0;
+ }
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/**
* find_first_clear_extent_bit - find the first range that has @bits not set.
* This range could start before @start.
*
@@ -1678,9 +1754,9 @@
*
* true is returned if we find something, false if nothing was in the tree
*/
-static noinline bool find_delalloc_range(struct extent_io_tree *tree,
- u64 *start, u64 *end, u64 max_bytes,
- struct extent_state **cached_state)
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+ u64 *end, u64 max_bytes,
+ struct extent_state **cached_state)
{
struct rb_node *node;
struct extent_state *state;
@@ -1798,8 +1874,8 @@
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
- found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
- max_bytes, &cached_state);
+ found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+ max_bytes, &cached_state);
if (!found || delalloc_end <= *start) {
*start = delalloc_start;
*end = delalloc_end;
@@ -1940,15 +2016,14 @@
return err;
}
-void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned clear_bits,
unsigned long page_ops)
{
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
- NULL);
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
- __process_pages_contig(inode->i_mapping, locked_page,
+ __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
start >> PAGE_SHIFT, end >> PAGE_SHIFT,
page_ops, NULL);
}
@@ -2017,8 +2092,8 @@
* set the private field for a given byte offset in the tree. If there isn't
* an extent_state there already, this does nothing.
*/
-static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record *failrec)
+int set_state_failrec(struct extent_io_tree *tree, u64 start,
+ struct io_failure_record *failrec)
{
struct rb_node *node;
struct extent_state *state;
@@ -2045,12 +2120,11 @@
return ret;
}
-static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record **failrec)
+struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
{
struct rb_node *node;
struct extent_state *state;
- int ret = 0;
+ struct io_failure_record *failrec;
spin_lock(&tree->lock);
/*
@@ -2059,18 +2133,19 @@
*/
node = tree_search(tree, start);
if (!node) {
- ret = -ENOENT;
+ failrec = ERR_PTR(-ENOENT);
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
if (state->start != start) {
- ret = -ENOENT;
+ failrec = ERR_PTR(-ENOENT);
goto out;
}
- *failrec = state->failrec;
+
+ failrec = state->failrec;
out:
spin_unlock(&tree->lock);
- return ret;
+ return failrec;
}
/*
@@ -2256,7 +2331,7 @@
return 0;
}
-int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num)
+int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 start = eb->start;
@@ -2300,8 +2375,8 @@
if (!ret)
return 0;
- ret = get_state_failrec(failure_tree, start, &failrec);
- if (ret)
+ failrec = get_state_failrec(failure_tree, start);
+ if (IS_ERR(failrec))
return 0;
BUG_ON(!failrec->this_mirror);
@@ -2373,8 +2448,8 @@
spin_unlock(&failure_tree->lock);
}
-int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
- struct io_failure_record **failrec_ret)
+static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
+ u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct io_failure_record *failrec;
@@ -2385,65 +2460,8 @@
int ret;
u64 logical;
- ret = get_state_failrec(failure_tree, start, &failrec);
- if (ret) {
- failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
- if (!failrec)
- return -ENOMEM;
-
- failrec->start = start;
- failrec->len = end - start + 1;
- failrec->this_mirror = 0;
- failrec->bio_flags = 0;
- failrec->in_validation = 0;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, failrec->len);
- if (!em) {
- read_unlock(&em_tree->lock);
- kfree(failrec);
- return -EIO;
- }
-
- if (em->start > start || em->start + em->len <= start) {
- free_extent_map(em);
- em = NULL;
- }
- read_unlock(&em_tree->lock);
- if (!em) {
- kfree(failrec);
- return -EIO;
- }
-
- logical = start - em->start;
- logical = em->block_start + logical;
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- logical = em->block_start;
- failrec->bio_flags = EXTENT_BIO_COMPRESSED;
- extent_set_compress_type(&failrec->bio_flags,
- em->compress_type);
- }
-
- btrfs_debug(fs_info,
- "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
- logical, start, failrec->len);
-
- failrec->logical = logical;
- free_extent_map(em);
-
- /* set the bits in the private failure tree */
- ret = set_extent_bits(failure_tree, start, end,
- EXTENT_LOCKED | EXTENT_DIRTY);
- if (ret >= 0)
- ret = set_state_failrec(failure_tree, start, failrec);
- /* set the bits in the inode's tree */
- if (ret >= 0)
- ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
- if (ret < 0) {
- kfree(failrec);
- return ret;
- }
- } else {
+ failrec = get_state_failrec(failure_tree, start);
+ if (!IS_ERR(failrec)) {
btrfs_debug(fs_info,
"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
failrec->logical, failrec->start, failrec->len,
@@ -2453,15 +2471,71 @@
* (e.g. with a list for failed_mirror) to make
* clean_io_failure() clean all those errors at once.
*/
+
+ return failrec;
}
- *failrec_ret = failrec;
+ failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+ if (!failrec)
+ return ERR_PTR(-ENOMEM);
- return 0;
+ failrec->start = start;
+ failrec->len = end - start + 1;
+ failrec->this_mirror = 0;
+ failrec->bio_flags = 0;
+ failrec->in_validation = 0;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, failrec->len);
+ if (!em) {
+ read_unlock(&em_tree->lock);
+ kfree(failrec);
+ return ERR_PTR(-EIO);
+ }
+
+ if (em->start > start || em->start + em->len <= start) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ kfree(failrec);
+ return ERR_PTR(-EIO);
+ }
+
+ logical = start - em->start;
+ logical = em->block_start + logical;
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ logical = em->block_start;
+ failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+ extent_set_compress_type(&failrec->bio_flags, em->compress_type);
+ }
+
+ btrfs_debug(fs_info,
+ "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
+ logical, start, failrec->len);
+
+ failrec->logical = logical;
+ free_extent_map(em);
+
+ /* Set the bits in the private failure tree */
+ ret = set_extent_bits(failure_tree, start, end,
+ EXTENT_LOCKED | EXTENT_DIRTY);
+ if (ret >= 0) {
+ ret = set_state_failrec(failure_tree, start, failrec);
+ /* Set the bits in the inode's tree */
+ ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
+ } else if (ret < 0) {
+ kfree(failrec);
+ return ERR_PTR(ret);
+ }
+
+ return failrec;
}
-bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
- struct io_failure_record *failrec, int failed_mirror)
+static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
+ struct io_failure_record *failrec,
+ int failed_mirror)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int num_copies;
@@ -2484,7 +2558,7 @@
* a) deliver good data to the caller
* b) correct the bad sectors on disk
*/
- if (failed_bio_pages > 1) {
+ if (needs_validation) {
/*
* to fulfill b), we need to know the exact failing sectors, as
* we don't want to rewrite any more than the failed ones. thus,
@@ -2523,95 +2597,114 @@
return true;
}
-
-struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
- struct io_failure_record *failrec,
- struct page *page, int pg_offset, int icsum,
- bio_end_io_t *endio_func, void *data)
+static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct bio *bio;
- struct btrfs_io_bio *btrfs_failed_bio;
- struct btrfs_io_bio *btrfs_bio;
+ u64 len = 0;
+ const u32 blocksize = inode->i_sb->s_blocksize;
- bio = btrfs_io_bio_alloc(1);
- bio->bi_end_io = endio_func;
- bio->bi_iter.bi_sector = failrec->logical >> 9;
- bio_set_dev(bio, fs_info->fs_devices->latest_bdev);
- bio->bi_iter.bi_size = 0;
- bio->bi_private = data;
+ /*
+ * If bi_status is BLK_STS_OK, then this was a checksum error, not an
+ * I/O error. In this case, we already know exactly which sector was
+ * bad, so we don't need to validate.
+ */
+ if (bio->bi_status == BLK_STS_OK)
+ return false;
- btrfs_failed_bio = btrfs_io_bio(failed_bio);
- if (btrfs_failed_bio->csum) {
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ /*
+ * We need to validate each sector individually if the failed I/O was
+ * for multiple sectors.
+ *
+ * There are a few possible bios that can end up here:
+ * 1. A buffered read bio, which is not cloned.
+ * 2. A direct I/O read bio, which is cloned.
+ * 3. A (buffered or direct) repair bio, which is not cloned.
+ *
+ * For cloned bios (case 2), we can get the size from
+ * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get
+ * it from the bvecs.
+ */
+ if (bio_flagged(bio, BIO_CLONED)) {
+ if (btrfs_io_bio(bio)->iter.bi_size > blocksize)
+ return true;
+ } else {
+ struct bio_vec *bvec;
+ int i;
- btrfs_bio = btrfs_io_bio(bio);
- btrfs_bio->csum = btrfs_bio->csum_inline;
- icsum *= csum_size;
- memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
- csum_size);
+ bio_for_each_bvec_all(bvec, bio, i) {
+ len += bvec->bv_len;
+ if (len > blocksize)
+ return true;
+ }
}
-
- bio_add_page(bio, page, failrec->len, pg_offset);
-
- return bio;
+ return false;
}
-/*
- * This is a generic handler for readpage errors. If other copies exist, read
- * those and write back good data to the failed position. Does not investigate
- * in remapping the failed extent elsewhere, hoping the device will be smart
- * enough to do this as needed
- */
-static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int failed_mirror)
+blk_status_t btrfs_submit_read_repair(struct inode *inode,
+ struct bio *failed_bio, u64 phy_offset,
+ struct page *page, unsigned int pgoff,
+ u64 start, u64 end, int failed_mirror,
+ submit_bio_hook_t *submit_bio_hook)
{
struct io_failure_record *failrec;
- struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct bio *bio;
- int read_mode = 0;
+ struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
+ const int icsum = phy_offset >> inode->i_sb->s_blocksize_bits;
+ bool need_validation;
+ struct bio *repair_bio;
+ struct btrfs_io_bio *repair_io_bio;
blk_status_t status;
- int ret;
- unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
+
+ btrfs_debug(fs_info,
+ "repair read error: read error at %llu", start);
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
- ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
- if (ret)
- return ret;
+ failrec = btrfs_get_io_failure_record(inode, start, end);
+ if (IS_ERR(failrec))
+ return errno_to_blk_status(PTR_ERR(failrec));
- if (!btrfs_check_repairable(inode, failed_bio_pages, failrec,
+ need_validation = btrfs_io_needs_validation(inode, failed_bio);
+
+ if (!btrfs_check_repairable(inode, need_validation, failrec,
failed_mirror)) {
free_io_failure(failure_tree, tree, failrec);
- return -EIO;
+ return BLK_STS_IOERR;
}
- if (failed_bio_pages > 1)
- read_mode |= REQ_FAILFAST_DEV;
+ repair_bio = btrfs_io_bio_alloc(1);
+ repair_io_bio = btrfs_io_bio(repair_bio);
+ repair_bio->bi_opf = REQ_OP_READ;
+ if (need_validation)
+ repair_bio->bi_opf |= REQ_FAILFAST_DEV;
+ repair_bio->bi_end_io = failed_bio->bi_end_io;
+ repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
+ repair_bio->bi_private = failed_bio->bi_private;
- phy_offset >>= inode->i_sb->s_blocksize_bits;
- bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
- start - page_offset(page),
- (int)phy_offset, failed_bio->bi_end_io,
- NULL);
- bio->bi_opf = REQ_OP_READ | read_mode;
+ if (failed_io_bio->csum) {
+ const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+
+ repair_io_bio->csum = repair_io_bio->csum_inline;
+ memcpy(repair_io_bio->csum,
+ failed_io_bio->csum + csum_size * icsum, csum_size);
+ }
+
+ bio_add_page(repair_bio, page, failrec->len, pgoff);
+ repair_io_bio->logical = failrec->start;
+ repair_io_bio->iter = repair_bio->bi_iter;
btrfs_debug(btrfs_sb(inode->i_sb),
- "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
- read_mode, failrec->this_mirror, failrec->in_validation);
+"repair read error: submitting new read to mirror %d, in_validation=%d",
+ failrec->this_mirror, failrec->in_validation);
- status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
- failrec->bio_flags);
+ status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
+ failrec->bio_flags);
if (status) {
free_io_failure(failure_tree, tree, failrec);
- bio_put(bio);
- ret = blk_status_to_errno(status);
+ bio_put(repair_bio);
}
-
- return ret;
+ return status;
}
/* lots and lots of room for performance fixes in the end_bio funcs */
@@ -2724,8 +2817,6 @@
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- bool data_inode = btrfs_ino(BTRFS_I(inode))
- != BTRFS_BTREE_INODE_OBJECTID;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
@@ -2756,9 +2847,12 @@
mirror = io_bio->mirror_num;
if (likely(uptodate)) {
- ret = tree->ops->readpage_end_io_hook(io_bio, offset,
- page, start, end,
- mirror);
+ if (is_data_inode(inode))
+ ret = btrfs_verify_data_csum(io_bio, offset, page,
+ start, end, mirror);
+ else
+ ret = btrfs_validate_metadata_buffer(io_bio,
+ offset, page, start, end, mirror);
if (ret)
uptodate = 0;
else
@@ -2771,7 +2865,7 @@
if (likely(uptodate))
goto readpage_ok;
- if (data_inode) {
+ if (is_data_inode(inode)) {
/*
* The generic bio_readpage_error handles errors the
@@ -2783,9 +2877,10 @@
* If it can't handle the error it will return -EIO and
* we remain responsible for that page.
*/
- ret = bio_readpage_error(bio, offset, page, start, end,
- mirror);
- if (ret == 0) {
+ if (!btrfs_submit_read_repair(inode, bio, offset, page,
+ start - page_offset(page),
+ start, end, mirror,
+ btrfs_submit_data_bio)) {
uptodate = !bio->bi_status;
offset += len;
continue;
@@ -2916,25 +3011,22 @@
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
- * @tree: tree so we can call our merge_bio hook
* @wbc: optional writeback control for io accounting
* @page: page to add to the bio
* @pg_offset: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
* @size: portion of page that we want to write
* @offset: starting offset in the page
- * @bdev: attach newly created bios to this bdev
* @bio_ret: must be valid pointer, newly allocated bio will be stored there
* @end_io_func: end_io callback for new bio
* @mirror_num: desired mirror to read/write
* @prev_bio_flags: flags of previous bio to see if we can merge the current one
* @bio_flags: flags of the current bio to see if we can merge them
*/
-static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
+static int submit_extent_page(unsigned int opf,
struct writeback_control *wbc,
struct page *page, u64 offset,
size_t size, unsigned long pg_offset,
- struct block_device *bdev,
struct bio **bio_ret,
bio_end_io_t end_io_func,
int mirror_num,
@@ -2946,6 +3038,7 @@
struct bio *bio;
size_t page_size = min_t(size_t, size, PAGE_SIZE);
sector_t sector = offset >> 9;
+ struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
ASSERT(bio_ret);
@@ -2959,7 +3052,6 @@
else
contig = bio_end_sector(bio) == sector;
- ASSERT(tree->ops);
if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
can_merge = false;
@@ -2980,13 +3072,16 @@
}
bio = btrfs_bio_alloc(offset);
- bio_set_dev(bio, bdev);
bio_add_page(bio, page, page_size, pg_offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
bio->bi_write_hint = page->mapping->host->i_write_hint;
bio->bi_opf = opf;
if (wbc) {
+ struct block_device *bdev;
+
+ bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
+ bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
wbc_account_cgroup_owner(wbc, page, page_size);
}
@@ -2999,28 +3094,21 @@
static void attach_extent_buffer_page(struct extent_buffer *eb,
struct page *page)
{
- if (!PagePrivate(page)) {
- SetPagePrivate(page);
- get_page(page);
- set_page_private(page, (unsigned long)eb);
- } else {
+ if (!PagePrivate(page))
+ attach_page_private(page, eb);
+ else
WARN_ON(page->private != (unsigned long)eb);
- }
}
void set_page_extent_mapped(struct page *page)
{
- if (!PagePrivate(page)) {
- SetPagePrivate(page);
- get_page(page);
- set_page_private(page, EXTENT_PAGE_PRIVATE);
- }
+ if (!PagePrivate(page))
+ attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
}
static struct extent_map *
__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
- u64 start, u64 len, get_extent_t *get_extent,
- struct extent_map **em_cached)
+ u64 start, u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
@@ -3036,7 +3124,7 @@
*em_cached = NULL;
}
- em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
if (em_cached && !IS_ERR_OR_NULL(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
@@ -3051,13 +3139,9 @@
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-static int __do_readpage(struct extent_io_tree *tree,
- struct page *page,
- get_extent_t *get_extent,
- struct extent_map **em_cached,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags, unsigned int read_flags,
- u64 *prev_em_start)
+int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct bio **bio, unsigned long *bio_flags,
+ unsigned int read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
@@ -3068,7 +3152,6 @@
u64 block_start;
u64 cur_end;
struct extent_map *em;
- struct block_device *bdev;
int ret = 0;
int nr = 0;
size_t pg_offset = 0;
@@ -3076,6 +3159,7 @@
size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
unsigned long this_bio_flag = 0;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
set_page_extent_mapped(page);
@@ -3119,7 +3203,7 @@
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
- end - cur + 1, get_extent, em_cached);
+ end - cur + 1, em_cached);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
unlock_extent(tree, cur, end);
@@ -3145,7 +3229,6 @@
offset = em->block_start + extent_offset;
disk_io_size = iosize;
}
- bdev = em->bdev;
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
@@ -3233,10 +3316,10 @@
continue;
}
- ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
+ ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
page, offset, disk_io_size,
- pg_offset, bdev, bio,
- end_bio_extent_readpage, mirror_num,
+ pg_offset, bio,
+ end_bio_extent_readpage, 0,
*bio_flags,
this_bio_flag,
force_bio_submit);
@@ -3260,8 +3343,7 @@
return ret;
}
-static inline void contiguous_readpages(struct extent_io_tree *tree,
- struct page *pages[], int nr_pages,
+static inline void contiguous_readpages(struct page *pages[], int nr_pages,
u64 start, u64 end,
struct extent_map **em_cached,
struct bio **bio,
@@ -3271,48 +3353,15 @@
struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
int index;
- btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
- __do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
- bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
+ btrfs_do_readpage(pages[index], em_cached, bio, bio_flags,
+ REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
}
-static int __extent_read_full_page(struct extent_io_tree *tree,
- struct page *page,
- get_extent_t *get_extent,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags,
- unsigned int read_flags)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- int ret;
-
- btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
-
- ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
- bio_flags, read_flags, NULL);
- return ret;
-}
-
-int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent, int mirror_num)
-{
- struct bio *bio = NULL;
- unsigned long bio_flags = 0;
- int ret;
-
- ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
- &bio_flags, 0);
- if (bio)
- ret = submit_one_bio(bio, mirror_num, bio_flags);
- return ret;
-}
-
static void update_nr_written(struct writeback_control *wbc,
unsigned long nr_written)
{
@@ -3329,7 +3378,7 @@
* This returns 0 if all went well (page still locked)
* This returns < 0 if there were errors (page still locked)
*/
-static noinline_for_stack int writepage_delalloc(struct inode *inode,
+static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct page *page, struct writeback_control *wbc,
u64 delalloc_start, unsigned long *nr_written)
{
@@ -3342,7 +3391,7 @@
while (delalloc_end < page_end) {
- found = find_lock_delalloc_range(inode, page,
+ found = find_lock_delalloc_range(&inode->vfs_inode, page,
&delalloc_start,
&delalloc_end);
if (!found) {
@@ -3359,8 +3408,7 @@
* started, so we don't want to return > 0 unless
* things are going well.
*/
- ret = ret < 0 ? ret : -EIO;
- goto done;
+ return ret < 0 ? ret : -EIO;
}
/*
* delalloc_end is already one less than the total length, so
@@ -3392,10 +3440,7 @@
return 1;
}
- ret = 0;
-
-done:
- return ret;
+ return 0;
}
/*
@@ -3406,15 +3451,15 @@
* 0 if all went well (page still locked)
* < 0 if there were errors (page still locked)
*/
-static noinline_for_stack int __extent_writepage_io(struct inode *inode,
+static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
struct page *page,
struct writeback_control *wbc,
struct extent_page_data *epd,
loff_t i_size,
unsigned long nr_written,
- unsigned int write_flags, int *nr_ret)
+ int *nr_ret)
{
- struct extent_io_tree *tree = epd->tree;
+ struct extent_io_tree *tree = &inode->io_tree;
u64 start = page_offset(page);
u64 page_end = start + PAGE_SIZE - 1;
u64 end;
@@ -3423,21 +3468,17 @@
u64 block_start;
u64 iosize;
struct extent_map *em;
- struct block_device *bdev;
size_t pg_offset = 0;
size_t blocksize;
int ret = 0;
int nr = 0;
+ const unsigned int write_flags = wbc_to_write_flags(wbc);
bool compressed;
ret = btrfs_writepage_cow_fixup(page, start, page_end);
if (ret) {
/* Fixup worker will requeue */
- if (ret == -EBUSY)
- wbc->pages_skipped++;
- else
- redirty_page_for_writepage(wbc, page);
-
+ redirty_page_for_writepage(wbc, page);
update_nr_written(wbc, nr_written);
unlock_page(page);
return 1;
@@ -3450,12 +3491,7 @@
update_nr_written(wbc, nr_written + 1);
end = page_end;
- if (i_size <= start) {
- btrfs_writepage_endio_finish_ordered(page, start, page_end, 1);
- goto done;
- }
-
- blocksize = inode->i_sb->s_blocksize;
+ blocksize = inode->vfs_inode.i_sb->s_blocksize;
while (cur <= end) {
u64 em_end;
@@ -3466,8 +3502,7 @@
page_end, 1);
break;
}
- em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur,
- end - cur + 1, 1);
+ em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
ret = PTR_ERR_OR_ZERO(em);
@@ -3481,7 +3516,6 @@
iosize = min(em_end - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
offset = em->block_start + extent_offset;
- bdev = em->bdev;
block_start = em->block_start;
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
free_extent_map(em);
@@ -3493,22 +3527,11 @@
*/
if (compressed || block_start == EXTENT_MAP_HOLE ||
block_start == EXTENT_MAP_INLINE) {
- /*
- * end_io notification does not happen here for
- * compressed extents
- */
- if (!compressed)
- btrfs_writepage_endio_finish_ordered(page, cur,
- cur + iosize - 1,
- 1);
- else if (compressed) {
- /* we don't want to end_page_writeback on
- * a compressed extent. this happens
- * elsewhere
- */
+ if (compressed)
nr++;
- }
-
+ else
+ btrfs_writepage_endio_finish_ordered(page, cur,
+ cur + iosize - 1, 1);
cur += iosize;
pg_offset += iosize;
continue;
@@ -3516,14 +3539,14 @@
btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
- btrfs_err(BTRFS_I(inode)->root->fs_info,
+ btrfs_err(inode->root->fs_info,
"page %lu not writeback, cur %llu end %llu",
page->index, cur, end);
}
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
page, offset, iosize, pg_offset,
- bdev, &epd->bio,
+ &epd->bio,
end_bio_extent_writepage,
0, 0, 0, false);
if (ret) {
@@ -3536,7 +3559,6 @@
pg_offset += iosize;
nr++;
}
-done:
*nr_ret = nr;
return ret;
}
@@ -3558,14 +3580,11 @@
u64 page_end = start + PAGE_SIZE - 1;
int ret;
int nr = 0;
- size_t pg_offset = 0;
+ size_t pg_offset;
loff_t i_size = i_size_read(inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
- unsigned int write_flags = 0;
unsigned long nr_written = 0;
- write_flags = wbc_to_write_flags(wbc);
-
trace___extent_writepage(page, inode, wbc);
WARN_ON(!PageLocked(page));
@@ -3590,22 +3609,21 @@
flush_dcache_page(page);
}
- pg_offset = 0;
-
set_page_extent_mapped(page);
if (!epd->extent_locked) {
- ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
+ ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
+ &nr_written);
if (ret == 1)
- goto done_unlocked;
+ return 0;
if (ret)
goto done;
}
- ret = __extent_writepage_io(inode, page, wbc, epd,
- i_size, nr_written, write_flags, &nr);
+ ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
+ nr_written, &nr);
if (ret == 1)
- goto done_unlocked;
+ return 0;
done:
if (nr == 0) {
@@ -3620,9 +3638,6 @@
unlock_page(page);
ASSERT(ret <= 0);
return ret;
-
-done_unlocked:
- return 0;
}
void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
@@ -3755,6 +3770,12 @@
return;
/*
+ * A read may stumble upon this buffer later, make sure that it gets an
+ * error and knows there was an error.
+ */
+ clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+
+ /*
* If we error out, we should add back the dirty_metadata_bytes
* to make it consistent.
*/
@@ -3851,9 +3872,6 @@
struct writeback_control *wbc,
struct extent_page_data *epd)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- struct block_device *bdev = fs_info->fs_devices->latest_bdev;
- struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
u64 offset = eb->start;
u32 nritems;
int i, num_pages;
@@ -3886,8 +3904,8 @@
clear_page_dirty_for_io(p);
set_page_writeback(p);
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
- p, offset, PAGE_SIZE, 0, bdev,
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
+ p, offset, PAGE_SIZE, 0,
&epd->bio,
end_bio_extent_buffer_writepage,
0, 0, 0, false);
@@ -3919,11 +3937,9 @@
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
struct extent_buffer *eb, *prev_eb = NULL;
struct extent_page_data epd = {
.bio = NULL,
- .tree = tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4245,7 +4261,6 @@
int ret;
struct extent_page_data epd = {
.bio = NULL,
- .tree = &BTRFS_I(page->mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4267,14 +4282,12 @@
{
int ret = 0;
struct address_space *mapping = inode->i_mapping;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct page *page;
unsigned long nr_pages = (end - start + PAGE_SIZE) >>
PAGE_SHIFT;
struct extent_page_data epd = {
.bio = NULL,
- .tree = tree,
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
};
@@ -4283,8 +4296,12 @@
.nr_to_write = nr_pages * 2,
.range_start = start,
.range_end = end + 1,
+ /* We're called from an async helper function */
+ .punt_to_cgroup = 1,
+ .no_cgroup_owner = 1,
};
+ wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
while (start <= end) {
page = find_get_page(mapping, start >> PAGE_SHIFT);
if (clear_page_dirty_for_io(page))
@@ -4299,11 +4316,12 @@
}
ASSERT(ret <= 0);
- if (ret < 0) {
+ if (ret == 0)
+ ret = flush_write_bio(&epd);
+ else
end_write_bio(&epd, ret);
- return ret;
- }
- ret = flush_write_bio(&epd);
+
+ wbc_detach_inode(&wbc_writepages);
return ret;
}
@@ -4313,7 +4331,6 @@
int ret = 0;
struct extent_page_data epd = {
.bio = NULL,
- .tree = &BTRFS_I(mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4328,52 +4345,32 @@
return ret;
}
-int extent_readpages(struct address_space *mapping, struct list_head *pages,
- unsigned nr_pages)
+void extent_readahead(struct readahead_control *rac)
{
struct bio *bio = NULL;
unsigned long bio_flags = 0;
struct page *pagepool[16];
struct extent_map *em_cached = NULL;
- struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
- int nr = 0;
u64 prev_em_start = (u64)-1;
+ int nr;
- while (!list_empty(pages)) {
- u64 contig_end = 0;
+ while ((nr = readahead_page_batch(rac, pagepool))) {
+ u64 contig_start = page_offset(pagepool[0]);
+ u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1;
- for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
- struct page *page = lru_to_page(pages);
+ ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
- prefetchw(&page->flags);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping, page->index,
- readahead_gfp_mask(mapping))) {
- put_page(page);
- break;
- }
-
- pagepool[nr++] = page;
- contig_end = page_offset(page) + PAGE_SIZE - 1;
- }
-
- if (nr) {
- u64 contig_start = page_offset(pagepool[0]);
-
- ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
-
- contiguous_readpages(tree, pagepool, nr, contig_start,
- contig_end, &em_cached, &bio, &bio_flags,
- &prev_em_start);
- }
+ contiguous_readpages(pagepool, nr, contig_start, contig_end,
+ &em_cached, &bio, &bio_flags, &prev_em_start);
}
if (em_cached)
free_extent_map(em_cached);
- if (bio)
- return submit_one_bio(bio, 0, bio_flags);
- return 0;
+ if (bio) {
+ if (submit_one_bio(bio, 0, bio_flags))
+ return;
+ }
}
/*
@@ -4452,6 +4449,9 @@
page->mapping->host->i_size > SZ_16M) {
u64 len;
while (start <= end) {
+ struct btrfs_fs_info *fs_info;
+ u64 cur_gen;
+
len = end - start + 1;
write_lock(&map->lock);
em = lookup_extent_mapping(map, start, len);
@@ -4476,13 +4476,33 @@
* extra reference on the em.
*/
if (list_empty(&em->list) ||
- test_bit(EXTENT_FLAG_LOGGING, &em->flags)) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &btrfs_inode->runtime_flags);
- remove_extent_mapping(map, em);
- /* once for the rb tree */
- free_extent_map(em);
- }
+ test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ goto remove_em;
+ /*
+ * If it's in the list of modified extents, remove it
+ * only if its generation is older then the current one,
+ * in which case we don't need it for a fast fsync.
+ * Otherwise don't remove it, we could be racing with an
+ * ongoing fast fsync that could miss the new extent.
+ */
+ fs_info = btrfs_inode->root->fs_info;
+ spin_lock(&fs_info->trans_lock);
+ cur_gen = fs_info->generation;
+ spin_unlock(&fs_info->trans_lock);
+ if (em->generation >= cur_gen)
+ goto next;
+remove_em:
+ /*
+ * We only remove extent maps that are not in the list of
+ * modified extents or that are in the list but with a
+ * generation lower then the current generation, so there
+ * is no need to set the full fsync flag on the inode (it
+ * hurts the fsync performance for workloads with a data
+ * size that exceeds or is close to the system's memory).
+ */
+ remove_extent_mapping(map, em);
+ /* once for the rb tree */
+ free_extent_map(em);
next:
start = extent_map_end(em);
write_unlock(&map->lock);
@@ -4500,7 +4520,7 @@
* helper function for fiemap, which doesn't want to see any holes.
* This maps until we find something past 'last'
*/
-static struct extent_map *get_extent_skip_holes(struct inode *inode,
+static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
u64 offset, u64 last)
{
u64 sectorsize = btrfs_inode_sectorsize(inode);
@@ -4515,7 +4535,7 @@
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
+ em = btrfs_get_extent_fiemap(inode, offset, len);
if (IS_ERR_OR_NULL(em))
return em;
@@ -4644,8 +4664,8 @@
return ret;
}
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len)
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
{
int ret = 0;
u64 off;
@@ -4655,12 +4675,12 @@
u64 last;
u64 last_for_get_extent = 0;
u64 disko = 0;
- u64 isize = i_size_read(inode);
+ u64 isize = i_size_read(&inode->vfs_inode);
struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct fiemap_cache cache = { 0 };
struct ulist *roots;
struct ulist *tmp_ulist;
@@ -4696,8 +4716,8 @@
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
*/
- ret = btrfs_lookup_file_extent(NULL, root, path,
- btrfs_ino(BTRFS_I(inode)), -1, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
+ 0);
if (ret < 0) {
goto out_free_ulist;
} else {
@@ -4711,7 +4731,7 @@
found_type = found_key.type;
/* No extents, but there might be delalloc bits */
- if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
+ if (found_key.objectid != btrfs_ino(inode) ||
found_type != BTRFS_EXTENT_DATA_KEY) {
/* have to trust i_size as the end */
last = (u64)-1;
@@ -4737,7 +4757,7 @@
last_for_get_extent = isize;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ lock_extent_bits(&inode->io_tree, start, start + len - 1,
&cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent);
@@ -4806,8 +4826,7 @@
* then we're just getting a count and we can skip the
* lookup stuff.
*/
- ret = btrfs_check_shared(root,
- btrfs_ino(BTRFS_I(inode)),
+ ret = btrfs_check_shared(root, btrfs_ino(inode),
bytenr, roots, tmp_ulist);
if (ret < 0)
goto out_free;
@@ -4851,7 +4870,7 @@
ret = emit_last_fiemap_cache(fieinfo, &cache);
free_extent_map(em);
out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ unlock_extent_cached(&inode->io_tree, start, start + len - 1,
&cached_state);
out_free_ulist:
@@ -4863,11 +4882,10 @@
static void __free_extent_buffer(struct extent_buffer *eb)
{
- btrfs_leak_debug_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
-int extent_buffer_under_io(struct extent_buffer *eb)
+int extent_buffer_under_io(const struct extent_buffer *eb)
{
return (atomic_read(&eb->io_pages) ||
test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
@@ -4909,10 +4927,7 @@
* We need to make sure we haven't be attached
* to a new eb.
*/
- ClearPagePrivate(page);
- set_page_private(page, 0);
- /* One for the page private */
- put_page(page);
+ detach_page_private(page);
}
if (mapped)
@@ -4929,6 +4944,7 @@
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
btrfs_release_extent_buffer_pages(eb);
+ btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
__free_extent_buffer(eb);
}
@@ -4946,11 +4962,12 @@
rwlock_init(&eb->lock);
atomic_set(&eb->blocking_readers, 0);
eb->blocking_writers = 0;
- eb->lock_nested = false;
+ eb->lock_recursed = false;
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
- btrfs_leak_debug_add(&eb->leak_list, &buffers);
+ btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
+ &fs_info->allocated_ebs);
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
@@ -4973,7 +4990,7 @@
return eb;
}
-struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
+struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
int i;
struct page *p;
@@ -5300,6 +5317,7 @@
}
static int release_extent_buffer(struct extent_buffer *eb)
+ __releases(&eb->refs_lock)
{
lockdep_assert_held(&eb->refs_lock);
@@ -5318,6 +5336,7 @@
spin_unlock(&eb->refs_lock);
}
+ btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_pages(eb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -5380,7 +5399,7 @@
release_extent_buffer(eb);
}
-void clear_extent_buffer_dirty(struct extent_buffer *eb)
+void clear_extent_buffer_dirty(const struct extent_buffer *eb)
{
int i;
int num_pages;
@@ -5475,7 +5494,6 @@
unsigned long num_reads = 0;
struct bio *bio = NULL;
unsigned long bio_flags = 0;
- struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -5528,20 +5546,19 @@
}
ClearPageError(page);
- err = __extent_read_full_page(tree, page,
- btree_get_extent, &bio,
- mirror_num, &bio_flags,
- REQ_META);
+ err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
+ page, page_offset(page), PAGE_SIZE, 0,
+ &bio, end_bio_extent_readpage,
+ mirror_num, 0, 0, false);
if (err) {
- ret = err;
/*
- * We use &bio in above __extent_read_full_page,
- * so we ensure that if it returns error, the
- * current page fails to add itself to bio and
- * it's been unlocked.
- *
- * We must dec io_pages by ourselves.
+ * We failed to submit the bio so it's the
+ * caller's responsibility to perform cleanup
+ * i.e unlock page/set error bit.
*/
+ ret = err;
+ SetPageError(page);
+ unlock_page(page);
atomic_dec(&eb->io_pages);
}
} else {
@@ -5576,6 +5593,36 @@
return ret;
}
+static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
+ unsigned long len)
+{
+ btrfs_warn(eb->fs_info,
+ "access to eb bytenr %llu len %lu out of range start %lu len %lu",
+ eb->start, eb->len, start, len);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+
+ return true;
+}
+
+/*
+ * Check if the [start, start + len) range is valid before reading/writing
+ * the eb.
+ * NOTE: @start and @len are offset inside the eb, not logical address.
+ *
+ * Caller should not touch the dst/src memory if this function returns error.
+ */
+static inline int check_eb_range(const struct extent_buffer *eb,
+ unsigned long start, unsigned long len)
+{
+ unsigned long offset;
+
+ /* start, start + len should not go beyond eb->len nor overflow */
+ if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
+ return report_eb_range(eb, start, len);
+
+ return false;
+}
+
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
@@ -5584,17 +5631,12 @@
struct page *page;
char *kaddr;
char *dst = (char *)dstv;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = start >> PAGE_SHIFT;
- if (start + len > eb->len) {
- WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
- eb->start, eb->len, start, len);
- memset(dst, 0, len);
+ if (check_eb_range(eb, start, len))
return;
- }
- offset = offset_in_page(start_offset + start);
+ offset = offset_in_page(start);
while (len > 0) {
page = eb->pages[i];
@@ -5619,21 +5661,20 @@
struct page *page;
char *kaddr;
char __user *dst = (char __user *)dstv;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = start >> PAGE_SHIFT;
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = offset_in_page(start_offset + start);
+ offset = offset_in_page(start);
while (len > 0) {
page = eb->pages[i];
cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
- if (probe_user_write(dst, kaddr + offset, cur)) {
+ if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
}
@@ -5647,48 +5688,6 @@
return ret;
}
-/*
- * return 0 if the item is found within a page.
- * return 1 if the item spans two pages.
- * return -EINVAL otherwise.
- */
-int map_private_extent_buffer(const struct extent_buffer *eb,
- unsigned long start, unsigned long min_len,
- char **map, unsigned long *map_start,
- unsigned long *map_len)
-{
- size_t offset;
- char *kaddr;
- struct page *p;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
- unsigned long end_i = (start_offset + start + min_len - 1) >>
- PAGE_SHIFT;
-
- if (start + min_len > eb->len) {
- WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
- eb->start, eb->len, start, min_len);
- return -EINVAL;
- }
-
- if (i != end_i)
- return 1;
-
- if (i == 0) {
- offset = start_offset;
- *map_start = 0;
- } else {
- offset = 0;
- *map_start = ((u64)i << PAGE_SHIFT) - start_offset;
- }
-
- p = eb->pages[i];
- kaddr = page_address(p);
- *map = kaddr + offset;
- *map_len = PAGE_SIZE - offset;
- return 0;
-}
-
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len)
{
@@ -5697,14 +5696,13 @@
struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = start >> PAGE_SHIFT;
int ret = 0;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return -EINVAL;
- offset = offset_in_page(start_offset + start);
+ offset = offset_in_page(start);
while (len > 0) {
page = eb->pages[i];
@@ -5724,7 +5722,7 @@
return ret;
}
-void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
+void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *srcv)
{
char *kaddr;
@@ -5735,7 +5733,7 @@
BTRFS_FSID_SIZE);
}
-void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv)
+void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
{
char *kaddr;
@@ -5745,7 +5743,7 @@
BTRFS_FSID_SIZE);
}
-void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
unsigned long start, unsigned long len)
{
size_t cur;
@@ -5753,13 +5751,12 @@
struct page *page;
char *kaddr;
char *src = (char *)srcv;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = start >> PAGE_SHIFT;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
- offset = offset_in_page(start_offset + start);
+ offset = offset_in_page(start);
while (len > 0) {
page = eb->pages[i];
@@ -5776,20 +5773,19 @@
}
}
-void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
+void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
unsigned long len)
{
size_t cur;
size_t offset;
struct page *page;
char *kaddr;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = start >> PAGE_SHIFT;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
- offset = offset_in_page(start_offset + start);
+ offset = offset_in_page(start);
while (len > 0) {
page = eb->pages[i];
@@ -5805,8 +5801,8 @@
}
}
-void copy_extent_buffer_full(struct extent_buffer *dst,
- struct extent_buffer *src)
+void copy_extent_buffer_full(const struct extent_buffer *dst,
+ const struct extent_buffer *src)
{
int i;
int num_pages;
@@ -5819,7 +5815,8 @@
page_address(src->pages[i]));
}
-void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+void copy_extent_buffer(const struct extent_buffer *dst,
+ const struct extent_buffer *src,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
@@ -5828,12 +5825,15 @@
size_t offset;
struct page *page;
char *kaddr;
- size_t start_offset = offset_in_page(dst->start);
- unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
+ unsigned long i = dst_offset >> PAGE_SHIFT;
+
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(src, src_offset, len))
+ return;
WARN_ON(src->len != dst_len);
- offset = offset_in_page(start_offset + dst_offset);
+ offset = offset_in_page(dst_offset);
while (len > 0) {
page = dst->pages[i];
@@ -5864,12 +5864,11 @@
* This helper hides the ugliness of finding the byte in an extent buffer which
* contains a given bit.
*/
-static inline void eb_bitmap_offset(struct extent_buffer *eb,
+static inline void eb_bitmap_offset(const struct extent_buffer *eb,
unsigned long start, unsigned long nr,
unsigned long *page_index,
size_t *page_offset)
{
- size_t start_offset = offset_in_page(eb->start);
size_t byte_offset = BIT_BYTE(nr);
size_t offset;
@@ -5878,7 +5877,7 @@
* the bitmap item in the extent buffer + the offset of the byte in the
* bitmap item.
*/
- offset = start_offset + start + byte_offset;
+ offset = start + byte_offset;
*page_index = offset >> PAGE_SHIFT;
*page_offset = offset_in_page(offset);
@@ -5890,7 +5889,7 @@
* @start: offset of the bitmap item in the extent buffer
* @nr: bit number to test
*/
-int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long nr)
{
u8 *kaddr;
@@ -5912,7 +5911,7 @@
* @pos: bit number of the first bit
* @len: number of bits to set
*/
-void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len)
{
u8 *kaddr;
@@ -5954,8 +5953,9 @@
* @pos: bit number of the first bit
* @len: number of bits to clear
*/
-void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
- unsigned long pos, unsigned long len)
+void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
+ unsigned long start, unsigned long pos,
+ unsigned long len)
{
u8 *kaddr;
struct page *page;
@@ -6016,36 +6016,26 @@
memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
}
-void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
- unsigned long src_offset, unsigned long len)
+void memcpy_extent_buffer(const struct extent_buffer *dst,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
- size_t start_offset = offset_in_page(dst->start);
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu dst len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu dst len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
while (len > 0) {
- dst_off_in_page = offset_in_page(start_offset + dst_offset);
- src_off_in_page = offset_in_page(start_offset + src_offset);
+ dst_off_in_page = offset_in_page(dst_offset);
+ src_off_in_page = offset_in_page(src_offset);
- dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
- src_i = (start_offset + src_offset) >> PAGE_SHIFT;
+ dst_i = dst_offset >> PAGE_SHIFT;
+ src_i = src_offset >> PAGE_SHIFT;
cur = min(len, (unsigned long)(PAGE_SIZE -
src_off_in_page));
@@ -6061,41 +6051,31 @@
}
}
-void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
- unsigned long src_offset, unsigned long len)
+void memmove_extent_buffer(const struct extent_buffer *dst,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
unsigned long dst_end = dst_offset + len - 1;
unsigned long src_end = src_offset + len - 1;
- size_t start_offset = offset_in_page(dst->start);
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
if (dst_offset < src_offset) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return;
}
while (len > 0) {
- dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
- src_i = (start_offset + src_end) >> PAGE_SHIFT;
+ dst_i = dst_end >> PAGE_SHIFT;
+ src_i = src_end >> PAGE_SHIFT;
- dst_off_in_page = offset_in_page(start_offset + dst_end);
- src_off_in_page = offset_in_page(start_offset + src_end);
+ dst_off_in_page = offset_in_page(dst_end);
+ src_off_in_page = offset_in_page(src_end);
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);