Update Linux to v5.10.157
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.157.tar.xz
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
Change-Id: I7b30d9e98d8c465d6b44de8e7433b4a40b3289ba
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index baff31a..6942707 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -137,6 +137,7 @@
u64 root_objectid;
u64 inum;
int share_count;
+ bool have_delayed_delete_refs;
};
static inline int extent_is_shared(struct share_check *sc)
@@ -287,8 +288,10 @@
struct prelim_ref *ref, *next_ref;
rbtree_postorder_for_each_entry_safe(ref, next_ref,
- &preftree->root.rb_root, rbnode)
+ &preftree->root.rb_root, rbnode) {
+ free_inode_elem_list(ref->inode_list);
free_pref(ref);
+ }
preftree->root = RB_ROOT_CACHED;
preftree->count = 0;
@@ -646,6 +649,18 @@
return (struct extent_inode_elem *)(uintptr_t)node->aux;
}
+static void free_leaf_list(struct ulist *ulist)
+{
+ struct ulist_node *node;
+ struct ulist_iterator uiter;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(ulist, &uiter)))
+ free_inode_elem_list(unode_aux_to_inode_list(node));
+
+ ulist_free(ulist);
+}
+
/*
* We maintain three separate rbtrees: one for direct refs, one for
* indirect refs which have a key, and one for indirect refs which do not
@@ -760,7 +775,11 @@
cond_resched();
}
out:
- ulist_free(parents);
+ /*
+ * We may have inode lists attached to refs in the parents ulist, so we
+ * must free them before freeing the ulist and its refs.
+ */
+ free_leaf_list(parents);
return ret;
}
@@ -817,16 +836,11 @@
struct preftrees *preftrees, struct share_check *sc)
{
struct btrfs_delayed_ref_node *node;
- struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct btrfs_key key;
- struct btrfs_key tmp_op_key;
struct rb_node *n;
int count;
int ret = 0;
- if (extent_op && extent_op->update_key)
- btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
-
spin_lock(&head->lock);
for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
node = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -852,10 +866,16 @@
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_key *key_ptr = NULL;
+
+ if (head->extent_op && head->extent_op->update_key) {
+ btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
+ key_ptr = &key;
+ }
ref = btrfs_delayed_node_to_tree_ref(node);
ret = add_indirect_ref(fs_info, preftrees, ref->root,
- &tmp_op_key, ref->level + 1,
+ key_ptr, ref->level + 1,
node->bytenr, count, sc,
GFP_ATOMIC);
break;
@@ -881,13 +901,22 @@
key.offset = ref->offset;
/*
- * Found a inum that doesn't match our known inum, we
- * know it's shared.
+ * If we have a share check context and a reference for
+ * another inode, we can't exit immediately. This is
+ * because even if this is a BTRFS_ADD_DELAYED_REF
+ * reference we may find next a BTRFS_DROP_DELAYED_REF
+ * which cancels out this ADD reference.
+ *
+ * If this is a DROP reference and there was no previous
+ * ADD reference, then we need to signal that when we
+ * process references from the extent tree (through
+ * add_inline_refs() and add_keyed_refs()), we should
+ * not exit early if we find a reference for another
+ * inode, because one of the delayed DROP references
+ * may cancel that reference in the extent tree.
*/
- if (sc && sc->inum && ref->objectid != sc->inum) {
- ret = BACKREF_FOUND_SHARED;
- goto out;
- }
+ if (sc && count < 0)
+ sc->have_delayed_delete_refs = true;
ret = add_indirect_ref(fs_info, preftrees, ref->root,
&key, 0, node->bytenr, count, sc,
@@ -917,7 +946,7 @@
}
if (!ret)
ret = extent_is_shared(sc);
-out:
+
spin_unlock(&head->lock);
return ret;
}
@@ -1020,7 +1049,8 @@
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
@@ -1030,6 +1060,7 @@
ret = add_indirect_ref(fs_info, preftrees, root,
&key, 0, bytenr, count,
sc, GFP_NOFS);
+
break;
}
default:
@@ -1119,7 +1150,8 @@
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
@@ -1358,6 +1390,12 @@
if (ret < 0)
goto out;
ref->inode_list = eie;
+ /*
+ * We transferred the list ownership to the ref,
+ * so set to NULL to avoid a double free in case
+ * an error happens after this.
+ */
+ eie = NULL;
}
ret = ulist_add_merge_ptr(refs, ref->parent,
ref->inode_list,
@@ -1383,6 +1421,14 @@
eie->next = ref->inode_list;
}
eie = NULL;
+ /*
+ * We have transferred the inode list ownership from
+ * this ref to the ref we added to the 'refs' ulist.
+ * So set this ref's inode list to NULL to avoid
+ * use-after-free when our caller uses it or double
+ * frees in case an error happens before we return.
+ */
+ ref->inode_list = NULL;
}
cond_resched();
}
@@ -1399,24 +1445,6 @@
return ret;
}
-static void free_leaf_list(struct ulist *blocks)
-{
- struct ulist_node *node = NULL;
- struct extent_inode_elem *eie;
- struct ulist_iterator uiter;
-
- ULIST_ITER_INIT(&uiter);
- while ((node = ulist_next(blocks, &uiter))) {
- if (!node->aux)
- continue;
- eie = unode_aux_to_inode_list(node);
- free_inode_elem_list(eie);
- node->aux = 0;
- }
-
- ulist_free(blocks);
-}
-
/*
* Finds all leafs with a reference to the specified combination of bytenr and
* offset. key_list_head will point to a list of corresponding keys (caller must
@@ -1542,6 +1570,7 @@
.root_objectid = root->root_key.objectid,
.inum = inum,
.share_count = 0,
+ .have_delayed_delete_refs = false,
};
ulist_init(roots);
@@ -1576,6 +1605,7 @@
break;
bytenr = node->val;
shared.share_count = 0;
+ shared.have_delayed_delete_refs = false;
cond_resched();
}
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index c99e293..889a598 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2570,7 +2570,6 @@
struct btrfs_path *path = NULL;
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
int loops = 0;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2636,7 +2635,6 @@
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
/*
@@ -2737,7 +2735,6 @@
int should_put;
struct btrfs_path *path;
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
path = btrfs_alloc_path();
if (!path)
@@ -2795,7 +2792,6 @@
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
list_add_tail(&cache->io_list, io);
} else {
@@ -3130,6 +3126,7 @@
* attempt.
*/
wait_for_alloc = true;
+ force = CHUNK_ALLOC_NO_FORCE;
spin_unlock(&space_info->lock);
mutex_lock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->chunk_mutex);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index d297804..be6935d 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -161,7 +161,7 @@
if (btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) {
btrfs_err(fs_info,
- "replace devid present without an active replace item");
+"replace without active item, run 'device scan --forget' on the target device");
ret = -EUCLEAN;
} else {
dev_replace->srcdev = NULL;
@@ -954,8 +954,7 @@
up_write(&dev_replace->rwsem);
/* Scrub for replace must not be running in suspended state */
- ret = btrfs_scrub_cancel(fs_info);
- ASSERT(ret != -ENOTCONN);
+ btrfs_scrub_cancel(fs_info);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a5bcad0..f2abd8b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1596,9 +1596,10 @@
ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
- btrfs_put_root(root);
- if (ret == -EEXIST)
+ if (ret == -EEXIST) {
+ btrfs_put_root(root);
goto again;
+ }
goto fail;
}
return root;
@@ -3060,7 +3061,7 @@
~BTRFS_FEATURE_INCOMPAT_SUPP;
if (features) {
btrfs_err(fs_info,
- "cannot mount because of unsupported optional features (%llx)",
+ "cannot mount because of unsupported optional features (0x%llx)",
features);
err = -EINVAL;
goto fail_alloc;
@@ -3098,11 +3099,25 @@
~BTRFS_FEATURE_COMPAT_RO_SUPP;
if (!sb_rdonly(sb) && features) {
btrfs_err(fs_info,
- "cannot mount read-write because of unsupported optional features (%llx)",
+ "cannot mount read-write because of unsupported optional features (0x%llx)",
features);
err = -EINVAL;
goto fail_alloc;
}
+ /*
+ * We have unsupported RO compat features, although RO mounted, we
+ * should not cause any metadata write, including log replay.
+ * Or we could screw up whatever the new feature requires.
+ */
+ if (unlikely(features && btrfs_super_log_root(disk_super) &&
+ !btrfs_test_opt(fs_info, NOLOGREPLAY))) {
+ btrfs_err(fs_info,
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
+ features);
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
ret = btrfs_init_workqueues(fs_info, fs_devices);
if (ret) {
@@ -4090,6 +4105,31 @@
/* clear out the rbtree of defraggable inodes */
btrfs_cleanup_defrag_inodes(fs_info);
+ /*
+ * After we parked the cleaner kthread, ordered extents may have
+ * completed and created new delayed iputs. If one of the async reclaim
+ * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
+ * can hang forever trying to stop it, because if a delayed iput is
+ * added after it ran btrfs_run_delayed_iputs() and before it called
+ * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
+ * no one else to run iputs.
+ *
+ * So wait for all ongoing ordered extents to complete and then run
+ * delayed iputs. This works because once we reach this point no one
+ * can either create new ordered extents nor create delayed iputs
+ * through some other means.
+ *
+ * Also note that btrfs_wait_ordered_roots() is not safe here, because
+ * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
+ * but the delayed iput for the respective inode is made only when doing
+ * the final btrfs_put_ordered_extent() (which must happen at
+ * btrfs_finish_ordered_io() when we are unmounting).
+ */
+ btrfs_flush_workqueue(fs_info->endio_write_workers);
+ /* Ordered extents for free space inodes. */
+ btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+ btrfs_run_delayed_iputs(fs_info);
+
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1a8d419..bfa2bf4 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -58,7 +58,7 @@
}
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
+ u64 root_objectid, u64 generation,
int check_generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index f32f411..5afb7ca 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -19,7 +19,7 @@
} __attribute__ ((packed));
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
+ u64 root_objectid, u64 generation,
int check_generation);
struct dentry *btrfs_get_parent(struct dentry *child);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f39d02e..16f44bc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -121,7 +121,7 @@
*/
struct extent_changeset {
/* How many bytes are set/cleared in this operation */
- unsigned int bytes_changed;
+ u64 bytes_changed;
/* Changed ranges */
struct ulist range_changed;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f59ec55..416a1b7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2833,8 +2833,9 @@
return ret;
}
-static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
@@ -2866,6 +2867,10 @@
goto out_only_mutex;
}
+ ret = file_modified(file);
+ if (ret)
+ goto out_only_mutex;
+
lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
lockend = round_down(offset + len,
btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
@@ -3301,7 +3306,7 @@
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
- return btrfs_punch_hole(inode, offset, len);
+ return btrfs_punch_hole(file, offset, len);
/*
* Only trigger disk allocation, don't trigger qgroup reserve
@@ -3323,6 +3328,10 @@
goto out;
}
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
/*
* TODO: Move these two operations after we have checked
* accurate reserved space, or fallocate can still fail but
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1d9262a..779b774 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -995,7 +995,6 @@
int ret = 0;
if (btrfs_is_free_space_inode(inode)) {
- WARN_ON_ONCE(1);
ret = -EINVAL;
goto out_unlock;
}
@@ -4023,6 +4022,13 @@
dest->root_key.objectid);
return -EPERM;
}
+ if (atomic_read(&dest->nr_swapfiles)) {
+ spin_unlock(&dest->root_item_lock);
+ btrfs_warn(fs_info,
+ "attempt to delete subvolume %llu with active swapfile",
+ root->root_key.objectid);
+ return -EPERM;
+ }
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
@@ -7474,7 +7480,19 @@
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
- ret = -ENOTBLK;
+ /*
+ * If we are in a NOWAIT context, return -EAGAIN in order to
+ * fallback to buffered IO. This is not only because we can
+ * block with buffered IO (no support for NOWAIT semantics at
+ * the moment) but also to avoid returning short reads to user
+ * space - this happens if we were able to read some data from
+ * previous non-compressed extents and then when we fallback to
+ * buffered IO, at btrfs_file_read_iter() by calling
+ * filemap_read(), we fail to fault in pages for the read buffer,
+ * in which case filemap_read() returns a short read (the number
+ * of bytes previously read is > 0, so it does not return -EFAULT).
+ */
+ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
goto unlock_err;
}
@@ -10215,8 +10233,23 @@
* set. We use this counter to prevent snapshots. We must increment it
* before walking the extents because we don't want a concurrent
* snapshot to run after we've already checked the extents.
+ *
+ * It is possible that subvolume is marked for deletion but still not
+ * removed yet. To prevent this race, we check the root status before
+ * activating the swapfile.
*/
+ spin_lock(&root->root_item_lock);
+ if (btrfs_root_dead(root)) {
+ spin_unlock(&root->root_item_lock);
+
+ btrfs_exclop_finish(fs_info);
+ btrfs_warn(fs_info,
+ "cannot activate swapfile because subvolume %llu is being deleted",
+ root->root_key.objectid);
+ return -EPERM;
+ }
atomic_inc(&root->nr_swapfiles);
+ spin_unlock(&root->root_item_lock);
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b5e9bfe..d0c3165 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2811,6 +2811,8 @@
}
}
+ btrfs_free_path(path);
+ path = NULL;
if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
ret = -EFAULT;
@@ -2903,6 +2905,8 @@
}
out:
+ btrfs_free_path(path);
+
if (!ret || ret == -EOVERFLOW) {
rootrefs->num_items = found;
/* update min_treeid for next search */
@@ -2914,7 +2918,6 @@
}
kfree(rootrefs);
- btrfs_free_path(path);
return ret;
}
@@ -3878,6 +3881,8 @@
ipath->fspath->val[i] = rel_ptr;
}
+ btrfs_free_path(path);
+ path = NULL;
ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
ipath->fspath, size);
if (ret) {
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a02e38f..36da775 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1158,6 +1158,21 @@
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
+ } else {
+ /*
+ * We have set both BTRFS_FS_QUOTA_ENABLED and
+ * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
+ * -EINPROGRESS. That can happen because someone started the
+ * rescan worker by calling quota rescan ioctl before we
+ * attempted to initialize the rescan worker. Failure due to
+ * quotas disabled in the meanwhile is not possible, because
+ * we are holding a write lock on fs_info->subvol_sem, which
+ * is also acquired when disabling quotas.
+ * Ignore such error, and any other error would need to undo
+ * everything we did in the transaction we just committed.
+ */
+ ASSERT(ret == -EINPROGRESS);
+ ret = 0;
}
out_free_path:
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index e65d0fa..9678d7f 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -332,6 +332,9 @@
{
bio_list_merge(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
+ /* Also inherit the bitmaps from @victim. */
+ bitmap_or(dest->dbitmap, victim->dbitmap, dest->dbitmap,
+ dest->stripe_npages);
dest->generic_bio_cnt += victim->generic_bio_cnt;
bio_list_init(&victim->bio_list);
}
@@ -874,6 +877,12 @@
if (rbio->generic_bio_cnt)
btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+ /*
+ * Clear the data bitmap, as the rbio may be cached for later usage.
+ * do this before before unlock_stripe() so there will be no new bio
+ * for this bio.
+ */
+ bitmap_clear(rbio->dbitmap, 0, rbio->stripe_npages);
/*
* At this moment, rbio->bio_list is empty, however since rbio does not
@@ -1207,6 +1216,9 @@
else
BUG();
+ /* We should have at least one data sector. */
+ ASSERT(bitmap_weight(rbio->dbitmap, rbio->stripe_npages));
+
/* at this point we either have a full stripe,
* or we've read the full stripe from the drive.
* recalculate the parity and write the new results.
@@ -1280,6 +1292,11 @@
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
+
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(pagenr, rbio->dbitmap))
+ continue;
+
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
if (!page)
@@ -1304,6 +1321,11 @@
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
+
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(pagenr, rbio->dbitmap))
+ continue;
+
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
if (!page)
@@ -1729,6 +1751,33 @@
run_plug(plug);
}
+/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
+static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
+{
+ const struct btrfs_fs_info *fs_info = rbio->fs_info;
+ const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ const u64 full_stripe_start = rbio->bbio->raid_map[0];
+ const u32 orig_len = orig_bio->bi_iter.bi_size;
+ const u32 sectorsize = fs_info->sectorsize;
+ u64 cur_logical;
+
+ ASSERT(orig_logical >= full_stripe_start &&
+ orig_logical + orig_len <= full_stripe_start +
+ rbio->nr_data * rbio->stripe_len);
+
+ bio_list_add(&rbio->bio_list, orig_bio);
+ rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
+
+ /* Update the dbitmap. */
+ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
+ cur_logical += sectorsize) {
+ int bit = ((u32)(cur_logical - full_stripe_start) >>
+ PAGE_SHIFT) % rbio->stripe_npages;
+
+ set_bit(bit, rbio->dbitmap);
+ }
+}
+
/*
* our main entry point for writes from the rest of the FS.
*/
@@ -1745,9 +1794,8 @@
btrfs_put_bbio(bbio);
return PTR_ERR(rbio);
}
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
rbio->operation = BTRFS_RBIO_WRITE;
+ rbio_add_bio(rbio, bio);
btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
@@ -2046,9 +2094,12 @@
atomic_set(&rbio->error, 0);
/*
- * read everything that hasn't failed. Thanks to the
- * stripe cache, it is possible that some or all of these
- * pages are going to be uptodate.
+ * Read everything that hasn't failed. However this time we will
+ * not trust any cached sector.
+ * As we may read out some stale data but higher layer is not reading
+ * that stale part.
+ *
+ * So here we always re-read everything in recovery path.
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
if (rbio->faila == stripe || rbio->failb == stripe) {
@@ -2057,16 +2108,6 @@
}
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *p;
-
- /*
- * the rmw code may have already read this
- * page in
- */
- p = rbio_stripe_page(rbio, stripe, pagenr);
- if (PageUptodate(p))
- continue;
-
ret = rbio_add_io_page(rbio, &bio_list,
rbio_stripe_page(rbio, stripe, pagenr),
stripe, pagenr, rbio->stripe_len);
@@ -2144,8 +2185,7 @@
}
rbio->operation = BTRFS_RBIO_READ_REBUILD;
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
+ rbio_add_bio(rbio, bio);
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 3a3102b..4b3ae0f 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -503,8 +503,11 @@
*/
ASSERT(key.offset == 0);
ASSERT(datal <= fs_info->sectorsize);
- if (key.offset != 0 || datal > fs_info->sectorsize)
- return -EUCLEAN;
+ if (WARN_ON(key.offset != 0) ||
+ WARN_ON(datal > fs_info->sectorsize)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = clone_copy_inline_extent(inode, path, &new_key,
drop_start, datal, size,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index db37a37..e9e8ca4 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -336,9 +336,10 @@
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- if (ret < 0)
+ if (ret < 0) {
+ err = ret;
goto out;
- if (ret == 0) {
+ } else if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 0392c55..88b9a53 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3811,6 +3811,7 @@
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
+ bool need_commit = false;
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
@@ -3924,6 +3925,12 @@
*/
nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
+ u64 old_super_errors;
+
+ spin_lock(&sctx->stat_lock);
+ old_super_errors = sctx->stat.super_errors;
+ spin_unlock(&sctx->stat_lock);
+
btrfs_info(fs_info, "scrub: started on devid %llu", devid);
/*
* by holding device list mutex, we can
@@ -3932,6 +3939,16 @@
mutex_lock(&fs_info->fs_devices->device_list_mutex);
ret = scrub_supers(sctx, dev);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ spin_lock(&sctx->stat_lock);
+ /*
+ * Super block errors found, but we can not commit transaction
+ * at current context, since btrfs_commit_transaction() needs
+ * to pause the current running scrub (hold by ourselves).
+ */
+ if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
+ need_commit = true;
+ spin_unlock(&sctx->stat_lock);
}
if (!ret)
@@ -3958,6 +3975,25 @@
scrub_workers_put(fs_info);
scrub_put_ctx(sctx);
+ /*
+ * We found some super block errors before, now try to force a
+ * transaction commit, as scrub has finished.
+ */
+ if (need_commit) {
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_err(fs_info,
+ "scrub: failed to start transaction to fix super block errors: %d", ret);
+ return ret;
+ }
+ ret = btrfs_commit_transaction(trans);
+ if (ret < 0)
+ btrfs_err(fs_info,
+ "scrub: failed to commit transaction to fix super block errors: %d", ret);
+ }
return ret;
out:
scrub_workers_put(fs_info);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2663485..8bf8cdb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -652,6 +652,8 @@
compress_force = false;
no_compress++;
} else {
+ btrfs_err(info, "unrecognized compression value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -710,8 +712,11 @@
case Opt_thread_pool:
ret = match_int(&args[0], &intarg);
if (ret) {
+ btrfs_err(info, "unrecognized thread_pool value %s",
+ args[0].from);
goto out;
} else if (intarg == 0) {
+ btrfs_err(info, "invalid value 0 for thread_pool");
ret = -EINVAL;
goto out;
}
@@ -772,8 +777,11 @@
break;
case Opt_ratio:
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info, "unrecognized metadata_ratio value %s",
+ args[0].from);
goto out;
+ }
info->metadata_ratio = intarg;
btrfs_info(info, "metadata ratio %u",
info->metadata_ratio);
@@ -790,6 +798,8 @@
btrfs_set_and_info(info, DISCARD_ASYNC,
"turning on async discard");
} else {
+ btrfs_err(info, "unrecognized discard mode value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -814,6 +824,8 @@
btrfs_set_and_info(info, FREE_SPACE_TREE,
"enabling free space tree");
} else {
+ btrfs_err(info, "unrecognized space_cache value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -889,8 +901,12 @@
break;
case Opt_check_integrity_print_mask:
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info,
+ "unrecognized check_integrity_print_mask value %s",
+ args[0].from);
goto out;
+ }
info->check_integrity_print_mask = intarg;
btrfs_info(info, "check_integrity_print_mask 0x%x",
info->check_integrity_print_mask);
@@ -905,13 +921,15 @@
goto out;
#endif
case Opt_fatal_errors:
- if (strcmp(args[0].from, "panic") == 0)
+ if (strcmp(args[0].from, "panic") == 0) {
btrfs_set_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
- else if (strcmp(args[0].from, "bug") == 0)
+ } else if (strcmp(args[0].from, "bug") == 0) {
btrfs_clear_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
- else {
+ } else {
+ btrfs_err(info, "unrecognized fatal_errors value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -919,8 +937,12 @@
case Opt_commit_interval:
intarg = 0;
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info, "unrecognized commit_interval value %s",
+ args[0].from);
+ ret = -EINVAL;
goto out;
+ }
if (intarg == 0) {
btrfs_info(info,
"using default commit interval %us",
@@ -934,8 +956,11 @@
break;
case Opt_rescue:
ret = parse_rescue_options(info, args[0].from);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_err(info, "unrecognized rescue value %s",
+ args[0].from);
goto out;
+ }
break;
#ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 3bb6b68..ecf1902 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1767,8 +1767,11 @@
#ifdef CONFIG_BTRFS_DEBUG
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
- if (ret)
- goto out2;
+ if (ret) {
+ sysfs_unmerge_group(&btrfs_kset->kobj,
+ &btrfs_static_feature_attr_group);
+ goto out_remove_group;
+ }
#endif
return 0;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 999c14e..0599566 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -192,7 +192,7 @@
void btrfs_free_dummy_root(struct btrfs_root *root)
{
- if (!root)
+ if (IS_ERR_OR_NULL(root))
return;
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ce1ca8e..289366c 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -230,21 +230,21 @@
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -256,31 +256,33 @@
return ret;
}
+ /* btrfs_qgroup_account_extent() always frees the ulists passed to it. */
+ old_roots = NULL;
+ new_roots = NULL;
+
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, nodesize)) {
test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
- old_roots = NULL;
- new_roots = NULL;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = remove_extent_item(root, nodesize, nodesize);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return -EINVAL;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -331,21 +333,21 @@
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -366,21 +368,21 @@
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = add_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -407,21 +409,21 @@
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = remove_extent_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 62784b9..9a8dc16 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1075,7 +1075,9 @@
extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
inode_objectid, parent_objectid, 0,
0);
- if (!IS_ERR_OR_NULL(extref)) {
+ if (IS_ERR(extref)) {
+ return PTR_ERR(extref);
+ } else if (extref) {
u32 item_size;
u32 cur_offset = 0;
unsigned long base;
@@ -5335,6 +5337,18 @@
}
/*
+ * For symlinks, we must always log their content, which is stored in an
+ * inline extent, otherwise we could end up with an empty symlink after
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
+ * one attempts to create an empty symlink).
+ * We don't need to worry about flushing delalloc, because when we create
+ * the inline extent when the symlink is created (we never have delalloc
+ * for symlinks).
+ */
+ if (S_ISLNK(inode->vfs_inode.i_mode))
+ inode_only = LOG_INODE_ALL;
+
+ /*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
*/
@@ -5724,7 +5738,7 @@
}
ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
+ if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
log_mode, ctx);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e462de9..d4d89e0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -540,15 +540,47 @@
return ret;
}
-static bool device_path_matched(const char *path, struct btrfs_device *device)
+/*
+ * Check if the device in the path matches the device in the given struct device.
+ *
+ * Returns:
+ * true If it is the same device.
+ * false If it is not the same device or on error.
+ */
+static bool device_matched(const struct btrfs_device *device, const char *path)
{
- int found;
+ char *device_name;
+ struct block_device *bdev_old;
+ struct block_device *bdev_new;
+
+ /*
+ * If we are looking for a device with the matching dev_t, then skip
+ * device without a name (a missing device).
+ */
+ if (!device->name)
+ return false;
+
+ device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
+ if (!device_name)
+ return false;
rcu_read_lock();
- found = strcmp(rcu_str_deref(device->name), path);
+ scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name));
rcu_read_unlock();
- return found == 0;
+ bdev_old = lookup_bdev(device_name);
+ kfree(device_name);
+ if (IS_ERR(bdev_old))
+ return false;
+
+ bdev_new = lookup_bdev(path);
+ if (IS_ERR(bdev_new))
+ return false;
+
+ if (bdev_old == bdev_new)
+ return true;
+
+ return false;
}
/*
@@ -581,9 +613,7 @@
&fs_devices->devices, dev_list) {
if (skip_device && skip_device == device)
continue;
- if (path && !device->name)
- continue;
- if (path && !device_path_matched(path, device))
+ if (path && !device_matched(device, path))
continue;
if (fs_devices->opened) {
/* for an already deleted device return 0 */
@@ -4220,10 +4250,12 @@
struct btrfs_fs_info *fs_info = data;
int ret = 0;
+ sb_start_write(fs_info->sb);
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
+ sb_end_write(fs_info->sb);
return ret;
}
@@ -7189,12 +7221,12 @@
* do another round of validation checks.
*/
if (total_dev != fs_info->fs_devices->total_devices) {
- btrfs_err(fs_info,
- "super_num_devices %llu mismatch with num_devices %llu found here",
+ btrfs_warn(fs_info,
+"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
btrfs_super_num_devices(fs_info->super_copy),
total_dev);
- ret = -EINVAL;
- goto error;
+ fs_info->fs_devices->total_devices = total_dev;
+ btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
}
if (btrfs_super_total_bytes(fs_info->super_copy) <
fs_info->fs_devices->total_rw_bytes) {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index f1a60bc..cd6049b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -389,6 +389,9 @@
const char *name, const void *buffer,
size_t size, int flags)
{
+ if (btrfs_root_readonly(BTRFS_I(inode)->root))
+ return -EROFS;
+
name = xattr_full_name(handler, name);
return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}