Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 0aa1bee..520a0f6 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -5,12 +5,16 @@
select CRYPTO
select CRYPTO_CRC32C
select LIBCRC32C
+ select CRYPTO_XXHASH
+ select CRYPTO_SHA256
+ select CRYPTO_BLAKE2B
select ZLIB_INFLATE
select ZLIB_DEFLATE
select LZO_COMPRESS
select LZO_DECOMPRESS
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
+ select FS_IOMAP
select RAID6_PQ
select XOR_BLOCKS
select SRCU
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 82200db..e738f62 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -11,7 +11,7 @@
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
- block-rsv.o delalloc-space.o block-group.o
+ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 11be024..43c8995 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -53,14 +53,12 @@
struct __btrfs_workqueue *high;
};
-struct btrfs_fs_info *
-btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
{
return wq->fs_info;
}
-struct btrfs_fs_info *
-btrfs_work_owner(const struct btrfs_work *work)
+struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work)
{
return work->wq->fs_info;
}
@@ -226,7 +224,6 @@
struct btrfs_work *work;
spinlock_t *lock = &wq->list_lock;
unsigned long flags;
- void *wtag;
bool free_self = false;
while (1) {
@@ -237,6 +234,13 @@
ordered_list);
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
+ /*
+ * Orders all subsequent loads after reading WORK_DONE_BIT,
+ * paired with the smp_mb__before_atomic in btrfs_work_helper
+ * this guarantees that the ordered function will see all
+ * updates from ordinary work function.
+ */
+ smp_rmb();
/*
* we are going to call the ordered done function, but
@@ -281,21 +285,19 @@
} else {
/*
* We don't want to call the ordered free functions with
- * the lock held though. Save the work as tag for the
- * trace event, because the callback could free the
- * structure.
+ * the lock held.
*/
- wtag = work;
work->ordered_free(work);
- trace_btrfs_all_work_done(wq->fs_info, wtag);
+ /* NB: work must not be dereferenced past this point. */
+ trace_btrfs_all_work_done(wq->fs_info, work);
}
}
spin_unlock_irqrestore(lock, flags);
if (free_self) {
- wtag = self;
self->ordered_free(self);
- trace_btrfs_all_work_done(wq->fs_info, wtag);
+ /* NB: self must not be dereferenced past this point. */
+ trace_btrfs_all_work_done(wq->fs_info, self);
}
}
@@ -304,7 +306,6 @@
struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
normal_work);
struct __btrfs_workqueue *wq;
- void *wtag;
int need_order = 0;
/*
@@ -318,18 +319,24 @@
if (work->ordered_func)
need_order = 1;
wq = work->wq;
- /* Safe for tracepoints in case work gets freed by the callback */
- wtag = work;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
work->func(work);
if (need_order) {
+ /*
+ * Ensures all memory accesses done in the work function are
+ * ordered before setting the WORK_DONE_BIT. Ensuring the thread
+ * which is going to executed the ordered work sees them.
+ * Pairs with the smp_rmb in run_ordered_work.
+ */
+ smp_mb__before_atomic();
set_bit(WORK_DONE_BIT, &work->flags);
run_ordered_work(wq, work);
+ } else {
+ /* NB: work must not be dereferenced past this point. */
+ trace_btrfs_all_work_done(wq->fs_info, work);
}
- if (!need_order)
- trace_btrfs_all_work_done(wq->fs_info, wtag);
}
void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 714ab68..3204daa 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -41,8 +41,8 @@
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
void btrfs_set_work_high_priority(struct btrfs_work *work);
-struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work);
-struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
+struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work);
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
void btrfs_flush_workqueue(struct btrfs_workqueue *wq);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7f644a5..baff31a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -13,6 +13,7 @@
#include "transaction.h"
#include "delayed-ref.h"
#include "locking.h"
+#include "misc.h"
/* Just an arbitrary number so we can be sure this happened */
#define BACKREF_FOUND_SHARED 6
@@ -391,7 +392,7 @@
struct rb_node **p = &preftrees->direct.root.rb_root.rb_node;
struct rb_node *parent = NULL;
struct prelim_ref *ref = NULL;
- struct prelim_ref target = {0};
+ struct prelim_ref target = {};
int result;
target.parent = bytenr;
@@ -537,29 +538,36 @@
const u64 *extent_item_pos, bool ignore_offset)
{
struct btrfs_root *root;
- struct btrfs_key root_key;
struct extent_buffer *eb;
int ret = 0;
int root_level;
int level = ref->level;
- int index;
struct btrfs_key search_key = ref->key_for_search;
- root_key.objectid = ref->root_id;
- root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = (u64)-1;
-
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- root = btrfs_get_fs_root(fs_info, &root_key, false);
+ /*
+ * If we're search_commit_root we could possibly be holding locks on
+ * other tree nodes. This happens when qgroups does backref walks when
+ * adding new delayed refs. To deal with this we need to look in cache
+ * for the root, and if we don't find it then we need to search the
+ * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage
+ * here.
+ */
+ if (path->search_commit_root)
+ root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id);
+ else
+ root = btrfs_get_fs_root(fs_info, ref->root_id, false);
if (IS_ERR(root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(root);
+ goto out_free;
+ }
+
+ if (!path->search_commit_root &&
+ test_bit(BTRFS_ROOT_DELETING, &root->state)) {
+ ret = -ENOENT;
goto out;
}
if (btrfs_is_testing(fs_info)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -ENOENT;
goto out;
}
@@ -571,10 +579,8 @@
else
root_level = btrfs_old_root_level(root, time_seq);
- if (root_level + 1 == level) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ if (root_level + 1 == level)
goto out;
- }
/*
* We can often find data backrefs with an offset that is too large
@@ -604,9 +610,6 @@
else
ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
- /* root node has been locked, we can release @subvol_srcu safely here */
- srcu_read_unlock(&fs_info->subvol_srcu, index);
-
btrfs_debug(fs_info,
"search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)",
ref->root_id, level, ref->count, ret,
@@ -628,6 +631,8 @@
ret = add_all_parents(root, path, parents, preftrees, ref, level,
time_seq, extent_item_pos, ignore_offset);
out:
+ btrfs_put_root(root);
+out_free:
path->lowest_level = 0;
btrfs_release_path(path);
return ret;
@@ -1208,7 +1213,12 @@
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0);
+ if (ret == 0) {
+ /* This shouldn't happen, indicates a bug or fs corruption. */
+ ASSERT(ret != 0);
+ ret = -EUCLEAN;
+ goto out;
+ }
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
@@ -1356,10 +1366,18 @@
goto out;
if (!ret && extent_item_pos) {
/*
- * we've recorded that parent, so we must extend
- * its inode list here
+ * We've recorded that parent, so we must extend
+ * its inode list here.
+ *
+ * However if there was corruption we may not
+ * have found an eie, return an error in this
+ * case.
*/
- BUG_ON(!eie);
+ ASSERT(eie);
+ if (!eie) {
+ ret = -EUCLEAN;
+ goto out;
+ }
while (eie->next)
eie = eie->next;
eie->next = ref->inode_list;
@@ -1407,10 +1425,10 @@
*
* returns 0 on success, <0 on error
*/
-static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **leafs,
- const u64 *extent_item_pos, bool ignore_offset)
+int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **leafs,
+ const u64 *extent_item_pos, bool ignore_offset)
{
int ret;
@@ -2298,3 +2316,824 @@
kvfree(ipath->fspath);
kfree(ipath);
}
+
+struct btrfs_backref_iter *btrfs_backref_iter_alloc(
+ struct btrfs_fs_info *fs_info, gfp_t gfp_flag)
+{
+ struct btrfs_backref_iter *ret;
+
+ ret = kzalloc(sizeof(*ret), gfp_flag);
+ if (!ret)
+ return NULL;
+
+ ret->path = btrfs_alloc_path();
+ if (!ret->path) {
+ kfree(ret);
+ return NULL;
+ }
+
+ /* Current backref iterator only supports iteration in commit root */
+ ret->path->search_commit_root = 1;
+ ret->path->skip_locking = 1;
+ ret->fs_info = fs_info;
+
+ return ret;
+}
+
+int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
+{
+ struct btrfs_fs_info *fs_info = iter->fs_info;
+ struct btrfs_path *path = iter->path;
+ struct btrfs_extent_item *ei;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = (u64)-1;
+ iter->bytenr = bytenr;
+
+ ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+ if (ret == 0) {
+ ret = -EUCLEAN;
+ goto release;
+ }
+ if (path->slots[0] == 0) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ ret = -EUCLEAN;
+ goto release;
+ }
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if ((key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY) || key.objectid != bytenr) {
+ ret = -ENOENT;
+ goto release;
+ }
+ memcpy(&iter->cur_key, &key, sizeof(key));
+ iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ iter->end_ptr = (u32)(iter->item_ptr +
+ btrfs_item_size_nr(path->nodes[0], path->slots[0]));
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+
+ /*
+ * Only support iteration on tree backref yet.
+ *
+ * This is an extra precaution for non skinny-metadata, where
+ * EXTENT_ITEM is also used for tree blocks, that we can only use
+ * extent flags to determine if it's a tree block.
+ */
+ if (btrfs_extent_flags(path->nodes[0], ei) & BTRFS_EXTENT_FLAG_DATA) {
+ ret = -ENOTSUPP;
+ goto release;
+ }
+ iter->cur_ptr = (u32)(iter->item_ptr + sizeof(*ei));
+
+ /* If there is no inline backref, go search for keyed backref */
+ if (iter->cur_ptr >= iter->end_ptr) {
+ ret = btrfs_next_item(fs_info->extent_root, path);
+
+ /* No inline nor keyed ref */
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto release;
+ }
+ if (ret < 0)
+ goto release;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key,
+ path->slots[0]);
+ if (iter->cur_key.objectid != bytenr ||
+ (iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY &&
+ iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY)) {
+ ret = -ENOENT;
+ goto release;
+ }
+ iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ iter->item_ptr = iter->cur_ptr;
+ iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size_nr(
+ path->nodes[0], path->slots[0]));
+ }
+
+ return 0;
+release:
+ btrfs_backref_iter_release(iter);
+ return ret;
+}
+
+/*
+ * Go to the next backref item of current bytenr, can be either inlined or
+ * keyed.
+ *
+ * Caller needs to check whether it's inline ref or not by iter->cur_key.
+ *
+ * Return 0 if we get next backref without problem.
+ * Return >0 if there is no extra backref for this bytenr.
+ * Return <0 if there is something wrong happened.
+ */
+int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
+{
+ struct extent_buffer *eb = btrfs_backref_get_eb(iter);
+ struct btrfs_path *path = iter->path;
+ struct btrfs_extent_inline_ref *iref;
+ int ret;
+ u32 size;
+
+ if (btrfs_backref_iter_is_inline_ref(iter)) {
+ /* We're still inside the inline refs */
+ ASSERT(iter->cur_ptr < iter->end_ptr);
+
+ if (btrfs_backref_has_tree_block_info(iter)) {
+ /* First tree block info */
+ size = sizeof(struct btrfs_tree_block_info);
+ } else {
+ /* Use inline ref type to determine the size */
+ int type;
+
+ iref = (struct btrfs_extent_inline_ref *)
+ ((unsigned long)iter->cur_ptr);
+ type = btrfs_extent_inline_ref_type(eb, iref);
+
+ size = btrfs_extent_inline_ref_size(type);
+ }
+ iter->cur_ptr += size;
+ if (iter->cur_ptr < iter->end_ptr)
+ return 0;
+
+ /* All inline items iterated, fall through */
+ }
+
+ /* We're at keyed items, there is no inline item, go to the next one */
+ ret = btrfs_next_item(iter->fs_info->extent_root, iter->path);
+ if (ret)
+ return ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key, path->slots[0]);
+ if (iter->cur_key.objectid != iter->bytenr ||
+ (iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY &&
+ iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY))
+ return 1;
+ iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ iter->cur_ptr = iter->item_ptr;
+ iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size_nr(path->nodes[0],
+ path->slots[0]);
+ return 0;
+}
+
+void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_backref_cache *cache, int is_reloc)
+{
+ int i;
+
+ cache->rb_root = RB_ROOT;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ INIT_LIST_HEAD(&cache->pending[i]);
+ INIT_LIST_HEAD(&cache->changed);
+ INIT_LIST_HEAD(&cache->detached);
+ INIT_LIST_HEAD(&cache->leaves);
+ INIT_LIST_HEAD(&cache->pending_edge);
+ INIT_LIST_HEAD(&cache->useless_node);
+ cache->fs_info = fs_info;
+ cache->is_reloc = is_reloc;
+}
+
+struct btrfs_backref_node *btrfs_backref_alloc_node(
+ struct btrfs_backref_cache *cache, u64 bytenr, int level)
+{
+ struct btrfs_backref_node *node;
+
+ ASSERT(level >= 0 && level < BTRFS_MAX_LEVEL);
+ node = kzalloc(sizeof(*node), GFP_NOFS);
+ if (!node)
+ return node;
+
+ INIT_LIST_HEAD(&node->list);
+ INIT_LIST_HEAD(&node->upper);
+ INIT_LIST_HEAD(&node->lower);
+ RB_CLEAR_NODE(&node->rb_node);
+ cache->nr_nodes++;
+ node->level = level;
+ node->bytenr = bytenr;
+
+ return node;
+}
+
+struct btrfs_backref_edge *btrfs_backref_alloc_edge(
+ struct btrfs_backref_cache *cache)
+{
+ struct btrfs_backref_edge *edge;
+
+ edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ if (edge)
+ cache->nr_edges++;
+ return edge;
+}
+
+/*
+ * Drop the backref node from cache, also cleaning up all its
+ * upper edges and any uncached nodes in the path.
+ *
+ * This cleanup happens bottom up, thus the node should either
+ * be the lowest node in the cache or a detached node.
+ */
+void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node)
+{
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_edge *edge;
+
+ if (!node)
+ return;
+
+ BUG_ON(!node->lowest && !node->detached);
+ while (!list_empty(&node->upper)) {
+ edge = list_entry(node->upper.next, struct btrfs_backref_edge,
+ list[LOWER]);
+ upper = edge->node[UPPER];
+ list_del(&edge->list[LOWER]);
+ list_del(&edge->list[UPPER]);
+ btrfs_backref_free_edge(cache, edge);
+
+ /*
+ * Add the node to leaf node list if no other child block
+ * cached.
+ */
+ if (list_empty(&upper->lower)) {
+ list_add_tail(&upper->lower, &cache->leaves);
+ upper->lowest = 1;
+ }
+ }
+
+ btrfs_backref_drop_node(cache, node);
+}
+
+/*
+ * Release all nodes/edges from current cache
+ */
+void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
+{
+ struct btrfs_backref_node *node;
+ int i;
+
+ while (!list_empty(&cache->detached)) {
+ node = list_entry(cache->detached.next,
+ struct btrfs_backref_node, list);
+ btrfs_backref_cleanup_node(cache, node);
+ }
+
+ while (!list_empty(&cache->leaves)) {
+ node = list_entry(cache->leaves.next,
+ struct btrfs_backref_node, lower);
+ btrfs_backref_cleanup_node(cache, node);
+ }
+
+ cache->last_trans = 0;
+
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ ASSERT(list_empty(&cache->pending[i]));
+ ASSERT(list_empty(&cache->pending_edge));
+ ASSERT(list_empty(&cache->useless_node));
+ ASSERT(list_empty(&cache->changed));
+ ASSERT(list_empty(&cache->detached));
+ ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
+ ASSERT(!cache->nr_nodes);
+ ASSERT(!cache->nr_edges);
+}
+
+/*
+ * Handle direct tree backref
+ *
+ * Direct tree backref means, the backref item shows its parent bytenr
+ * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined).
+ *
+ * @ref_key: The converted backref key.
+ * For keyed backref, it's the item key.
+ * For inlined backref, objectid is the bytenr,
+ * type is btrfs_inline_ref_type, offset is
+ * btrfs_inline_ref_offset.
+ */
+static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
+ struct btrfs_key *ref_key,
+ struct btrfs_backref_node *cur)
+{
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_node *upper;
+ struct rb_node *rb_node;
+
+ ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY);
+
+ /* Only reloc root uses backref pointing to itself */
+ if (ref_key->objectid == ref_key->offset) {
+ struct btrfs_root *root;
+
+ cur->is_reloc_root = 1;
+ /* Only reloc backref cache cares about a specific root */
+ if (cache->is_reloc) {
+ root = find_reloc_root(cache->fs_info, cur->bytenr);
+ if (!root)
+ return -ENOENT;
+ cur->root = root;
+ } else {
+ /*
+ * For generic purpose backref cache, reloc root node
+ * is useless.
+ */
+ list_add(&cur->list, &cache->useless_node);
+ }
+ return 0;
+ }
+
+ edge = btrfs_backref_alloc_edge(cache);
+ if (!edge)
+ return -ENOMEM;
+
+ rb_node = rb_simple_search(&cache->rb_root, ref_key->offset);
+ if (!rb_node) {
+ /* Parent node not yet cached */
+ upper = btrfs_backref_alloc_node(cache, ref_key->offset,
+ cur->level + 1);
+ if (!upper) {
+ btrfs_backref_free_edge(cache, edge);
+ return -ENOMEM;
+ }
+
+ /*
+ * Backrefs for the upper level block isn't cached, add the
+ * block to pending list
+ */
+ list_add_tail(&edge->list[UPPER], &cache->pending_edge);
+ } else {
+ /* Parent node already cached */
+ upper = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
+ ASSERT(upper->checked);
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
+ btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER);
+ return 0;
+}
+
+/*
+ * Handle indirect tree backref
+ *
+ * Indirect tree backref means, we only know which tree the node belongs to.
+ * We still need to do a tree search to find out the parents. This is for
+ * TREE_BLOCK_REF backref (keyed or inlined).
+ *
+ * @ref_key: The same as @ref_key in handle_direct_tree_backref()
+ * @tree_key: The first key of this tree block.
+ * @path: A clean (released) path, to avoid allocating path everytime
+ * the function get called.
+ */
+static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache,
+ struct btrfs_path *path,
+ struct btrfs_key *ref_key,
+ struct btrfs_key *tree_key,
+ struct btrfs_backref_node *cur)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_node *lower;
+ struct btrfs_backref_edge *edge;
+ struct extent_buffer *eb;
+ struct btrfs_root *root;
+ struct rb_node *rb_node;
+ int level;
+ bool need_check = true;
+ int ret;
+
+ root = btrfs_get_fs_root(fs_info, ref_key->offset, false);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ cur->cowonly = 1;
+
+ if (btrfs_root_level(&root->root_item) == cur->level) {
+ /* Tree root */
+ ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr);
+ /*
+ * For reloc backref cache, we may ignore reloc root. But for
+ * general purpose backref cache, we can't rely on
+ * btrfs_should_ignore_reloc_root() as it may conflict with
+ * current running relocation and lead to missing root.
+ *
+ * For general purpose backref cache, reloc root detection is
+ * completely relying on direct backref (key->offset is parent
+ * bytenr), thus only do such check for reloc cache.
+ */
+ if (btrfs_should_ignore_reloc_root(root) && cache->is_reloc) {
+ btrfs_put_root(root);
+ list_add(&cur->list, &cache->useless_node);
+ } else {
+ cur->root = root;
+ }
+ return 0;
+ }
+
+ level = cur->level + 1;
+
+ /* Search the tree to find parent blocks referring to the block */
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0);
+ path->lowest_level = 0;
+ if (ret < 0) {
+ btrfs_put_root(root);
+ return ret;
+ }
+ if (ret > 0 && path->slots[level] > 0)
+ path->slots[level]--;
+
+ eb = path->nodes[level];
+ if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) {
+ btrfs_err(fs_info,
+"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
+ cur->bytenr, level - 1, root->root_key.objectid,
+ tree_key->objectid, tree_key->type, tree_key->offset);
+ btrfs_put_root(root);
+ ret = -ENOENT;
+ goto out;
+ }
+ lower = cur;
+
+ /* Add all nodes and edges in the path */
+ for (; level < BTRFS_MAX_LEVEL; level++) {
+ if (!path->nodes[level]) {
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
+ lower->bytenr);
+ /* Same as previous should_ignore_reloc_root() call */
+ if (btrfs_should_ignore_reloc_root(root) &&
+ cache->is_reloc) {
+ btrfs_put_root(root);
+ list_add(&lower->list, &cache->useless_node);
+ } else {
+ lower->root = root;
+ }
+ break;
+ }
+
+ edge = btrfs_backref_alloc_edge(cache);
+ if (!edge) {
+ btrfs_put_root(root);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ eb = path->nodes[level];
+ rb_node = rb_simple_search(&cache->rb_root, eb->start);
+ if (!rb_node) {
+ upper = btrfs_backref_alloc_node(cache, eb->start,
+ lower->level + 1);
+ if (!upper) {
+ btrfs_put_root(root);
+ btrfs_backref_free_edge(cache, edge);
+ ret = -ENOMEM;
+ goto out;
+ }
+ upper->owner = btrfs_header_owner(eb);
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ upper->cowonly = 1;
+
+ /*
+ * If we know the block isn't shared we can avoid
+ * checking its backrefs.
+ */
+ if (btrfs_block_can_be_shared(root, eb))
+ upper->checked = 0;
+ else
+ upper->checked = 1;
+
+ /*
+ * Add the block to pending list if we need to check its
+ * backrefs, we only do this once while walking up a
+ * tree as we will catch anything else later on.
+ */
+ if (!upper->checked && need_check) {
+ need_check = false;
+ list_add_tail(&edge->list[UPPER],
+ &cache->pending_edge);
+ } else {
+ if (upper->checked)
+ need_check = true;
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
+ } else {
+ upper = rb_entry(rb_node, struct btrfs_backref_node,
+ rb_node);
+ ASSERT(upper->checked);
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ if (!upper->owner)
+ upper->owner = btrfs_header_owner(eb);
+ }
+ btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER);
+
+ if (rb_node) {
+ btrfs_put_root(root);
+ break;
+ }
+ lower = upper;
+ upper = NULL;
+ }
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+/*
+ * Add backref node @cur into @cache.
+ *
+ * NOTE: Even if the function returned 0, @cur is not yet cached as its upper
+ * links aren't yet bi-directional. Needs to finish such links.
+ * Use btrfs_backref_finish_upper_links() to finish such linkage.
+ *
+ * @path: Released path for indirect tree backref lookup
+ * @iter: Released backref iter for extent tree search
+ * @node_key: The first key of the tree block
+ */
+int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
+ struct btrfs_path *path,
+ struct btrfs_backref_iter *iter,
+ struct btrfs_key *node_key,
+ struct btrfs_backref_node *cur)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_node *exist;
+ int ret;
+
+ ret = btrfs_backref_iter_start(iter, cur->bytenr);
+ if (ret < 0)
+ return ret;
+ /*
+ * We skip the first btrfs_tree_block_info, as we don't use the key
+ * stored in it, but fetch it from the tree block
+ */
+ if (btrfs_backref_has_tree_block_info(iter)) {
+ ret = btrfs_backref_iter_next(iter);
+ if (ret < 0)
+ goto out;
+ /* No extra backref? This means the tree block is corrupted */
+ if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+ WARN_ON(cur->checked);
+ if (!list_empty(&cur->upper)) {
+ /*
+ * The backref was added previously when processing backref of
+ * type BTRFS_TREE_BLOCK_REF_KEY
+ */
+ ASSERT(list_is_singular(&cur->upper));
+ edge = list_entry(cur->upper.next, struct btrfs_backref_edge,
+ list[LOWER]);
+ ASSERT(list_empty(&edge->list[UPPER]));
+ exist = edge->node[UPPER];
+ /*
+ * Add the upper level block to pending list if we need check
+ * its backrefs
+ */
+ if (!exist->checked)
+ list_add_tail(&edge->list[UPPER], &cache->pending_edge);
+ } else {
+ exist = NULL;
+ }
+
+ for (; ret == 0; ret = btrfs_backref_iter_next(iter)) {
+ struct extent_buffer *eb;
+ struct btrfs_key key;
+ int type;
+
+ cond_resched();
+ eb = btrfs_backref_get_eb(iter);
+
+ key.objectid = iter->bytenr;
+ if (btrfs_backref_iter_is_inline_ref(iter)) {
+ struct btrfs_extent_inline_ref *iref;
+
+ /* Update key for inline backref */
+ iref = (struct btrfs_extent_inline_ref *)
+ ((unsigned long)iter->cur_ptr);
+ type = btrfs_get_extent_inline_ref_type(eb, iref,
+ BTRFS_REF_TYPE_BLOCK);
+ if (type == BTRFS_REF_TYPE_INVALID) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ key.type = type;
+ key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+ } else {
+ key.type = iter->cur_key.type;
+ key.offset = iter->cur_key.offset;
+ }
+
+ /*
+ * Parent node found and matches current inline ref, no need to
+ * rebuild this node for this inline ref
+ */
+ if (exist &&
+ ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
+ exist->owner == key.offset) ||
+ (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
+ exist->bytenr == key.offset))) {
+ exist = NULL;
+ continue;
+ }
+
+ /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
+ if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
+ ret = handle_direct_tree_backref(cache, &key, cur);
+ if (ret < 0)
+ goto out;
+ continue;
+ } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
+ ret = -EINVAL;
+ btrfs_print_v0_err(fs_info);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
+ goto out;
+ } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
+ continue;
+ }
+
+ /*
+ * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
+ * means the root objectid. We need to search the tree to get
+ * its parent bytenr.
+ */
+ ret = handle_indirect_tree_backref(cache, path, &key, node_key,
+ cur);
+ if (ret < 0)
+ goto out;
+ }
+ ret = 0;
+ cur->checked = 1;
+ WARN_ON(exist);
+out:
+ btrfs_backref_iter_release(iter);
+ return ret;
+}
+
+/*
+ * Finish the upwards linkage created by btrfs_backref_add_tree_node()
+ */
+int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *start)
+{
+ struct list_head *useless_node = &cache->useless_node;
+ struct btrfs_backref_edge *edge;
+ struct rb_node *rb_node;
+ LIST_HEAD(pending_edge);
+
+ ASSERT(start->checked);
+
+ /* Insert this node to cache if it's not COW-only */
+ if (!start->cowonly) {
+ rb_node = rb_simple_insert(&cache->rb_root, start->bytenr,
+ &start->rb_node);
+ if (rb_node)
+ btrfs_backref_panic(cache->fs_info, start->bytenr,
+ -EEXIST);
+ list_add_tail(&start->lower, &cache->leaves);
+ }
+
+ /*
+ * Use breadth first search to iterate all related edges.
+ *
+ * The starting points are all the edges of this node
+ */
+ list_for_each_entry(edge, &start->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &pending_edge);
+
+ while (!list_empty(&pending_edge)) {
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_node *lower;
+
+ edge = list_first_entry(&pending_edge,
+ struct btrfs_backref_edge, list[UPPER]);
+ list_del_init(&edge->list[UPPER]);
+ upper = edge->node[UPPER];
+ lower = edge->node[LOWER];
+
+ /* Parent is detached, no need to keep any edges */
+ if (upper->detached) {
+ list_del(&edge->list[LOWER]);
+ btrfs_backref_free_edge(cache, edge);
+
+ /* Lower node is orphan, queue for cleanup */
+ if (list_empty(&lower->upper))
+ list_add(&lower->list, useless_node);
+ continue;
+ }
+
+ /*
+ * All new nodes added in current build_backref_tree() haven't
+ * been linked to the cache rb tree.
+ * So if we have upper->rb_node populated, this means a cache
+ * hit. We only need to link the edge, as @upper and all its
+ * parents have already been linked.
+ */
+ if (!RB_EMPTY_NODE(&upper->rb_node)) {
+ if (upper->lowest) {
+ list_del_init(&upper->lower);
+ upper->lowest = 0;
+ }
+
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+ continue;
+ }
+
+ /* Sanity check, we shouldn't have any unchecked nodes */
+ if (!upper->checked) {
+ ASSERT(0);
+ return -EUCLEAN;
+ }
+
+ /* Sanity check, COW-only node has non-COW-only parent */
+ if (start->cowonly != upper->cowonly) {
+ ASSERT(0);
+ return -EUCLEAN;
+ }
+
+ /* Only cache non-COW-only (subvolume trees) tree blocks */
+ if (!upper->cowonly) {
+ rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
+ &upper->rb_node);
+ if (rb_node) {
+ btrfs_backref_panic(cache->fs_info,
+ upper->bytenr, -EEXIST);
+ return -EUCLEAN;
+ }
+ }
+
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+
+ /*
+ * Also queue all the parent edges of this uncached node
+ * to finish the upper linkage
+ */
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &pending_edge);
+ }
+ return 0;
+}
+
+void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node)
+{
+ struct btrfs_backref_node *lower;
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_edge *edge;
+
+ while (!list_empty(&cache->useless_node)) {
+ lower = list_first_entry(&cache->useless_node,
+ struct btrfs_backref_node, list);
+ list_del_init(&lower->list);
+ }
+ while (!list_empty(&cache->pending_edge)) {
+ edge = list_first_entry(&cache->pending_edge,
+ struct btrfs_backref_edge, list[UPPER]);
+ list_del(&edge->list[UPPER]);
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ upper = edge->node[UPPER];
+ btrfs_backref_free_edge(cache, edge);
+
+ /*
+ * Lower is no longer linked to any upper backref nodes and
+ * isn't in the cache, we can free it ourselves.
+ */
+ if (list_empty(&lower->upper) &&
+ RB_EMPTY_NODE(&lower->rb_node))
+ list_add(&lower->list, &cache->useless_node);
+
+ if (!RB_EMPTY_NODE(&upper->rb_node))
+ continue;
+
+ /* Add this guy's upper edges to the list to process */
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER],
+ &cache->pending_edge);
+ if (list_empty(&upper->upper))
+ list_add(&upper->list, &cache->useless_node);
+ }
+
+ while (!list_empty(&cache->useless_node)) {
+ lower = list_first_entry(&cache->useless_node,
+ struct btrfs_backref_node, list);
+ list_del_init(&lower->list);
+ if (lower == node)
+ node = NULL;
+ btrfs_backref_drop_node(cache, lower);
+ }
+
+ btrfs_backref_cleanup_node(cache, node);
+ ASSERT(list_empty(&cache->useless_node) &&
+ list_empty(&cache->pending_edge));
+}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 777f61d..17abde7 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -8,6 +8,7 @@
#include <linux/btrfs.h>
#include "ulist.h"
+#include "disk-io.h"
#include "extent_io.h"
struct inode_fs_paths {
@@ -40,6 +41,10 @@
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
+int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **leafs,
+ const u64 *extent_item_pos, bool ignore_offset);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **roots, bool ignore_offset);
@@ -74,4 +79,303 @@
u64 wanted_disk_byte;
};
+/*
+ * Iterate backrefs of one extent.
+ *
+ * Now it only supports iteration of tree block in commit root.
+ */
+struct btrfs_backref_iter {
+ u64 bytenr;
+ struct btrfs_path *path;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_key cur_key;
+ u32 item_ptr;
+ u32 cur_ptr;
+ u32 end_ptr;
+};
+
+struct btrfs_backref_iter *btrfs_backref_iter_alloc(
+ struct btrfs_fs_info *fs_info, gfp_t gfp_flag);
+
+static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter)
+{
+ if (!iter)
+ return;
+ btrfs_free_path(iter->path);
+ kfree(iter);
+}
+
+static inline struct extent_buffer *btrfs_backref_get_eb(
+ struct btrfs_backref_iter *iter)
+{
+ if (!iter)
+ return NULL;
+ return iter->path->nodes[0];
+}
+
+/*
+ * For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data
+ * is btrfs_tree_block_info, without a btrfs_extent_inline_ref header.
+ *
+ * This helper determines if that's the case.
+ */
+static inline bool btrfs_backref_has_tree_block_info(
+ struct btrfs_backref_iter *iter)
+{
+ if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY &&
+ iter->cur_ptr - iter->item_ptr == sizeof(struct btrfs_extent_item))
+ return true;
+ return false;
+}
+
+int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr);
+
+int btrfs_backref_iter_next(struct btrfs_backref_iter *iter);
+
+static inline bool btrfs_backref_iter_is_inline_ref(
+ struct btrfs_backref_iter *iter)
+{
+ if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY ||
+ iter->cur_key.type == BTRFS_METADATA_ITEM_KEY)
+ return true;
+ return false;
+}
+
+static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
+{
+ iter->bytenr = 0;
+ iter->item_ptr = 0;
+ iter->cur_ptr = 0;
+ iter->end_ptr = 0;
+ btrfs_release_path(iter->path);
+ memset(&iter->cur_key, 0, sizeof(iter->cur_key));
+}
+
+/*
+ * Backref cache related structures
+ *
+ * The whole objective of backref_cache is to build a bi-directional map
+ * of tree blocks (represented by backref_node) and all their parents.
+ */
+
+/*
+ * Represent a tree block in the backref cache
+ */
+struct btrfs_backref_node {
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ }; /* Use rb_simple_node for search/insert */
+
+ u64 new_bytenr;
+ /* Objectid of tree block owner, can be not uptodate */
+ u64 owner;
+ /* Link to pending, changed or detached list */
+ struct list_head list;
+
+ /* List of upper level edges, which link this node to its parents */
+ struct list_head upper;
+ /* List of lower level edges, which link this node to its children */
+ struct list_head lower;
+
+ /* NULL if this node is not tree root */
+ struct btrfs_root *root;
+ /* Extent buffer got by COWing the block */
+ struct extent_buffer *eb;
+ /* Level of the tree block */
+ unsigned int level:8;
+ /* Is the block in a non-shareable tree */
+ unsigned int cowonly:1;
+ /* 1 if no child node is in the cache */
+ unsigned int lowest:1;
+ /* Is the extent buffer locked */
+ unsigned int locked:1;
+ /* Has the block been processed */
+ unsigned int processed:1;
+ /* Have backrefs of this block been checked */
+ unsigned int checked:1;
+ /*
+ * 1 if corresponding block has been COWed but some upper level block
+ * pointers may not point to the new location
+ */
+ unsigned int pending:1;
+ /* 1 if the backref node isn't connected to any other backref node */
+ unsigned int detached:1;
+
+ /*
+ * For generic purpose backref cache, where we only care if it's a reloc
+ * root, doesn't care the source subvolid.
+ */
+ unsigned int is_reloc_root:1;
+};
+
+#define LOWER 0
+#define UPPER 1
+
+/*
+ * Represent an edge connecting upper and lower backref nodes.
+ */
+struct btrfs_backref_edge {
+ /*
+ * list[LOWER] is linked to btrfs_backref_node::upper of lower level
+ * node, and list[UPPER] is linked to btrfs_backref_node::lower of
+ * upper level node.
+ *
+ * Also, build_backref_tree() uses list[UPPER] for pending edges, before
+ * linking list[UPPER] to its upper level nodes.
+ */
+ struct list_head list[2];
+
+ /* Two related nodes */
+ struct btrfs_backref_node *node[2];
+};
+
+struct btrfs_backref_cache {
+ /* Red black tree of all backref nodes in the cache */
+ struct rb_root rb_root;
+ /* For passing backref nodes to btrfs_reloc_cow_block */
+ struct btrfs_backref_node *path[BTRFS_MAX_LEVEL];
+ /*
+ * List of blocks that have been COWed but some block pointers in upper
+ * level blocks may not reflect the new location
+ */
+ struct list_head pending[BTRFS_MAX_LEVEL];
+ /* List of backref nodes with no child node */
+ struct list_head leaves;
+ /* List of blocks that have been COWed in current transaction */
+ struct list_head changed;
+ /* List of detached backref node. */
+ struct list_head detached;
+
+ u64 last_trans;
+
+ int nr_nodes;
+ int nr_edges;
+
+ /* List of unchecked backref edges during backref cache build */
+ struct list_head pending_edge;
+
+ /* List of useless backref nodes during backref cache build */
+ struct list_head useless_node;
+
+ struct btrfs_fs_info *fs_info;
+
+ /*
+ * Whether this cache is for relocation
+ *
+ * Reloction backref cache require more info for reloc root compared
+ * to generic backref cache.
+ */
+ unsigned int is_reloc;
+};
+
+void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_backref_cache *cache, int is_reloc);
+struct btrfs_backref_node *btrfs_backref_alloc_node(
+ struct btrfs_backref_cache *cache, u64 bytenr, int level);
+struct btrfs_backref_edge *btrfs_backref_alloc_edge(
+ struct btrfs_backref_cache *cache);
+
+#define LINK_LOWER (1 << 0)
+#define LINK_UPPER (1 << 1)
+static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
+ struct btrfs_backref_node *lower,
+ struct btrfs_backref_node *upper,
+ int link_which)
+{
+ ASSERT(upper && lower && upper->level == lower->level + 1);
+ edge->node[LOWER] = lower;
+ edge->node[UPPER] = upper;
+ if (link_which & LINK_LOWER)
+ list_add_tail(&edge->list[LOWER], &lower->upper);
+ if (link_which & LINK_UPPER)
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+}
+
+static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node)
+{
+ if (node) {
+ ASSERT(list_empty(&node->list));
+ ASSERT(list_empty(&node->lower));
+ ASSERT(node->eb == NULL);
+ cache->nr_nodes--;
+ btrfs_put_root(node->root);
+ kfree(node);
+ }
+}
+
+static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_edge *edge)
+{
+ if (edge) {
+ cache->nr_edges--;
+ kfree(edge);
+ }
+}
+
+static inline void btrfs_backref_unlock_node_buffer(
+ struct btrfs_backref_node *node)
+{
+ if (node->locked) {
+ btrfs_tree_unlock(node->eb);
+ node->locked = 0;
+ }
+}
+
+static inline void btrfs_backref_drop_node_buffer(
+ struct btrfs_backref_node *node)
+{
+ if (node->eb) {
+ btrfs_backref_unlock_node_buffer(node);
+ free_extent_buffer(node->eb);
+ node->eb = NULL;
+ }
+}
+
+/*
+ * Drop the backref node from cache without cleaning up its children
+ * edges.
+ *
+ * This can only be called on node without parent edges.
+ * The children edges are still kept as is.
+ */
+static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
+ struct btrfs_backref_node *node)
+{
+ ASSERT(list_empty(&node->upper));
+
+ btrfs_backref_drop_node_buffer(node);
+ list_del_init(&node->list);
+ list_del_init(&node->lower);
+ if (!RB_EMPTY_NODE(&node->rb_node))
+ rb_erase(&node->rb_node, &tree->rb_root);
+ btrfs_backref_free_node(tree, node);
+}
+
+void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node);
+
+void btrfs_backref_release_cache(struct btrfs_backref_cache *cache);
+
+static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info,
+ u64 bytenr, int errno)
+{
+ btrfs_panic(fs_info, errno,
+ "Inconsistency in backref cache found at offset %llu",
+ bytenr);
+}
+
+int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
+ struct btrfs_path *path,
+ struct btrfs_backref_iter *iter,
+ struct btrfs_key *node_key,
+ struct btrfs_backref_node *cur);
+
+int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *start);
+
+void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node);
+
#endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index e98d6ea..c99e293 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -7,13 +7,14 @@
#include "disk-io.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "disk-io.h"
#include "volumes.h"
#include "transaction.h"
#include "ref-verify.h"
#include "sysfs.h"
#include "tree-log.h"
#include "delalloc-space.h"
+#include "discard.h"
+#include "raid56.h"
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
@@ -64,11 +65,8 @@
spin_lock(&fs_info->balance_lock);
target = get_restripe_target(fs_info, flags);
if (target) {
- /* Pick target profile only if it's already available */
- if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
- spin_unlock(&fs_info->balance_lock);
- return extended_to_chunk(target);
- }
+ spin_unlock(&fs_info->balance_lock);
+ return extended_to_chunk(target);
}
spin_unlock(&fs_info->balance_lock);
@@ -95,7 +93,7 @@
return extended_to_chunk(flags | allowed);
}
-static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
+u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
{
unsigned seq;
u64 flags;
@@ -115,23 +113,27 @@
return btrfs_reduce_alloc_profile(fs_info, flags);
}
-u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
+void btrfs_get_block_group(struct btrfs_block_group *cache)
{
- return get_alloc_profile(fs_info, orig_flags);
+ refcount_inc(&cache->refs);
}
-void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_put_block_group(struct btrfs_block_group *cache)
{
- atomic_inc(&cache->count);
-}
-
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
-{
- if (atomic_dec_and_test(&cache->count)) {
+ if (refcount_dec_and_test(&cache->refs)) {
WARN_ON(cache->pinned > 0);
WARN_ON(cache->reserved > 0);
/*
+ * A block_group shouldn't be on the discard_list anymore.
+ * Remove the block_group from the discard_list to prevent us
+ * from causing a panic due to NULL pointer dereference.
+ */
+ if (WARN_ON(!list_empty(&cache->discard_list)))
+ btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
+ cache);
+
+ /*
* If not empty, someone is still holding mutex of
* full_stripe_lock, which can only be released by caller.
* And it will definitely cause use-after-free when caller
@@ -149,22 +151,23 @@
* This adds the block group to the fs_info rb tree for the block group cache
*/
static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
struct rb_node **p;
struct rb_node *parent = NULL;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
+
+ ASSERT(block_group->length != 0);
spin_lock(&info->block_group_cache_lock);
p = &info->block_group_cache_tree.rb_node;
while (*p) {
parent = *p;
- cache = rb_entry(parent, struct btrfs_block_group_cache,
- cache_node);
- if (block_group->key.objectid < cache->key.objectid) {
+ cache = rb_entry(parent, struct btrfs_block_group, cache_node);
+ if (block_group->start < cache->start) {
p = &(*p)->rb_left;
- } else if (block_group->key.objectid > cache->key.objectid) {
+ } else if (block_group->start > cache->start) {
p = &(*p)->rb_right;
} else {
spin_unlock(&info->block_group_cache_lock);
@@ -176,8 +179,8 @@
rb_insert_color(&block_group->cache_node,
&info->block_group_cache_tree);
- if (info->first_logical_byte > block_group->key.objectid)
- info->first_logical_byte = block_group->key.objectid;
+ if (info->first_logical_byte > block_group->start)
+ info->first_logical_byte = block_group->start;
spin_unlock(&info->block_group_cache_lock);
@@ -188,10 +191,10 @@
* This will return the block group at or after bytenr if contains is 0, else
* it will return the block group that contains the bytenr
*/
-static struct btrfs_block_group_cache *block_group_cache_tree_search(
+static struct btrfs_block_group *block_group_cache_tree_search(
struct btrfs_fs_info *info, u64 bytenr, int contains)
{
- struct btrfs_block_group_cache *cache, *ret = NULL;
+ struct btrfs_block_group *cache, *ret = NULL;
struct rb_node *n;
u64 end, start;
@@ -199,13 +202,12 @@
n = info->block_group_cache_tree.rb_node;
while (n) {
- cache = rb_entry(n, struct btrfs_block_group_cache,
- cache_node);
- end = cache->key.objectid + cache->key.offset - 1;
- start = cache->key.objectid;
+ cache = rb_entry(n, struct btrfs_block_group, cache_node);
+ end = cache->start + cache->length - 1;
+ start = cache->start;
if (bytenr < start) {
- if (!contains && (!ret || start < ret->key.objectid))
+ if (!contains && (!ret || start < ret->start))
ret = cache;
n = n->rb_left;
} else if (bytenr > start) {
@@ -221,8 +223,8 @@
}
if (ret) {
btrfs_get_block_group(ret);
- if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
- info->first_logical_byte = ret->key.objectid;
+ if (bytenr == 0 && info->first_logical_byte > ret->start)
+ info->first_logical_byte = ret->start;
}
spin_unlock(&info->block_group_cache_lock);
@@ -232,7 +234,7 @@
/*
* Return the block group that starts at or after bytenr
*/
-struct btrfs_block_group_cache *btrfs_lookup_first_block_group(
+struct btrfs_block_group *btrfs_lookup_first_block_group(
struct btrfs_fs_info *info, u64 bytenr)
{
return block_group_cache_tree_search(info, bytenr, 0);
@@ -241,14 +243,14 @@
/*
* Return the block group that contains the given bytenr
*/
-struct btrfs_block_group_cache *btrfs_lookup_block_group(
+struct btrfs_block_group *btrfs_lookup_block_group(
struct btrfs_fs_info *info, u64 bytenr)
{
return block_group_cache_tree_search(info, bytenr, 1);
}
-struct btrfs_block_group_cache *btrfs_next_block_group(
- struct btrfs_block_group_cache *cache)
+struct btrfs_block_group *btrfs_next_block_group(
+ struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct rb_node *node;
@@ -257,7 +259,7 @@
/* If our block group was removed, we need a full search. */
if (RB_EMPTY_NODE(&cache->cache_node)) {
- const u64 next_bytenr = cache->key.objectid + cache->key.offset;
+ const u64 next_bytenr = cache->start + cache->length;
spin_unlock(&fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
@@ -266,8 +268,7 @@
node = rb_next(&cache->cache_node);
btrfs_put_block_group(cache);
if (node) {
- cache = rb_entry(node, struct btrfs_block_group_cache,
- cache_node);
+ cache = rb_entry(node, struct btrfs_block_group, cache_node);
btrfs_get_block_group(cache);
} else
cache = NULL;
@@ -277,7 +278,7 @@
bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
{
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
bool ret = true;
bg = btrfs_lookup_block_group(fs_info, bytenr);
@@ -300,7 +301,7 @@
void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
{
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
bg = btrfs_lookup_block_group(fs_info, bytenr);
ASSERT(bg);
@@ -314,7 +315,7 @@
btrfs_put_block_group(bg);
}
-void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
+void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
{
wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
}
@@ -322,7 +323,7 @@
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start)
{
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
bg = btrfs_lookup_block_group(fs_info, start);
ASSERT(bg);
@@ -331,7 +332,7 @@
btrfs_put_block_group(bg);
}
-void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
+void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
{
struct btrfs_space_info *space_info = bg->space_info;
@@ -357,7 +358,7 @@
}
struct btrfs_caching_control *btrfs_get_caching_control(
- struct btrfs_block_group_cache *cache)
+ struct btrfs_block_group *cache)
{
struct btrfs_caching_control *ctl;
@@ -392,7 +393,7 @@
* Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
* any of the information in this block group.
*/
-void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes)
{
struct btrfs_caching_control *caching_ctl;
@@ -401,13 +402,13 @@
if (!caching_ctl)
return;
- wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache) ||
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
(cache->free_space_ctl->free_space >= num_bytes));
btrfs_put_caching_control(caching_ctl);
}
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
+int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
{
struct btrfs_caching_control *caching_ctl;
int ret = 0;
@@ -416,7 +417,7 @@
if (!caching_ctl)
return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
- wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache));
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
if (cache->cached == BTRFS_CACHE_ERROR)
ret = -EIO;
btrfs_put_caching_control(caching_ctl);
@@ -424,11 +425,11 @@
}
#ifdef CONFIG_BTRFS_DEBUG
-static void fragment_free_space(struct btrfs_block_group_cache *block_group)
+static void fragment_free_space(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- u64 start = block_group->key.objectid;
- u64 len = block_group->key.offset;
+ u64 start = block_group->start;
+ u64 len = block_group->length;
u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
fs_info->nodesize : fs_info->sectorsize;
u64 step = chunk << 1;
@@ -450,15 +451,14 @@
* used yet since their free space will be released as soon as the transaction
* commits.
*/
-u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
- u64 start, u64 end)
+u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
{
struct btrfs_fs_info *info = block_group->fs_info;
u64 extent_start, extent_end, size, total_added = 0;
int ret;
while (start < end) {
- ret = find_first_extent_bit(info->pinned_extents, start,
+ ret = find_first_extent_bit(&info->excluded_extents, start,
&extent_start, &extent_end,
EXTENT_DIRTY | EXTENT_UPTODATE,
NULL);
@@ -470,8 +470,8 @@
} else if (extent_start > start && extent_start < end) {
size = extent_start - start;
total_added += size;
- ret = btrfs_add_free_space(block_group, start,
- size);
+ ret = btrfs_add_free_space_async_trimmed(block_group,
+ start, size);
BUG_ON(ret); /* -ENOMEM or logic error */
start = extent_end + 1;
} else {
@@ -482,7 +482,8 @@
if (start < end) {
size = end - start;
total_added += size;
- ret = btrfs_add_free_space(block_group, start, size);
+ ret = btrfs_add_free_space_async_trimmed(block_group, start,
+ size);
BUG_ON(ret); /* -ENOMEM or logic error */
}
@@ -491,7 +492,7 @@
static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
- struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
+ struct btrfs_block_group *block_group = caching_ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_path *path;
@@ -507,7 +508,7 @@
if (!path)
return -ENOMEM;
- last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+ last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
#ifdef CONFIG_BTRFS_DEBUG
/*
@@ -587,13 +588,12 @@
goto next;
}
- if (key.objectid < block_group->key.objectid) {
+ if (key.objectid < block_group->start) {
path->slots[0]++;
continue;
}
- if (key.objectid >= block_group->key.objectid +
- block_group->key.offset)
+ if (key.objectid >= block_group->start + block_group->length)
break;
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
@@ -617,8 +617,7 @@
ret = 0;
total_found += add_new_free_space(block_group, last,
- block_group->key.objectid +
- block_group->key.offset);
+ block_group->start + block_group->length);
caching_ctl->progress = (u64)-1;
out:
@@ -628,7 +627,7 @@
static noinline void caching_thread(struct btrfs_work *work)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_caching_control *caching_ctl;
int ret;
@@ -664,8 +663,7 @@
spin_lock(&block_group->space_info->lock);
spin_lock(&block_group->lock);
- bytes_used = block_group->key.offset -
- btrfs_block_group_used(&block_group->item);
+ bytes_used = block_group->length - block_group->used;
block_group->space_info->bytes_used += bytes_used >> 1;
spin_unlock(&block_group->lock);
spin_unlock(&block_group->space_info->lock);
@@ -685,8 +683,7 @@
btrfs_put_block_group(block_group);
}
-int btrfs_cache_block_group(struct btrfs_block_group_cache *cache,
- int load_cache_only)
+int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
{
DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -701,7 +698,7 @@
mutex_init(&caching_ctl->mutex);
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
- caching_ctl->progress = cache->key.objectid;
+ caching_ctl->progress = cache->start;
refcount_set(&caching_ctl->count, 1);
btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
@@ -770,8 +767,7 @@
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
- bytes_used = cache->key.offset -
- btrfs_block_group_used(&cache->item);
+ bytes_used = cache->length - cache->used;
cache->space_info->bytes_used += bytes_used >> 1;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
@@ -840,37 +836,69 @@
*
* - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
* in the whole filesystem
+ *
+ * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
*/
static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
- if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ bool found_raid56 = false;
+ bool found_raid1c34 = false;
+
+ if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
+ (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
+ (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
struct list_head *head = &fs_info->space_info;
struct btrfs_space_info *sinfo;
list_for_each_entry_rcu(sinfo, head, list) {
- bool found = false;
-
down_read(&sinfo->groups_sem);
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
- found = true;
+ found_raid56 = true;
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
- found = true;
+ found_raid56 = true;
+ if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
+ found_raid1c34 = true;
+ if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
+ found_raid1c34 = true;
up_read(&sinfo->groups_sem);
-
- if (found)
- return;
}
- btrfs_clear_fs_incompat(fs_info, RAID56);
+ if (!found_raid56)
+ btrfs_clear_fs_incompat(fs_info, RAID56);
+ if (!found_raid1c34)
+ btrfs_clear_fs_incompat(fs_info, RAID1C34);
}
}
+static int remove_block_group_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ int ret;
+
+ root = fs_info->extent_root;
+ key.objectid = block_group->start;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = block_group->length;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ return ret;
+
+ ret = btrfs_del_item(trans, root, path);
+ return ret;
+}
+
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
u64 group_start, struct extent_map *em)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
struct btrfs_path *path;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_free_cluster *cluster;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_key key;
@@ -893,10 +921,9 @@
* remove it.
*/
btrfs_free_excluded_extents(block_group);
- btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
- block_group->key.offset);
+ btrfs_free_ref_tree_range(fs_info, block_group->start,
+ block_group->length);
- memcpy(&key, &block_group->key, sizeof(key));
index = btrfs_bg_flags_to_raid_index(block_group->flags);
factor = btrfs_bg_type_to_factor(block_group->flags);
@@ -974,8 +1001,8 @@
}
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = block_group->key.objectid;
key.type = 0;
+ key.offset = block_group->start;
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0)
@@ -997,7 +1024,7 @@
/* Once for the block groups rbtree */
btrfs_put_block_group(block_group);
- if (fs_info->first_logical_byte == block_group->key.objectid)
+ if (fs_info->first_logical_byte == block_group->start)
fs_info->first_logical_byte = (u64)-1;
spin_unlock(&fs_info->block_group_cache_lock);
@@ -1058,36 +1085,55 @@
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
WARN_ON(block_group->space_info->total_bytes
- < block_group->key.offset);
+ < block_group->length);
WARN_ON(block_group->space_info->bytes_readonly
- < block_group->key.offset);
+ < block_group->length);
WARN_ON(block_group->space_info->disk_total
- < block_group->key.offset * factor);
+ < block_group->length * factor);
}
- block_group->space_info->total_bytes -= block_group->key.offset;
- block_group->space_info->bytes_readonly -= block_group->key.offset;
- block_group->space_info->disk_total -= block_group->key.offset * factor;
+ block_group->space_info->total_bytes -= block_group->length;
+ block_group->space_info->bytes_readonly -= block_group->length;
+ block_group->space_info->disk_total -= block_group->length * factor;
spin_unlock(&block_group->space_info->lock);
- memcpy(&key, &block_group->key, sizeof(key));
+ /*
+ * Remove the free space for the block group from the free space tree
+ * and the block group's item from the extent tree before marking the
+ * block group as removed. This is to prevent races with tasks that
+ * freeze and unfreeze a block group, this task and another task
+ * allocating a new block group - the unfreeze task ends up removing
+ * the block group's extent map before the task calling this function
+ * deletes the block group item from the extent tree, allowing for
+ * another task to attempt to create another block group with the same
+ * item key (and failing with -EEXIST and a transaction abort).
+ */
+ ret = remove_block_group_free_space(trans, block_group);
+ if (ret)
+ goto out;
- mutex_lock(&fs_info->chunk_mutex);
+ ret = remove_block_group_item(trans, path, block_group);
+ if (ret < 0)
+ goto out;
+
spin_lock(&block_group->lock);
block_group->removed = 1;
/*
- * At this point trimming can't start on this block group, because we
- * removed the block group from the tree fs_info->block_group_cache_tree
- * so no one can't find it anymore and even if someone already got this
- * block group before we removed it from the rbtree, they have already
- * incremented block_group->trimming - if they didn't, they won't find
- * any free space entries because we already removed them all when we
- * called btrfs_remove_free_space_cache().
+ * At this point trimming or scrub can't start on this block group,
+ * because we removed the block group from the rbtree
+ * fs_info->block_group_cache_tree so no one can't find it anymore and
+ * even if someone already got this block group before we removed it
+ * from the rbtree, they have already incremented block_group->frozen -
+ * if they didn't, for the trimming case they won't find any free space
+ * entries because we already removed them all when we called
+ * btrfs_remove_free_space_cache().
*
* And we must not remove the extent map from the fs_info->mapping_tree
* to prevent the same logical address range and physical device space
- * ranges from being reused for a new block group. This is because our
- * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+ * ranges from being reused for a new block group. This is needed to
+ * avoid races with trimming and scrub.
+ *
+ * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
* completely transactionless, so while it is trimming a range the
* currently running transaction might finish and a new one start,
* allowing for new block groups to be created that can reuse the same
@@ -1098,25 +1144,9 @@
* in place until the extents have been discarded completely when
* the transaction commit has completed.
*/
- remove_em = (atomic_read(&block_group->trimming) == 0);
+ remove_em = (atomic_read(&block_group->frozen) == 0);
spin_unlock(&block_group->lock);
- mutex_unlock(&fs_info->chunk_mutex);
-
- ret = remove_block_group_free_space(trans, block_group);
- if (ret)
- goto out;
-
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret > 0)
- ret = -EIO;
- if (ret < 0)
- goto out;
-
- ret = btrfs_del_item(trans, root, path);
- if (ret)
- goto out;
-
if (remove_em) {
struct extent_map_tree *em_tree;
@@ -1190,36 +1220,28 @@
* data in this block group. That check should be done by relocation routine,
* not this function.
*/
-static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
- u64 min_allocable_bytes;
int ret = -ENOSPC;
- /*
- * We need some metadata space and system metadata space for
- * allocating chunks in some corner cases until we force to set
- * it to be readonly.
- */
- if ((sinfo->flags &
- (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
- !force)
- min_allocable_bytes = SZ_1M;
- else
- min_allocable_bytes = 0;
-
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
+ if (cache->swap_extents) {
+ ret = -ETXTBSY;
+ goto out;
+ }
+
if (cache->ro) {
cache->ro++;
ret = 0;
goto out;
}
- num_bytes = cache->key.offset - cache->reserved - cache->pinned -
- cache->bytes_super - btrfs_block_group_used(&cache->item);
+ num_bytes = cache->length - cache->reserved - cache->pinned -
+ cache->bytes_super - cache->used;
/*
* Data never overcommits, even in mixed mode, so do just the straight
@@ -1258,22 +1280,67 @@
spin_unlock(&sinfo->lock);
if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
btrfs_info(cache->fs_info,
- "unable to make block group %llu ro",
- cache->key.objectid);
+ "unable to make block group %llu ro", cache->start);
btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
}
return ret;
}
+static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_transaction *prev_trans = NULL;
+ const u64 start = bg->start;
+ const u64 end = start + bg->length - 1;
+ int ret;
+
+ spin_lock(&fs_info->trans_lock);
+ if (trans->transaction->list.prev != &fs_info->trans_list) {
+ prev_trans = list_last_entry(&trans->transaction->list,
+ struct btrfs_transaction, list);
+ refcount_inc(&prev_trans->use_count);
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * Hold the unused_bg_unpin_mutex lock to avoid racing with
+ * btrfs_finish_extent_commit(). If we are at transaction N, another
+ * task might be running finish_extent_commit() for the previous
+ * transaction N - 1, and have seen a range belonging to the block
+ * group in pinned_extents before we were able to clear the whole block
+ * group range from pinned_extents. This means that task can lookup for
+ * the block group after we unpinned it from pinned_extents and removed
+ * it, leading to a BUG_ON() at unpin_extent_range().
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ if (prev_trans) {
+ ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
+ EXTENT_DIRTY);
+ if (ret)
+ goto out;
+ }
+
+ ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
+ EXTENT_DIRTY);
+out:
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ if (prev_trans)
+ btrfs_put_transaction(prev_trans);
+
+ return ret == 0;
+}
+
/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
*/
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
+ const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
int ret = 0;
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
@@ -1281,11 +1348,10 @@
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
- u64 start, end;
int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
bg_list);
list_del_init(&block_group->bg_list);
@@ -1297,14 +1363,31 @@
}
spin_unlock(&fs_info->unused_bgs_lock);
+ btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
+
mutex_lock(&fs_info->delete_unused_bgs_mutex);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
+
+ /*
+ * Async discard moves the final block group discard to be prior
+ * to the unused_bgs code path. Therefore, if it's not fully
+ * trimmed, punt it back to the async discard lists.
+ */
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
+ !btrfs_is_free_space_trimmed(block_group)) {
+ trace_btrfs_skip_unused_block_group(block_group);
+ up_write(&space_info->groups_sem);
+ /* Requeue if we failed because of async discard */
+ btrfs_discard_queue_work(&fs_info->discard_ctl,
+ block_group);
+ goto next;
+ }
+
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->pinned ||
- btrfs_block_group_used(&block_group->item) ||
- block_group->ro ||
+ block_group->used || block_group->ro ||
list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
@@ -1332,7 +1415,7 @@
* properly if we fail to join the transaction.
*/
trans = btrfs_start_trans_remove_block_group(fs_info,
- block_group->key.objectid);
+ block_group->start);
if (IS_ERR(trans)) {
btrfs_dec_block_group_ro(block_group);
ret = PTR_ERR(trans);
@@ -1343,35 +1426,27 @@
* We could have pending pinned extents for this block group,
* just delete them, we don't care about them anymore.
*/
- start = block_group->key.objectid;
- end = start + block_group->key.offset - 1;
+ if (!clean_pinned_extents(trans, block_group)) {
+ btrfs_dec_block_group_ro(block_group);
+ goto end_trans;
+ }
+
/*
- * Hold the unused_bg_unpin_mutex lock to avoid racing with
- * btrfs_finish_extent_commit(). If we are at transaction N,
- * another task might be running finish_extent_commit() for the
- * previous transaction N - 1, and have seen a range belonging
- * to the block group in freed_extents[] before we were able to
- * clear the whole block group range from freed_extents[]. This
- * means that task can lookup for the block group after we
- * unpinned it from freed_extents[] and removed it, leading to
- * a BUG_ON() at btrfs_unpin_extent_range().
+ * At this point, the block_group is read only and should fail
+ * new allocations. However, btrfs_finish_extent_commit() can
+ * cause this block_group to be placed back on the discard
+ * lists because now the block_group isn't fully discarded.
+ * Bail here and try again later after discarding everything.
*/
- mutex_lock(&fs_info->unused_bg_unpin_mutex);
- ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
- EXTENT_DIRTY);
- if (ret) {
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ spin_lock(&fs_info->discard_ctl.lock);
+ if (!list_empty(&block_group->discard_list)) {
+ spin_unlock(&fs_info->discard_ctl.lock);
btrfs_dec_block_group_ro(block_group);
+ btrfs_discard_queue_work(&fs_info->discard_ctl,
+ block_group);
goto end_trans;
}
- ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
- EXTENT_DIRTY);
- if (ret) {
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_dec_block_group_ro(block_group);
- goto end_trans;
- }
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ spin_unlock(&fs_info->discard_ctl.lock);
/* Reset pinned so btrfs_put_block_group doesn't complain */
spin_lock(&space_info->lock);
@@ -1380,30 +1455,38 @@
btrfs_space_info_update_bytes_pinned(fs_info, space_info,
-block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -block_group->pinned,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned);
block_group->pinned = 0;
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
+ /*
+ * The normal path here is an unused block group is passed here,
+ * then trimming is handled in the transaction commit path.
+ * Async discard interposes before this to do the trimming
+ * before coming down the unused block group path as trimming
+ * will no longer be done later in the transaction commit path.
+ */
+ if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ goto flip_async;
+
/* DISCARD can flip during remount */
- trimming = btrfs_test_opt(fs_info, DISCARD);
+ trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
/* Implicit trim during transaction commit. */
if (trimming)
- btrfs_get_block_group_trimming(block_group);
+ btrfs_freeze_block_group(block_group);
/*
* Btrfs_remove_chunk will abort the transaction if things go
* horribly wrong.
*/
- ret = btrfs_remove_chunk(trans, block_group->key.objectid);
+ ret = btrfs_remove_chunk(trans, block_group->start);
if (ret) {
if (trimming)
- btrfs_put_block_group_trimming(block_group);
+ btrfs_unfreeze_block_group(block_group);
goto end_trans;
}
@@ -1432,9 +1515,16 @@
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
+ return;
+
+flip_async:
+ btrfs_end_transaction(trans);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ btrfs_put_block_group(block_group);
+ btrfs_discard_punt_unused_bgs_list(fs_info);
}
-void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
+void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -1447,21 +1537,70 @@
spin_unlock(&fs_info->unused_bgs_lock);
}
+static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_block_group_item bg;
+ struct extent_buffer *leaf;
+ int slot;
+ u64 flags;
+ int ret = 0;
+
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+
+ em_tree = &fs_info->mapping_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ btrfs_err(fs_info,
+ "logical %llu len %llu found bg but no related chunk",
+ key->objectid, key->offset);
+ return -ENOENT;
+ }
+
+ if (em->start != key->objectid || em->len != key->offset) {
+ btrfs_err(fs_info,
+ "block group %llu len %llu mismatch with chunk %llu len %llu",
+ key->objectid, key->offset, em->start, em->len);
+ ret = -EUCLEAN;
+ goto out_free_em;
+ }
+
+ read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bg));
+ flags = btrfs_stack_block_group_flags(&bg) &
+ BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(fs_info,
+"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
+ key->objectid, key->offset, flags,
+ (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
+ ret = -EUCLEAN;
+ }
+
+out_free_em:
+ free_extent_map(em);
+ return ret;
+}
+
static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct btrfs_key *key)
{
struct btrfs_root *root = fs_info->extent_root;
- int ret = 0;
+ int ret;
struct btrfs_key found_key;
struct extent_buffer *leaf;
- struct btrfs_block_group_item bg;
- u64 flags;
int slot;
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
while (1) {
slot = path->slots[0];
@@ -1478,49 +1617,10 @@
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- struct extent_map_tree *em_tree;
- struct extent_map *em;
-
- em_tree = &root->fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, found_key.objectid,
- found_key.offset);
- read_unlock(&em_tree->lock);
- if (!em) {
- btrfs_err(fs_info,
- "logical %llu len %llu found bg but no related chunk",
- found_key.objectid, found_key.offset);
- ret = -ENOENT;
- } else if (em->start != found_key.objectid ||
- em->len != found_key.offset) {
- btrfs_err(fs_info,
- "block group %llu len %llu mismatch with chunk %llu len %llu",
- found_key.objectid, found_key.offset,
- em->start, em->len);
- ret = -EUCLEAN;
- } else {
- read_extent_buffer(leaf, &bg,
- btrfs_item_ptr_offset(leaf, slot),
- sizeof(bg));
- flags = btrfs_block_group_flags(&bg) &
- BTRFS_BLOCK_GROUP_TYPE_MASK;
-
- if (flags != (em->map_lookup->type &
- BTRFS_BLOCK_GROUP_TYPE_MASK)) {
- btrfs_err(fs_info,
-"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
- found_key.objectid,
- found_key.offset, flags,
- (BTRFS_BLOCK_GROUP_TYPE_MASK &
- em->map_lookup->type));
- ret = -EUCLEAN;
- } else {
- ret = 0;
- }
- }
- free_extent_map(em);
- goto out;
+ ret = read_bg_from_eb(fs_info, &found_key, path);
+ break;
}
+
path->slots[0]++;
}
out:
@@ -1542,7 +1642,96 @@
write_sequnlock(&fs_info->profiles_lock);
}
-static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
+/**
+ * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
+ * @chunk_start: logical address of block group
+ * @physical: physical address to map to logical addresses
+ * @logical: return array of logical addresses which map to @physical
+ * @naddrs: length of @logical
+ * @stripe_len: size of IO stripe for the given block group
+ *
+ * Maps a particular @physical disk address to a list of @logical addresses.
+ * Used primarily to exclude those portions of a block group that contain super
+ * block copies.
+ */
+EXPORT_FOR_TESTS
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ u64 physical, u64 **logical, int *naddrs, int *stripe_len)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 *buf;
+ u64 bytenr;
+ u64 data_stripe_length;
+ u64 io_stripe_size;
+ int i, nr = 0;
+ int ret = 0;
+
+ em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
+ if (IS_ERR(em))
+ return -EIO;
+
+ map = em->map_lookup;
+ data_stripe_length = em->orig_block_len;
+ io_stripe_size = map->stripe_len;
+
+ /* For RAID5/6 adjust to a full IO stripe length */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ io_stripe_size = map->stripe_len * nr_data_stripes(map);
+
+ buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ bool already_inserted = false;
+ u64 stripe_nr;
+ int j;
+
+ if (!in_range(physical, map->stripes[i].physical,
+ data_stripe_length))
+ continue;
+
+ stripe_nr = physical - map->stripes[i].physical;
+ stripe_nr = div64_u64(stripe_nr, map->stripe_len);
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ stripe_nr = stripe_nr * map->num_stripes + i;
+ stripe_nr = div_u64(stripe_nr, map->sub_stripes);
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+ stripe_nr = stripe_nr * map->num_stripes + i;
+ }
+ /*
+ * The remaining case would be for RAID56, multiply by
+ * nr_data_stripes(). Alternatively, just use rmap_len below
+ * instead of map->stripe_len
+ */
+
+ bytenr = chunk_start + stripe_nr * io_stripe_size;
+
+ /* Ensure we don't add duplicate addresses */
+ for (j = 0; j < nr; j++) {
+ if (buf[j] == bytenr) {
+ already_inserted = true;
+ break;
+ }
+ }
+
+ if (!already_inserted)
+ buf[nr++] = bytenr;
+ }
+
+ *logical = buf;
+ *naddrs = nr;
+ *stripe_len = io_stripe_size;
+out:
+ free_extent_map(em);
+ return ret;
+}
+
+static int exclude_super_stripes(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
u64 bytenr;
@@ -1550,10 +1739,10 @@
int stripe_len;
int i, nr, ret;
- if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
- stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
+ if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
+ stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
cache->bytes_super += stripe_len;
- ret = btrfs_add_excluded_extent(fs_info, cache->key.objectid,
+ ret = btrfs_add_excluded_extent(fs_info, cache->start,
stripe_len);
if (ret)
return ret;
@@ -1561,33 +1750,18 @@
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- ret = btrfs_rmap_block(fs_info, cache->key.objectid,
+ ret = btrfs_rmap_block(fs_info, cache->start,
bytenr, &logical, &nr, &stripe_len);
if (ret)
return ret;
while (nr--) {
- u64 start, len;
-
- if (logical[nr] > cache->key.objectid +
- cache->key.offset)
- continue;
-
- if (logical[nr] + stripe_len <= cache->key.objectid)
- continue;
-
- start = logical[nr];
- if (start < cache->key.objectid) {
- start = cache->key.objectid;
- len = (logical[nr] + stripe_len) - start;
- } else {
- len = min_t(u64, stripe_len,
- cache->key.objectid +
- cache->key.offset - start);
- }
+ u64 len = min_t(u64, stripe_len,
+ cache->start + cache->length - logical[nr]);
cache->bytes_super += len;
- ret = btrfs_add_excluded_extent(fs_info, start, len);
+ ret = btrfs_add_excluded_extent(fs_info, logical[nr],
+ len);
if (ret) {
kfree(logical);
return ret;
@@ -1599,26 +1773,20 @@
return 0;
}
-static void link_block_group(struct btrfs_block_group_cache *cache)
+static void link_block_group(struct btrfs_block_group *cache)
{
struct btrfs_space_info *space_info = cache->space_info;
int index = btrfs_bg_flags_to_raid_index(cache->flags);
- bool first = false;
down_write(&space_info->groups_sem);
- if (list_empty(&space_info->block_groups[index]))
- first = true;
list_add_tail(&cache->list, &space_info->block_groups[index]);
up_write(&space_info->groups_sem);
-
- if (first)
- btrfs_sysfs_add_block_group_type(cache);
}
-static struct btrfs_block_group_cache *btrfs_create_block_group_cache(
- struct btrfs_fs_info *fs_info, u64 start, u64 size)
+static struct btrfs_block_group *btrfs_create_block_group_cache(
+ struct btrfs_fs_info *fs_info, u64 start)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
cache = kzalloc(sizeof(*cache), GFP_NOFS);
if (!cache)
@@ -1631,25 +1799,25 @@
return NULL;
}
- cache->key.objectid = start;
- cache->key.offset = size;
- cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ cache->start = start;
cache->fs_info = fs_info;
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
- set_free_space_tree_thresholds(cache);
- atomic_set(&cache->count, 1);
+ cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
+
+ refcount_set(&cache->refs, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
INIT_LIST_HEAD(&cache->ro_list);
+ INIT_LIST_HEAD(&cache->discard_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
btrfs_init_free_space_ctl(cache);
- atomic_set(&cache->trimming, 0);
+ atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
@@ -1664,7 +1832,7 @@
{
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
u64 start = 0;
int ret = 0;
@@ -1689,15 +1857,14 @@
free_extent_map(em);
break;
}
- if (bg->key.objectid != em->start ||
- bg->key.offset != em->len ||
+ if (bg->start != em->start || bg->length != em->len ||
(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
(em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
em->start, em->len,
em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
- bg->key.objectid, bg->key.offset,
+ bg->start, bg->length,
bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
ret = -EUCLEAN;
free_extent_map(em);
@@ -1711,22 +1878,133 @@
return ret;
}
+static void read_block_group_item(struct btrfs_block_group *cache,
+ struct btrfs_path *path,
+ const struct btrfs_key *key)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_block_group_item bgi;
+ int slot = path->slots[0];
+
+ cache->length = key->offset;
+
+ read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bgi));
+ cache->used = btrfs_stack_block_group_used(&bgi);
+ cache->flags = btrfs_stack_block_group_flags(&bgi);
+}
+
+static int read_one_block_group(struct btrfs_fs_info *info,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ int need_clear)
+{
+ struct btrfs_block_group *cache;
+ struct btrfs_space_info *space_info;
+ const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
+ int ret;
+
+ ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
+
+ cache = btrfs_create_block_group_cache(info, key->objectid);
+ if (!cache)
+ return -ENOMEM;
+
+ read_block_group_item(cache, path, key);
+
+ set_free_space_tree_thresholds(cache);
+
+ if (need_clear) {
+ /*
+ * When we mount with old space cache, we need to
+ * set BTRFS_DC_CLEAR and set dirty flag.
+ *
+ * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+ * truncate the old free space cache inode and
+ * setup a new one.
+ * b) Setting 'dirty flag' makes sure that we flush
+ * the new space cache info onto disk.
+ */
+ if (btrfs_test_opt(info, SPACE_CACHE))
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
+ }
+ if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+ (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
+ btrfs_err(info,
+"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
+ cache->start);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ /*
+ * We need to exclude the super stripes now so that the space info has
+ * super bytes accounted for, otherwise we'll think we have more space
+ * than we actually do.
+ */
+ ret = exclude_super_stripes(cache);
+ if (ret) {
+ /* We may have excluded something, so call this just in case. */
+ btrfs_free_excluded_extents(cache);
+ goto error;
+ }
+
+ /*
+ * Check for two cases, either we are full, and therefore don't need
+ * to bother with the caching work since we won't find any space, or we
+ * are empty, and we can just add all the space in and be done with it.
+ * This saves us _a_lot_ of time, particularly in the full case.
+ */
+ if (cache->length == cache->used) {
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ btrfs_free_excluded_extents(cache);
+ } else if (cache->used == 0) {
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ add_new_free_space(cache, cache->start,
+ cache->start + cache->length);
+ btrfs_free_excluded_extents(cache);
+ }
+
+ ret = btrfs_add_block_group_cache(info, cache);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ goto error;
+ }
+ trace_btrfs_add_block_group(info, cache, 0);
+ btrfs_update_space_info(info, cache->flags, cache->length,
+ cache->used, cache->bytes_super, &space_info);
+
+ cache->space_info = space_info;
+
+ link_block_group(cache);
+
+ set_avail_alloc_bits(info, cache->flags);
+ if (btrfs_chunk_readonly(info, cache->start)) {
+ inc_block_group_ro(cache, 1);
+ } else if (cache->used == 0) {
+ ASSERT(list_empty(&cache->bg_list));
+ if (btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&info->discard_ctl, cache);
+ else
+ btrfs_mark_bg_unused(cache);
+ }
+ return 0;
+error:
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
int btrfs_read_block_groups(struct btrfs_fs_info *info)
{
struct btrfs_path *path;
int ret;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
struct btrfs_space_info *space_info;
struct btrfs_key key;
- struct btrfs_key found_key;
- struct extent_buffer *leaf;
int need_clear = 0;
u64 cache_gen;
- u64 feature;
- int mixed;
-
- feature = btrfs_super_incompat_flags(info->super_copy);
- mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
key.objectid = 0;
key.offset = 0;
@@ -1734,7 +2012,6 @@
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
cache_gen = btrfs_super_cache_generation(info->super_copy);
if (btrfs_test_opt(info, SPACE_CACHE) &&
@@ -1750,112 +2027,28 @@
if (ret != 0)
goto error;
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
- cache = btrfs_create_block_group_cache(info, found_key.objectid,
- found_key.offset);
- if (!cache) {
- ret = -ENOMEM;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ ret = read_one_block_group(info, path, &key, need_clear);
+ if (ret < 0)
goto error;
- }
-
- if (need_clear) {
- /*
- * When we mount with old space cache, we need to
- * set BTRFS_DC_CLEAR and set dirty flag.
- *
- * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
- * truncate the old free space cache inode and
- * setup a new one.
- * b) Setting 'dirty flag' makes sure that we flush
- * the new space cache info onto disk.
- */
- if (btrfs_test_opt(info, SPACE_CACHE))
- cache->disk_cache_state = BTRFS_DC_CLEAR;
- }
-
- read_extent_buffer(leaf, &cache->item,
- btrfs_item_ptr_offset(leaf, path->slots[0]),
- sizeof(cache->item));
- cache->flags = btrfs_block_group_flags(&cache->item);
- if (!mixed &&
- ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
- (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
- btrfs_err(info,
-"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
- cache->key.objectid);
- btrfs_put_block_group(cache);
- ret = -EINVAL;
- goto error;
- }
-
- key.objectid = found_key.objectid + found_key.offset;
+ key.objectid += key.offset;
+ key.offset = 0;
btrfs_release_path(path);
-
- /*
- * We need to exclude the super stripes now so that the space
- * info has super bytes accounted for, otherwise we'll think
- * we have more space than we actually do.
- */
- ret = exclude_super_stripes(cache);
- if (ret) {
- /*
- * We may have excluded something, so call this just in
- * case.
- */
- btrfs_free_excluded_extents(cache);
- btrfs_put_block_group(cache);
- goto error;
- }
-
- /*
- * Check for two cases, either we are full, and therefore
- * don't need to bother with the caching work since we won't
- * find any space, or we are empty, and we can just add all
- * the space in and be done with it. This saves us _a_lot_ of
- * time, particularly in the full case.
- */
- if (found_key.offset == btrfs_block_group_used(&cache->item)) {
- cache->last_byte_to_unpin = (u64)-1;
- cache->cached = BTRFS_CACHE_FINISHED;
- btrfs_free_excluded_extents(cache);
- } else if (btrfs_block_group_used(&cache->item) == 0) {
- cache->last_byte_to_unpin = (u64)-1;
- cache->cached = BTRFS_CACHE_FINISHED;
- add_new_free_space(cache, found_key.objectid,
- found_key.objectid +
- found_key.offset);
- btrfs_free_excluded_extents(cache);
- }
-
- ret = btrfs_add_block_group_cache(info, cache);
- if (ret) {
- btrfs_remove_free_space_cache(cache);
- btrfs_put_block_group(cache);
- goto error;
- }
-
- trace_btrfs_add_block_group(info, cache, 0);
- btrfs_update_space_info(info, cache->flags, found_key.offset,
- btrfs_block_group_used(&cache->item),
- cache->bytes_super, &space_info);
-
- cache->space_info = space_info;
-
- link_block_group(cache);
-
- set_avail_alloc_bits(info, cache->flags);
- if (btrfs_chunk_readonly(info, cache->key.objectid)) {
- inc_block_group_ro(cache, 1);
- } else if (btrfs_block_group_used(&cache->item) == 0) {
- ASSERT(list_empty(&cache->bg_list));
- btrfs_mark_bg_unused(cache);
- }
}
+ btrfs_release_path(path);
- rcu_read_lock();
- list_for_each_entry_rcu(space_info, &info->space_info, list) {
+ list_for_each_entry(space_info, &info->space_info, list) {
+ int i;
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (list_empty(&space_info->block_groups[i]))
+ continue;
+ cache = list_first_entry(&space_info->block_groups[i],
+ struct btrfs_block_group,
+ list);
+ btrfs_sysfs_add_block_group_type(cache);
+ }
+
if (!(btrfs_get_alloc_profile(info, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID1_MASK |
@@ -1875,7 +2068,6 @@
list)
inc_block_group_ro(cache, 1);
}
- rcu_read_unlock();
btrfs_init_global_block_rsv(info);
ret = check_chunk_block_group_mappings(info);
@@ -1884,38 +2076,66 @@
return ret;
}
+static int insert_block_group_item(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group_item bgi;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+
+ spin_lock(&block_group->lock);
+ btrfs_set_stack_block_group_used(&bgi, block_group->used);
+ btrfs_set_stack_block_group_chunk_objectid(&bgi,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
+ key.objectid = block_group->start;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = block_group->length;
+ spin_unlock(&block_group->lock);
+
+ root = fs_info->extent_root;
+ return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
+}
+
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *block_group;
- struct btrfs_root *extent_root = fs_info->extent_root;
- struct btrfs_block_group_item item;
- struct btrfs_key key;
+ struct btrfs_block_group *block_group;
int ret = 0;
if (!trans->can_flush_pending_bgs)
return;
while (!list_empty(&trans->new_bgs)) {
+ int index;
+
block_group = list_first_entry(&trans->new_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
bg_list);
if (ret)
goto next;
- spin_lock(&block_group->lock);
- memcpy(&item, &block_group->item, sizeof(item));
- memcpy(&key, &block_group->key, sizeof(key));
- spin_unlock(&block_group->lock);
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
- ret = btrfs_insert_item(trans, extent_root, &key, &item,
- sizeof(item));
+ ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
- ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
+ ret = btrfs_finish_chunk_alloc(trans, block_group->start,
+ block_group->length);
if (ret)
btrfs_abort_transaction(trans, ret);
add_block_group_free_space(trans, block_group);
+
+ /*
+ * If we restriped during balance, we may have added a new raid
+ * type, so now add the sysfs entries when it is safe to do so.
+ * We don't have to worry about locking here as it's handled in
+ * btrfs_sysfs_add_block_group_type.
+ */
+ if (block_group->space_info->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(block_group);
+
/* Already aborted the transaction if it failed. */
next:
btrfs_delayed_refs_rsv_release(fs_info, 1);
@@ -1928,20 +2148,18 @@
u64 type, u64 chunk_offset, u64 size)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
int ret;
btrfs_set_log_full_commit(trans);
- cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
+ cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
if (!cache)
return -ENOMEM;
- btrfs_set_block_group_used(&cache->item, bytes_used);
- btrfs_set_block_group_chunk_objectid(&cache->item,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
- btrfs_set_block_group_flags(&cache->item, type);
-
+ cache->length = size;
+ set_free_space_tree_thresholds(cache);
+ cache->used = bytes_used;
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
@@ -2000,54 +2218,6 @@
return 0;
}
-static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
-{
- u64 num_devices;
- u64 stripped;
-
- /*
- * if restripe for this chunk_type is on pick target profile and
- * return, otherwise do the usual balance
- */
- stripped = get_restripe_target(fs_info, flags);
- if (stripped)
- return extended_to_chunk(stripped);
-
- num_devices = fs_info->fs_devices->rw_devices;
-
- stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
- BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
-
- if (num_devices == 1) {
- stripped |= BTRFS_BLOCK_GROUP_DUP;
- stripped = flags & ~stripped;
-
- /* turn raid0 into single device chunks */
- if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return stripped;
-
- /* turn mirroring into duplication */
- if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
- BTRFS_BLOCK_GROUP_RAID10))
- return stripped | BTRFS_BLOCK_GROUP_DUP;
- } else {
- /* they already had raid on here, just return */
- if (flags & stripped)
- return flags;
-
- stripped |= BTRFS_BLOCK_GROUP_DUP;
- stripped = flags & ~stripped;
-
- /* switch duplicated blocks with raid1 */
- if (flags & BTRFS_BLOCK_GROUP_DUP)
- return stripped | BTRFS_BLOCK_GROUP_RAID1;
-
- /* this is drive concat, leave it alone */
- }
-
- return flags;
-}
-
/*
* Mark one block group RO, can be called several times for the same block
* group.
@@ -2057,7 +2227,7 @@
* ensure we still have some free space after marking this
* block group RO.
*/
-int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache,
+int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -2093,7 +2263,7 @@
* If we are changing raid levels, try to allocate a
* corresponding block group with the new raid level.
*/
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
if (alloc_flags != cache->flags) {
ret = btrfs_chunk_alloc(trans, alloc_flags,
CHUNK_ALLOC_FORCE);
@@ -2108,8 +2278,8 @@
}
}
- ret = inc_block_group_ro(cache, !do_chunk_alloc);
- if (!do_chunk_alloc)
+ ret = inc_block_group_ro(cache, 0);
+ if (!do_chunk_alloc || ret == -ETXTBSY)
goto unlock_out;
if (!ret)
goto out;
@@ -2118,9 +2288,11 @@
if (ret < 0)
goto out;
ret = inc_block_group_ro(cache, 0);
+ if (ret == -ETXTBSY)
+ goto unlock_out;
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
mutex_lock(&fs_info->chunk_mutex);
check_system_chunk(trans, alloc_flags);
mutex_unlock(&fs_info->chunk_mutex);
@@ -2132,7 +2304,7 @@
return ret;
}
-void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
+void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
@@ -2142,9 +2314,8 @@
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
if (!--cache->ro) {
- num_bytes = cache->key.offset - cache->reserved -
- cache->pinned - cache->bytes_super -
- btrfs_block_group_used(&cache->item);
+ num_bytes = cache->length - cache->reserved -
+ cache->pinned - cache->bytes_super - cache->used;
sinfo->bytes_readonly -= num_bytes;
list_del_init(&cache->ro_list);
}
@@ -2152,17 +2323,23 @@
spin_unlock(&sinfo->lock);
}
-static int write_one_cache_group(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct btrfs_block_group_cache *cache)
+static int update_block_group_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *root = fs_info->extent_root;
unsigned long bi;
struct extent_buffer *leaf;
+ struct btrfs_block_group_item bgi;
+ struct btrfs_key key;
- ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
+ key.objectid = cache->start;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = cache->length;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -2171,7 +2348,11 @@
leaf = path->nodes[0];
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
- write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
+ btrfs_set_stack_block_group_used(&bgi, cache->used);
+ btrfs_set_stack_block_group_chunk_objectid(&bgi,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ btrfs_set_stack_block_group_flags(&bgi, cache->flags);
+ write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
btrfs_mark_buffer_dirty(leaf);
fail:
btrfs_release_path(path);
@@ -2179,7 +2360,7 @@
}
-static int cache_save_setup(struct btrfs_block_group_cache *block_group,
+static int cache_save_setup(struct btrfs_block_group *block_group,
struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
@@ -2197,7 +2378,7 @@
* If this block group is smaller than 100 megs don't bother caching the
* block group.
*/
- if (block_group->key.offset < (100 * SZ_1M)) {
+ if (block_group->length < (100 * SZ_1M)) {
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
@@ -2298,14 +2479,15 @@
* taking up quite a bit since it's not folded into the other space
* cache.
*/
- num_pages = div_u64(block_group->key.offset, SZ_256M);
+ num_pages = div_u64(block_group->length, SZ_256M);
if (!num_pages)
num_pages = 1;
num_pages *= 16;
num_pages *= PAGE_SIZE;
- ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
+ ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
+ num_pages);
if (ret)
goto out_put;
@@ -2343,7 +2525,7 @@
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *cache, *tmp;
+ struct btrfs_block_group *cache, *tmp;
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_path *path;
@@ -2381,7 +2563,7 @@
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
@@ -2420,8 +2602,7 @@
while (!list_empty(&dirty)) {
bool drop_reserve = true;
- cache = list_first_entry(&dirty,
- struct btrfs_block_group_cache,
+ cache = list_first_entry(&dirty, struct btrfs_block_group,
dirty_list);
/*
* This can happen if something re-dirties a block group that
@@ -2473,7 +2654,7 @@
}
}
if (!ret) {
- ret = write_one_cache_group(trans, path, cache);
+ ret = update_block_group_item(trans, path, cache);
/*
* Our block group might still be attached to the list
* of new block groups in the transaction handle of some
@@ -2550,7 +2731,7 @@
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
@@ -2580,7 +2761,7 @@
spin_lock(&cur_trans->dirty_bgs_lock);
while (!list_empty(&cur_trans->dirty_bgs)) {
cache = list_first_entry(&cur_trans->dirty_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
dirty_list);
/*
@@ -2626,7 +2807,7 @@
}
}
if (!ret) {
- ret = write_one_cache_group(trans, path, cache);
+ ret = update_block_group_item(trans, path, cache);
/*
* One of the free space endio workers might have
* created a new block group while updating a free space
@@ -2643,7 +2824,7 @@
if (ret == -ENOENT) {
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
- ret = write_one_cache_group(trans, path, cache);
+ ret = update_block_group_item(trans, path, cache);
}
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -2662,7 +2843,7 @@
* to use it without any locking
*/
while (!list_empty(io)) {
- cache = list_first_entry(io, struct btrfs_block_group_cache,
+ cache = list_first_entry(io, struct btrfs_block_group,
io_list);
list_del_init(&cache->io_list);
btrfs_wait_cache_io(trans, cache, path);
@@ -2677,7 +2858,7 @@
u64 bytenr, u64 num_bytes, int alloc)
{
struct btrfs_fs_info *info = trans->fs_info;
- struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_block_group *cache = NULL;
u64 total = num_bytes;
u64 old_val;
u64 byte_in_group;
@@ -2708,11 +2889,11 @@
* is because we need the unpinning stage to actually add the
* space back to the block group, otherwise we will leak space.
*/
- if (!alloc && !btrfs_block_group_cache_done(cache))
+ if (!alloc && !btrfs_block_group_done(cache))
btrfs_cache_block_group(cache, 1);
- byte_in_group = bytenr - cache->key.objectid;
- WARN_ON(byte_in_group > cache->key.offset);
+ byte_in_group = bytenr - cache->start;
+ WARN_ON(byte_in_group > cache->length);
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
@@ -2721,11 +2902,11 @@
cache->disk_cache_state < BTRFS_DC_CLEAR)
cache->disk_cache_state = BTRFS_DC_CLEAR;
- old_val = btrfs_block_group_used(&cache->item);
- num_bytes = min(total, cache->key.offset - byte_in_group);
+ old_val = cache->used;
+ num_bytes = min(total, cache->length - byte_in_group);
if (alloc) {
old_val += num_bytes;
- btrfs_set_block_group_used(&cache->item, old_val);
+ cache->used = old_val;
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
cache->space_info->bytes_used += num_bytes;
@@ -2734,7 +2915,7 @@
spin_unlock(&cache->space_info->lock);
} else {
old_val -= num_bytes;
- btrfs_set_block_group_used(&cache->item, old_val);
+ cache->used = old_val;
cache->pinned += num_bytes;
btrfs_space_info_update_bytes_pinned(info,
cache->space_info, num_bytes);
@@ -2743,11 +2924,9 @@
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- percpu_counter_add_batch(
- &cache->space_info->total_bytes_pinned,
- num_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
- set_extent_dirty(info->pinned_extents,
+ __btrfs_mod_total_bytes_pinned(cache->space_info,
+ num_bytes);
+ set_extent_dirty(&trans->transaction->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
}
@@ -2767,8 +2946,10 @@
* dirty list to avoid races between cleaner kthread and space
* cache writeout.
*/
- if (!alloc && old_val == 0)
- btrfs_mark_bg_unused(cache);
+ if (!alloc && old_val == 0) {
+ if (!btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_mark_bg_unused(cache);
+ }
btrfs_put_block_group(cache);
total -= num_bytes;
@@ -2792,7 +2973,7 @@
* reservation and the block group has become read only we cannot make the
* reservation and return -EAGAIN, otherwise this function always succeeds.
*/
-int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
+int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
u64 ram_bytes, u64 num_bytes, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
@@ -2811,6 +2992,13 @@
space_info, -ram_bytes);
if (delalloc)
cache->delalloc_bytes += num_bytes;
+
+ /*
+ * Compression can use less space than we reserved, so wake
+ * tickets if that happens
+ */
+ if (num_bytes < ram_bytes)
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -2828,7 +3016,7 @@
* A and before transaction A commits you free that leaf, you call this with
* reserve set to 0 in order to clear the reservation.
*/
-void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
u64 num_bytes, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
@@ -2844,6 +3032,8 @@
if (delalloc)
cache->delalloc_bytes -= num_bytes;
spin_unlock(&cache->lock);
+
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
spin_unlock(&space_info->lock);
}
@@ -2852,12 +3042,10 @@
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
+ list_for_each_entry(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
found->force_alloc = CHUNK_ALLOC_FORCE;
}
- rcu_read_unlock();
}
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
@@ -3034,9 +3222,7 @@
}
/*
- * If @is_allocation is true, reserve space in the system space info necessary
- * for allocating a chunk, otherwise if it's false, reserve space necessary for
- * removing a chunk.
+ * Reserve space in the system space for allocating or removing a chunk
*/
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
{
@@ -3093,7 +3279,7 @@
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
u64 last = 0;
while (1) {
@@ -3121,7 +3307,7 @@
spin_unlock(&block_group->lock);
ASSERT(block_group->io_ctl.inode == NULL);
iput(inode);
- last = block_group->key.objectid + block_group->key.offset;
+ last = block_group->start + block_group->length;
btrfs_put_block_group(block_group);
}
}
@@ -3133,7 +3319,7 @@
*/
int btrfs_free_block_groups(struct btrfs_fs_info *info)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
@@ -3150,7 +3336,7 @@
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->unused_bgs)) {
block_group = list_first_entry(&info->unused_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
bg_list);
list_del_init(&block_group->bg_list);
btrfs_put_block_group(block_group);
@@ -3159,7 +3345,7 @@
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
- block_group = rb_entry(n, struct btrfs_block_group_cache,
+ block_group = rb_entry(n, struct btrfs_block_group,
cache_node);
rb_erase(&block_group->cache_node,
&info->block_group_cache_tree);
@@ -3183,21 +3369,14 @@
ASSERT(list_empty(&block_group->dirty_list));
ASSERT(list_empty(&block_group->io_list));
ASSERT(list_empty(&block_group->bg_list));
- ASSERT(atomic_read(&block_group->count) == 1);
+ ASSERT(refcount_read(&block_group->refs) == 1);
+ ASSERT(block_group->swap_extents == 0);
btrfs_put_block_group(block_group);
spin_lock(&info->block_group_cache_lock);
}
spin_unlock(&info->block_group_cache_lock);
- /*
- * Now that all the block groups are freed, go through and free all the
- * space_info structs. This is only called during the final stages of
- * unmount, and so we know nobody is using them. We call
- * synchronize_rcu() once before we start, just to be on the safe side.
- */
- synchronize_rcu();
-
btrfs_release_global_block_rsv(info);
while (!list_empty(&info->space_info)) {
@@ -3213,8 +3392,71 @@
space_info->bytes_reserved > 0 ||
space_info->bytes_may_use > 0))
btrfs_dump_space_info(info, space_info, 0, 0);
+ WARN_ON(space_info->reclaim_size > 0);
list_del(&space_info->list);
btrfs_sysfs_remove_space_info(space_info);
}
return 0;
}
+
+void btrfs_freeze_block_group(struct btrfs_block_group *cache)
+{
+ atomic_inc(&cache->frozen);
+}
+
+void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ bool cleanup;
+
+ spin_lock(&block_group->lock);
+ cleanup = (atomic_dec_and_test(&block_group->frozen) &&
+ block_group->removed);
+ spin_unlock(&block_group->lock);
+
+ if (cleanup) {
+ em_tree = &fs_info->mapping_tree;
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, block_group->start,
+ 1);
+ BUG_ON(!em); /* logic error, can't happen */
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+
+ /* once for us and once for the tree */
+ free_extent_map(em);
+ free_extent_map(em);
+
+ /*
+ * We may have left one free space entry and other possible
+ * tasks trimming this block group have left 1 entry each one.
+ * Free them if any.
+ */
+ __btrfs_remove_free_space_cache(block_group->free_space_ctl);
+ }
+}
+
+bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
+{
+ bool ret = true;
+
+ spin_lock(&bg->lock);
+ if (bg->ro)
+ ret = false;
+ else
+ bg->swap_extents++;
+ spin_unlock(&bg->lock);
+
+ return ret;
+}
+
+void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
+{
+ spin_lock(&bg->lock);
+ ASSERT(!bg->ro);
+ ASSERT(bg->swap_extents >= amount);
+ bg->swap_extents -= amount;
+ spin_unlock(&bg->lock);
+}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 0758e6d..4c76143 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -13,6 +13,19 @@
};
/*
+ * This describes the state of the block_group for async discard. This is due
+ * to the two pass nature of it where extent discarding is prioritized over
+ * bitmap discarding. BTRFS_DISCARD_RESET_CURSOR is set when we are resetting
+ * between lists to prevent contention for discard state variables
+ * (eg. discard_cursor).
+ */
+enum btrfs_discard_state {
+ BTRFS_DISCARD_EXTENTS,
+ BTRFS_DISCARD_BITMAPS,
+ BTRFS_DISCARD_RESET_CURSOR,
+};
+
+/*
* Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to
* only allocate a chunk if we really need one.
*
@@ -34,7 +47,7 @@
struct mutex mutex;
wait_queue_head_t wait;
struct btrfs_work work;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
u64 progress;
refcount_t count;
};
@@ -42,14 +55,15 @@
/* Once caching_thread() finds this much free space, it will wake up waiters. */
#define CACHING_CTL_WAKE_UP SZ_2M
-struct btrfs_block_group_cache {
- struct btrfs_key key;
- struct btrfs_block_group_item item;
+struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
struct inode *inode;
spinlock_t lock;
+ u64 start;
+ u64 length;
u64 pinned;
u64 reserved;
+ u64 used;
u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
@@ -100,8 +114,7 @@
/* For block groups in the same raid type */
struct list_head list;
- /* Usage count */
- atomic_t count;
+ refcount_t refs;
/*
* List of struct btrfs_free_clusters for this block group.
@@ -115,7 +128,22 @@
/* For read-only block groups */
struct list_head ro_list;
- atomic_t trimming;
+ /*
+ * When non-zero it means the block group's logical address and its
+ * device extents can not be reused for future block group allocations
+ * until the counter goes down to 0. This is to prevent them from being
+ * reused while some task is still using the block group after it was
+ * deleted - we want to make sure they can only be reused for new block
+ * groups after that task is done with the deleted block group.
+ */
+ atomic_t frozen;
+
+ /* For discard operations */
+ struct list_head discard_list;
+ int discard_index;
+ u64 discard_eligible_time;
+ u64 discard_cursor;
+ enum btrfs_discard_state discard_state;
/* For dirty block groups */
struct list_head dirty_list;
@@ -153,13 +181,35 @@
*/
int needs_free_space;
+ /*
+ * Number of extents in this block group used for swap files.
+ * All accesses protected by the spinlock 'lock'.
+ */
+ int swap_extents;
+
/* Record locked full stripes for RAID5/6 block group */
struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
};
+static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
+{
+ return (block_group->start + block_group->length);
+}
+
+static inline bool btrfs_is_block_group_data_only(
+ struct btrfs_block_group *block_group)
+{
+ /*
+ * In mixed mode the fragmentation is expected to be high, lowering the
+ * efficiency, so only proper data block groups are considered.
+ */
+ return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
+ !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA);
+}
+
#ifdef CONFIG_BTRFS_DEBUG
static inline int btrfs_should_fragment_free_space(
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -170,29 +220,29 @@
}
#endif
-struct btrfs_block_group_cache *btrfs_lookup_first_block_group(
+struct btrfs_block_group *btrfs_lookup_first_block_group(
struct btrfs_fs_info *info, u64 bytenr);
-struct btrfs_block_group_cache *btrfs_lookup_block_group(
+struct btrfs_block_group *btrfs_lookup_block_group(
struct btrfs_fs_info *info, u64 bytenr);
-struct btrfs_block_group_cache *btrfs_next_block_group(
- struct btrfs_block_group_cache *cache);
-void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+struct btrfs_block_group *btrfs_next_block_group(
+ struct btrfs_block_group *cache);
+void btrfs_get_block_group(struct btrfs_block_group *cache);
+void btrfs_put_block_group(struct btrfs_block_group *cache);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
-void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
+void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg);
bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
-void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
-void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
+void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache);
-int btrfs_cache_block_group(struct btrfs_block_group_cache *cache,
+int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache);
+int btrfs_cache_block_group(struct btrfs_block_group *cache,
int load_cache_only);
void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
struct btrfs_caching_control *btrfs_get_caching_control(
- struct btrfs_block_group_cache *cache);
-u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *cache);
+u64 add_new_free_space(struct btrfs_block_group *block_group,
u64 start, u64 end);
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
@@ -200,22 +250,22 @@
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
u64 group_start, struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
-void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg);
+void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
u64 type, u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
-int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache,
+int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc);
-void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
+void btrfs_dec_block_group_ro(struct btrfs_block_group *cache);
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, int alloc);
-int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
+int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
u64 ram_bytes, u64 num_bytes, int delalloc);
-void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
u64 num_bytes, int delalloc);
int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
enum btrfs_chunk_alloc_enum force);
@@ -240,12 +290,22 @@
return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
}
-static inline int btrfs_block_group_cache_done(
- struct btrfs_block_group_cache *cache)
+static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
{
smp_mb();
return cache->cached == BTRFS_CACHE_FINISHED ||
cache->cached == BTRFS_CACHE_ERROR;
}
+void btrfs_freeze_block_group(struct btrfs_block_group *cache);
+void btrfs_unfreeze_block_group(struct btrfs_block_group *cache);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ u64 physical, u64 **logical, int *naddrs, int *stripe_len);
+#endif
+
+bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg);
+void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount);
+
#endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 343400d..bc920af 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -7,6 +7,98 @@
#include "transaction.h"
#include "block-group.h"
+/*
+ * HOW DO BLOCK RESERVES WORK
+ *
+ * Think of block_rsv's as buckets for logically grouped metadata
+ * reservations. Each block_rsv has a ->size and a ->reserved. ->size is
+ * how large we want our block rsv to be, ->reserved is how much space is
+ * currently reserved for this block reserve.
+ *
+ * ->failfast exists for the truncate case, and is described below.
+ *
+ * NORMAL OPERATION
+ *
+ * -> Reserve
+ * Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill
+ *
+ * We call into btrfs_reserve_metadata_bytes() with our bytes, which is
+ * accounted for in space_info->bytes_may_use, and then add the bytes to
+ * ->reserved, and ->size in the case of btrfs_block_rsv_add.
+ *
+ * ->size is an over-estimation of how much we may use for a particular
+ * operation.
+ *
+ * -> Use
+ * Entrance: btrfs_use_block_rsv
+ *
+ * When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv()
+ * to determine the appropriate block_rsv to use, and then verify that
+ * ->reserved has enough space for our tree block allocation. Once
+ * successful we subtract fs_info->nodesize from ->reserved.
+ *
+ * -> Finish
+ * Entrance: btrfs_block_rsv_release
+ *
+ * We are finished with our operation, subtract our individual reservation
+ * from ->size, and then subtract ->size from ->reserved and free up the
+ * excess if there is any.
+ *
+ * There is some logic here to refill the delayed refs rsv or the global rsv
+ * as needed, otherwise the excess is subtracted from
+ * space_info->bytes_may_use.
+ *
+ * TYPES OF BLOCK RESERVES
+ *
+ * BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK
+ * These behave normally, as described above, just within the confines of the
+ * lifetime of their particular operation (transaction for the whole trans
+ * handle lifetime, for example).
+ *
+ * BLOCK_RSV_GLOBAL
+ * It is impossible to properly account for all the space that may be required
+ * to make our extent tree updates. This block reserve acts as an overflow
+ * buffer in case our delayed refs reserve does not reserve enough space to
+ * update the extent tree.
+ *
+ * We can steal from this in some cases as well, notably on evict() or
+ * truncate() in order to help users recover from ENOSPC conditions.
+ *
+ * BLOCK_RSV_DELALLOC
+ * The individual item sizes are determined by the per-inode size
+ * calculations, which are described with the delalloc code. This is pretty
+ * straightforward, it's just the calculation of ->size encodes a lot of
+ * different items, and thus it gets used when updating inodes, inserting file
+ * extents, and inserting checksums.
+ *
+ * BLOCK_RSV_DELREFS
+ * We keep a running tally of how many delayed refs we have on the system.
+ * We assume each one of these delayed refs are going to use a full
+ * reservation. We use the transaction items and pre-reserve space for every
+ * operation, and use this reservation to refill any gap between ->size and
+ * ->reserved that may exist.
+ *
+ * From there it's straightforward, removing a delayed ref means we remove its
+ * count from ->size and free up reservations as necessary. Since this is
+ * the most dynamic block reserve in the system, we will try to refill this
+ * block reserve first with any excess returned by any other block reserve.
+ *
+ * BLOCK_RSV_EMPTY
+ * This is the fallback block reserve to make us try to reserve space if we
+ * don't have a specific bucket for this allocation. It is mostly used for
+ * updating the device tree and such, since that is a separate pool we're
+ * content to just reserve space from the space_info on demand.
+ *
+ * BLOCK_RSV_TEMP
+ * This is used by things like truncate and iput. We will temporarily
+ * allocate a block reserve, set it to some size, and then truncate bytes
+ * until we have no space left. With ->failfast set we'll simply return
+ * ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try
+ * to make a new reservation. This is because these operations are
+ * unbounded, so we want to do as much work as we can, and then back off and
+ * re-reserve.
+ */
+
static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes,
@@ -112,7 +204,7 @@
{
if (!rsv)
return;
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
kfree(rsv);
}
@@ -179,9 +271,9 @@
return ret;
}
-u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, u64 *qgroup_to_release)
+u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ u64 *qgroup_to_release)
{
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
@@ -298,9 +390,9 @@
if (block_rsv->reserved < block_rsv->size) {
num_bytes = block_rsv->size - block_rsv->reserved;
- block_rsv->reserved += num_bytes;
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
num_bytes);
+ block_rsv->reserved = block_rsv->size;
} else if (block_rsv->reserved > block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
@@ -347,7 +439,8 @@
void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
{
- btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1,
+ NULL);
WARN_ON(fs_info->trans_block_rsv.size > 0);
WARN_ON(fs_info->trans_block_rsv.reserved > 0);
WARN_ON(fs_info->chunk_block_rsv.size > 0);
@@ -365,7 +458,7 @@
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *block_rsv = NULL;
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
(root == fs_info->csum_root && trans->adding_csums) ||
(root == fs_info->uuid_root))
block_rsv = trans->block_rsv;
@@ -418,7 +511,8 @@
/*DEFAULT_RATELIMIT_BURST*/ 1);
if (__ratelimit(&_rs))
WARN(1, KERN_DEBUG
- "BTRFS: block rsv returned %d\n", ret);
+ "BTRFS: block rsv %d returned %d\n",
+ block_rsv->type, ret);
}
try_reserve:
ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index d1428bb..0b6ae53 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -73,7 +73,7 @@
int min_factor);
void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes, bool update_size);
-u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes, u64 *qgroup_to_release);
void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info);
@@ -82,20 +82,12 @@
struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u32 blocksize);
-
-static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
-{
- __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
-}
-
static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u32 blocksize)
{
btrfs_block_rsv_add_bytes(block_rsv, blocksize, false);
- btrfs_block_rsv_release(fs_info, block_rsv, 0);
+ btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL);
}
#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index f3ff57b..5a43f8e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -7,6 +7,7 @@
#define BTRFS_INODE_H
#include <linux/hash.h>
+#include <linux/refcount.h>
#include "extent_map.h"
#include "extent_io.h"
#include "ordered-data.h"
@@ -20,16 +21,36 @@
* new data the application may have written before commit.
*/
enum {
- BTRFS_INODE_ORDERED_DATA_CLOSE,
+ BTRFS_INODE_FLUSH_ON_CLOSE,
BTRFS_INODE_DUMMY,
BTRFS_INODE_IN_DEFRAG,
BTRFS_INODE_HAS_ASYNC_EXTENT,
+ /*
+ * Always set under the VFS' inode lock, otherwise it can cause races
+ * during fsync (we start as a fast fsync and then end up in a full
+ * fsync racing with ordered extent completion).
+ */
BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING,
BTRFS_INODE_IN_DELALLOC_LIST,
- BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
+ /*
+ * Set and used when logging an inode and it serves to signal that an
+ * inode does not have xattrs, so subsequent fsyncs can avoid searching
+ * for xattrs to log. This bit must be cleared whenever a xattr is added
+ * to an inode.
+ */
+ BTRFS_INODE_NO_XATTRS,
+ /*
+ * Set when we are in a context where we need to start a transaction and
+ * have dirty pages with the respective file range locked. This is to
+ * ensure that when reserving space for the transaction, if we are low
+ * on available space and need to flush delalloc, we will not flush
+ * delalloc for this inode, because that could result in a deadlock (on
+ * the file range, inode's io_tree).
+ */
+ BTRFS_INODE_NO_DELALLOC_FLUSH,
};
/* in memory btrfs inode */
@@ -60,12 +81,15 @@
*/
struct extent_io_tree io_failure_tree;
+ /*
+ * Keep track of where the inode has extent items mapped in order to
+ * make sure the i_size adjustments are accurate
+ */
+ struct extent_io_tree file_extent_tree;
+
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
- /* held while doing delalloc reservations */
- struct mutex delalloc_mutex;
-
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
@@ -148,6 +172,17 @@
u64 last_unlink_trans;
/*
+ * The id/generation of the last transaction where this inode was
+ * either the source or the destination of a clone/dedupe operation.
+ * Used when logging an inode to know if there are shared extents that
+ * need special care when logging checksum items, to avoid duplicate
+ * checksum items in a log (which can lead to a corruption where we end
+ * up with missing checksum ranges after log replay).
+ * Protected by the vfs inode lock.
+ */
+ u64 last_reflink_trans;
+
+ /*
* Number of bytes outstanding that are going to need csums. This is
* used in ENOSPC accounting.
*/
@@ -197,6 +232,11 @@
struct inode vfs_inode;
};
+static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
+{
+ return inode->root->fs_info->sectorsize;
+}
+
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -305,53 +345,25 @@
return ret;
}
-#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1
-
struct btrfs_dio_private {
struct inode *inode;
- unsigned long flags;
u64 logical_offset;
u64 disk_bytenr;
u64 bytes;
- void *private;
- /* number of bios pending for this dio */
- atomic_t pending_bios;
-
- /* IO errors */
- int errors;
-
- /* orig_bio is our btrfs_io_bio */
- struct bio *orig_bio;
+ /*
+ * References to this structure. There is one reference per in-flight
+ * bio plus one while we're still setting up.
+ */
+ refcount_t refs;
/* dio_bio came from fs/direct-io.c */
struct bio *dio_bio;
- /*
- * The original bio may be split to several sub-bios, this is
- * done during endio of sub-bios
- */
- blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *,
- blk_status_t);
+ /* Array of checksums */
+ u8 csums[];
};
-/*
- * Disable DIO read nolock optimization, so new dio readers will be forced
- * to grab i_mutex. It is used to avoid the endless truncate due to
- * nonlocked dio read.
- */
-static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode)
-{
- set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
- smp_mb();
-}
-
-static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
-{
- smp_mb__before_atomic();
- clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
-}
-
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 72c70f5..81a8c87 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -77,7 +77,6 @@
#include <linux/sched.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
#include <linux/mutex.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>
@@ -152,11 +151,8 @@
struct list_head ref_to_list; /* list */
struct list_head ref_from_list; /* list */
struct btrfsic_block *next_in_same_bio;
- void *orig_bio_bh_private;
- union {
- bio_end_io_t *bio;
- bh_end_io_t *bh;
- } orig_bio_bh_end_io;
+ void *orig_bio_private;
+ bio_end_io_t *orig_bio_end_io;
int submit_bio_bh_rw;
u64 flush_gen; /* only valid if !never_written */
};
@@ -325,14 +321,12 @@
u64 dev_bytenr, char **mapped_datav,
unsigned int num_pages,
struct bio *bio, int *bio_is_patched,
- struct buffer_head *bh,
int submit_bio_bh_rw);
static int btrfsic_process_written_superblock(
struct btrfsic_state *state,
struct btrfsic_block *const block,
struct btrfs_super_block *const super_hdr);
static void btrfsic_bio_end_io(struct bio *bp);
-static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
const struct btrfsic_block *block,
int recursion_level);
@@ -399,8 +393,8 @@
b->never_written = 0;
b->mirror_num = 0;
b->next_in_same_bio = NULL;
- b->orig_bio_bh_private = NULL;
- b->orig_bio_bh_end_io.bio = NULL;
+ b->orig_bio_private = NULL;
+ b->orig_bio_end_io = NULL;
INIT_LIST_HEAD(&b->collision_resolving_node);
INIT_LIST_HEAD(&b->all_blocks_node);
INIT_LIST_HEAD(&b->ref_to_list);
@@ -636,12 +630,9 @@
int ret = 0;
int pass;
- BUG_ON(NULL == state);
selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
- if (NULL == selected_super) {
- pr_info("btrfsic: error, kmalloc failed!\n");
+ if (!selected_super)
return -ENOMEM;
- }
list_for_each_entry(device, dev_head, dev_list) {
int i;
@@ -768,29 +759,31 @@
struct btrfs_fs_info *fs_info = state->fs_info;
struct btrfs_super_block *super_tmp;
u64 dev_bytenr;
- struct buffer_head *bh;
struct btrfsic_block *superblock_tmp;
int pass;
struct block_device *const superblock_bdev = device->bdev;
+ struct page *page;
+ struct address_space *mapping = superblock_bdev->bd_inode->i_mapping;
+ int ret = 0;
/* super block bytenr is always the unmapped device bytenr */
dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
return -1;
- bh = __bread(superblock_bdev, dev_bytenr / BTRFS_BDEV_BLOCKSIZE,
- BTRFS_SUPER_INFO_SIZE);
- if (NULL == bh)
+
+ page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS);
+ if (IS_ERR(page))
return -1;
- super_tmp = (struct btrfs_super_block *)
- (bh->b_data + (dev_bytenr & (BTRFS_BDEV_BLOCKSIZE - 1)));
+
+ super_tmp = page_address(page);
if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
btrfs_super_nodesize(super_tmp) != state->metablock_size ||
btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
- brelse(bh);
- return 0;
+ ret = 0;
+ goto out;
}
superblock_tmp =
@@ -800,9 +793,8 @@
if (NULL == superblock_tmp) {
superblock_tmp = btrfsic_block_alloc();
if (NULL == superblock_tmp) {
- pr_info("btrfsic: error, kmalloc failed!\n");
- brelse(bh);
- return -1;
+ ret = -1;
+ goto out;
}
/* for superblock, only the dev_bytenr makes sense */
superblock_tmp->dev_bytenr = dev_bytenr;
@@ -886,8 +878,8 @@
mirror_num)) {
pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n",
next_bytenr, mirror_num);
- brelse(bh);
- return -1;
+ ret = -1;
+ goto out;
}
next_block = btrfsic_block_lookup_or_add(
@@ -896,8 +888,8 @@
mirror_num, NULL);
if (NULL == next_block) {
btrfsic_release_block_ctx(&tmp_next_block_ctx);
- brelse(bh);
- return -1;
+ ret = -1;
+ goto out;
}
next_block->disk_key = tmp_disk_key;
@@ -908,16 +900,17 @@
BTRFSIC_GENERATION_UNKNOWN);
btrfsic_release_block_ctx(&tmp_next_block_ctx);
if (NULL == l) {
- brelse(bh);
- return -1;
+ ret = -1;
+ goto out;
}
}
}
if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
btrfsic_dump_tree_sub(state, superblock_tmp, 0);
- brelse(bh);
- return 0;
+out:
+ put_page(page);
+ return ret;
}
static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
@@ -925,9 +918,7 @@
struct btrfsic_stack_frame *sf;
sf = kzalloc(sizeof(*sf), GFP_NOFS);
- if (NULL == sf)
- pr_info("btrfsic: alloc memory failed!\n");
- else
+ if (sf)
sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
return sf;
}
@@ -1317,7 +1308,6 @@
if (NULL == l) {
l = btrfsic_block_link_alloc();
if (NULL == l) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(next_block_ctx);
*next_blockp = NULL;
return -1;
@@ -1474,7 +1464,6 @@
mirror_num,
&block_was_created);
if (NULL == next_block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&next_block_ctx);
return -1;
}
@@ -1744,7 +1733,6 @@
u64 dev_bytenr, char **mapped_datav,
unsigned int num_pages,
struct bio *bio, int *bio_is_patched,
- struct buffer_head *bh,
int submit_bio_bh_rw)
{
int is_metadata;
@@ -1903,9 +1891,9 @@
block->is_iodone = 0;
BUG_ON(NULL == bio_is_patched);
if (!*bio_is_patched) {
- block->orig_bio_bh_private =
+ block->orig_bio_private =
bio->bi_private;
- block->orig_bio_bh_end_io.bio =
+ block->orig_bio_end_io =
bio->bi_end_io;
block->next_in_same_bio = NULL;
bio->bi_private = block;
@@ -1917,25 +1905,17 @@
bio->bi_private;
BUG_ON(NULL == chained_block);
- block->orig_bio_bh_private =
- chained_block->orig_bio_bh_private;
- block->orig_bio_bh_end_io.bio =
- chained_block->orig_bio_bh_end_io.
- bio;
+ block->orig_bio_private =
+ chained_block->orig_bio_private;
+ block->orig_bio_end_io =
+ chained_block->orig_bio_end_io;
block->next_in_same_bio = chained_block;
bio->bi_private = block;
}
- } else if (NULL != bh) {
- block->is_iodone = 0;
- block->orig_bio_bh_private = bh->b_private;
- block->orig_bio_bh_end_io.bh = bh->b_end_io;
- block->next_in_same_bio = NULL;
- bh->b_private = block;
- bh->b_end_io = btrfsic_bh_end_io;
} else {
block->is_iodone = 1;
- block->orig_bio_bh_private = NULL;
- block->orig_bio_bh_end_io.bio = NULL;
+ block->orig_bio_private = NULL;
+ block->orig_bio_end_io = NULL;
block->next_in_same_bio = NULL;
}
}
@@ -2026,7 +2006,6 @@
block = btrfsic_block_alloc();
if (NULL == block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&block_ctx);
goto continue_loop;
}
@@ -2043,8 +2022,8 @@
block->is_iodone = 0;
BUG_ON(NULL == bio_is_patched);
if (!*bio_is_patched) {
- block->orig_bio_bh_private = bio->bi_private;
- block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+ block->orig_bio_private = bio->bi_private;
+ block->orig_bio_end_io = bio->bi_end_io;
block->next_in_same_bio = NULL;
bio->bi_private = block;
bio->bi_end_io = btrfsic_bio_end_io;
@@ -2055,24 +2034,17 @@
bio->bi_private;
BUG_ON(NULL == chained_block);
- block->orig_bio_bh_private =
- chained_block->orig_bio_bh_private;
- block->orig_bio_bh_end_io.bio =
- chained_block->orig_bio_bh_end_io.bio;
+ block->orig_bio_private =
+ chained_block->orig_bio_private;
+ block->orig_bio_end_io =
+ chained_block->orig_bio_end_io;
block->next_in_same_bio = chained_block;
bio->bi_private = block;
}
- } else if (NULL != bh) {
- block->is_iodone = 0;
- block->orig_bio_bh_private = bh->b_private;
- block->orig_bio_bh_end_io.bh = bh->b_end_io;
- block->next_in_same_bio = NULL;
- bh->b_private = block;
- bh->b_end_io = btrfsic_bh_end_io;
} else {
block->is_iodone = 1;
- block->orig_bio_bh_private = NULL;
- block->orig_bio_bh_end_io.bio = NULL;
+ block->orig_bio_private = NULL;
+ block->orig_bio_end_io = NULL;
block->next_in_same_bio = NULL;
}
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
@@ -2113,8 +2085,8 @@
iodone_w_error = 1;
BUG_ON(NULL == block);
- bp->bi_private = block->orig_bio_bh_private;
- bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+ bp->bi_private = block->orig_bio_private;
+ bp->bi_end_io = block->orig_bio_end_io;
do {
struct btrfsic_block *next_block;
@@ -2147,38 +2119,6 @@
bp->bi_end_io(bp);
}
-static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
-{
- struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
- int iodone_w_error = !uptodate;
- struct btrfsic_dev_state *dev_state;
-
- BUG_ON(NULL == block);
- dev_state = block->dev_state;
- if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
- iodone_w_error,
- btrfsic_get_block_type(dev_state->state, block),
- block->logical_bytenr, block->dev_state->name,
- block->dev_bytenr, block->mirror_num);
-
- block->iodone_w_error = iodone_w_error;
- if (block->submit_bio_bh_rw & REQ_PREFLUSH) {
- dev_state->last_flush_gen++;
- if ((dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bh_end_io() new %s flush_gen=%llu\n",
- dev_state->name, dev_state->last_flush_gen);
- }
- if (block->submit_bio_bh_rw & REQ_FUA)
- block->flush_gen = 0; /* FUA completed means block is on disk */
-
- bh->b_private = block->orig_bio_bh_private;
- bh->b_end_io = block->orig_bio_bh_end_io.bh;
- block->is_iodone = 1; /* for FLUSH, this releases the block */
- bh->b_end_io(bh, uptodate);
-}
-
static int btrfsic_process_written_superblock(
struct btrfsic_state *state,
struct btrfsic_block *const superblock,
@@ -2286,7 +2226,6 @@
mirror_num,
&was_created);
if (NULL == next_block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&tmp_next_block_ctx);
return -1;
}
@@ -2594,10 +2533,8 @@
&state->block_link_hashtable);
if (NULL == l) {
l = btrfsic_block_link_alloc();
- if (NULL == l) {
- pr_info("btrfsic: error, kmalloc failed!\n");
+ if (!l)
return NULL;
- }
l->block_ref_to = next_block;
l->block_ref_from = from_block;
@@ -2641,10 +2578,9 @@
struct btrfsic_dev_state *dev_state;
block = btrfsic_block_alloc();
- if (NULL == block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
+ if (!block)
return NULL;
- }
+
dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev);
if (NULL == dev_state) {
pr_info("btrfsic: error, lookup dev_state failed!\n");
@@ -2731,63 +2667,6 @@
&btrfsic_dev_state_hashtable);
}
-int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh)
-{
- struct btrfsic_dev_state *dev_state;
-
- if (!btrfsic_is_initialized)
- return submit_bh(op, op_flags, bh);
-
- mutex_lock(&btrfsic_mutex);
- /* since btrfsic_submit_bh() might also be called before
- * btrfsic_mount(), this might return NULL */
- dev_state = btrfsic_dev_state_lookup(bh->b_bdev->bd_dev);
-
- /* Only called to write the superblock (incl. FLUSH/FUA) */
- if (NULL != dev_state &&
- (op == REQ_OP_WRITE) && bh->b_size > 0) {
- u64 dev_bytenr;
-
- dev_bytenr = BTRFS_BDEV_BLOCKSIZE * bh->b_blocknr;
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bh(op=0x%x,0x%x, blocknr=%llu (bytenr %llu), size=%zu, data=%p, bdev=%p)\n",
- op, op_flags, (unsigned long long)bh->b_blocknr,
- dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev);
- btrfsic_process_written_block(dev_state, dev_bytenr,
- &bh->b_data, 1, NULL,
- NULL, bh, op_flags);
- } else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) {
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bh(op=0x%x,0x%x FLUSH, bdev=%p)\n",
- op, op_flags, bh->b_bdev);
- if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
- if ((dev_state->state->print_mask &
- (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
- BTRFSIC_PRINT_MASK_VERBOSE)))
- pr_info("btrfsic_submit_bh(%s) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->name);
- } else {
- struct btrfsic_block *const block =
- &dev_state->dummy_block_for_bio_bh_flush;
-
- block->is_iodone = 0;
- block->never_written = 0;
- block->iodone_w_error = 0;
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = op_flags;
- block->orig_bio_bh_private = bh->b_private;
- block->orig_bio_bh_end_io.bh = bh->b_end_io;
- block->next_in_same_bio = NULL;
- bh->b_private = block;
- bh->b_end_io = btrfsic_bh_end_io;
- }
- }
- mutex_unlock(&btrfsic_mutex);
- return submit_bh(op, op_flags, bh);
-}
-
static void __btrfsic_submit_bio(struct bio *bio)
{
struct btrfsic_dev_state *dev_state;
@@ -2839,7 +2718,7 @@
btrfsic_process_written_block(dev_state, dev_bytenr,
mapped_datav, segs,
bio, &bio_is_patched,
- NULL, bio->bi_opf);
+ bio->bi_opf);
bio_for_each_segment(bvec, bio, iter)
kunmap(bvec.bv_page);
kfree(mapped_datav);
@@ -2863,8 +2742,8 @@
block->iodone_w_error = 0;
block->flush_gen = dev_state->last_flush_gen + 1;
block->submit_bio_bh_rw = bio->bi_opf;
- block->orig_bio_bh_private = bio->bi_private;
- block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+ block->orig_bio_private = bio->bi_private;
+ block->orig_bio_end_io = bio->bi_end_io;
block->next_in_same_bio = NULL;
bio->bi_private = block;
bio->bi_end_io = btrfsic_bio_end_io;
@@ -2906,10 +2785,8 @@
return -1;
}
state = kvzalloc(sizeof(*state), GFP_KERNEL);
- if (!state) {
- pr_info("btrfs check-integrity: allocation failed!\n");
+ if (!state)
return -ENOMEM;
- }
if (!btrfsic_is_initialized) {
mutex_init(&btrfsic_mutex);
@@ -2938,7 +2815,6 @@
ds = btrfsic_dev_state_alloc();
if (NULL == ds) {
- pr_info("btrfs check-integrity: kmalloc() failed!\n");
mutex_unlock(&btrfsic_mutex);
return -ENOMEM;
}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index 9bf4359..bcc730a 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -7,11 +7,9 @@
#define BTRFS_CHECK_INTEGRITY_H
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh);
void btrfsic_submit_bio(struct bio *bio);
int btrfsic_submit_bio_wait(struct bio *bio);
#else
-#define btrfsic_submit_bh submit_bh
#define btrfsic_submit_bio submit_bio
#define btrfsic_submit_bio_wait submit_bio_wait
#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 28f78e4..646416b 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -39,6 +39,8 @@
case BTRFS_COMPRESS_ZSTD:
case BTRFS_COMPRESS_NONE:
return btrfs_compress_types[type];
+ default:
+ break;
}
return NULL;
@@ -60,6 +62,75 @@
return false;
}
+static int compression_compress_pages(int type, struct list_head *ws,
+ struct address_space *mapping, u64 start, struct page **pages,
+ unsigned long *out_pages, unsigned long *total_in,
+ unsigned long *total_out)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB:
+ return zlib_compress_pages(ws, mapping, start, pages,
+ out_pages, total_in, total_out);
+ case BTRFS_COMPRESS_LZO:
+ return lzo_compress_pages(ws, mapping, start, pages,
+ out_pages, total_in, total_out);
+ case BTRFS_COMPRESS_ZSTD:
+ return zstd_compress_pages(ws, mapping, start, pages,
+ out_pages, total_in, total_out);
+ case BTRFS_COMPRESS_NONE:
+ default:
+ /*
+ * This can happen when compression races with remount setting
+ * it to 'no compress', while caller doesn't call
+ * inode_need_compress() to check if we really need to
+ * compress.
+ *
+ * Not a big deal, just need to inform caller that we
+ * haven't allocated any pages yet.
+ */
+ *out_pages = 0;
+ return -E2BIG;
+ }
+}
+
+static int compression_decompress_bio(int type, struct list_head *ws,
+ struct compressed_bio *cb)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
+ case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb);
+ case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
+ case BTRFS_COMPRESS_NONE:
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
+static int compression_decompress(int type, struct list_head *ws,
+ unsigned char *data_in, struct page *dest_page,
+ unsigned long start_byte, size_t srclen, size_t destlen)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
+ start_byte, srclen, destlen);
+ case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
+ start_byte, srclen, destlen);
+ case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
+ start_byte, srclen, destlen);
+ case BTRFS_COMPRESS_NONE:
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
@@ -71,18 +142,17 @@
(DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size;
}
-static int check_compressed_csum(struct btrfs_inode *inode,
- struct compressed_bio *cb,
+static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
u64 disk_start)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- int ret;
struct page *page;
unsigned long i;
char *kaddr;
u8 csum[BTRFS_CSUM_SIZE];
+ struct compressed_bio *cb = bio->bi_private;
u8 *cb_sum = cb->sums;
if (inode->flags & BTRFS_INODE_NODATASUM)
@@ -93,24 +163,22 @@
for (i = 0; i < cb->nr_pages; i++) {
page = cb->compressed_pages[i];
- crypto_shash_init(shash);
kaddr = kmap_atomic(page);
- crypto_shash_update(shash, kaddr, PAGE_SIZE);
+ crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum);
kunmap_atomic(kaddr);
- crypto_shash_final(shash, (u8 *)&csum);
if (memcmp(&csum, cb_sum, csum_size)) {
btrfs_print_data_csum_error(inode, disk_start,
csum, cb_sum, cb->mirror_num);
- ret = -EIO;
- goto fail;
+ if (btrfs_io_bio(bio)->device)
+ btrfs_dev_stat_inc_and_print(
+ btrfs_io_bio(bio)->device,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ return -EIO;
}
cb_sum += csum_size;
-
}
- ret = 0;
-fail:
- return ret;
+ return 0;
}
/* when we finish reading compressed pages from the disk, we
@@ -145,7 +213,6 @@
* Record the correct mirror_num in cb->orig_bio so that
* read-repair can work properly.
*/
- ASSERT(btrfs_io_bio(cb->orig_bio));
btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
cb->mirror_num = mirror;
@@ -157,7 +224,7 @@
goto csum_failed;
inode = cb->inode;
- ret = check_compressed_csum(BTRFS_I(inode), cb,
+ ret = check_compressed_csum(BTRFS_I(inode), bio,
(u64)bio->bi_iter.bi_sector << 9);
if (ret)
goto csum_failed;
@@ -306,23 +373,23 @@
* This also checksums the file bytes and gets things ready for
* the end io hooks.
*/
-blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
+blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
struct page **compressed_pages,
unsigned long nr_pages,
- unsigned int write_flags)
+ unsigned int write_flags,
+ struct cgroup_subsys_state *blkcg_css)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = NULL;
struct compressed_bio *cb;
unsigned long bytes_left;
int pg_index = 0;
struct page *page;
u64 first_byte = disk_start;
- struct block_device *bdev;
blk_status_t ret;
- int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+ int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
WARN_ON(!PAGE_ALIGNED(start));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
@@ -330,7 +397,7 @@
return BLK_STS_RESOURCE;
refcount_set(&cb->pending_bios, 0);
cb->errors = 0;
- cb->inode = inode;
+ cb->inode = &inode->vfs_inode;
cb->start = start;
cb->len = len;
cb->mirror_num = 0;
@@ -339,13 +406,15 @@
cb->orig_bio = NULL;
cb->nr_pages = nr_pages;
- bdev = fs_info->fs_devices->latest_bdev;
-
bio = btrfs_bio_alloc(first_byte);
- bio_set_dev(bio, bdev);
bio->bi_opf = REQ_OP_WRITE | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
+
+ if (blkcg_css) {
+ bio->bi_opf |= REQ_CGROUP_PUNT;
+ kthread_associate_blkcg(blkcg_css);
+ }
refcount_set(&cb->pending_bios, 1);
/* create and submit bios for the compressed pages */
@@ -354,7 +423,7 @@
int submit = 0;
page = compressed_pages[pg_index];
- page->mapping = inode->i_mapping;
+ page->mapping = inode->vfs_inode.i_mapping;
if (bio->bi_iter.bi_size)
submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
0);
@@ -378,17 +447,18 @@
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(fs_info, bio, 0, 1);
+ ret = btrfs_map_bio(fs_info, bio, 0);
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
}
bio = btrfs_bio_alloc(first_byte);
- bio_set_dev(bio, bdev);
bio->bi_opf = REQ_OP_WRITE | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
+ if (blkcg_css)
+ bio->bi_opf |= REQ_CGROUP_PUNT;
bio_add_page(bio, page, PAGE_SIZE, 0);
}
if (bytes_left < PAGE_SIZE) {
@@ -409,12 +479,15 @@
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(fs_info, bio, 0, 1);
+ ret = btrfs_map_bio(fs_info, bio, 0);
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
}
+ if (blkcg_css)
+ kthread_associate_blkcg(NULL);
+
return 0;
}
@@ -553,7 +626,6 @@
unsigned long nr_pages;
unsigned long pg_index;
struct page *page;
- struct block_device *bdev;
struct bio *comp_bio;
u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9;
u64 em_len;
@@ -604,8 +676,6 @@
if (!cb->compressed_pages)
goto fail1;
- bdev = fs_info->fs_devices->latest_bdev;
-
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
__GFP_HIGHMEM);
@@ -624,7 +694,6 @@
cb->len = bio->bi_iter.bi_size;
comp_bio = btrfs_bio_alloc(cur_disk_byte);
- bio_set_dev(comp_bio, bdev);
comp_bio->bi_opf = REQ_OP_READ;
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
@@ -660,7 +729,7 @@
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_lookup_bio_sums(inode, comp_bio,
- sums);
+ (u64)-1, sums);
BUG_ON(ret); /* -ENOMEM */
}
@@ -668,14 +737,13 @@
fs_info->sectorsize);
sums += csum_size * nr_sectors;
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
if (ret) {
comp_bio->bi_status = ret;
bio_endio(comp_bio);
}
comp_bio = btrfs_bio_alloc(cur_disk_byte);
- bio_set_dev(comp_bio, bdev);
comp_bio->bi_opf = REQ_OP_READ;
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
@@ -689,11 +757,11 @@
BUG_ON(ret); /* -ENOMEM */
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
+ ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums);
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
if (ret) {
comp_bio->bi_status = ret;
bio_endio(comp_bio);
@@ -764,26 +832,6 @@
static struct workspace_manager heuristic_wsm;
-static void heuristic_init_workspace_manager(void)
-{
- btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress);
-}
-
-static void heuristic_cleanup_workspace_manager(void)
-{
- btrfs_cleanup_workspace_manager(&heuristic_wsm);
-}
-
-static struct list_head *heuristic_get_workspace(unsigned int level)
-{
- return btrfs_get_workspace(&heuristic_wsm, level);
-}
-
-static void heuristic_put_workspace(struct list_head *ws)
-{
- btrfs_put_workspace(&heuristic_wsm, ws);
-}
-
static void free_heuristic_ws(struct list_head *ws)
{
struct heuristic_ws *workspace;
@@ -824,12 +872,7 @@
}
const struct btrfs_compress_op btrfs_heuristic_compress = {
- .init_workspace_manager = heuristic_init_workspace_manager,
- .cleanup_workspace_manager = heuristic_cleanup_workspace_manager,
- .get_workspace = heuristic_get_workspace,
- .put_workspace = heuristic_put_workspace,
- .alloc_workspace = alloc_heuristic_ws,
- .free_workspace = free_heuristic_ws,
+ .workspace_manager = &heuristic_wsm,
};
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
@@ -840,13 +883,44 @@
&btrfs_zstd_compress,
};
-void btrfs_init_workspace_manager(struct workspace_manager *wsm,
- const struct btrfs_compress_op *ops)
+static struct list_head *alloc_workspace(int type, unsigned int level)
{
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
+ case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
+ case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level);
+ case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
+static void free_workspace(int type, struct list_head *ws)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws);
+ case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws);
+ case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws);
+ case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
+static void btrfs_init_workspace_manager(int type)
+{
+ struct workspace_manager *wsm;
struct list_head *workspace;
- wsm->ops = ops;
-
+ wsm = btrfs_compress_op[type]->workspace_manager;
INIT_LIST_HEAD(&wsm->idle_ws);
spin_lock_init(&wsm->ws_lock);
atomic_set(&wsm->total_ws, 0);
@@ -856,7 +930,7 @@
* Preallocate one workspace for each compression type so we can
* guarantee forward progress in the worst case
*/
- workspace = wsm->ops->alloc_workspace(0);
+ workspace = alloc_workspace(type, 0);
if (IS_ERR(workspace)) {
pr_warn(
"BTRFS: cannot preallocate compression workspace, will try later\n");
@@ -867,14 +941,16 @@
}
}
-void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
+static void btrfs_cleanup_workspace_manager(int type)
{
+ struct workspace_manager *wsman;
struct list_head *ws;
+ wsman = btrfs_compress_op[type]->workspace_manager;
while (!list_empty(&wsman->idle_ws)) {
ws = wsman->idle_ws.next;
list_del(ws);
- wsman->ops->free_workspace(ws);
+ free_workspace(type, ws);
atomic_dec(&wsman->total_ws);
}
}
@@ -885,9 +961,9 @@
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
- unsigned int level)
+struct list_head *btrfs_get_workspace(int type, unsigned int level)
{
+ struct workspace_manager *wsm;
struct list_head *workspace;
int cpus = num_online_cpus();
unsigned nofs_flag;
@@ -897,6 +973,7 @@
wait_queue_head_t *ws_wait;
int *free_ws;
+ wsm = btrfs_compress_op[type]->workspace_manager;
idle_ws = &wsm->idle_ws;
ws_lock = &wsm->ws_lock;
total_ws = &wsm->total_ws;
@@ -932,7 +1009,7 @@
* context of btrfs_compress_bio/btrfs_compress_pages
*/
nofs_flag = memalloc_nofs_save();
- workspace = wsm->ops->alloc_workspace(level);
+ workspace = alloc_workspace(type, level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(workspace)) {
@@ -965,21 +1042,34 @@
static struct list_head *get_workspace(int type, int level)
{
- return btrfs_compress_op[type]->get_workspace(level);
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level);
+ case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level);
+ case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level);
+ case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
}
/*
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
-void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
+void btrfs_put_workspace(int type, struct list_head *ws)
{
+ struct workspace_manager *wsm;
struct list_head *idle_ws;
spinlock_t *ws_lock;
atomic_t *total_ws;
wait_queue_head_t *ws_wait;
int *free_ws;
+ wsm = btrfs_compress_op[type]->workspace_manager;
idle_ws = &wsm->idle_ws;
ws_lock = &wsm->ws_lock;
total_ws = &wsm->total_ws;
@@ -995,7 +1085,7 @@
}
spin_unlock(ws_lock);
- wsm->ops->free_workspace(ws);
+ free_workspace(type, ws);
atomic_dec(total_ws);
wake:
cond_wake_up(ws_wait);
@@ -1003,7 +1093,34 @@
static void put_workspace(int type, struct list_head *ws)
{
- return btrfs_compress_op[type]->put_workspace(ws);
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws);
+ case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws);
+ case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws);
+ case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
+/*
+ * Adjust @level according to the limits of the compression algorithm or
+ * fallback to default
+ */
+static unsigned int btrfs_compress_set_level(int type, unsigned level)
+{
+ const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+
+ if (level == 0)
+ level = ops->default_level;
+ else
+ level = min(level, ops->max_level);
+
+ return level;
}
/*
@@ -1042,10 +1159,8 @@
level = btrfs_compress_set_level(type, level);
workspace = get_workspace(type, level);
- ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
- start, pages,
- out_pages,
- total_in, total_out);
+ ret = compression_compress_pages(type, workspace, mapping, start, pages,
+ out_pages, total_in, total_out);
put_workspace(type, workspace);
return ret;
}
@@ -1071,7 +1186,7 @@
int type = cb->compress_type;
workspace = get_workspace(type, 0);
- ret = btrfs_compress_op[type]->decompress_bio(workspace, cb);
+ ret = compression_decompress_bio(type, workspace, cb);
put_workspace(type, workspace);
return ret;
@@ -1089,9 +1204,8 @@
int ret;
workspace = get_workspace(type, 0);
- ret = btrfs_compress_op[type]->decompress(workspace, data_in,
- dest_page, start_byte,
- srclen, destlen);
+ ret = compression_decompress(type, workspace, data_in, dest_page,
+ start_byte, srclen, destlen);
put_workspace(type, workspace);
return ret;
@@ -1099,18 +1213,18 @@
void __init btrfs_init_compress(void)
{
- int i;
-
- for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
- btrfs_compress_op[i]->init_workspace_manager();
+ btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
+ btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
+ btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
+ zstd_init_workspace_manager();
}
void __cold btrfs_exit_compress(void)
{
- int i;
-
- for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
- btrfs_compress_op[i]->cleanup_workspace_manager();
+ btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
+ btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
+ btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
+ zstd_cleanup_workspace_manager();
}
/*
@@ -1158,7 +1272,7 @@
/* copy bytes from the working buffer into the pages */
while (working_bytes > 0) {
bytes = min_t(unsigned long, bvec.bv_len,
- PAGE_SIZE - buf_offset);
+ PAGE_SIZE - (buf_offset % PAGE_SIZE));
bytes = min(bytes, working_bytes);
kaddr = kmap_atomic(bvec.bv_page);
@@ -1616,19 +1730,3 @@
return level;
}
-
-/*
- * Adjust @level according to the limits of the compression algorithm or
- * fallback to default
- */
-unsigned int btrfs_compress_set_level(int type, unsigned level)
-{
- const struct btrfs_compress_op *ops = btrfs_compress_op[type];
-
- if (level == 0)
- level = ops->default_level;
- else
- level = min(level, ops->max_level);
-
- return level;
-}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 4cb8be9..8001b70 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -8,6 +8,8 @@
#include <linux/sizes.h>
+struct btrfs_inode;
+
/*
* We want to make sure that amount of RAM required to uncompress an extent is
* reasonable, so we limit the total size in ram of a compressed extent to
@@ -88,12 +90,13 @@
unsigned long total_out, u64 disk_start,
struct bio *bio);
-blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
+blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
struct page **compressed_pages,
unsigned long nr_pages,
- unsigned int write_flags);
+ unsigned int write_flags,
+ struct cgroup_subsys_state *blkcg_css);
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
@@ -104,11 +107,10 @@
BTRFS_COMPRESS_ZLIB = 1,
BTRFS_COMPRESS_LZO = 2,
BTRFS_COMPRESS_ZSTD = 3,
- BTRFS_COMPRESS_TYPES = 3,
+ BTRFS_NR_COMPRESS_TYPES = 4,
};
struct workspace_manager {
- const struct btrfs_compress_op *ops;
struct list_head idle_ws;
spinlock_t ws_lock;
/* Number of free workspaces */
@@ -119,50 +121,18 @@
wait_queue_head_t ws_wait;
};
-void btrfs_init_workspace_manager(struct workspace_manager *wsm,
- const struct btrfs_compress_op *ops);
-struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
- unsigned int level);
-void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws);
-void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm);
+struct list_head *btrfs_get_workspace(int type, unsigned int level);
+void btrfs_put_workspace(int type, struct list_head *ws);
struct btrfs_compress_op {
- void (*init_workspace_manager)(void);
-
- void (*cleanup_workspace_manager)(void);
-
- struct list_head *(*get_workspace)(unsigned int level);
-
- void (*put_workspace)(struct list_head *ws);
-
- struct list_head *(*alloc_workspace)(unsigned int level);
-
- void (*free_workspace)(struct list_head *workspace);
-
- int (*compress_pages)(struct list_head *workspace,
- struct address_space *mapping,
- u64 start,
- struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out);
-
- int (*decompress_bio)(struct list_head *workspace,
- struct compressed_bio *cb);
-
- int (*decompress)(struct list_head *workspace,
- unsigned char *data_in,
- struct page *dest_page,
- unsigned long start_byte,
- size_t srclen, size_t destlen);
-
+ struct workspace_manager *workspace_manager;
/* Maximum level supported by the compression algorithm */
unsigned int max_level;
unsigned int default_level;
};
/* The heuristic workspaces are managed via the 0th workspace manager */
-#define BTRFS_NR_WORKSPACE_MANAGERS (BTRFS_COMPRESS_TYPES + 1)
+#define BTRFS_NR_WORKSPACE_MANAGERS BTRFS_NR_COMPRESS_TYPES
extern const struct btrfs_compress_op btrfs_heuristic_compress;
extern const struct btrfs_compress_op btrfs_zlib_compress;
@@ -172,8 +142,41 @@
const char* btrfs_compress_type2str(enum btrfs_compression_type type);
bool btrfs_compress_is_valid_type(const char *str, size_t len);
-unsigned int btrfs_compress_set_level(int type, unsigned level);
-
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
+int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *zlib_alloc_workspace(unsigned int level);
+void zlib_free_workspace(struct list_head *ws);
+struct list_head *zlib_get_workspace(unsigned int level);
+
+int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *lzo_alloc_workspace(unsigned int level);
+void lzo_free_workspace(struct list_head *ws);
+
+int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+void zstd_init_workspace_manager(void);
+void zstd_cleanup_workspace_manager(void);
+struct list_head *zstd_alloc_workspace(unsigned int level);
+void zstd_free_workspace(struct list_head *ws);
+struct list_head *zstd_get_workspace(unsigned int level);
+void zstd_put_workspace(struct list_head *ws);
+
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dac30b0..5addd1e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -31,9 +31,14 @@
static const struct btrfs_csums {
u16 size;
- const char *name;
+ const char name[10];
+ const char driver[12];
} btrfs_csums[] = {
[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
+ [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
+ [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
+ [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
+ .driver = "blake2b-256" },
};
int btrfs_super_csum_size(const struct btrfs_super_block *s)
@@ -51,36 +56,28 @@
return btrfs_csums[csum_type].name;
}
+/*
+ * Return driver name if defined, otherwise the name that's also a valid driver
+ * name
+ */
+const char *btrfs_super_csum_driver(u16 csum_type)
+{
+ /* csum type is validated at mount time */
+ return btrfs_csums[csum_type].driver[0] ?
+ btrfs_csums[csum_type].driver :
+ btrfs_csums[csum_type].name;
+}
+
+size_t __attribute_const__ btrfs_get_num_csums(void)
+{
+ return ARRAY_SIZE(btrfs_csums);
+}
+
struct btrfs_path *btrfs_alloc_path(void)
{
return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
}
-/*
- * set all locked nodes in the path to blocking locks. This should
- * be done before scheduling
- */
-noinline void btrfs_set_path_blocking(struct btrfs_path *p)
-{
- int i;
- for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
- if (!p->nodes[i] || !p->locks[i])
- continue;
- /*
- * If we currently have a spinning reader or writer lock this
- * will bump the count of blocking holders and drop the
- * spinlock.
- */
- if (p->locks[i] == BTRFS_READ_LOCK) {
- btrfs_set_lock_blocking_read(p->nodes[i]);
- p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
- } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
- btrfs_set_lock_blocking_write(p->nodes[i]);
- p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
- }
- }
-}
-
/* this also releases the path */
void btrfs_free_path(struct btrfs_path *p)
{
@@ -147,47 +144,10 @@
return eb;
}
-/* loop around taking references on and locking the root node of the
- * tree until you end up with a lock on the root. A locked buffer
- * is returned, with a reference held.
- */
-struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
-{
- struct extent_buffer *eb;
-
- while (1) {
- eb = btrfs_root_node(root);
- btrfs_tree_lock(eb);
- if (eb == root->node)
- break;
- btrfs_tree_unlock(eb);
- free_extent_buffer(eb);
- }
- return eb;
-}
-
-/* loop around taking references on and locking the root node of the
- * tree until you end up with a lock on the root. A locked buffer
- * is returned, with a reference held.
- */
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
-{
- struct extent_buffer *eb;
-
- while (1) {
- eb = btrfs_root_node(root);
- btrfs_tree_read_lock(eb);
- if (eb == root->node)
- break;
- btrfs_tree_read_unlock(eb);
- free_extent_buffer(eb);
- }
- return eb;
-}
-
-/* cowonly root (everything not a reference counted cow subvolume), just get
- * put onto a simple dirty list. transaction.c walks this to make sure they
- * get properly updated on disk.
+/*
+ * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
+ * just get put onto a simple dirty list. Transaction walks this list to make
+ * sure they get properly updated on disk.
*/
static void add_root_to_dirty_list(struct btrfs_root *root)
{
@@ -226,9 +186,9 @@
int level;
struct btrfs_disk_key disk_key;
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != root->last_trans);
level = btrfs_header_level(buf);
@@ -238,7 +198,8 @@
btrfs_node_key(buf, &disk_key, 0);
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
- &disk_key, level, buf->start, 0);
+ &disk_key, level, buf->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -348,7 +309,6 @@
struct rb_root *tm_root;
struct rb_node *node;
struct rb_node *next;
- struct seq_list *cur_elem;
struct tree_mod_elem *tm;
u64 min_seq = (u64)-1;
u64 seq_putting = elem->seq;
@@ -360,18 +320,20 @@
list_del(&elem->list);
elem->seq = 0;
- list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
- if (cur_elem->seq < min_seq) {
- if (seq_putting > cur_elem->seq) {
- /*
- * blocker with lower sequence number exists, we
- * cannot remove anything from the log
- */
- write_unlock(&fs_info->tree_mod_log_lock);
- return;
- }
- min_seq = cur_elem->seq;
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ struct seq_list *first;
+
+ first = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct seq_list, list);
+ if (seq_putting > first->seq) {
+ /*
+ * Blocker with lower sequence number exists, we
+ * cannot remove anything from the log.
+ */
+ write_unlock(&fs_info->tree_mod_log_lock);
+ return;
}
+ min_seq = first->seq;
}
/*
@@ -869,12 +831,11 @@
struct extent_buffer *buf)
{
/*
- * Tree blocks not in reference counted trees and tree roots
- * are never shared. If a block was allocated after the last
- * snapshot and the block was not allocated by tree relocation,
- * we know the block is not shared.
+ * Tree blocks not in shareable trees and tree roots are never shared.
+ * If a block was allocated after the last snapshot and the block was
+ * not allocated by tree relocation, we know the block is not shared.
*/
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
buf != root->node && buf != root->commit_root &&
(btrfs_header_generation(buf) <=
btrfs_root_last_snapshot(&root->root_item) ||
@@ -969,9 +930,7 @@
if (new_flags != 0) {
int level = btrfs_header_level(buf);
- ret = btrfs_set_disk_extent_flags(trans,
- buf->start,
- buf->len,
+ ret = btrfs_set_disk_extent_flags(trans, buf,
new_flags, level, 0);
if (ret)
return ret;
@@ -1002,7 +961,8 @@
const struct btrfs_disk_key *disk_key,
int level,
u64 hint,
- u64 empty_size)
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *ret;
@@ -1031,7 +991,7 @@
ret = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, disk_key, level,
- hint, empty_size);
+ hint, empty_size, nest);
trans->can_flush_pending_bgs = true;
return ret;
@@ -1054,7 +1014,8 @@
struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size)
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_disk_key disk_key;
@@ -1069,9 +1030,9 @@
btrfs_assert_tree_locked(buf);
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != root->last_trans);
level = btrfs_header_level(buf);
@@ -1085,7 +1046,7 @@
parent_start = parent->start;
cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
- level, search_start, empty_size);
+ level, search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1112,7 +1073,7 @@
return ret;
}
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
btrfs_tree_unlock(cow);
@@ -1128,7 +1089,7 @@
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
parent_start = buf->start;
- extent_buffer_get(cow);
+ atomic_inc(&cow->refs);
ret = tree_mod_log_insert_root(root->node, cow, 1);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, cow);
@@ -1247,7 +1208,7 @@
switch (tm->op) {
case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
BUG_ON(tm->slot < n);
- /* Fallthrough */
+ fallthrough;
case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
case MOD_LOG_KEY_REMOVE:
btrfs_set_node_key(eb, &tm->key, tm->slot);
@@ -1519,7 +1480,8 @@
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret)
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
@@ -1558,7 +1520,7 @@
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0);
+ parent_slot, cow_ret, search_start, 0, nest);
trace_btrfs_cow_block(root, buf, *cow_ret);
@@ -1578,6 +1540,22 @@
return 0;
}
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Compare two keys, on little-endian the disk order is same as CPU order and
+ * we can avoid the conversion.
+ */
+static int comp_keys(const struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *k2)
+{
+ const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
+
+ return btrfs_comp_cpu_keys(k1, k2);
+}
+
+#else
+
/*
* compare two keys in a memcmp fashion
*/
@@ -1590,11 +1568,12 @@
return btrfs_comp_cpu_keys(&k1, k2);
}
+#endif
/*
* same as comp_keys only with two btrfs_key's
*/
-int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
+int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
{
if (k1->objectid > k2->objectid)
return 1;
@@ -1713,7 +1692,8 @@
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
- (end_slot - i) * blocksize));
+ (end_slot - i) * blocksize),
+ BTRFS_NESTING_COW);
if (err) {
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
@@ -1745,15 +1725,8 @@
{
int low = 0;
int high = max;
- int mid;
int ret;
- struct btrfs_disk_key *tmp = NULL;
- struct btrfs_disk_key unaligned;
- unsigned long offset;
- char *kaddr = NULL;
- unsigned long map_start = 0;
- unsigned long map_len = 0;
- int err;
+ const int key_size = sizeof(struct btrfs_disk_key);
if (low > high) {
btrfs_err(eb->fs_info,
@@ -1764,32 +1737,26 @@
}
while (low < high) {
+ unsigned long oip;
+ unsigned long offset;
+ struct btrfs_disk_key *tmp;
+ struct btrfs_disk_key unaligned;
+ int mid;
+
mid = (low + high) / 2;
offset = p + mid * item_size;
+ oip = offset_in_page(offset);
- if (!kaddr || offset < map_start ||
- (offset + sizeof(struct btrfs_disk_key)) >
- map_start + map_len) {
+ if (oip + key_size <= PAGE_SIZE) {
+ const unsigned long idx = offset >> PAGE_SHIFT;
+ char *kaddr = page_address(eb->pages[idx]);
- err = map_private_extent_buffer(eb, offset,
- sizeof(struct btrfs_disk_key),
- &kaddr, &map_start, &map_len);
-
- if (!err) {
- tmp = (struct btrfs_disk_key *)(kaddr + offset -
- map_start);
- } else if (err == 1) {
- read_extent_buffer(eb, &unaligned,
- offset, sizeof(unaligned));
- tmp = &unaligned;
- } else {
- return err;
- }
-
+ tmp = (struct btrfs_disk_key *)(kaddr + oip);
} else {
- tmp = (struct btrfs_disk_key *)(kaddr + offset -
- map_start);
+ read_extent_buffer(eb, &unaligned, offset, key_size);
+ tmp = &unaligned;
}
+
ret = comp_keys(tmp, key);
if (ret < 0)
@@ -1810,9 +1777,9 @@
* leaves vs nodes
*/
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
- int level, int *slot)
+ int *slot)
{
- if (level == 0)
+ if (btrfs_header_level(eb) == 0)
return generic_bin_search(eb,
offsetof(struct btrfs_leaf, items),
sizeof(struct btrfs_item),
@@ -1924,7 +1891,8 @@
btrfs_tree_lock(child);
btrfs_set_lock_blocking_write(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(child);
free_extent_buffer(child);
@@ -1960,10 +1928,11 @@
left = NULL;
if (left) {
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
wret = btrfs_cow_block(trans, root, left,
- parent, pslot - 1, &left);
+ parent, pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -1975,10 +1944,11 @@
right = NULL;
if (right) {
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
wret = btrfs_cow_block(trans, root, right,
- parent, pslot + 1, &right);
+ parent, pslot + 1, &right,
+ BTRFS_NESTING_RIGHT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -2067,7 +2037,7 @@
/* update the path */
if (left) {
if (btrfs_header_nritems(left) > orig_slot) {
- extent_buffer_get(left);
+ atomic_inc(&left->refs);
/* left was locked after cow */
path->nodes[level] = left;
path->slots[level + 1] -= 1;
@@ -2138,7 +2108,7 @@
if (left) {
u32 left_nr;
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
left_nr = btrfs_header_nritems(left);
@@ -2146,7 +2116,8 @@
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
- pslot - 1, &left);
+ pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret)
wret = 1;
else {
@@ -2192,7 +2163,7 @@
if (right) {
u32 right_nr;
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
right_nr = btrfs_header_nritems(right);
@@ -2201,7 +2172,7 @@
} else {
ret = btrfs_cow_block(trans, root, right,
parent, pslot + 1,
- &right);
+ &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
wret = 1;
else {
@@ -2410,32 +2381,6 @@
}
/*
- * This releases any locks held in the path starting at level and
- * going all the way up to the root.
- *
- * btrfs_search_slot will keep the lock held on higher nodes in a few
- * corner cases, such as COW of the block at slot zero in the node. This
- * ignores those rules, and it should only be called when there are no
- * more updates to be done higher up in the tree.
- */
-noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
-{
- int i;
-
- if (path->keep_locks)
- return;
-
- for (i = level; i < BTRFS_MAX_LEVEL; i++) {
- if (!path->nodes[i])
- continue;
- if (!path->locks[i])
- continue;
- btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
- path->locks[i] = 0;
- }
-}
-
-/*
* helper function for btrfs_search_slot. The goal is to find a block
* in cache without setting the path to blocking. If we find the block
* we return zero and the path is unchanged.
@@ -2451,16 +2396,15 @@
struct btrfs_fs_info *fs_info = root->fs_info;
u64 blocknr;
u64 gen;
- struct extent_buffer *b = *eb_ret;
struct extent_buffer *tmp;
struct btrfs_key first_key;
int ret;
int parent_level;
- blocknr = btrfs_node_blockptr(b, slot);
- gen = btrfs_node_ptr_generation(b, slot);
- parent_level = btrfs_header_level(b);
- btrfs_node_key_to_cpu(b, &first_key, slot);
+ blocknr = btrfs_node_blockptr(*eb_ret, slot);
+ gen = btrfs_node_ptr_generation(*eb_ret, slot);
+ parent_level = btrfs_header_level(*eb_ret);
+ btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
@@ -2604,19 +2548,6 @@
return ret;
}
-static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
- int level, int *prev_cmp, int *slot)
-{
- if (*prev_cmp != 0) {
- *prev_cmp = btrfs_bin_search(b, key, level, slot);
- return *prev_cmp;
- }
-
- *slot = 0;
-
- return 0;
-}
-
int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
u64 iobjectid, u64 ioff, u8 key_type,
struct btrfs_key *found_key)
@@ -2658,12 +2589,9 @@
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
- int root_lock;
+ int root_lock = 0;
int level = 0;
- /* We try very hard to do read locks on the root */
- root_lock = BTRFS_READ_LOCK;
-
if (p->search_commit_root) {
/*
* The commit roots are read only so we always do read locks,
@@ -2683,7 +2611,7 @@
} else {
b = root->commit_root;
- extent_buffer_get(b);
+ atomic_inc(&b->refs);
}
level = btrfs_header_level(b);
/*
@@ -2701,6 +2629,9 @@
goto out;
}
+ /* We try very hard to do read locks on the root */
+ root_lock = BTRFS_READ_LOCK;
+
/*
* If the level is set to maximum, we can skip trying to get the read
* lock.
@@ -2710,7 +2641,7 @@
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = btrfs_read_lock_root_node(root);
+ b = __btrfs_read_lock_root_node(root, p->recurse);
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -2727,6 +2658,17 @@
level = btrfs_header_level(b);
out:
+ /*
+ * The root may have failed to write out at some point, and thus is no
+ * longer valid, return an error in this case.
+ */
+ if (!extent_buffer_uptodate(b)) {
+ if (root_lock)
+ btrfs_tree_unlock_rw(b, root_lock);
+ free_extent_buffer(b);
+ return ERR_PTR(-EIO);
+ }
+
p->nodes[level] = b;
if (!p->skip_locking)
p->locks[level] = root_lock;
@@ -2816,12 +2758,10 @@
}
while (b) {
+ int dec = 0;
+
level = btrfs_header_level(b);
- /*
- * setup the path here so we can release it under lock
- * contention with the cow code
- */
if (cow) {
bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
@@ -2851,11 +2791,13 @@
btrfs_set_path_blocking(p);
if (last_level)
err = btrfs_cow_block(trans, root, b, NULL, 0,
- &b);
+ &b,
+ BTRFS_NESTING_COW);
else
err = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
- p->slots[level + 1], &b);
+ p->slots[level + 1], &b,
+ BTRFS_NESTING_COW);
if (err) {
ret = err;
goto done;
@@ -2888,77 +2830,25 @@
}
}
- ret = key_search(b, key, level, &prev_cmp, &slot);
- if (ret < 0)
- goto done;
-
- if (level != 0) {
- int dec = 0;
- if (ret && slot > 0) {
- dec = 1;
- slot -= 1;
- }
- p->slots[level] = slot;
- err = setup_nodes_for_search(trans, root, p, b, level,
- ins_len, &write_lock_level);
- if (err == -EAGAIN)
- goto again;
- if (err) {
- ret = err;
- goto done;
- }
- b = p->nodes[level];
- slot = p->slots[level];
-
- /*
- * slot 0 is special, if we change the key
- * we have to update the parent pointer
- * which means we must have a write lock
- * on the parent
- */
- if (slot == 0 && ins_len &&
- write_lock_level < level + 1) {
- write_lock_level = level + 1;
- btrfs_release_path(p);
- goto again;
- }
-
- unlock_up(p, level, lowest_unlock,
- min_write_lock_level, &write_lock_level);
-
- if (level == lowest_level) {
- if (dec)
- p->slots[level]++;
- goto done;
- }
-
- err = read_block_for_search(root, p, &b, level,
- slot, key);
- if (err == -EAGAIN)
- goto again;
- if (err) {
- ret = err;
- goto done;
- }
-
- if (!p->skip_locking) {
- level = btrfs_header_level(b);
- if (level <= write_lock_level) {
- if (!btrfs_try_tree_write_lock(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_lock(b);
- }
- p->locks[level] = BTRFS_WRITE_LOCK;
- } else {
- if (!btrfs_tree_read_lock_atomic(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
- }
- p->locks[level] = BTRFS_READ_LOCK;
- }
- p->nodes[level] = b;
- }
+ /*
+ * If btrfs_bin_search returns an exact match (prev_cmp == 0)
+ * we can safely assume the target key will always be in slot 0
+ * on lower levels due to the invariants BTRFS' btree provides,
+ * namely that a btrfs_key_ptr entry always points to the
+ * lowest key in the child node, thus we can skip searching
+ * lower levels
+ */
+ if (prev_cmp == 0) {
+ slot = 0;
+ ret = 0;
} else {
+ ret = btrfs_bin_search(b, key, &slot);
+ prev_cmp = ret;
+ if (ret < 0)
+ goto done;
+ }
+
+ if (level == 0) {
p->slots[level] = slot;
if (ins_len > 0 &&
btrfs_leaf_free_space(b) < ins_len) {
@@ -2983,6 +2873,68 @@
min_write_lock_level, NULL);
goto done;
}
+ if (ret && slot > 0) {
+ dec = 1;
+ slot--;
+ }
+ p->slots[level] = slot;
+ err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
+ &write_lock_level);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+ b = p->nodes[level];
+ slot = p->slots[level];
+
+ /*
+ * Slot 0 is special, if we change the key we have to update
+ * the parent pointer which means we must have a write lock on
+ * the parent
+ */
+ if (slot == 0 && ins_len && write_lock_level < level + 1) {
+ write_lock_level = level + 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
+ unlock_up(p, level, lowest_unlock, min_write_lock_level,
+ &write_lock_level);
+
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
+
+ err = read_block_for_search(root, p, &b, level, slot, key);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ if (!p->skip_locking) {
+ level = btrfs_header_level(b);
+ if (level <= write_lock_level) {
+ if (!btrfs_try_tree_write_lock(b)) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_lock(b);
+ }
+ p->locks[level] = BTRFS_WRITE_LOCK;
+ } else {
+ if (!btrfs_tree_read_lock_atomic(b)) {
+ btrfs_set_path_blocking(p);
+ __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
+ p->recurse);
+ }
+ p->locks[level] = BTRFS_READ_LOCK;
+ }
+ p->nodes[level] = b;
+ }
}
ret = 1;
done:
@@ -3019,7 +2971,6 @@
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
- int prev_cmp = -1;
lowest_level = p->lowest_level;
WARN_ON(p->nodes[0] != NULL);
@@ -3039,6 +2990,8 @@
p->locks[level] = BTRFS_READ_LOCK;
while (b) {
+ int dec = 0;
+
level = btrfs_header_level(b);
p->nodes[level] = b;
@@ -3050,56 +3003,49 @@
*/
btrfs_unlock_up_safe(p, level + 1);
- /*
- * Since we can unwind ebs we want to do a real search every
- * time.
- */
- prev_cmp = -1;
- ret = key_search(b, key, level, &prev_cmp, &slot);
+ ret = btrfs_bin_search(b, key, &slot);
if (ret < 0)
goto done;
- if (level != 0) {
- int dec = 0;
- if (ret && slot > 0) {
- dec = 1;
- slot -= 1;
- }
- p->slots[level] = slot;
- unlock_up(p, level, lowest_unlock, 0, NULL);
-
- if (level == lowest_level) {
- if (dec)
- p->slots[level]++;
- goto done;
- }
-
- err = read_block_for_search(root, p, &b, level,
- slot, key);
- if (err == -EAGAIN)
- goto again;
- if (err) {
- ret = err;
- goto done;
- }
-
- level = btrfs_header_level(b);
- if (!btrfs_tree_read_lock_atomic(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
- }
- b = tree_mod_log_rewind(fs_info, p, b, time_seq);
- if (!b) {
- ret = -ENOMEM;
- goto done;
- }
- p->locks[level] = BTRFS_READ_LOCK;
- p->nodes[level] = b;
- } else {
+ if (level == 0) {
p->slots[level] = slot;
unlock_up(p, level, lowest_unlock, 0, NULL);
goto done;
}
+
+ if (ret && slot > 0) {
+ dec = 1;
+ slot--;
+ }
+ p->slots[level] = slot;
+ unlock_up(p, level, lowest_unlock, 0, NULL);
+
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
+
+ err = read_block_for_search(root, p, &b, level, slot, key);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ level = btrfs_header_level(b);
+ if (!btrfs_tree_read_lock_atomic(b)) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_read_lock(b);
+ }
+ b = tree_mod_log_rewind(fs_info, p, b, time_seq);
+ if (!b) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ p->locks[level] = BTRFS_READ_LOCK;
+ p->nodes[level] = b;
}
ret = 1;
done:
@@ -3272,6 +3218,58 @@
}
/*
+ * Check key order of two sibling extent buffers.
+ *
+ * Return true if something is wrong.
+ * Return false if everything is fine.
+ *
+ * Tree-checker only works inside one tree block, thus the following
+ * corruption can not be detected by tree-checker:
+ *
+ * Leaf @left | Leaf @right
+ * --------------------------------------------------------------
+ * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
+ *
+ * Key f6 in leaf @left itself is valid, but not valid when the next
+ * key in leaf @right is 7.
+ * This can only be checked at tree block merge time.
+ * And since tree checker has ensured all key order in each tree block
+ * is correct, we only need to bother the last key of @left and the first
+ * key of @right.
+ */
+static bool check_sibling_keys(struct extent_buffer *left,
+ struct extent_buffer *right)
+{
+ struct btrfs_key left_last;
+ struct btrfs_key right_first;
+ int level = btrfs_header_level(left);
+ int nr_left = btrfs_header_nritems(left);
+ int nr_right = btrfs_header_nritems(right);
+
+ /* No key to check in one of the tree blocks */
+ if (!nr_left || !nr_right)
+ return false;
+
+ if (level) {
+ btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_node_key_to_cpu(right, &right_first, 0);
+ } else {
+ btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_item_key_to_cpu(right, &right_first, 0);
+ }
+
+ if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
+ btrfs_crit(left->fs_info,
+"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
+ left_last.objectid, left_last.type,
+ left_last.offset, right_first.objectid,
+ right_first.type, right_first.offset);
+ return true;
+ }
+ return false;
+}
+
+/*
* try to push data from one node into the next node left in the
* tree.
*
@@ -3315,6 +3313,12 @@
} else
push_items = min(src_nritems - 8, push_items);
+ /* dst is the left eb, src is the middle eb */
+ if (check_sibling_keys(dst, src)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -3383,6 +3387,12 @@
if (max_push < push_items)
push_items = max_push;
+ /* dst is the right eb, src is the middle eb */
+ if (check_sibling_keys(src, dst)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
@@ -3439,7 +3449,8 @@
btrfs_node_key(lower, &lower_key, 0);
c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
- root->node->start, 0);
+ root->node->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -3464,7 +3475,7 @@
free_extent_buffer(old);
add_root_to_dirty_list(root);
- extent_buffer_get(c);
+ atomic_inc(&c->refs);
path->nodes[level] = c;
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
path->slots[level] = 0;
@@ -3569,7 +3580,7 @@
btrfs_node_key(c, &disk_key, mid);
split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
- c->start, 0);
+ c->start, 0, BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -3617,19 +3628,17 @@
{
struct btrfs_item *start_item;
struct btrfs_item *end_item;
- struct btrfs_map_token token;
int data_len;
int nritems = btrfs_header_nritems(l);
int end = min(nritems, start + nr) - 1;
if (!nr)
return 0;
- btrfs_init_map_token(&token, l);
start_item = btrfs_item_nr(start);
end_item = btrfs_item_nr(end);
- data_len = btrfs_token_item_offset(l, start_item, &token) +
- btrfs_token_item_size(l, start_item, &token);
- data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
+ data_len = btrfs_item_offset(l, start_item) +
+ btrfs_item_size(l, start_item);
+ data_len = data_len - btrfs_item_offset(l, end_item);
data_len += sizeof(struct btrfs_item) * nr;
WARN_ON(data_len < 0);
return data_len;
@@ -3760,8 +3769,8 @@
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(i);
- push_space -= btrfs_token_item_size(right, item, &token);
- btrfs_set_token_item_offset(right, item, push_space, &token);
+ push_space -= btrfs_token_item_size(&token, item);
+ btrfs_set_token_item_offset(&token, item, push_space);
}
left_nritems -= push_items;
@@ -3840,7 +3849,7 @@
if (IS_ERR(right))
return 1;
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
free_space = btrfs_leaf_free_space(right);
@@ -3849,7 +3858,7 @@
/* cow and double check */
ret = btrfs_cow_block(trans, root, right, upper,
- slot + 1, &right);
+ slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
goto out_unlock;
@@ -3861,6 +3870,12 @@
if (left_nritems == 0)
goto out_unlock;
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return ret;
+ }
if (path->slots[0] == left_nritems && !empty) {
/* Key greater than all keys in the leaf, right neighbor has
* enough room for it and we're not emptying our leaf to delete
@@ -3969,10 +3984,9 @@
item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(left, item, &token);
- btrfs_set_token_item_offset(left, item,
- ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
- &token);
+ ioff = btrfs_token_item_offset(&token, item);
+ btrfs_set_token_item_offset(&token, item,
+ ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -4002,9 +4016,8 @@
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(i);
- push_space = push_space - btrfs_token_item_size(right,
- item, &token);
- btrfs_set_token_item_offset(right, item, push_space, &token);
+ push_space = push_space - btrfs_token_item_size(&token, item);
+ btrfs_set_token_item_offset(&token, item, push_space);
}
btrfs_mark_buffer_dirty(left);
@@ -4075,7 +4088,7 @@
if (IS_ERR(left))
return 1;
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
free_space = btrfs_leaf_free_space(left);
@@ -4086,7 +4099,8 @@
/* cow and double check */
ret = btrfs_cow_block(trans, root, left,
- path->nodes[1], slot - 1, &left);
+ path->nodes[1], slot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret) {
/* we hit -ENOSPC, but it isn't fatal here */
if (ret == -ENOSPC)
@@ -4100,6 +4114,10 @@
goto out;
}
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
return __push_leaf_left(path, min_data_size,
empty, left, free_space, right_nritems,
max_slot);
@@ -4146,9 +4164,8 @@
struct btrfs_item *item = btrfs_item_nr(i);
u32 ioff;
- ioff = btrfs_token_item_offset(right, item, &token);
- btrfs_set_token_item_offset(right, item,
- ioff + rt_data_off, &token);
+ ioff = btrfs_token_item_offset(&token, item);
+ btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
}
btrfs_set_header_nritems(l, mid);
@@ -4349,8 +4366,18 @@
else
btrfs_item_key(l, &disk_key, mid);
+ /*
+ * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
+ * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
+ * subclasses, which is 8 at the time of this patch, and we've maxed it
+ * out. In the future we could add a
+ * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
+ * use BTRFS_NESTING_NEW_ROOT.
+ */
right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
- l->start, 0);
+ l->start, 0, num_doubles ?
+ BTRFS_NESTING_NEW_ROOT :
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -4595,9 +4622,7 @@
return ret;
path->slots[0]++;
- setup_items_for_insert(root, path, new_key, &item_size,
- item_size, item_size +
- sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, path, new_key, &item_size, 1);
leaf = path->nodes[0];
memcpy_extent_buffer(leaf,
btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -4651,9 +4676,8 @@
u32 ioff;
item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff + size_diff, &token);
+ ioff = btrfs_token_item_offset(&token, item);
+ btrfs_set_token_item_offset(&token, item, ioff + size_diff);
}
/* shift the data */
@@ -4750,9 +4774,8 @@
u32 ioff;
item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff - data_size, &token);
+ ioff = btrfs_token_item_offset(&token, item);
+ btrfs_set_token_item_offset(&token, item, ioff - data_size);
}
/* shift the data */
@@ -4772,14 +4795,20 @@
}
}
-/*
- * this is a helper for btrfs_insert_empty_items, the main goal here is
- * to save stack depth by doing the bulk of the work in a function
- * that doesn't call btrfs_search_slot
+/**
+ * setup_items_for_insert - Helper called before inserting one or more items
+ * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
+ * in a function that doesn't call btrfs_search_slot
+ *
+ * @root: root we are inserting items to
+ * @path: points to the leaf/slot where we are going to insert new items
+ * @cpu_key: array of keys for items to be inserted
+ * @data_size: size of the body of each item we are going to insert
+ * @nr: size of @cpu_key/@data_size arrays
*/
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr)
+ int nr)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_item *item;
@@ -4790,6 +4819,12 @@
struct extent_buffer *leaf;
int slot;
struct btrfs_map_token token;
+ u32 total_size;
+ u32 total_data = 0;
+
+ for (i = 0; i < nr; i++)
+ total_data += data_size[i];
+ total_size = total_data + (nr * sizeof(struct btrfs_item));
if (path->slots[0] == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
@@ -4816,7 +4851,8 @@
if (old_data < data_end) {
btrfs_print_leaf(leaf);
- btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
+ btrfs_crit(fs_info,
+ "item at slot %d with data offset %u beyond data end of leaf %u",
slot, old_data, data_end);
BUG();
}
@@ -4828,9 +4864,9 @@
u32 ioff;
item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff - total_data, &token);
+ ioff = btrfs_token_item_offset(&token, item);
+ btrfs_set_token_item_offset(&token, item,
+ ioff - total_data);
}
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
@@ -4849,10 +4885,9 @@
btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(slot + i);
- btrfs_set_token_item_offset(leaf, item,
- data_end - data_size[i], &token);
data_end -= data_size[i];
- btrfs_set_token_item_size(leaf, item, data_size[i], &token);
+ btrfs_set_token_item_offset(&token, item, data_end);
+ btrfs_set_token_item_size(&token, item, data_size[i]);
}
btrfs_set_header_nritems(leaf, nritems + nr);
@@ -4893,8 +4928,7 @@
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, cpu_key, data_size,
- total_data, total_size, nr);
+ setup_items_for_insert(root, path, cpu_key, data_size, nr);
return 0;
}
@@ -4997,7 +5031,7 @@
root_sub_used(root, leaf->len);
- extent_buffer_get(leaf);
+ atomic_inc(&leaf->refs);
btrfs_free_tree_block(trans, root, leaf, 0, 1);
free_extent_buffer_stale(leaf);
}
@@ -5040,9 +5074,8 @@
u32 ioff;
item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff + dsize, &token);
+ ioff = btrfs_token_item_offset(&token, item);
+ btrfs_set_token_item_offset(&token, item, ioff + dsize);
}
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
@@ -5078,7 +5111,7 @@
* for possible call to del_ptr below
*/
slot = path->slots[1];
- extent_buffer_get(leaf);
+ atomic_inc(&leaf->refs);
btrfs_set_path_blocking(path);
wret = push_leaf_left(trans, root, path, 1, 1,
@@ -5213,7 +5246,7 @@
while (1) {
nritems = btrfs_header_nritems(cur);
level = btrfs_header_level(cur);
- sret = btrfs_bin_search(cur, min_key, level, &slot);
+ sret = btrfs_bin_search(cur, min_key, &slot);
if (sret < 0) {
ret = sret;
goto out;
@@ -5496,7 +5529,9 @@
}
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
+ __btrfs_tree_read_lock(next,
+ BTRFS_NESTING_RIGHT,
+ path->recurse);
}
next_rw_lock = BTRFS_READ_LOCK;
}
@@ -5531,7 +5566,9 @@
ret = btrfs_try_tree_read_lock(next);
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
+ __btrfs_tree_read_lock(next,
+ BTRFS_NESTING_RIGHT,
+ path->recurse);
}
next_rw_lock = BTRFS_READ_LOCK;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index cd77c06..bcc6848 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -28,17 +28,19 @@
#include <linux/dynamic_debug.h>
#include <linux/refcount.h>
#include <linux/crc32c.h>
+#include "extent-io-tree.h"
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
#include "block-rsv.h"
+#include "locking.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
struct btrfs_pending_snapshot;
struct btrfs_delayed_ref_root;
struct btrfs_space_info;
-struct btrfs_block_group_cache;
+struct btrfs_block_group;
extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
@@ -56,9 +58,9 @@
* filesystem data as well that can be used to read data in order to repair
* read errors on other disks.
*
- * Current value is derived from RAID1 with 2 copies.
+ * Current value is derived from RAID1C4 with 4 copies.
*/
-#define BTRFS_MAX_MIRRORS (2 + 1)
+#define BTRFS_MAX_MIRRORS (4 + 1)
#define BTRFS_MAX_LEVEL 8
@@ -100,6 +102,14 @@
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
+/*
+ * Deltas are an effective way to populate global statistics. Give macro names
+ * to make it clear what we're doing. An example is discard_extents in
+ * btrfs_free_space_ctl.
+ */
+#define BTRFS_STAT_NR_ENTRIES 2
+#define BTRFS_STAT_CURR 0
+#define BTRFS_STAT_PREV 1
/*
* Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
@@ -291,7 +301,8 @@
BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
- BTRFS_FEATURE_INCOMPAT_METADATA_UUID)
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
+ BTRFS_FEATURE_INCOMPAT_RAID1C34)
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@ -363,6 +374,7 @@
unsigned int search_commit_root:1;
unsigned int need_commit_sem:1;
unsigned int skip_release_on_error:1;
+ unsigned int recurse:1;
};
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
sizeof(struct btrfs_item))
@@ -413,7 +425,7 @@
/* We did a full search and couldn't create a cluster */
bool fragmented;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
/*
* when a cluster is allocated from a block group, we put the
* cluster onto a list in the block group so that it can
@@ -438,6 +450,36 @@
struct mutex lock;
};
+/* Discard control. */
+/*
+ * Async discard uses multiple lists to differentiate the discard filter
+ * parameters. Index 0 is for completely free block groups where we need to
+ * ensure the entire block group is trimmed without being lossy. Indices
+ * afterwards represent monotonically decreasing discard filter sizes to
+ * prioritize what should be discarded next.
+ */
+#define BTRFS_NR_DISCARD_LISTS 3
+#define BTRFS_DISCARD_INDEX_UNUSED 0
+#define BTRFS_DISCARD_INDEX_START 1
+
+struct btrfs_discard_ctl {
+ struct workqueue_struct *discard_workers;
+ struct delayed_work work;
+ spinlock_t lock;
+ struct btrfs_block_group *block_group;
+ struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
+ u64 prev_discard;
+ atomic_t discardable_extents;
+ atomic64_t discardable_bytes;
+ u64 max_discard_size;
+ unsigned long delay;
+ u32 iops_limit;
+ u32 kbps_limit;
+ u64 discard_extent_bytes;
+ u64 discard_bitmap_bytes;
+ atomic64_t discard_bytes_saved;
+};
+
/* delayed seq elem */
struct seq_list {
struct list_head list;
@@ -453,7 +495,7 @@
ORPHAN_CLEANUP_DONE = 2,
};
-void btrfs_init_async_reclaim_work(struct work_struct *work);
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
/* fs_info */
struct reloc_control;
@@ -476,10 +518,15 @@
void *ptr;
struct inode *inode;
/*
- * If true, ptr points to a struct btrfs_block_group_cache. Otherwise,
- * ptr points to a struct btrfs_device.
+ * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
+ * points to a struct btrfs_device.
*/
bool is_block_group;
+ /*
+ * Only used when 'is_block_group' is true and it is the number of
+ * extents used by a swapfile for this block group ('ptr' field).
+ */
+ int bg_extent_count;
};
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
@@ -500,11 +547,6 @@
/* Used to record internally whether fs has been frozen */
BTRFS_FS_FROZEN,
/*
- * Indicate that a whole-filesystem exclusive operation is running
- * (device replace, resize, device add/delete, balance)
- */
- BTRFS_FS_EXCL_OP,
- /*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
* Set and cleared while holding fs_info::balance_mutex.
@@ -520,10 +562,26 @@
*/
BTRFS_FS_CSUM_IMPL_FAST,
+ /* Indicate that the discard workqueue can service discards. */
+ BTRFS_FS_DISCARD_RUNNING,
+
/* Indicate that we can't trust the free space tree for caching yet */
BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
};
+/*
+ * Exclusive operations (device replace, resize, device add/remove, balance)
+ */
+enum btrfs_exclusive_operation {
+ BTRFS_EXCLOP_NONE,
+ BTRFS_EXCLOP_BALANCE,
+ BTRFS_EXCLOP_DEV_ADD,
+ BTRFS_EXCLOP_DEV_REMOVE,
+ BTRFS_EXCLOP_DEV_REPLACE,
+ BTRFS_EXCLOP_RESIZE,
+ BTRFS_EXCLOP_SWAP_ACTIVATE,
+};
+
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
@@ -536,6 +594,7 @@
struct btrfs_root *quota_root;
struct btrfs_root *uuid_root;
struct btrfs_root *free_space_root;
+ struct btrfs_root *data_reloc_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@@ -551,8 +610,8 @@
/* keep track of unallocated space */
atomic64_t free_chunk_space;
- struct extent_io_tree freed_extents[2];
- struct extent_io_tree *pinned_extents;
+ /* Track ranges which are used by log trees blocks/logged data extents */
+ struct extent_io_tree excluded_extents;
/* logical->physical extent mapping */
struct extent_map_tree mapping_tree;
@@ -651,7 +710,6 @@
struct rw_semaphore cleanup_work_sem;
struct rw_semaphore subvol_sem;
- struct srcu_struct subvol_srcu;
spinlock_t trans_lock;
/*
@@ -713,12 +771,10 @@
struct btrfs_workqueue *endio_workers;
struct btrfs_workqueue *endio_meta_workers;
struct btrfs_workqueue *endio_raid56_workers;
- struct btrfs_workqueue *endio_repair_workers;
struct btrfs_workqueue *rmw_workers;
struct btrfs_workqueue *endio_meta_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
- struct btrfs_workqueue *submit_workers;
struct btrfs_workqueue *caching_workers;
struct btrfs_workqueue *readahead_workers;
@@ -735,6 +791,7 @@
u32 thread_pool_size;
struct kobject *space_info_kobj;
+ struct kobject *qgroups_kobj;
u64 total_pinned;
@@ -811,6 +868,8 @@
struct btrfs_workqueue *scrub_wr_completion_workers;
struct btrfs_workqueue *scrub_parity_workers;
+ struct btrfs_discard_ctl discard_ctl;
+
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
#endif
@@ -873,6 +932,7 @@
/* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work;
+ struct work_struct async_data_reclaim_work;
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
@@ -896,10 +956,22 @@
*/
int send_in_progress;
+ /* Type of exclusive operation running */
+ unsigned long exclusive_operation;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
#endif
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct kobject *debug_kobj;
+ struct kobject *discard_debug_kobj;
+ struct list_head allocated_roots;
+
+ spinlock_t eb_leak_lock;
+ struct list_head allocated_ebs;
+#endif
};
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
@@ -907,11 +979,6 @@
return sb->s_fs_info;
}
-struct btrfs_subvolume_writers {
- struct percpu_counter counter;
- wait_queue_head_t wait;
-};
-
/*
* The state of btrfs root
*/
@@ -923,7 +990,28 @@
* is used to tell us when more checks are required
*/
BTRFS_ROOT_IN_TRANS_SETUP,
- BTRFS_ROOT_REF_COWS,
+
+ /*
+ * Set if tree blocks of this root can be shared by other roots.
+ * Only subvolume trees and their reloc trees have this bit set.
+ * Conflicts with TRACK_DIRTY bit.
+ *
+ * This affects two things:
+ *
+ * - How balance works
+ * For shareable roots, we need to use reloc tree and do path
+ * replacement for balance, and need various pre/post hooks for
+ * snapshot creation to handle them.
+ *
+ * While for non-shareable trees, we just simply do a tree search
+ * with COW.
+ *
+ * - How dirty roots are tracked
+ * For shareable roots, btrfs_record_root_in_trans() is needed to
+ * track them, while non-subvolume roots have TRACK_DIRTY bit, they
+ * don't need to set this manually.
+ */
+ BTRFS_ROOT_SHAREABLE,
BTRFS_ROOT_TRACK_DIRTY,
BTRFS_ROOT_IN_RADIX,
BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
@@ -1011,11 +1099,10 @@
u64 highest_objectid;
- u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
- /* the dirty list is only used by non-reference counted roots */
+ /* The dirty list is only used by non-shareable roots */
struct list_head dirty_list;
struct list_head root_list;
@@ -1089,8 +1176,9 @@
* root_item_lock.
*/
int dedupe_in_progress;
- struct btrfs_subvolume_writers *subv_writers;
- atomic_t will_be_snapshotted;
+ /* For exclusion of snapshot creation and nocow writes */
+ struct btrfs_drew_lock snapshot_lock;
+
atomic_t snapshot_force_cow;
/* For qgroup metadata reserved space */
@@ -1105,29 +1193,52 @@
/* Record pairs of swapped blocks for qgroup */
struct btrfs_qgroup_swapped_blocks swapped_blocks;
+ /* Used only by log trees, when logging csum items */
+ struct extent_io_tree log_csum_range;
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
#endif
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct list_head leak_list;
+#endif
};
-struct btrfs_clone_extent_info {
+/*
+ * Structure that conveys information about an extent that is going to replace
+ * all the extents in a file range.
+ */
+struct btrfs_replace_extent_info {
u64 disk_offset;
u64 disk_len;
u64 data_offset;
u64 data_len;
u64 file_offset;
+ /* Pointer to a file extent item of type regular or prealloc. */
char *extent_buf;
- u32 item_size;
+ /*
+ * Set to true when attempting to replace a file range with a new extent
+ * described by this structure, set to false when attempting to clone an
+ * existing extent into a file range.
+ */
+ bool is_new_extent;
+ /* Meaningful only if is_new_extent is true. */
+ int qgroup_reserved;
+ /*
+ * Meaningful only if is_new_extent is true.
+ * Used to track how many extent items we have already inserted in a
+ * subvolume tree that refer to the extent described by this structure,
+ * so that we know when to create a new delayed ref or update an existing
+ * one.
+ */
+ int insertions;
};
struct btrfs_file_private {
void *filldir_buf;
};
-static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
-{
- return btrfs_sb(inode->i_sb)->sectorsize;
-}
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
{
@@ -1175,7 +1286,7 @@
#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
#define BTRFS_MOUNT_NOSSD (1 << 9)
-#define BTRFS_MOUNT_DISCARD (1 << 10)
+#define BTRFS_MOUNT_DISCARD_SYNC (1 << 10)
#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
@@ -1194,6 +1305,7 @@
#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
#define BTRFS_MOUNT_REF_VERIFY (1 << 28)
+#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
@@ -1205,18 +1317,18 @@
BTRFS_MOUNT_##opt)
#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
-{ \
+do { \
if (!btrfs_test_opt(fs_info, opt)) \
btrfs_info(fs_info, fmt, ##args); \
btrfs_set_opt(fs_info->mount_opt, opt); \
-}
+} while (0)
#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
-{ \
+do { \
if (btrfs_test_opt(fs_info, opt)) \
btrfs_info(fs_info, fmt, ##args); \
btrfs_clear_opt(fs_info->mount_opt, opt); \
-}
+} while (0)
/*
* Requests for changes that need to be done during transaction commit.
@@ -1295,7 +1407,7 @@
BTRFS_INODE_ROOT_ITEM_INIT)
struct btrfs_map_token {
- const struct extent_buffer *eb;
+ struct extent_buffer *eb;
char *kaddr;
unsigned long offset;
};
@@ -1307,7 +1419,8 @@
struct extent_buffer *eb)
{
token->eb = eb;
- token->kaddr = NULL;
+ token->kaddr = page_address(eb->pages[0]);
+ token->offset = 0;
}
/* some macros to generate set/get functions for the struct fields. This
@@ -1318,6 +1431,16 @@
#define cpu_to_le8(v) (v)
#define __le8 u8
+static inline u8 get_unaligned_le8(const void *p)
+{
+ return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+ *(u8 *)p = val;
+}
+
#define read_eb_member(eb, ptr, type, member, result) (\
read_extent_buffer(eb, (char *)(result), \
((unsigned long)(ptr)) + \
@@ -1331,15 +1454,14 @@
sizeof(((type *)0)->member)))
#define DECLARE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \
- const void *ptr, unsigned long off, \
- struct btrfs_map_token *token); \
-void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr, \
- unsigned long off, u##bits val, \
- struct btrfs_map_token *token); \
+u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
+ const void *ptr, unsigned long off); \
+void btrfs_set_token_##bits(struct btrfs_map_token *token, \
+ const void *ptr, unsigned long off, \
+ u##bits val); \
u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off); \
-void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val);
DECLARE_BTRFS_SETGET_BITS(8)
@@ -1354,53 +1476,49 @@
BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
return btrfs_get_##bits(eb, s, offsetof(type, member)); \
} \
-static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \
+static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
u##bits val) \
{ \
BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
btrfs_set_##bits(eb, s, offsetof(type, member), val); \
} \
-static inline u##bits btrfs_token_##name(const struct extent_buffer *eb,\
- const type *s, \
- struct btrfs_map_token *token) \
+static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
+ const type *s) \
{ \
BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
- return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \
+ return btrfs_get_token_##bits(token, s, offsetof(type, member));\
} \
-static inline void btrfs_set_token_##name(struct extent_buffer *eb, \
- type *s, u##bits val, \
- struct btrfs_map_token *token) \
+static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
+ type *s, u##bits val) \
{ \
BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
- btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \
+ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
}
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
const type *p = page_address(eb->pages[0]); \
- u##bits res = le##bits##_to_cpu(p->member); \
- return res; \
+ return get_unaligned_le##bits(&p->member); \
} \
-static inline void btrfs_set_##name(struct extent_buffer *eb, \
+static inline void btrfs_set_##name(const struct extent_buffer *eb, \
u##bits val) \
{ \
type *p = page_address(eb->pages[0]); \
- p->member = cpu_to_le##bits(val); \
+ put_unaligned_le##bits(val, &p->member); \
}
#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const type *s) \
{ \
- return le##bits##_to_cpu(s->member); \
+ return get_unaligned_le##bits(&s->member); \
} \
static inline void btrfs_set_##name(type *s, u##bits val) \
{ \
- s->member = cpu_to_le##bits(val); \
+ put_unaligned_le##bits(val, &s->member); \
}
-
-static inline u64 btrfs_device_total_bytes(struct extent_buffer *eb,
+static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s)
{
BUILD_BUG_ON(sizeof(u64) !=
@@ -1408,7 +1526,7 @@
return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
total_bytes));
}
-static inline void btrfs_set_device_total_bytes(struct extent_buffer *eb,
+static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s,
u64 val)
{
@@ -1512,31 +1630,31 @@
return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
}
-static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
+static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb,
struct btrfs_chunk *c, int nr)
{
return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
}
-static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
+static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb,
struct btrfs_chunk *c, int nr)
{
return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
}
/* struct btrfs_block_group_item */
-BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item,
used, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
+BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item,
used, 64);
-BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid,
struct btrfs_block_group_item, chunk_objectid, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
+BTRFS_SETGET_FUNCS(block_group_chunk_objectid,
struct btrfs_block_group_item, chunk_objectid, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_flags,
+BTRFS_SETGET_FUNCS(block_group_flags,
struct btrfs_block_group_item, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(block_group_flags,
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
struct btrfs_block_group_item, flags, 64);
/* struct btrfs_free_space_info */
@@ -1598,31 +1716,21 @@
BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
chunk_offset, 64);
BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
-
-static inline unsigned long btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
-{
- unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
- return (unsigned long)dev + ptr;
-}
-
BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
generation, 64);
BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
-BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32);
-
-
BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
-static inline void btrfs_tree_block_key(struct extent_buffer *eb,
+static inline void btrfs_tree_block_key(const struct extent_buffer *eb,
struct btrfs_tree_block_info *item,
struct btrfs_disk_key *key)
{
read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
}
-static inline void btrfs_set_tree_block_key(struct extent_buffer *eb,
+static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb,
struct btrfs_tree_block_info *item,
struct btrfs_disk_key *key)
{
@@ -1660,12 +1768,6 @@
return 0;
}
-BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64);
-BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0,
- generation, 64);
-BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64);
-BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
-
/* struct btrfs_node */
BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
@@ -1674,7 +1776,7 @@
BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
generation, 64);
-static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
+static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr)
{
unsigned long ptr;
ptr = offsetof(struct btrfs_node, ptrs) +
@@ -1682,7 +1784,7 @@
return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
}
-static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
+static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb,
int nr, u64 val)
{
unsigned long ptr;
@@ -1691,7 +1793,7 @@
btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
}
-static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
+static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr)
{
unsigned long ptr;
ptr = offsetof(struct btrfs_node, ptrs) +
@@ -1699,7 +1801,7 @@
return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
}
-static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
+static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb,
int nr, u64 val)
{
unsigned long ptr;
@@ -1717,7 +1819,7 @@
void btrfs_node_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr);
-static inline void btrfs_set_node_key(struct extent_buffer *eb,
+static inline void btrfs_set_node_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr)
{
unsigned long ptr;
@@ -1841,6 +1943,52 @@
BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Optimized helpers for little-endian architectures where CPU and on-disk
+ * structures have the same endianness and we can skip conversions.
+ */
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key,
+ const struct btrfs_disk_key *disk_key)
+{
+ memcpy(cpu_key, disk_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *cpu_key)
+{
+ memcpy(disk_key, cpu_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *cpu_key, int nr)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_node_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *cpu_key, int nr)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_item_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_key *cpu_key)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_dir_item_key(eb, item, disk_key);
+}
+
+#else
+
static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
const struct btrfs_disk_key *disk)
{
@@ -1882,6 +2030,8 @@
btrfs_disk_key_to_cpu(key, &disk_key);
}
+#endif
+
/* struct btrfs_header */
BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
@@ -1929,16 +2079,6 @@
btrfs_set_header_flags(eb, flags);
}
-static inline unsigned long btrfs_header_fsid(void)
-{
- return offsetof(struct btrfs_header, fsid);
-}
-
-static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb)
-{
- return offsetof(struct btrfs_header, chunk_tree_uuid);
-}
-
static inline int btrfs_is_leaf(const struct extent_buffer *eb)
{
return btrfs_header_level(eb) == 0;
@@ -2169,6 +2309,9 @@
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
+const char *btrfs_super_csum_driver(u16 csum_type);
+size_t __attribute_const__ btrfs_get_num_csums(void);
+
/*
* The leaf data grows from end-to-front in the node.
@@ -2185,7 +2328,8 @@
}
/* struct btrfs_file_extent_item */
-BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
+ type, 8);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
struct btrfs_file_extent_item, disk_bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
@@ -2194,6 +2338,8 @@
struct btrfs_file_extent_item, generation, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
struct btrfs_file_extent_item, num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes,
+ struct btrfs_file_extent_item, ram_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
struct btrfs_file_extent_item, disk_num_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
@@ -2210,6 +2356,7 @@
return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
}
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
disk_bytenr, 64);
BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
@@ -2403,7 +2550,7 @@
int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes);
-void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache);
+void btrfs_free_excluded_extents(struct btrfs_block_group *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
unsigned long count);
void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
@@ -2413,9 +2560,9 @@
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags);
-int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 num, int reserved);
-int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
+int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
+ int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
@@ -2425,7 +2572,8 @@
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
- u64 empty_size);
+ u64 empty_size,
+ enum btrfs_lock_nesting nest);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2445,34 +2593,63 @@
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, u64 flags,
+ struct extent_buffer *eb, u64 flags,
int level, int is_data);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
-int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len);
-void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
+ u64 len);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref);
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
-void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
-void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+/*
+ * Different levels for to flush space when doing space reservations.
+ *
+ * The higher the level, the more methods we try to reclaim space.
+ */
enum btrfs_reserve_flush_enum {
/* If we are in the transaction, we can't flush anything.*/
BTRFS_RESERVE_NO_FLUSH,
+
/*
- * Flushing delalloc may cause deadlock somewhere, in this
- * case, use FLUSH LIMIT
+ * Flush space by:
+ * - Running delayed inode items
+ * - Allocating a new chunk
*/
BTRFS_RESERVE_FLUSH_LIMIT,
+
+ /*
+ * Flush space by:
+ * - Running delayed inode items
+ * - Running delayed refs
+ * - Running delalloc and waiting for ordered extents
+ * - Allocating a new chunk
+ */
BTRFS_RESERVE_FLUSH_EVICT,
+
+ /*
+ * Flush space by above mentioned methods and by:
+ * - Running delayed iputs
+ * - Commiting transaction
+ *
+ * Can be interruped by fatal signal.
+ */
+ BTRFS_RESERVE_FLUSH_DATA,
+ BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
BTRFS_RESERVE_FLUSH_ALL,
+
+ /*
+ * Pretty much the same as FLUSH_ALL, but can also steal space from
+ * global rsv.
+ *
+ * Can be interruped by fatal signal.
+ */
BTRFS_RESERVE_FLUSH_ALL_STEAL,
};
@@ -2492,7 +2669,7 @@
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
@@ -2513,8 +2690,8 @@
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
- int level, int *slot);
-int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
+ int *slot);
+int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type);
@@ -2524,8 +2701,6 @@
struct btrfs_path *path,
const struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
-struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int lowest_level,
u64 min_trans);
@@ -2538,7 +2713,8 @@
int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret);
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest);
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2574,8 +2750,6 @@
void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
-void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int slot, int nr);
@@ -2588,7 +2762,7 @@
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr);
+ int nr);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
@@ -2623,9 +2797,8 @@
return btrfs_next_old_item(root, p, 0);
}
int btrfs_leaf_free_space(struct extent_buffer *leaf);
-int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- int update_ref, int for_reloc);
+int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
+ int for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
@@ -2653,23 +2826,6 @@
return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info);
}
-static inline void free_fs_info(struct btrfs_fs_info *fs_info)
-{
- kfree(fs_info->balance_ctl);
- kfree(fs_info->delayed_root);
- kfree(fs_info->extent_root);
- kfree(fs_info->tree_root);
- kfree(fs_info->chunk_root);
- kfree(fs_info->dev_root);
- kfree(fs_info->csum_root);
- kfree(fs_info->quota_root);
- kfree(fs_info->uuid_root);
- kfree(fs_info->free_space_root);
- kfree(fs_info->super_copy);
- kfree(fs_info->super_for_commit);
- kvfree(fs_info);
-}
-
/* tree mod log functions from ctree.c */
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem);
@@ -2708,9 +2864,7 @@
u64 subid);
int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
-int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
- int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
- u64));
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
/* dir-item.c */
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
@@ -2794,9 +2948,7 @@
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
- u8 *dst);
-blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
- u64 logical_offset);
+ u64 offset, u8 *dst);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@@ -2810,8 +2962,8 @@
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
-blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
- u64 file_start, int contig);
+blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
+ u64 file_start, int contig);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
@@ -2819,8 +2971,18 @@
struct btrfs_file_extent_item *fi,
const bool new_inline,
struct extent_map *em);
+int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len);
+int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len);
+void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size);
+u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
+blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end, int mirror);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -2847,8 +3009,9 @@
u32 min_type);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+ bool in_reclaim_context);
+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -2876,14 +3039,12 @@
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
-struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *new,
- struct btrfs_path *path);
-struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *was_new);
+struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
+ struct btrfs_root *root, struct btrfs_path *path);
+struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
- u64 start, u64 end, int create);
+ u64 start, u64 end);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
@@ -2903,20 +3064,21 @@
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
-int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
-int btrfs_is_empty_uuid(u8 *uuid);
+int __pure btrfs_is_empty_uuid(u8 *uuid);
int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_pages);
@@ -2924,6 +3086,9 @@
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
/* file.c */
int __init btrfs_auto_defrag_init(void);
@@ -2937,7 +3102,7 @@
int skip_pinned);
extern const struct file_operations btrfs_file_operations;
int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
+ struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_path *path, u64 start, u64 end,
u64 *drop_end, int drop_cache,
int replace_extent,
@@ -2946,22 +3111,20 @@
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode, u64 start,
u64 end, int drop_cache);
-int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
- struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_release_file(struct inode *inode, struct file *file);
-int btrfs_dirty_pages(struct inode *inode, struct page **pages,
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags);
-int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes);
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes);
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -3158,7 +3321,7 @@
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
-const char *btrfs_decode_error(int errno);
+const char * __attribute_const__ btrfs_decode_error(int errno);
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
@@ -3358,7 +3521,7 @@
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_recover_relocation(struct btrfs_root *root);
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *cow);
@@ -3366,6 +3529,10 @@
u64 *bytes_to_reserve);
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
+int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
+struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info,
+ u64 bytenr);
+int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
/* scrub.c */
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
@@ -3429,9 +3596,7 @@
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode);
void btrfs_test_destroy_inode(struct inode *inode);
-
static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index f4f531c..bacee09 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -9,136 +9,127 @@
#include "qgroup.h"
#include "block-group.h"
+/*
+ * HOW DOES THIS WORK
+ *
+ * There are two stages to data reservations, one for data and one for metadata
+ * to handle the new extents and checksums generated by writing data.
+ *
+ *
+ * DATA RESERVATION
+ * The general flow of the data reservation is as follows
+ *
+ * -> Reserve
+ * We call into btrfs_reserve_data_bytes() for the user request bytes that
+ * they wish to write. We make this reservation and add it to
+ * space_info->bytes_may_use. We set EXTENT_DELALLOC on the inode io_tree
+ * for the range and carry on if this is buffered, or follow up trying to
+ * make a real allocation if we are pre-allocating or doing O_DIRECT.
+ *
+ * -> Use
+ * At writepages()/prealloc/O_DIRECT time we will call into
+ * btrfs_reserve_extent() for some part or all of this range of bytes. We
+ * will make the allocation and subtract space_info->bytes_may_use by the
+ * original requested length and increase the space_info->bytes_reserved by
+ * the allocated length. This distinction is important because compression
+ * may allocate a smaller on disk extent than we previously reserved.
+ *
+ * -> Allocation
+ * finish_ordered_io() will insert the new file extent item for this range,
+ * and then add a delayed ref update for the extent tree. Once that delayed
+ * ref is written the extent size is subtracted from
+ * space_info->bytes_reserved and added to space_info->bytes_used.
+ *
+ * Error handling
+ *
+ * -> By the reservation maker
+ * This is the simplest case, we haven't completed our operation and we know
+ * how much we reserved, we can simply call
+ * btrfs_free_reserved_data_space*() and it will be removed from
+ * space_info->bytes_may_use.
+ *
+ * -> After the reservation has been made, but before cow_file_range()
+ * This is specifically for the delalloc case. You must clear
+ * EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will
+ * be subtracted from space_info->bytes_may_use.
+ *
+ * METADATA RESERVATION
+ * The general metadata reservation lifetimes are discussed elsewhere, this
+ * will just focus on how it is used for delalloc space.
+ *
+ * We keep track of two things on a per inode bases
+ *
+ * ->outstanding_extents
+ * This is the number of file extent items we'll need to handle all of the
+ * outstanding DELALLOC space we have in this inode. We limit the maximum
+ * size of an extent, so a large contiguous dirty area may require more than
+ * one outstanding_extent, which is why count_max_extents() is used to
+ * determine how many outstanding_extents get added.
+ *
+ * ->csum_bytes
+ * This is essentially how many dirty bytes we have for this inode, so we
+ * can calculate the number of checksum items we would have to add in order
+ * to checksum our outstanding data.
+ *
+ * We keep a per-inode block_rsv in order to make it easier to keep track of
+ * our reservation. We use btrfs_calculate_inode_block_rsv_size() to
+ * calculate the current theoretical maximum reservation we would need for the
+ * metadata for this inode. We call this and then adjust our reservation as
+ * necessary, either by attempting to reserve more space, or freeing up excess
+ * space.
+ *
+ * OUTSTANDING_EXTENTS HANDLING
+ *
+ * ->outstanding_extents is used for keeping track of how many extents we will
+ * need to use for this inode, and it will fluctuate depending on where you are
+ * in the life cycle of the dirty data. Consider the following normal case for
+ * a completely clean inode, with a num_bytes < our maximum allowed extent size
+ *
+ * -> reserve
+ * ->outstanding_extents += 1 (current value is 1)
+ *
+ * -> set_delalloc
+ * ->outstanding_extents += 1 (currrent value is 2)
+ *
+ * -> btrfs_delalloc_release_extents()
+ * ->outstanding_extents -= 1 (current value is 1)
+ *
+ * We must call this once we are done, as we hold our reservation for the
+ * duration of our operation, and then assume set_delalloc will update the
+ * counter appropriately.
+ *
+ * -> add ordered extent
+ * ->outstanding_extents += 1 (current value is 2)
+ *
+ * -> btrfs_clear_delalloc_extent
+ * ->outstanding_extents -= 1 (current value is 1)
+ *
+ * -> finish_ordered_io/btrfs_remove_ordered_extent
+ * ->outstanding_extents -= 1 (current value is 0)
+ *
+ * Each stage is responsible for their own accounting of the extent, thus
+ * making error handling and cleanup easier.
+ */
+
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
- u64 used;
- int ret = 0;
- int need_commit = 2;
- int have_pinned_space;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
/* Make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, fs_info->sectorsize);
- if (btrfs_is_free_space_inode(inode)) {
- need_commit = 0;
- ASSERT(current->journal_info);
- }
+ if (btrfs_is_free_space_inode(inode))
+ flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
-again:
- /* Make sure we have enough space to handle the data first */
- spin_lock(&data_sinfo->lock);
- used = btrfs_space_info_used(data_sinfo, true);
-
- if (used + bytes > data_sinfo->total_bytes) {
- struct btrfs_trans_handle *trans;
-
- /*
- * If we don't have enough free bytes in this space then we need
- * to alloc a new chunk.
- */
- if (!data_sinfo->full) {
- u64 alloc_target;
-
- data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
- spin_unlock(&data_sinfo->lock);
-
- alloc_target = btrfs_data_alloc_profile(fs_info);
- /*
- * It is ugly that we don't call nolock join
- * transaction for the free space inode case here.
- * But it is safe because we only do the data space
- * reservation for the free space cache in the
- * transaction context, the common join transaction
- * just increase the counter of the current transaction
- * handler, doesn't try to acquire the trans_lock of
- * the fs.
- */
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_chunk_alloc(trans, alloc_target,
- CHUNK_ALLOC_NO_FORCE);
- btrfs_end_transaction(trans);
- if (ret < 0) {
- if (ret != -ENOSPC)
- return ret;
- else {
- have_pinned_space = 1;
- goto commit_trans;
- }
- }
-
- goto again;
- }
-
- /*
- * If we don't have enough pinned space to deal with this
- * allocation, and no removed chunk in current transaction,
- * don't bother committing the transaction.
- */
- have_pinned_space = __percpu_counter_compare(
- &data_sinfo->total_bytes_pinned,
- used + bytes - data_sinfo->total_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
- spin_unlock(&data_sinfo->lock);
-
- /* Commit the current transaction and try again */
-commit_trans:
- if (need_commit) {
- need_commit--;
-
- if (need_commit > 0) {
- btrfs_start_delalloc_roots(fs_info, -1);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
- (u64)-1);
- }
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- if (have_pinned_space >= 0 ||
- test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
- &trans->transaction->flags) ||
- need_commit > 0) {
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
- /*
- * The cleaner kthread might still be doing iput
- * operations. Wait for it to finish so that
- * more space is released. We don't need to
- * explicitly run the delayed iputs here because
- * the commit_transaction would have woken up
- * the cleaner.
- */
- ret = btrfs_wait_on_delayed_iputs(fs_info);
- if (ret)
- return ret;
- goto again;
- } else {
- btrfs_end_transaction(trans);
- }
- }
-
- trace_btrfs_space_reservation(fs_info,
- "space_info:enospc",
- data_sinfo->flags, bytes, 1);
- return -ENOSPC;
- }
- btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
- spin_unlock(&data_sinfo->lock);
-
- return 0;
+ return btrfs_reserve_data_bytes(fs_info, bytes, flush);
}
-int btrfs_check_data_free_space(struct inode *inode,
+int btrfs_check_data_free_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
/* align the range */
@@ -146,14 +137,14 @@
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
- ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
+ ret = btrfs_alloc_data_chunk_ondemand(inode, len);
if (ret < 0)
return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
- ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), reserved, start, len);
+ ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
if (ret < 0)
- btrfs_free_reserved_data_space_noquota(inode, start, len);
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
else
ret = 0;
return ret;
@@ -167,21 +158,15 @@
* which we can't sleep and is sure it won't affect qgroup reserved space.
* Like clear_bit_hook().
*/
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_space_info *data_sinfo;
- /* Make sure the range is aligned to sectorsize */
- len = round_up(start + len, fs_info->sectorsize) -
- round_down(start, fs_info->sectorsize);
- start = round_down(start, fs_info->sectorsize);
+ ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
data_sinfo = fs_info->data_sinfo;
- spin_lock(&data_sinfo->lock);
- btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
- spin_unlock(&data_sinfo->lock);
+ btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
}
/*
@@ -191,17 +176,17 @@
* This one will handle the per-inode data rsv map for accurate reserved
* space framework.
*/
-void btrfs_free_reserved_data_space(struct inode *inode,
+void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
/* Make sure the range is aligned to sectorsize */
- len = round_up(start + len, root->fs_info->sectorsize) -
- round_down(start, root->fs_info->sectorsize);
- start = round_down(start, root->fs_info->sectorsize);
+ len = round_up(start + len, fs_info->sectorsize) -
+ round_down(start, fs_info->sectorsize);
+ start = round_down(start, fs_info->sectorsize);
- btrfs_free_reserved_data_space_noquota(inode, start, len);
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
btrfs_qgroup_free_data(inode, reserved, start, len);
}
@@ -228,8 +213,8 @@
* are releasing 0 bytes, and then we'll just get the reservation over
* the size free'd.
*/
- released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
- &qgroup_to_release);
+ released = btrfs_block_rsv_release(fs_info, block_rsv, 0,
+ &qgroup_to_release);
if (released > 0)
trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), released, 0);
@@ -307,7 +292,6 @@
unsigned nr_extents;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
- bool delalloc_lock = true;
/*
* If we are a free space inode we need to not flush since we will be in
@@ -320,7 +304,6 @@
*/
if (btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
- delalloc_lock = false;
} else {
if (current->journal_info)
flush = BTRFS_RESERVE_FLUSH_LIMIT;
@@ -329,9 +312,6 @@
schedule_timeout(1);
}
- if (delalloc_lock)
- mutex_lock(&inode->delalloc_mutex);
-
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
/*
@@ -348,10 +328,12 @@
&qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
if (ret)
- goto out_fail;
+ return ret;
ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
- if (ret)
- goto out_qgroup;
+ if (ret) {
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
+ return ret;
+ }
/*
* Now we need to update our outstanding extents and csum bytes _first_
@@ -375,15 +357,7 @@
block_rsv->qgroup_rsv_reserved += qgroup_reserve;
spin_unlock(&block_rsv->lock);
- if (delalloc_lock)
- mutex_unlock(&inode->delalloc_mutex);
return 0;
-out_qgroup:
- btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
-out_fail:
- if (delalloc_lock)
- mutex_unlock(&inode->delalloc_mutex);
- return ret;
}
/**
@@ -466,7 +440,7 @@
* Return 0 for success
* Return <0 for error(-ENOSPC or -EQUOT)
*/
-int btrfs_delalloc_reserve_space(struct inode *inode,
+int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
{
int ret;
@@ -474,7 +448,7 @@
ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
+ ret = btrfs_delalloc_reserve_metadata(inode, len);
if (ret < 0)
btrfs_free_reserved_data_space(inode, *reserved, start, len);
return ret;
@@ -492,10 +466,10 @@
* list if there are no delalloc bytes left.
* Also it will handle the qgroup reserved space.
*/
-void btrfs_delalloc_release_space(struct inode *inode,
+void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free)
{
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
+ btrfs_delalloc_release_metadata(inode, len, qgroup_free);
btrfs_free_reserved_data_space(inode, reserved, start, len);
}
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index 54466fb..28bf5c3 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -6,18 +6,18 @@
struct extent_changeset;
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
-int btrfs_check_data_free_space(struct inode *inode,
+int btrfs_check_data_free_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
-void btrfs_free_reserved_data_space(struct inode *inode,
+void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
-void btrfs_delalloc_release_space(struct inode *inode,
+void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free);
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
u64 len);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free);
-int btrfs_delalloc_reserve_space(struct inode *inode,
+int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
#endif /* BTRFS_DELALLOC_SPACE_H */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index e968904..04422d9 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -13,6 +13,7 @@
#include "transaction.h"
#include "ctree.h"
#include "qgroup.h"
+#include "locking.h"
#define BTRFS_DELAYED_WRITEBACK 512
#define BTRFS_DELAYED_BACKGROUND 128
@@ -595,8 +596,7 @@
trace_btrfs_space_reservation(fs_info, "delayed_item",
item->key.objectid, item->bytes_reserved,
0);
- btrfs_block_rsv_release(fs_info, rsv,
- item->bytes_reserved);
+ btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
}
static int btrfs_delayed_inode_reserve_metadata(
@@ -677,8 +677,7 @@
rsv = &fs_info->delayed_block_rsv;
trace_btrfs_space_reservation(fs_info, "delayed_inode",
node->inode_id, node->bytes_reserved, 0);
- btrfs_block_rsv_release(fs_info, rsv,
- node->bytes_reserved);
+ btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL);
if (qgroup_free)
btrfs_qgroup_free_meta_prealloc(node->root,
node->bytes_reserved);
@@ -770,8 +769,7 @@
}
/* insert the keys of the items */
- setup_items_for_insert(root, path, keys, data_size,
- total_data_size, total_size, nitems);
+ setup_items_for_insert(root, path, keys, data_size, nitems);
/* insert the dir index items */
slot = path->slots[0];
@@ -1778,6 +1776,7 @@
int btrfs_fill_inode(struct inode *inode, u32 *rdev)
{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct btrfs_delayed_node *delayed_node;
struct btrfs_inode_item *inode_item;
@@ -1797,6 +1796,8 @@
i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
+ btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
+ round_up(i_size_read(inode), fs_info->sectorsize));
inode->i_mode = btrfs_stack_inode_mode(inode_item);
set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 74ae226..ca96ef0 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -70,7 +70,7 @@
refcount_t refs;
int ins_or_del;
u32 data_len;
- char data[0];
+ char data[];
};
static inline void btrfs_init_delayed_root(
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index dfdb7d4..30883b9 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -82,8 +82,7 @@
u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
u64 released = 0;
- released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes,
- NULL);
+ released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
if (released)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
0, released, 0);
@@ -649,12 +648,12 @@
*/
static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *existing,
- struct btrfs_delayed_ref_head *update,
- int *old_ref_mod_ret)
+ struct btrfs_delayed_ref_head *update)
{
struct btrfs_delayed_ref_root *delayed_refs =
&trans->transaction->delayed_refs;
struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 flags = btrfs_ref_head_to_space_flags(existing);
int old_ref_mod;
BUG_ON(existing->is_data != update->is_data);
@@ -702,8 +701,6 @@
* currently, for refs we just added we know we're a-ok.
*/
old_ref_mod = existing->total_ref_mod;
- if (old_ref_mod_ret)
- *old_ref_mod_ret = old_ref_mod;
existing->ref_mod += update->ref_mod;
existing->total_ref_mod += update->ref_mod;
@@ -725,6 +722,27 @@
trans->delayed_ref_updates += csum_leaves;
}
}
+
+ /*
+ * This handles the following conditions:
+ *
+ * 1. We had a ref mod of 0 or more and went negative, indicating that
+ * we may be freeing space, so add our space to the
+ * total_bytes_pinned counter.
+ * 2. We were negative and went to 0 or positive, so no longer can say
+ * that the space would be pinned, decrement our counter from the
+ * total_bytes_pinned counter.
+ * 3. We are now at 0 and have ->must_insert_reserved set, which means
+ * this was a new allocation and then we dropped it, and thus must
+ * add our space to the total_bytes_pinned counter.
+ */
+ if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
+ btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
+ else if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
+ btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes);
+ else if (existing->total_ref_mod == 0 && existing->must_insert_reserved)
+ btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
+
spin_unlock(&existing->lock);
}
@@ -799,8 +817,7 @@
add_delayed_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_qgroup_extent_record *qrecord,
- int action, int *qrecord_inserted_ret,
- int *old_ref_mod, int *new_ref_mod)
+ int action, int *qrecord_inserted_ret)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -822,8 +839,7 @@
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
- update_existing_head_ref(trans, existing, head_ref,
- old_ref_mod);
+ update_existing_head_ref(trans, existing, head_ref);
/*
* we've updated the existing ref, free the newly
* allocated ref
@@ -831,14 +847,17 @@
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
- if (old_ref_mod)
- *old_ref_mod = 0;
+ u64 flags = btrfs_ref_head_to_space_flags(head_ref);
+
if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
trans->delayed_ref_updates +=
btrfs_csum_bytes_to_leaves(trans->fs_info,
head_ref->num_bytes);
}
+ if (head_ref->ref_mod < 0)
+ btrfs_mod_total_bytes_pinned(trans->fs_info, flags,
+ head_ref->num_bytes);
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
@@ -846,8 +865,6 @@
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
- if (new_ref_mod)
- *new_ref_mod = head_ref->total_ref_mod;
return head_ref;
}
@@ -910,8 +927,7 @@
*/
int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- struct btrfs_delayed_extent_op *extent_op,
- int *old_ref_mod, int *new_ref_mod)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_tree_ref *ref;
@@ -978,8 +994,7 @@
* the spin lock
*/
head_ref = add_delayed_ref_head(trans, head_ref, record,
- action, &qrecord_inserted,
- old_ref_mod, new_ref_mod);
+ action, &qrecord_inserted);
ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
@@ -1007,8 +1022,7 @@
*/
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- u64 reserved, int *old_ref_mod,
- int *new_ref_mod)
+ u64 reserved)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_data_ref *ref;
@@ -1074,8 +1088,7 @@
* the spin lock
*/
head_ref = add_delayed_ref_head(trans, head_ref, record,
- action, &qrecord_inserted,
- old_ref_mod, new_ref_mod);
+ action, &qrecord_inserted);
ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
@@ -1118,7 +1131,7 @@
spin_lock(&delayed_refs->lock);
add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
- NULL, NULL, NULL);
+ NULL);
spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 1c977e6..3ba1404 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -326,6 +326,16 @@
}
}
+static inline u64 btrfs_ref_head_to_space_flags(
+ struct btrfs_delayed_ref_head *head_ref)
+{
+ if (head_ref->is_data)
+ return BTRFS_BLOCK_GROUP_DATA;
+ else if (head_ref->is_system)
+ return BTRFS_BLOCK_GROUP_SYSTEM;
+ return BTRFS_BLOCK_GROUP_METADATA;
+}
+
static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head)
{
if (refcount_dec_and_test(&head->refs))
@@ -334,12 +344,10 @@
int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- struct btrfs_delayed_extent_op *extent_op,
- int *old_ref_mod, int *new_ref_mod);
+ struct btrfs_delayed_extent_op *extent_op);
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- u64 reserved, int *old_ref_mod,
- int *new_ref_mod);
+ u64 reserved);
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1cb7f5d..d297804 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -22,12 +22,48 @@
#include "dev-replace.h"
#include "sysfs.h"
+/*
+ * Device replace overview
+ *
+ * [Objective]
+ * To copy all extents (both new and on-disk) from source device to target
+ * device, while still keeping the filesystem read-write.
+ *
+ * [Method]
+ * There are two main methods involved:
+ *
+ * - Write duplication
+ *
+ * All new writes will be written to both target and source devices, so even
+ * if replace gets canceled, sources device still contans up-to-date data.
+ *
+ * Location: handle_ops_on_dev_replace() from __btrfs_map_block()
+ * Start: btrfs_dev_replace_start()
+ * End: btrfs_dev_replace_finishing()
+ * Content: Latest data/metadata
+ *
+ * - Copy existing extents
+ *
+ * This happens by re-using scrub facility, as scrub also iterates through
+ * existing extents from commit root.
+ *
+ * Location: scrub_write_block_to_dev_replace() from
+ * scrub_block_complete()
+ * Content: Data/meta from commit root.
+ *
+ * Due to the content difference, we need to avoid nocow write when dev-replace
+ * is happening. This is done by marking the block group read-only and waiting
+ * for NOCOW writes.
+ *
+ * After replace is done, the finishing part is done by swapping the target and
+ * source devices.
+ *
+ * Location: btrfs_dev_replace_update_device_in_mapping_tree() from
+ * btrfs_dev_replace_finishing()
+ */
+
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
-static void btrfs_dev_replace_update_device_in_mapping_tree(
- struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev,
- struct btrfs_device *tgtdev);
static int btrfs_dev_replace_kthread(void *data);
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
@@ -206,7 +242,6 @@
{
struct btrfs_device *device;
struct block_device *bdev;
- struct list_head *devices;
struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
@@ -226,8 +261,7 @@
sync_blockdev(bdev);
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(device, devices, dev_list) {
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info,
"target device is in the filesystem!");
@@ -494,7 +528,7 @@
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
up_write(&dev_replace->rwsem);
- ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
+ ret = btrfs_sysfs_add_device(tgt_device);
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
@@ -522,11 +556,8 @@
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
- if (ret == -EINPROGRESS) {
+ if (ret == -EINPROGRESS)
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
- } else if (ret != -ECANCELED) {
- WARN_ON(ret);
- }
return ret;
@@ -615,6 +646,32 @@
return ret;
}
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 start = 0;
+ int i;
+
+ write_lock(&em_tree->lock);
+ do {
+ em = lookup_extent_mapping(em_tree, start, (u64)-1);
+ if (!em)
+ break;
+ map = em->map_lookup;
+ for (i = 0; i < map->num_stripes; i++)
+ if (srcdev == map->stripes[i].dev)
+ map->stripes[i].dev = tgtdev;
+ start = em->start + em->len;
+ free_extent_map(em);
+ } while (start);
+ write_unlock(&em_tree->lock);
+}
+
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
@@ -646,7 +703,7 @@
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_roots(fs_info, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
@@ -771,8 +828,11 @@
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/* replace the sysfs entry */
- btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
- btrfs_rm_dev_replace_free_srcdev(src_device);
+ btrfs_sysfs_remove_device(src_device);
+ btrfs_sysfs_update_devid(tgt_device);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
+ btrfs_scratch_superblocks(fs_info, src_device->bdev,
+ src_device->name->str);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
@@ -781,35 +841,11 @@
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ btrfs_rm_dev_replace_free_srcdev(src_device);
+
return 0;
}
-static void btrfs_dev_replace_update_device_in_mapping_tree(
- struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev,
- struct btrfs_device *tgtdev)
-{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
- u64 start = 0;
- int i;
-
- write_lock(&em_tree->lock);
- do {
- em = lookup_extent_mapping(em_tree, start, (u64)-1);
- if (!em)
- break;
- map = em->map_lookup;
- for (i = 0; i < map->num_stripes; i++)
- if (srcdev == map->stripes[i].dev)
- map->stripes[i].dev = tgtdev;
- start = em->start + em->len;
- free_extent_map(em);
- } while (start);
- write_unlock(&em_tree->lock);
-}
-
/*
* Read progress of device replace status according to the state and last
* stored position. The value format is the same as for
@@ -1010,7 +1046,7 @@
* should never allow both to start and pause. We don't want to allow
* dev-replace to start anyway.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
@@ -1047,11 +1083,11 @@
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret && ret != -ECANCELED);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return 0;
}
-int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
{
if (!dev_replace->is_valid)
return 0;
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 78c5d8f..60b70da 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -17,6 +17,6 @@
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
-int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
#endif
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
new file mode 100644
index 0000000..9e1a061
--- /dev/null
+++ b/fs/btrfs/discard.c
@@ -0,0 +1,709 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/math64.h>
+#include <linux/sizes.h>
+#include <linux/workqueue.h>
+#include "ctree.h"
+#include "block-group.h"
+#include "discard.h"
+#include "free-space-cache.h"
+
+/*
+ * This contains the logic to handle async discard.
+ *
+ * Async discard manages trimming of free space outside of transaction commit.
+ * Discarding is done by managing the block_groups on a LRU list based on free
+ * space recency. Two passes are used to first prioritize discarding extents
+ * and then allow for trimming in the bitmap the best opportunity to coalesce.
+ * The block_groups are maintained on multiple lists to allow for multiple
+ * passes with different discard filter requirements. A delayed work item is
+ * used to manage discarding with timeout determined by a max of the delay
+ * incurred by the iops rate limit, the byte rate limit, and the max delay of
+ * BTRFS_DISCARD_MAX_DELAY.
+ *
+ * Note, this only keeps track of block_groups that are explicitly for data.
+ * Mixed block_groups are not supported.
+ *
+ * The first list is special to manage discarding of fully free block groups.
+ * This is necessary because we issue a final trim for a full free block group
+ * after forgetting it. When a block group becomes unused, instead of directly
+ * being added to the unused_bgs list, we add it to this first list. Then
+ * from there, if it becomes fully discarded, we place it onto the unused_bgs
+ * list.
+ *
+ * The in-memory free space cache serves as the backing state for discard.
+ * Consequently this means there is no persistence. We opt to load all the
+ * block groups in as not discarded, so the mount case degenerates to the
+ * crashing case.
+ *
+ * As the free space cache uses bitmaps, there exists a tradeoff between
+ * ease/efficiency for find_free_extent() and the accuracy of discard state.
+ * Here we opt to let untrimmed regions merge with everything while only letting
+ * trimmed regions merge with other trimmed regions. This can cause
+ * overtrimming, but the coalescing benefit seems to be worth it. Additionally,
+ * bitmap state is tracked as a whole. If we're able to fully trim a bitmap,
+ * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in,
+ * this resets the state and we will retry trimming the whole bitmap. This is a
+ * tradeoff between discard state accuracy and the cost of accounting.
+ */
+
+/* This is an initial delay to give some chance for block reuse */
+#define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC)
+#define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC)
+
+/* Target completion latency of discarding all discardable extents */
+#define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC)
+#define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL)
+#define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
+#define BTRFS_DISCARD_MAX_IOPS (10U)
+
+/* Montonically decreasing minimum length filters after index 0 */
+static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
+ 0,
+ BTRFS_ASYNC_DISCARD_MAX_FILTER,
+ BTRFS_ASYNC_DISCARD_MIN_FILTER
+};
+
+static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ return &discard_ctl->discard_list[block_group->discard_index];
+}
+
+static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ if (!btrfs_run_discard_work(discard_ctl))
+ return;
+
+ if (list_empty(&block_group->discard_list) ||
+ block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
+ if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
+ block_group->discard_index = BTRFS_DISCARD_INDEX_START;
+ block_group->discard_eligible_time = (ktime_get_ns() +
+ BTRFS_DISCARD_DELAY);
+ block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
+ }
+
+ list_move_tail(&block_group->discard_list,
+ get_discard_list(discard_ctl, block_group));
+}
+
+static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ if (!btrfs_is_block_group_data_only(block_group))
+ return;
+
+ spin_lock(&discard_ctl->lock);
+ __add_to_discard_list(discard_ctl, block_group);
+ spin_unlock(&discard_ctl->lock);
+}
+
+static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ spin_lock(&discard_ctl->lock);
+
+ if (!btrfs_run_discard_work(discard_ctl)) {
+ spin_unlock(&discard_ctl->lock);
+ return;
+ }
+
+ list_del_init(&block_group->discard_list);
+
+ block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
+ block_group->discard_eligible_time = (ktime_get_ns() +
+ BTRFS_DISCARD_UNUSED_DELAY);
+ block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
+ list_add_tail(&block_group->discard_list,
+ &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
+
+ spin_unlock(&discard_ctl->lock);
+}
+
+static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ bool running = false;
+
+ spin_lock(&discard_ctl->lock);
+
+ if (block_group == discard_ctl->block_group) {
+ running = true;
+ discard_ctl->block_group = NULL;
+ }
+
+ block_group->discard_eligible_time = 0;
+ list_del_init(&block_group->discard_list);
+
+ spin_unlock(&discard_ctl->lock);
+
+ return running;
+}
+
+/**
+ * find_next_block_group - find block_group that's up next for discarding
+ * @discard_ctl: discard control
+ * @now: current time
+ *
+ * Iterate over the discard lists to find the next block_group up for
+ * discarding checking the discard_eligible_time of block_group.
+ */
+static struct btrfs_block_group *find_next_block_group(
+ struct btrfs_discard_ctl *discard_ctl,
+ u64 now)
+{
+ struct btrfs_block_group *ret_block_group = NULL, *block_group;
+ int i;
+
+ for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
+ struct list_head *discard_list = &discard_ctl->discard_list[i];
+
+ if (!list_empty(discard_list)) {
+ block_group = list_first_entry(discard_list,
+ struct btrfs_block_group,
+ discard_list);
+
+ if (!ret_block_group)
+ ret_block_group = block_group;
+
+ if (ret_block_group->discard_eligible_time < now)
+ break;
+
+ if (ret_block_group->discard_eligible_time >
+ block_group->discard_eligible_time)
+ ret_block_group = block_group;
+ }
+ }
+
+ return ret_block_group;
+}
+
+/**
+ * peek_discard_list - wrap find_next_block_group()
+ * @discard_ctl: discard control
+ * @discard_state: the discard_state of the block_group after state management
+ * @discard_index: the discard_index of the block_group after state management
+ *
+ * This wraps find_next_block_group() and sets the block_group to be in use.
+ * discard_state's control flow is managed here. Variables related to
+ * discard_state are reset here as needed (eg discard_cursor). @discard_state
+ * and @discard_index are remembered as it may change while we're discarding,
+ * but we want the discard to execute in the context determined here.
+ */
+static struct btrfs_block_group *peek_discard_list(
+ struct btrfs_discard_ctl *discard_ctl,
+ enum btrfs_discard_state *discard_state,
+ int *discard_index, u64 now)
+{
+ struct btrfs_block_group *block_group;
+
+ spin_lock(&discard_ctl->lock);
+again:
+ block_group = find_next_block_group(discard_ctl, now);
+
+ if (block_group && now >= block_group->discard_eligible_time) {
+ if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
+ block_group->used != 0) {
+ if (btrfs_is_block_group_data_only(block_group))
+ __add_to_discard_list(discard_ctl, block_group);
+ else
+ list_del_init(&block_group->discard_list);
+ goto again;
+ }
+ if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
+ block_group->discard_cursor = block_group->start;
+ block_group->discard_state = BTRFS_DISCARD_EXTENTS;
+ }
+ discard_ctl->block_group = block_group;
+ }
+ if (block_group) {
+ *discard_state = block_group->discard_state;
+ *discard_index = block_group->discard_index;
+ }
+ spin_unlock(&discard_ctl->lock);
+
+ return block_group;
+}
+
+/**
+ * btrfs_discard_check_filter - updates a block groups filters
+ * @block_group: block group of interest
+ * @bytes: recently freed region size after coalescing
+ *
+ * Async discard maintains multiple lists with progressively smaller filters
+ * to prioritize discarding based on size. Should a free space that matches
+ * a larger filter be returned to the free_space_cache, prioritize that discard
+ * by moving @block_group to the proper filter.
+ */
+void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
+ u64 bytes)
+{
+ struct btrfs_discard_ctl *discard_ctl;
+
+ if (!block_group ||
+ !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
+ return;
+
+ discard_ctl = &block_group->fs_info->discard_ctl;
+
+ if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
+ bytes >= discard_minlen[block_group->discard_index - 1]) {
+ int i;
+
+ remove_from_discard_list(discard_ctl, block_group);
+
+ for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
+ i++) {
+ if (bytes >= discard_minlen[i]) {
+ block_group->discard_index = i;
+ add_to_discard_list(discard_ctl, block_group);
+ break;
+ }
+ }
+ }
+}
+
+/**
+ * btrfs_update_discard_index - moves a block group along the discard lists
+ * @discard_ctl: discard control
+ * @block_group: block_group of interest
+ *
+ * Increment @block_group's discard_index. If it falls of the list, let it be.
+ * Otherwise add it back to the appropriate list.
+ */
+static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ block_group->discard_index++;
+ if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
+ block_group->discard_index = 1;
+ return;
+ }
+
+ add_to_discard_list(discard_ctl, block_group);
+}
+
+/**
+ * btrfs_discard_cancel_work - remove a block_group from the discard lists
+ * @discard_ctl: discard control
+ * @block_group: block_group of interest
+ *
+ * This removes @block_group from the discard lists. If necessary, it waits on
+ * the current work and then reschedules the delayed work.
+ */
+void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ if (remove_from_discard_list(discard_ctl, block_group)) {
+ cancel_delayed_work_sync(&discard_ctl->work);
+ btrfs_discard_schedule_work(discard_ctl, true);
+ }
+}
+
+/**
+ * btrfs_discard_queue_work - handles queuing the block_groups
+ * @discard_ctl: discard control
+ * @block_group: block_group of interest
+ *
+ * This maintains the LRU order of the discard lists.
+ */
+void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
+ return;
+
+ if (block_group->used == 0)
+ add_to_discard_unused_list(discard_ctl, block_group);
+ else
+ add_to_discard_list(discard_ctl, block_group);
+
+ if (!delayed_work_pending(&discard_ctl->work))
+ btrfs_discard_schedule_work(discard_ctl, false);
+}
+
+static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
+ u64 now, bool override)
+{
+ struct btrfs_block_group *block_group;
+
+ if (!btrfs_run_discard_work(discard_ctl))
+ return;
+ if (!override && delayed_work_pending(&discard_ctl->work))
+ return;
+
+ block_group = find_next_block_group(discard_ctl, now);
+ if (block_group) {
+ unsigned long delay = discard_ctl->delay;
+ u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
+
+ /*
+ * A single delayed workqueue item is responsible for
+ * discarding, so we can manage the bytes rate limit by keeping
+ * track of the previous discard.
+ */
+ if (kbps_limit && discard_ctl->prev_discard) {
+ u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
+ u64 bps_delay = div64_u64(discard_ctl->prev_discard *
+ MSEC_PER_SEC, bps_limit);
+
+ delay = max(delay, msecs_to_jiffies(bps_delay));
+ }
+
+ /*
+ * This timeout is to hopefully prevent immediate discarding
+ * in a recently allocated block group.
+ */
+ if (now < block_group->discard_eligible_time) {
+ u64 bg_timeout = block_group->discard_eligible_time - now;
+
+ delay = max(delay, nsecs_to_jiffies(bg_timeout));
+ }
+
+ mod_delayed_work(discard_ctl->discard_workers,
+ &discard_ctl->work, delay);
+ }
+}
+
+/*
+ * btrfs_discard_schedule_work - responsible for scheduling the discard work
+ * @discard_ctl: discard control
+ * @override: override the current timer
+ *
+ * Discards are issued by a delayed workqueue item. @override is used to
+ * update the current delay as the baseline delay interval is reevaluated on
+ * transaction commit. This is also maxed with any other rate limit.
+ */
+void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
+ bool override)
+{
+ const u64 now = ktime_get_ns();
+
+ spin_lock(&discard_ctl->lock);
+ __btrfs_discard_schedule_work(discard_ctl, now, override);
+ spin_unlock(&discard_ctl->lock);
+}
+
+/**
+ * btrfs_finish_discard_pass - determine next step of a block_group
+ * @discard_ctl: discard control
+ * @block_group: block_group of interest
+ *
+ * This determines the next step for a block group after it's finished going
+ * through a pass on a discard list. If it is unused and fully trimmed, we can
+ * mark it unused and send it to the unused_bgs path. Otherwise, pass it onto
+ * the appropriate filter list or let it fall off.
+ */
+static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ remove_from_discard_list(discard_ctl, block_group);
+
+ if (block_group->used == 0) {
+ if (btrfs_is_free_space_trimmed(block_group))
+ btrfs_mark_bg_unused(block_group);
+ else
+ add_to_discard_unused_list(discard_ctl, block_group);
+ } else {
+ btrfs_update_discard_index(discard_ctl, block_group);
+ }
+}
+
+/**
+ * btrfs_discard_workfn - discard work function
+ * @work: work
+ *
+ * This finds the next block_group to start discarding and then discards a
+ * single region. It does this in a two-pass fashion: first extents and second
+ * bitmaps. Completely discarded block groups are sent to the unused_bgs path.
+ */
+static void btrfs_discard_workfn(struct work_struct *work)
+{
+ struct btrfs_discard_ctl *discard_ctl;
+ struct btrfs_block_group *block_group;
+ enum btrfs_discard_state discard_state;
+ int discard_index = 0;
+ u64 trimmed = 0;
+ u64 minlen = 0;
+ u64 now = ktime_get_ns();
+
+ discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
+
+ block_group = peek_discard_list(discard_ctl, &discard_state,
+ &discard_index, now);
+ if (!block_group || !btrfs_run_discard_work(discard_ctl))
+ return;
+ if (now < block_group->discard_eligible_time) {
+ btrfs_discard_schedule_work(discard_ctl, false);
+ return;
+ }
+
+ /* Perform discarding */
+ minlen = discard_minlen[discard_index];
+
+ if (discard_state == BTRFS_DISCARD_BITMAPS) {
+ u64 maxlen = 0;
+
+ /*
+ * Use the previous levels minimum discard length as the max
+ * length filter. In the case something is added to make a
+ * region go beyond the max filter, the entire bitmap is set
+ * back to BTRFS_TRIM_STATE_UNTRIMMED.
+ */
+ if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
+ maxlen = discard_minlen[discard_index - 1];
+
+ btrfs_trim_block_group_bitmaps(block_group, &trimmed,
+ block_group->discard_cursor,
+ btrfs_block_group_end(block_group),
+ minlen, maxlen, true);
+ discard_ctl->discard_bitmap_bytes += trimmed;
+ } else {
+ btrfs_trim_block_group_extents(block_group, &trimmed,
+ block_group->discard_cursor,
+ btrfs_block_group_end(block_group),
+ minlen, true);
+ discard_ctl->discard_extent_bytes += trimmed;
+ }
+
+ discard_ctl->prev_discard = trimmed;
+
+ /* Determine next steps for a block_group */
+ if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
+ if (discard_state == BTRFS_DISCARD_BITMAPS) {
+ btrfs_finish_discard_pass(discard_ctl, block_group);
+ } else {
+ block_group->discard_cursor = block_group->start;
+ spin_lock(&discard_ctl->lock);
+ if (block_group->discard_state !=
+ BTRFS_DISCARD_RESET_CURSOR)
+ block_group->discard_state =
+ BTRFS_DISCARD_BITMAPS;
+ spin_unlock(&discard_ctl->lock);
+ }
+ }
+
+ spin_lock(&discard_ctl->lock);
+ discard_ctl->block_group = NULL;
+ __btrfs_discard_schedule_work(discard_ctl, now, false);
+ spin_unlock(&discard_ctl->lock);
+}
+
+/**
+ * btrfs_run_discard_work - determines if async discard should be running
+ * @discard_ctl: discard control
+ *
+ * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
+ */
+bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
+{
+ struct btrfs_fs_info *fs_info = container_of(discard_ctl,
+ struct btrfs_fs_info,
+ discard_ctl);
+
+ return (!(fs_info->sb->s_flags & SB_RDONLY) &&
+ test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
+}
+
+/**
+ * btrfs_discard_calc_delay - recalculate the base delay
+ * @discard_ctl: discard control
+ *
+ * Recalculate the base delay which is based off the total number of
+ * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms)
+ * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
+ */
+void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
+{
+ s32 discardable_extents;
+ s64 discardable_bytes;
+ u32 iops_limit;
+ unsigned long delay;
+ unsigned long lower_limit = BTRFS_DISCARD_MIN_DELAY_MSEC;
+
+ discardable_extents = atomic_read(&discard_ctl->discardable_extents);
+ if (!discardable_extents)
+ return;
+
+ spin_lock(&discard_ctl->lock);
+
+ /*
+ * The following is to fix a potential -1 discrepenancy that we're not
+ * sure how to reproduce. But given that this is the only place that
+ * utilizes these numbers and this is only called by from
+ * btrfs_finish_extent_commit() which is synchronized, we can correct
+ * here.
+ */
+ if (discardable_extents < 0)
+ atomic_add(-discardable_extents,
+ &discard_ctl->discardable_extents);
+
+ discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
+ if (discardable_bytes < 0)
+ atomic64_add(-discardable_bytes,
+ &discard_ctl->discardable_bytes);
+
+ if (discardable_extents <= 0) {
+ spin_unlock(&discard_ctl->lock);
+ return;
+ }
+
+ iops_limit = READ_ONCE(discard_ctl->iops_limit);
+ if (iops_limit)
+ lower_limit = max_t(unsigned long, lower_limit,
+ MSEC_PER_SEC / iops_limit);
+
+ delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents;
+ delay = clamp(delay, lower_limit, BTRFS_DISCARD_MAX_DELAY_MSEC);
+ discard_ctl->delay = msecs_to_jiffies(delay);
+
+ spin_unlock(&discard_ctl->lock);
+}
+
+/**
+ * btrfs_discard_update_discardable - propagate discard counters
+ * @block_group: block_group of interest
+ * @ctl: free_space_ctl of @block_group
+ *
+ * This propagates deltas of counters up to the discard_ctl. It maintains a
+ * current counter and a previous counter passing the delta up to the global
+ * stat. Then the current counter value becomes the previous counter value.
+ */
+void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_discard_ctl *discard_ctl;
+ s32 extents_delta;
+ s64 bytes_delta;
+
+ if (!block_group ||
+ !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
+ !btrfs_is_block_group_data_only(block_group))
+ return;
+
+ discard_ctl = &block_group->fs_info->discard_ctl;
+
+ extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
+ ctl->discardable_extents[BTRFS_STAT_PREV];
+ if (extents_delta) {
+ atomic_add(extents_delta, &discard_ctl->discardable_extents);
+ ctl->discardable_extents[BTRFS_STAT_PREV] =
+ ctl->discardable_extents[BTRFS_STAT_CURR];
+ }
+
+ bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
+ ctl->discardable_bytes[BTRFS_STAT_PREV];
+ if (bytes_delta) {
+ atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
+ ctl->discardable_bytes[BTRFS_STAT_PREV] =
+ ctl->discardable_bytes[BTRFS_STAT_CURR];
+ }
+}
+
+/**
+ * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists
+ * @fs_info: fs_info of interest
+ *
+ * The unused_bgs list needs to be punted to the discard lists because the
+ * order of operations is changed. In the normal sychronous discard path, the
+ * block groups are trimmed via a single large trim in transaction commit. This
+ * is ultimately what we are trying to avoid with asynchronous discard. Thus,
+ * it must be done before going down the unused_bgs path.
+ */
+void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group *block_group, *next;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ /* We enabled async discard, so punt all to the queue */
+ list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
+ bg_list) {
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+/**
+ * btrfs_discard_purge_list - purge discard lists
+ * @discard_ctl: discard control
+ *
+ * If we are disabling async discard, we may have intercepted block groups that
+ * are completely free and ready for the unused_bgs path. As discarding will
+ * now happen in transaction commit or not at all, we can safely mark the
+ * corresponding block groups as unused and they will be sent on their merry
+ * way to the unused_bgs list.
+ */
+static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
+{
+ struct btrfs_block_group *block_group, *next;
+ int i;
+
+ spin_lock(&discard_ctl->lock);
+ for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
+ list_for_each_entry_safe(block_group, next,
+ &discard_ctl->discard_list[i],
+ discard_list) {
+ list_del_init(&block_group->discard_list);
+ spin_unlock(&discard_ctl->lock);
+ if (block_group->used == 0)
+ btrfs_mark_bg_unused(block_group);
+ spin_lock(&discard_ctl->lock);
+ }
+ }
+ spin_unlock(&discard_ctl->lock);
+}
+
+void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
+ btrfs_discard_cleanup(fs_info);
+ return;
+ }
+
+ btrfs_discard_punt_unused_bgs_list(fs_info);
+
+ set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
+}
+
+void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
+{
+ clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
+}
+
+void btrfs_discard_init(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
+ int i;
+
+ spin_lock_init(&discard_ctl->lock);
+ INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
+
+ for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
+ INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
+
+ discard_ctl->prev_discard = 0;
+ atomic_set(&discard_ctl->discardable_extents, 0);
+ atomic64_set(&discard_ctl->discardable_bytes, 0);
+ discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
+ discard_ctl->delay = BTRFS_DISCARD_MAX_DELAY_MSEC;
+ discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
+ discard_ctl->kbps_limit = 0;
+ discard_ctl->discard_extent_bytes = 0;
+ discard_ctl->discard_bitmap_bytes = 0;
+ atomic64_set(&discard_ctl->discard_bytes_saved, 0);
+}
+
+void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
+{
+ btrfs_discard_stop(fs_info);
+ cancel_delayed_work_sync(&fs_info->discard_ctl.work);
+ btrfs_discard_purge_list(&fs_info->discard_ctl);
+}
diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
new file mode 100644
index 0000000..353228d
--- /dev/null
+++ b/fs/btrfs/discard.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DISCARD_H
+#define BTRFS_DISCARD_H
+
+#include <linux/sizes.h>
+
+struct btrfs_fs_info;
+struct btrfs_discard_ctl;
+struct btrfs_block_group;
+
+/* Discard size limits */
+#define BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE (SZ_64M)
+#define BTRFS_ASYNC_DISCARD_MAX_FILTER (SZ_1M)
+#define BTRFS_ASYNC_DISCARD_MIN_FILTER (SZ_32K)
+
+/* List operations */
+void btrfs_discard_check_filter(struct btrfs_block_group *block_group, u64 bytes);
+
+/* Work operations */
+void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group);
+void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group);
+void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
+ bool override);
+bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl);
+
+/* Update operations */
+void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl);
+void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl);
+
+/* Setup/cleanup operations */
+void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info);
+void btrfs_discard_resume(struct btrfs_fs_info *fs_info);
+void btrfs_discard_stop(struct btrfs_fs_info *fs_info);
+void btrfs_discard_init(struct btrfs_fs_info *fs_info);
+void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info);
+
+#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 946ae19..a5bcad0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -7,7 +7,6 @@
#include <linux/blkdev.h>
#include <linux/radix-tree.h>
#include <linux/writeback.h>
-#include <linux/buffer_head.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/slab.h>
@@ -41,6 +40,8 @@
#include "tree-checker.h"
#include "ref-verify.h"
#include "block-group.h"
+#include "discard.h"
+#include "space-info.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -49,7 +50,6 @@
BTRFS_SUPER_FLAG_METADUMP |\
BTRFS_SUPER_FLAG_METADUMP_V2)
-static const struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -97,6 +97,12 @@
kmem_cache_destroy(btrfs_end_io_wq_cache);
}
+static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->csum_shash)
+ crypto_free_shash(fs_info->csum_shash);
+}
+
/*
* async submit bios are used to offload expensive checksumming
* onto the worker threads. They checksum file and metadata bios
@@ -198,98 +204,28 @@
#endif
/*
- * extents on the btree inode are pretty simple, there's one extent
- * that covers the entire device
- */
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset, u64 start, u64 len,
- int create)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_map *em;
- int ret;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- if (em) {
- em->bdev = fs_info->fs_devices->latest_bdev;
- read_unlock(&em_tree->lock);
- goto out;
- }
- read_unlock(&em_tree->lock);
-
- em = alloc_extent_map();
- if (!em) {
- em = ERR_PTR(-ENOMEM);
- goto out;
- }
- em->start = 0;
- em->len = (u64)-1;
- em->block_len = (u64)-1;
- em->block_start = 0;
- em->bdev = fs_info->fs_devices->latest_bdev;
-
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
- if (ret == -EEXIST) {
- free_extent_map(em);
- em = lookup_extent_mapping(em_tree, start, len);
- if (!em)
- em = ERR_PTR(-EIO);
- } else if (ret) {
- free_extent_map(em);
- em = ERR_PTR(ret);
- }
- write_unlock(&em_tree->lock);
-
-out:
- return em;
-}
-
-/*
* Compute the csum of a btree block and store the result to provided buffer.
- *
- * Returns error if the extent buffer cannot be mapped.
*/
-static int csum_tree_block(struct extent_buffer *buf, u8 *result)
+static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
+ const int num_pages = fs_info->nodesize >> PAGE_SHIFT;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- unsigned long len;
- unsigned long cur_len;
- unsigned long offset = BTRFS_CSUM_SIZE;
char *kaddr;
- unsigned long map_start;
- unsigned long map_len;
- int err;
+ int i;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
+ kaddr = page_address(buf->pages[0]);
+ crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
+ PAGE_SIZE - BTRFS_CSUM_SIZE);
- len = buf->len - offset;
-
- while (len > 0) {
- /*
- * Note: we don't need to check for the err == 1 case here, as
- * with the given combination of 'start = BTRFS_CSUM_SIZE (32)'
- * and 'min_len = 32' and the currently implemented mapping
- * algorithm we cannot cross a page boundary.
- */
- err = map_private_extent_buffer(buf, offset, 32,
- &kaddr, &map_start, &map_len);
- if (WARN_ON(err))
- return err;
- cur_len = min(len, map_len - (offset - map_start));
- crypto_shash_update(shash, kaddr + offset - map_start, cur_len);
- len -= cur_len;
- offset += cur_len;
+ for (i = 1; i < num_pages; i++) {
+ kaddr = page_address(buf->pages[i]);
+ crypto_shash_update(shash, kaddr, PAGE_SIZE);
}
memset(result, 0, BTRFS_CSUM_SIZE);
-
crypto_shash_final(shash, result);
-
- return 0;
}
/*
@@ -352,6 +288,9 @@
{
switch (csum_type) {
case BTRFS_CSUM_TYPE_CRC32:
+ case BTRFS_CSUM_TYPE_XXHASH:
+ case BTRFS_CSUM_TYPE_SHA256:
+ case BTRFS_CSUM_TYPE_BLAKE2:
return true;
default:
return false;
@@ -371,16 +310,14 @@
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
/*
* The super_block structure does not span the whole
* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
* filled with zeros and is included in the checksum.
*/
- crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
- BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
- crypto_shash_final(shash, result);
+ crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb)))
return 1;
@@ -534,10 +471,10 @@
return -EUCLEAN;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
- btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
+ offsetof(struct btrfs_header, fsid),
+ BTRFS_FSID_SIZE) == 0);
- if (csum_tree_block(eb, result))
- return -EINVAL;
+ csum_tree_block(eb, result);
if (btrfs_header_level(eb))
ret = btrfs_check_node(eb);
@@ -545,9 +482,11 @@
ret = btrfs_check_leaf_full(eb);
if (ret < 0) {
+ btrfs_print_tree(eb, 0);
btrfs_err(fs_info,
"block=%llu write time tree block corruption detected",
eb->start);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
return ret;
}
write_extent_buffer(eb, result, 0, csum_size);
@@ -558,44 +497,41 @@
static int check_tree_block_fsid(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
u8 fsid[BTRFS_FSID_SIZE];
- int ret = 1;
+ u8 *metadata_uuid;
- read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
- while (fs_devices) {
- u8 *metadata_uuid;
+ read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
+ BTRFS_FSID_SIZE);
+ /*
+ * Checking the incompat flag is only valid for the current fs. For
+ * seed devices it's forbidden to have their uuid changed so reading
+ * ->fsid in this case is fine
+ */
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
+ metadata_uuid = fs_devices->metadata_uuid;
+ else
+ metadata_uuid = fs_devices->fsid;
- /*
- * Checking the incompat flag is only valid for the current
- * fs. For seed devices it's forbidden to have their uuid
- * changed so reading ->fsid in this case is fine
- */
- if (fs_devices == fs_info->fs_devices &&
- btrfs_fs_incompat(fs_info, METADATA_UUID))
- metadata_uuid = fs_devices->metadata_uuid;
- else
- metadata_uuid = fs_devices->fsid;
+ if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
+ return 0;
- if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {
- ret = 0;
- break;
- }
- fs_devices = fs_devices->seed;
- }
- return ret;
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+ if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
+ return 0;
+
+ return 1;
}
-static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int mirror)
{
u64 found_start;
int found_level;
struct extent_buffer *eb;
- struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ struct btrfs_fs_info *fs_info;
+ u16 csum_size;
int ret = 0;
u8 result[BTRFS_CSUM_SIZE];
int reads_done;
@@ -604,11 +540,13 @@
goto out;
eb = (struct extent_buffer *)page->private;
+ fs_info = eb->fs_info;
+ csum_size = btrfs_super_csum_size(fs_info->super_copy);
/* the pending IO might have been the only thing that kept this buffer
* in memory. Make sure we have a ref for all this other checks
*/
- extent_buffer_get(eb);
+ atomic_inc(&eb->refs);
reads_done = atomic_dec_and_test(&eb->io_pages);
if (!reads_done)
@@ -644,9 +582,7 @@
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
eb, found_level);
- ret = csum_tree_block(eb, result);
- if (ret)
- goto err;
+ csum_tree_block(eb, result);
if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
u8 val[BTRFS_CSUM_SIZE] = { 0 };
@@ -719,9 +655,7 @@
else
wq = fs_info->endio_write_workers;
} else {
- if (unlikely(end_io_wq->metadata == BTRFS_WQ_ENDIO_DIO_REPAIR))
- wq = fs_info->endio_repair_workers;
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+ if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
wq = fs_info->endio_raid56_workers;
else if (end_io_wq->metadata)
wq = fs_info->endio_meta_workers;
@@ -790,8 +724,13 @@
return;
}
- ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio,
- async->mirror_num, 1);
+ /*
+ * All of the bios that pass through here are from async helpers.
+ * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
+ * This changes nothing when cgroups aren't in use.
+ */
+ async->bio->bi_opf |= REQ_CGROUP_PUNT;
+ ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
if (ret) {
async->bio->bi_status = ret;
bio_endio(async->bio);
@@ -874,9 +813,8 @@
return 1;
}
-static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int async = check_async_write(fs_info, BTRFS_I(inode));
@@ -891,12 +829,12 @@
BTRFS_WQ_ENDIO_METADATA);
if (ret)
goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else if (!async) {
ret = btree_csum_one_bio(bio);
if (ret)
goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else {
/*
* kthread helpers are used to submit writes so that
@@ -961,13 +899,6 @@
return btree_write_cache_pages(mapping, wbc);
}
-static int btree_readpage(struct file *file, struct page *page)
-{
- struct extent_io_tree *tree;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_read_full_page(tree, page, btree_get_extent, 0);
-}
-
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
{
if (PageWriteback(page) || PageDirty(page))
@@ -987,9 +918,7 @@
btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
"page private not zero on page %llu",
(unsigned long long)page_offset(page));
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
+ detach_page_private(page);
}
}
@@ -1009,7 +938,6 @@
}
static const struct address_space_operations btree_aops = {
- .readpage = btree_readpage,
.writepages = btree_writepages,
.releasepage = btree_releasepage,
.invalidatepage = btree_invalidatepage,
@@ -1091,36 +1019,11 @@
}
}
-static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
-{
- struct btrfs_subvolume_writers *writers;
- int ret;
-
- writers = kmalloc(sizeof(*writers), GFP_NOFS);
- if (!writers)
- return ERR_PTR(-ENOMEM);
-
- ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
- if (ret < 0) {
- kfree(writers);
- return ERR_PTR(ret);
- }
-
- init_waitqueue_head(&writers->wait);
- return writers;
-}
-
-static void
-btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
-{
- percpu_counter_destroy(&writers->counter);
- kfree(writers);
-}
-
static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+ root->fs_info = fs_info;
root->node = NULL;
root->commit_root = NULL;
root->state = 0;
@@ -1165,36 +1068,40 @@
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
refcount_set(&root->refs, 1);
- atomic_set(&root->will_be_snapshotted, 0);
atomic_set(&root->snapshot_force_cow, 0);
atomic_set(&root->nr_swapfiles, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
- if (!dummy)
+ if (!dummy) {
extent_io_tree_init(fs_info, &root->dirty_log_pages,
IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
+ extent_io_tree_init(fs_info, &root->log_csum_range,
+ IO_TREE_LOG_CSUM_RANGE, NULL);
+ }
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
- if (!dummy)
- root->defrag_trans_start = fs_info->generation;
- else
- root->defrag_trans_start = 0;
root->root_key.objectid = objectid;
root->anon_dev = 0;
spin_lock_init(&root->root_item_lock);
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
+#ifdef CONFIG_BTRFS_DEBUG
+ INIT_LIST_HEAD(&root->leak_list);
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ list_add_tail(&root->leak_list, &fs_info->allocated_roots);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+#endif
}
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
- gfp_t flags)
+ u64 objectid, gfp_t flags)
{
struct btrfs_root *root = kzalloc(sizeof(*root), flags);
if (root)
- root->fs_info = fs_info;
+ __setup_root(root, fs_info, objectid);
return root;
}
@@ -1207,12 +1114,11 @@
if (!fs_info)
return ERR_PTR(-EINVAL);
- root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
if (!root)
return ERR_PTR(-ENOMEM);
/* We don't use the stripesize in selftest, set it as sectorsize */
- __setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
root->alloc_bytenr = 0;
return root;
@@ -1229,24 +1135,23 @@
struct btrfs_key key;
unsigned int nofs_flag;
int ret = 0;
- uuid_le uuid = NULL_UUID_LE;
/*
* We're holding a transaction handle, so use a NOFS memory allocation
* context to avoid deadlock if reclaim happens.
*/
nofs_flag = memalloc_nofs_save();
- root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(root, fs_info, objectid);
root->root_key.objectid = objectid;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
@@ -1269,8 +1174,9 @@
btrfs_set_root_last_snapshot(&root->root_item, 0);
btrfs_set_root_dirid(&root->root_item, 0);
if (is_fstree(objectid))
- uuid_le_gen(&uuid);
- memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
+ generate_random_guid(root->root_item.uuid);
+ else
+ export_guid(root->root_item.uuid, &guid_null);
root->root_item.drop_level = 0;
key.objectid = objectid;
@@ -1285,12 +1191,9 @@
return root;
fail:
- if (leaf) {
+ if (leaf)
btrfs_tree_unlock(leaf);
- free_extent_buffer(root->commit_root);
- free_extent_buffer(leaf);
- }
- kfree(root);
+ btrfs_put_root(root);
return ERR_PTR(ret);
}
@@ -1301,29 +1204,28 @@
struct btrfs_root *root;
struct extent_buffer *leaf;
- root = btrfs_alloc_root(fs_info, GFP_NOFS);
+ root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(root, fs_info, BTRFS_TREE_LOG_OBJECTID);
-
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
/*
- * DON'T set REF_COWS for log trees
+ * DON'T set SHAREABLE bit for log trees.
*
- * log trees do not get reference counted because they go away
- * before a real commit is actually done. They do store pointers
- * to file data extents, and those reference counts still get
- * updated (along with back refs to the log tree).
+ * Log trees are not exposed to user space thus can't be snapshotted,
+ * and they go away before a real commit is actually done.
+ *
+ * They do store pointers to file data extents, and those reference
+ * counts still get updated (along with back refs to the log tree).
*/
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
- NULL, 0, 0, 0);
+ NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
- kfree(root);
+ btrfs_put_root(root);
return ERR_CAST(leaf);
}
@@ -1379,34 +1281,26 @@
return 0;
}
-static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
- struct btrfs_key *key)
+static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
+ struct btrfs_path *path,
+ struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_fs_info *fs_info = tree_root->fs_info;
- struct btrfs_path *path;
u64 generation;
int ret;
int level;
- path = btrfs_alloc_path();
- if (!path)
+ root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
+ if (!root)
return ERR_PTR(-ENOMEM);
- root = btrfs_alloc_root(fs_info, GFP_NOFS);
- if (!root) {
- ret = -ENOMEM;
- goto alloc_fail;
- }
-
- __setup_root(root, fs_info, key->objectid);
-
ret = btrfs_find_root(tree_root, key, path,
&root->root_item, &root->root_key);
if (ret) {
if (ret > 0)
ret = -ENOENT;
- goto find_fail;
+ goto fail;
}
generation = btrfs_root_generation(&root->root_item);
@@ -1416,45 +1310,43 @@
generation, level, NULL);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
- goto find_fail;
+ root->node = NULL;
+ goto fail;
} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
- free_extent_buffer(root->node);
- goto find_fail;
+ goto fail;
}
root->commit_root = btrfs_root_node(root);
-out:
- btrfs_free_path(path);
return root;
-
-find_fail:
- kfree(root);
-alloc_fail:
- root = ERR_PTR(ret);
- goto out;
+fail:
+ btrfs_put_root(root);
+ return ERR_PTR(ret);
}
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
- struct btrfs_key *location)
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key)
{
struct btrfs_root *root;
+ struct btrfs_path *path;
- root = btrfs_read_tree_root(tree_root, location);
- if (IS_ERR(root))
- return root;
-
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- set_bit(BTRFS_ROOT_REF_COWS, &root->state);
- btrfs_check_and_init_root_item(&root->root_item);
- }
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+ root = read_tree_root_path(tree_root, path, key);
+ btrfs_free_path(path);
return root;
}
-int btrfs_init_fs_root(struct btrfs_root *root)
+/*
+ * Initialize subvolume root in-memory structure
+ *
+ * @anon_dev: anonymous device to attach to the root, if zero, allocate new
+ */
+static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
{
int ret;
- struct btrfs_subvolume_writers *writers;
+ unsigned int nofs_flag;
root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1464,12 +1356,21 @@
goto fail;
}
- writers = btrfs_alloc_subvolume_writers();
- if (IS_ERR(writers)) {
- ret = PTR_ERR(writers);
+ /*
+ * We might be called under a transaction (e.g. indirect backref
+ * resolution) which could deadlock if it triggers memory reclaim
+ */
+ nofs_flag = memalloc_nofs_save();
+ ret = btrfs_drew_lock_init(&root->snapshot_lock);
+ memalloc_nofs_restore(nofs_flag);
+ if (ret)
goto fail;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+ root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
+ btrfs_check_and_init_root_item(&root->root_item);
}
- root->subv_writers = writers;
btrfs_init_free_ino_ctl(root);
spin_lock_init(&root->ino_cache_lock);
@@ -1481,9 +1382,13 @@
*/
if (is_fstree(root->root_key.objectid) &&
btrfs_root_refs(&root->root_item) > 0) {
- ret = get_anon_bdev(&root->anon_dev);
- if (ret)
- goto fail;
+ if (!anon_dev) {
+ ret = get_anon_bdev(&root->anon_dev);
+ if (ret)
+ goto fail;
+ } else {
+ root->anon_dev = anon_dev;
+ }
}
mutex_lock(&root->objectid_mutex);
@@ -1504,18 +1409,45 @@
return ret;
}
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
- u64 root_id)
+static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_id)
{
struct btrfs_root *root;
spin_lock(&fs_info->fs_roots_radix_lock);
root = radix_tree_lookup(&fs_info->fs_roots_radix,
(unsigned long)root_id);
+ if (root)
+ root = btrfs_grab_root(root);
spin_unlock(&fs_info->fs_roots_radix_lock);
return root;
}
+static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
+ u64 objectid)
+{
+ if (objectid == BTRFS_ROOT_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->tree_root);
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->extent_root);
+ if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->chunk_root);
+ if (objectid == BTRFS_DEV_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->dev_root);
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->csum_root);
+ if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->quota_root) ?
+ fs_info->quota_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_UUID_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->uuid_root) ?
+ fs_info->uuid_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->free_space_root) ?
+ fs_info->free_space_root : ERR_PTR(-ENOENT);
+ return NULL;
+}
+
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root)
{
@@ -1529,51 +1461,111 @@
ret = radix_tree_insert(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
root);
- if (ret == 0)
+ if (ret == 0) {
+ btrfs_grab_root(root);
set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
+ }
spin_unlock(&fs_info->fs_roots_radix_lock);
radix_tree_preload_end();
return ret;
}
-struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
- struct btrfs_key *location,
- bool check_ref)
+void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
+{
+#ifdef CONFIG_BTRFS_DEBUG
+ struct btrfs_root *root;
+
+ while (!list_empty(&fs_info->allocated_roots)) {
+ char buf[BTRFS_ROOT_NAME_BUF_LEN];
+
+ root = list_first_entry(&fs_info->allocated_roots,
+ struct btrfs_root, leak_list);
+ btrfs_err(fs_info, "leaked root %s refcount %d",
+ btrfs_root_name(&root->root_key, buf),
+ refcount_read(&root->refs));
+ while (refcount_read(&root->refs) > 1)
+ btrfs_put_root(root);
+ btrfs_put_root(root);
+ }
+#endif
+}
+
+void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
+{
+ percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+ percpu_counter_destroy(&fs_info->delalloc_bytes);
+ percpu_counter_destroy(&fs_info->dio_bytes);
+ percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
+ btrfs_free_csum_hash(fs_info);
+ btrfs_free_stripe_hash_table(fs_info);
+ btrfs_free_ref_cache(fs_info);
+ kfree(fs_info->balance_ctl);
+ kfree(fs_info->delayed_root);
+ btrfs_put_root(fs_info->extent_root);
+ btrfs_put_root(fs_info->tree_root);
+ btrfs_put_root(fs_info->chunk_root);
+ btrfs_put_root(fs_info->dev_root);
+ btrfs_put_root(fs_info->csum_root);
+ btrfs_put_root(fs_info->quota_root);
+ btrfs_put_root(fs_info->uuid_root);
+ btrfs_put_root(fs_info->free_space_root);
+ btrfs_put_root(fs_info->fs_root);
+ btrfs_put_root(fs_info->data_reloc_root);
+ btrfs_check_leaked_roots(fs_info);
+ btrfs_extent_buffer_leak_debug_check(fs_info);
+ kfree(fs_info->super_copy);
+ kfree(fs_info->super_for_commit);
+ kvfree(fs_info);
+}
+
+
+/*
+ * Get an in-memory reference of a root structure.
+ *
+ * For essential trees like root/extent tree, we grab it from fs_info directly.
+ * For subvolume trees, we check the cached filesystem roots first. If not
+ * found, then read it from disk and add it to cached fs roots.
+ *
+ * Caller should release the root by calling btrfs_put_root() after the usage.
+ *
+ * NOTE: Reloc and log trees can't be read by this function as they share the
+ * same root objectid.
+ *
+ * @objectid: root id
+ * @anon_dev: preallocated anonymous block device number for new roots,
+ * pass 0 for new allocation.
+ * @check_ref: whether to check root item references, If true, return -ENOENT
+ * for orphan roots
+ */
+static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev,
+ bool check_ref)
{
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
int ret;
- if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
- return fs_info->tree_root;
- if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
- return fs_info->extent_root;
- if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
- return fs_info->chunk_root;
- if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
- return fs_info->dev_root;
- if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
- return fs_info->csum_root;
- if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
- return fs_info->quota_root ? fs_info->quota_root :
- ERR_PTR(-ENOENT);
- if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
- return fs_info->uuid_root ? fs_info->uuid_root :
- ERR_PTR(-ENOENT);
- if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
- return fs_info->free_space_root ? fs_info->free_space_root :
- ERR_PTR(-ENOENT);
+ root = btrfs_get_global_root(fs_info, objectid);
+ if (root)
+ return root;
again:
- root = btrfs_lookup_fs_root(fs_info, location->objectid);
+ root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
- if (check_ref && btrfs_root_refs(&root->root_item) == 0)
+ /* Shouldn't get preallocated anon_dev for cached roots */
+ ASSERT(!anon_dev);
+ if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
+ btrfs_put_root(root);
return ERR_PTR(-ENOENT);
+ }
return root;
}
- root = btrfs_read_fs_root(fs_info->tree_root, location);
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = btrfs_read_tree_root(fs_info->tree_root, &key);
if (IS_ERR(root))
return root;
@@ -1582,7 +1574,7 @@
goto fail;
}
- ret = btrfs_init_fs_root(root);
+ ret = btrfs_init_fs_root(root, anon_dev);
if (ret)
goto fail;
@@ -1593,7 +1585,7 @@
}
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
- key.offset = location->objectid;
+ key.offset = objectid;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
btrfs_free_path(path);
@@ -1604,37 +1596,96 @@
ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
- if (ret == -EEXIST) {
- btrfs_free_fs_root(root);
+ btrfs_put_root(root);
+ if (ret == -EEXIST)
goto again;
- }
goto fail;
}
return root;
fail:
- btrfs_free_fs_root(root);
+ /*
+ * If our caller provided us an anonymous device, then it's his
+ * responsability to free it in case we fail. So we have to set our
+ * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
+ * and once again by our caller.
+ */
+ if (anon_dev)
+ root->anon_dev = 0;
+ btrfs_put_root(root);
return ERR_PTR(ret);
}
-static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+/*
+ * Get in-memory reference of a root structure
+ *
+ * @objectid: tree objectid
+ * @check_ref: if set, verify that the tree exists and the item has at least
+ * one reference
+ */
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, bool check_ref)
{
- struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
- int ret = 0;
- struct btrfs_device *device;
- struct backing_dev_info *bdi;
+ return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
+}
- rcu_read_lock();
- list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
- if (!device->bdev)
- continue;
- bdi = device->bdev->bd_bdi;
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- rcu_read_unlock();
- return ret;
+/*
+ * Get in-memory reference of a root structure, created as new, optionally pass
+ * the anonymous block device id
+ *
+ * @objectid: tree objectid
+ * @anon_dev: if zero, allocate a new anonymous block device or use the
+ * parameter value
+ */
+struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev)
+{
+ return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
+}
+
+/*
+ * btrfs_get_fs_root_commit_root - return a root for the given objectid
+ * @fs_info: the fs_info
+ * @objectid: the objectid we need to lookup
+ *
+ * This is exclusively used for backref walking, and exists specifically because
+ * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref
+ * creation time, which means we may have to read the tree_root in order to look
+ * up a fs root that is not in memory. If the root is not in memory we will
+ * read the tree root commit root and look up the fs root from there. This is a
+ * temporary root, it will not be inserted into the radix tree as it doesn't
+ * have the most uptodate information, it'll simply be discarded once the
+ * backref code is finished using the root.
+ */
+struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ u64 objectid)
+{
+ struct btrfs_root *root;
+ struct btrfs_key key;
+
+ ASSERT(path->search_commit_root && path->skip_locking);
+
+ /*
+ * This can return -ENOENT if we ask for a root that doesn't exist, but
+ * since this is called via the backref walking code we won't be looking
+ * up a root that doesn't exist, unless there's corruption. So if root
+ * != NULL just return it.
+ */
+ root = btrfs_get_global_root(fs_info, objectid);
+ if (root)
+ return root;
+
+ root = btrfs_lookup_fs_root(fs_info, objectid);
+ if (root)
+ return root;
+
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = read_tree_root_path(fs_info->tree_root, path, &key);
+ btrfs_release_path(path);
+
+ return root;
}
/*
@@ -1786,18 +1837,18 @@
}
/*
- * this will find the highest generation in the array of
- * root backups. The index of the highest array is returned,
- * or -1 if we can't find anything.
+ * This will find the highest generation in the array of root backups. The
+ * index of the highest array is returned, or -EINVAL if we can't find
+ * anything.
*
* We check to make sure the array is valid by comparing the
* generation of the latest root in the array with the generation
* in the super block. If they don't match we pitch it.
*/
-static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+static int find_newest_super_backup(struct btrfs_fs_info *info)
{
+ const u64 newest_gen = btrfs_super_generation(info->super_copy);
u64 cur;
- int newest_index = -1;
struct btrfs_root_backup *root_backup;
int i;
@@ -1805,37 +1856,10 @@
root_backup = info->super_copy->super_roots + i;
cur = btrfs_backup_tree_root_gen(root_backup);
if (cur == newest_gen)
- newest_index = i;
+ return i;
}
- /* check to see if we actually wrapped around */
- if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
- root_backup = info->super_copy->super_roots;
- cur = btrfs_backup_tree_root_gen(root_backup);
- if (cur == newest_gen)
- newest_index = 0;
- }
- return newest_index;
-}
-
-
-/*
- * find the oldest backup so we know where to store new entries
- * in the backup array. This will set the backup_root_index
- * field in the fs_info struct
- */
-static void find_oldest_super_backup(struct btrfs_fs_info *info,
- u64 newest_gen)
-{
- int newest_index = -1;
-
- newest_index = find_newest_super_backup(info, newest_gen);
- /* if there was garbage in there, just move along */
- if (newest_index == -1) {
- info->backup_root_index = 0;
- } else {
- info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
- }
+ return -EINVAL;
}
/*
@@ -1845,22 +1869,8 @@
*/
static void backup_super_roots(struct btrfs_fs_info *info)
{
- int next_backup;
+ const int next_backup = info->backup_root_index;
struct btrfs_root_backup *root_backup;
- int last_backup;
-
- next_backup = info->backup_root_index;
- last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
- BTRFS_NUM_BACKUP_ROOTS;
-
- /*
- * just overwrite the last backup if we're at the same generation
- * this happens only at umount
- */
- root_backup = info->super_for_commit->super_roots + last_backup;
- if (btrfs_backup_tree_root_gen(root_backup) ==
- btrfs_header_generation(info->tree_root->node))
- next_backup = last_backup;
root_backup = info->super_for_commit->super_roots + next_backup;
@@ -1933,40 +1943,31 @@
}
/*
- * this copies info out of the root backup array and back into
- * the in-memory super block. It is meant to help iterate through
- * the array, so you send it the number of backups you've already
- * tried and the last backup index you used.
+ * read_backup_root - Reads a backup root based on the passed priority. Prio 0
+ * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
*
- * this returns -1 when it has tried all the backups
+ * fs_info - filesystem whose backup roots need to be read
+ * priority - priority of backup root required
+ *
+ * Returns backup root index on success and -EINVAL otherwise.
*/
-static noinline int next_root_backup(struct btrfs_fs_info *info,
- struct btrfs_super_block *super,
- int *num_backups_tried, int *backup_index)
+static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
{
+ int backup_index = find_newest_super_backup(fs_info);
+ struct btrfs_super_block *super = fs_info->super_copy;
struct btrfs_root_backup *root_backup;
- int newest = *backup_index;
- if (*num_backups_tried == 0) {
- u64 gen = btrfs_super_generation(super);
+ if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
+ if (priority == 0)
+ return backup_index;
- newest = find_newest_super_backup(info, gen);
- if (newest == -1)
- return -1;
-
- *backup_index = newest;
- *num_backups_tried = 1;
- } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
- /* we've tried all the backups, all done */
- return -1;
+ backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
+ backup_index %= BTRFS_NUM_BACKUP_ROOTS;
} else {
- /* jump to the next oldest backup */
- newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
- BTRFS_NUM_BACKUP_ROOTS;
- *backup_index = newest;
- *num_backups_tried += 1;
+ return -EINVAL;
}
- root_backup = super->super_roots + newest;
+
+ root_backup = super->super_roots + backup_index;
btrfs_set_super_generation(super,
btrfs_backup_tree_root_gen(root_backup));
@@ -1976,12 +1977,13 @@
btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
/*
- * fixme: the total bytes and num_devices need to match or we should
+ * Fixme: the total bytes and num_devices need to match or we should
* need a fsck
*/
btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
- return 0;
+
+ return backup_index;
}
/* helper to cleanup workers */
@@ -1992,16 +1994,16 @@
btrfs_destroy_workqueue(fs_info->workers);
btrfs_destroy_workqueue(fs_info->endio_workers);
btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
- btrfs_destroy_workqueue(fs_info->endio_repair_workers);
btrfs_destroy_workqueue(fs_info->rmw_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
- btrfs_destroy_workqueue(fs_info->submit_workers);
btrfs_destroy_workqueue(fs_info->delayed_workers);
btrfs_destroy_workqueue(fs_info->caching_workers);
btrfs_destroy_workqueue(fs_info->readahead_workers);
btrfs_destroy_workqueue(fs_info->flush_workers);
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
+ if (fs_info->discard_ctl.discard_workers)
+ destroy_workqueue(fs_info->discard_ctl.discard_workers);
/*
* Now that all other work queues are destroyed, we can safely destroy
* the queues used for metadata I/O, since tasks from those other work
@@ -2031,11 +2033,36 @@
free_root_extent_buffers(info->csum_root);
free_root_extent_buffers(info->quota_root);
free_root_extent_buffers(info->uuid_root);
+ free_root_extent_buffers(info->fs_root);
+ free_root_extent_buffers(info->data_reloc_root);
if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
free_root_extent_buffers(info->free_space_root);
}
+void btrfs_put_root(struct btrfs_root *root)
+{
+ if (!root)
+ return;
+
+ if (refcount_dec_and_test(&root->refs)) {
+ WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
+ if (root->anon_dev)
+ free_anon_bdev(root->anon_dev);
+ btrfs_drew_lock_destroy(&root->snapshot_lock);
+ free_root_extent_buffers(root);
+ kfree(root->free_ino_ctl);
+ kfree(root->free_ino_pinned);
+#ifdef CONFIG_BTRFS_DEBUG
+ spin_lock(&root->fs_info->fs_roots_radix_lock);
+ list_del_init(&root->leak_list);
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
+#endif
+ kfree(root);
+ }
+}
+
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
{
int ret;
@@ -2047,13 +2074,9 @@
struct btrfs_root, root_list);
list_del(&gang[0]->root_list);
- if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
btrfs_drop_and_free_fs_root(fs_info, gang[0]);
- } else {
- free_extent_buffer(gang[0]->node);
- free_extent_buffer(gang[0]->commit_root);
- btrfs_put_fs_root(gang[0]);
- }
+ btrfs_put_root(gang[0]);
}
while (1) {
@@ -2065,11 +2088,6 @@
for (i = 0; i < ret; i++)
btrfs_drop_and_free_fs_root(fs_info, gang[i]);
}
-
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
- btrfs_free_log_root_tree(NULL, fs_info);
- btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
- }
}
static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
@@ -2109,13 +2127,11 @@
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_INODE_IO, inode);
+ IO_TREE_BTREE_INODE_IO, inode);
BTRFS_I(inode)->io_tree.track_uptodate = false;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
- BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
-
- BTRFS_I(inode)->root = fs_info->tree_root;
+ BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
btrfs_insert_inode_hash(inode);
@@ -2161,16 +2177,6 @@
fs_info->caching_workers =
btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
- /*
- * a higher idle thresh on the submit workers makes it much more
- * likely that bios will be send down in a sane order to the
- * devices
- */
- fs_info->submit_workers =
- btrfs_alloc_workqueue(fs_info, "submit", flags,
- min_t(u64, fs_devices->num_devices,
- max_active), 64);
-
fs_info->fixup_workers =
btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
@@ -2189,8 +2195,6 @@
fs_info->endio_raid56_workers =
btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
max_active, 4);
- fs_info->endio_repair_workers =
- btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0);
fs_info->rmw_workers =
btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
fs_info->endio_write_workers =
@@ -2207,17 +2211,19 @@
max_active, 2);
fs_info->qgroup_rescan_workers =
btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
+ fs_info->discard_ctl.discard_workers =
+ alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
if (!(fs_info->workers && fs_info->delalloc_workers &&
- fs_info->submit_workers && fs_info->flush_workers &&
+ fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->endio_meta_write_workers &&
- fs_info->endio_repair_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->readahead_workers &&
fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->qgroup_rescan_workers)) {
+ fs_info->qgroup_rescan_workers &&
+ fs_info->discard_ctl.discard_workers)) {
return -ENOMEM;
}
@@ -2227,13 +2233,13 @@
static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
{
struct crypto_shash *csum_shash;
- const char *csum_name = btrfs_super_csum_name(csum_type);
+ const char *csum_driver = btrfs_super_csum_driver(csum_type);
- csum_shash = crypto_alloc_shash(csum_name, 0, 0);
+ csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
if (IS_ERR(csum_shash)) {
btrfs_err(fs_info, "error allocating %s hash for checksum",
- csum_name);
+ csum_driver);
return PTR_ERR(csum_shash);
}
@@ -2242,11 +2248,6 @@
return 0;
}
-static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
-{
- crypto_free_shash(fs_info->csum_shash);
-}
-
static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
@@ -2261,24 +2262,23 @@
return -EIO;
}
- log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
+ GFP_KERNEL);
if (!log_tree_root)
return -ENOMEM;
- __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
-
log_tree_root->node = read_tree_block(fs_info, bytenr,
fs_info->generation + 1,
level, NULL);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
- kfree(log_tree_root);
+ log_tree_root->node = NULL;
+ btrfs_put_root(log_tree_root);
return ret;
} else if (!extent_buffer_uptodate(log_tree_root->node)) {
btrfs_err(fs_info, "failed to read log tree");
- free_extent_buffer(log_tree_root->node);
- kfree(log_tree_root);
+ btrfs_put_root(log_tree_root);
return -EIO;
}
/* returns with log_tree_root freed on success */
@@ -2286,8 +2286,7 @@
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Failed to recover log tree");
- free_extent_buffer(log_tree_root->node);
- kfree(log_tree_root);
+ btrfs_put_root(log_tree_root);
return ret;
}
@@ -2340,6 +2339,19 @@
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->csum_root = root;
+ /*
+ * This tree can share blocks with some other fs tree during relocation
+ * and we need a proper setup by btrfs_get_fs_root
+ */
+ root = btrfs_get_fs_root(tree_root->fs_info,
+ BTRFS_DATA_RELOC_TREE_OBJECTID, true);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->data_reloc_root = root;
+
location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (!IS_ERR(root)) {
@@ -2601,69 +2613,103 @@
return ret;
}
-int open_ctree(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- char *options)
+static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
{
- u32 sectorsize;
- u32 nodesize;
- u32 stripesize;
- u64 generation;
- u64 features;
- u16 csum_type;
- struct btrfs_key location;
- struct buffer_head *bh;
- struct btrfs_super_block *disk_super;
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- struct btrfs_root *tree_root;
- struct btrfs_root *chunk_root;
- int ret;
- int err = -EINVAL;
- int num_backups_tried = 0;
- int backup_index = 0;
- int clear_free_space_tree = 0;
- int level;
+ int backup_index = find_newest_super_backup(fs_info);
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ bool handle_error = false;
+ int ret = 0;
+ int i;
- tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
- chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
- if (!tree_root || !chunk_root) {
- err = -ENOMEM;
- goto fail;
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+ u64 generation;
+ int level;
+
+ if (handle_error) {
+ if (!IS_ERR(tree_root->node))
+ free_extent_buffer(tree_root->node);
+ tree_root->node = NULL;
+
+ if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
+ break;
+
+ free_root_pointers(fs_info, 0);
+
+ /*
+ * Don't use the log in recovery mode, it won't be
+ * valid
+ */
+ btrfs_set_super_log_root(sb, 0);
+
+ /* We can't trust the free space cache either */
+ btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+
+ ret = read_backup_root(fs_info, i);
+ backup_index = ret;
+ if (ret < 0)
+ return ret;
+ }
+ generation = btrfs_super_generation(sb);
+ level = btrfs_super_root_level(sb);
+ tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
+ generation, level, NULL);
+ if (IS_ERR(tree_root->node)) {
+ handle_error = true;
+ ret = PTR_ERR(tree_root->node);
+ tree_root->node = NULL;
+ btrfs_warn(fs_info, "couldn't read tree root");
+ continue;
+
+ } else if (!extent_buffer_uptodate(tree_root->node)) {
+ handle_error = true;
+ ret = -EIO;
+ btrfs_warn(fs_info, "error while reading tree root");
+ continue;
+ }
+
+ btrfs_set_root_node(&tree_root->root_item, tree_root->node);
+ tree_root->commit_root = btrfs_root_node(tree_root);
+ btrfs_set_root_refs(&tree_root->root_item, 1);
+
+ /*
+ * No need to hold btrfs_root::objectid_mutex since the fs
+ * hasn't been fully initialised and we are the only user
+ */
+ ret = btrfs_find_highest_objectid(tree_root,
+ &tree_root->highest_objectid);
+ if (ret < 0) {
+ handle_error = true;
+ continue;
+ }
+
+ ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+ ret = btrfs_read_roots(fs_info);
+ if (ret < 0) {
+ handle_error = true;
+ continue;
+ }
+
+ /* All successful */
+ fs_info->generation = generation;
+ fs_info->last_trans_committed = generation;
+
+ /* Always begin writing backup roots after the one being used */
+ if (backup_index < 0) {
+ fs_info->backup_root_index = 0;
+ } else {
+ fs_info->backup_root_index = backup_index + 1;
+ fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
+ }
+ break;
}
- ret = init_srcu_struct(&fs_info->subvol_srcu);
- if (ret) {
- err = ret;
- goto fail;
- }
+ return ret;
+}
- ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
- if (ret) {
- err = ret;
- goto fail_srcu;
- }
-
- ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
- if (ret) {
- err = ret;
- goto fail_dio_bytes;
- }
- fs_info->dirty_metadata_batch = PAGE_SIZE *
- (1 + ilog2(nr_cpu_ids));
-
- ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
- if (ret) {
- err = ret;
- goto fail_dirty_metadata_bytes;
- }
-
- ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
- GFP_KERNEL);
- if (ret) {
- err = ret;
- goto fail_delalloc_bytes;
- }
-
+void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
+{
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
@@ -2690,6 +2736,11 @@
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
+#ifdef CONFIG_BTRFS_DEBUG
+ INIT_LIST_HEAD(&fs_info->allocated_roots);
+ INIT_LIST_HEAD(&fs_info->allocated_ebs);
+ spin_lock_init(&fs_info->eb_leak_lock);
+#endif
extent_map_tree_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
@@ -2706,7 +2757,6 @@
atomic_set(&fs_info->reada_works_cnt, 0);
atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
- fs_info->sb = sb;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
@@ -2725,42 +2775,19 @@
INIT_LIST_HEAD(&fs_info->ordered_roots);
spin_lock_init(&fs_info->ordered_root_lock);
- fs_info->btree_inode = new_inode(sb);
- if (!fs_info->btree_inode) {
- err = -ENOMEM;
- goto fail_bio_counter;
- }
- mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
-
- fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
- GFP_KERNEL);
- if (!fs_info->delayed_root) {
- err = -ENOMEM;
- goto fail_iput;
- }
- btrfs_init_delayed_root(fs_info->delayed_root);
-
btrfs_init_scrub(fs_info);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
fs_info->check_integrity_print_mask = 0;
#endif
btrfs_init_balance(fs_info);
- btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
-
- sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
- sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
-
- btrfs_init_btree_inode(fs_info);
+ btrfs_init_async_reclaim_work(fs_info);
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
fs_info->first_logical_byte = (u64)-1;
- extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
- IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
- extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
- IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
- fs_info->pinned_extents = &fs_info->freed_extents[0];
+ extent_io_tree_init(fs_info, &fs_info->excluded_extents,
+ IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
mutex_init(&fs_info->ordered_operations_mutex);
@@ -2776,6 +2803,7 @@
btrfs_init_dev_replace_locks(fs_info);
btrfs_init_qgroup(fs_info);
+ btrfs_discard_init(fs_info);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -2795,23 +2823,134 @@
fs_info->swapfile_pins = RB_ROOT;
fs_info->send_in_progress = 0;
+}
- ret = btrfs_alloc_stripe_hash_table(fs_info);
- if (ret) {
- err = ret;
- goto fail_alloc;
+static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
+{
+ int ret;
+
+ fs_info->sb = sb;
+ sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
+ sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
+
+ ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ fs_info->dirty_metadata_batch = PAGE_SIZE *
+ (1 + ilog2(nr_cpu_ids));
+
+ ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
+ GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
+ GFP_KERNEL);
+ if (!fs_info->delayed_root)
+ return -ENOMEM;
+ btrfs_init_delayed_root(fs_info->delayed_root);
+
+ return btrfs_alloc_stripe_hash_table(fs_info);
+}
+
+static int btrfs_uuid_rescan_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
+ int ret;
+
+ /*
+ * 1st step is to iterate through the existing UUID tree and
+ * to delete all entries that contain outdated data.
+ * 2nd step is to add all missing entries to the UUID tree.
+ */
+ ret = btrfs_uuid_tree_iterate(fs_info);
+ if (ret < 0) {
+ if (ret != -EINTR)
+ btrfs_warn(fs_info, "iterating uuid_tree failed %d",
+ ret);
+ up(&fs_info->uuid_tree_rescan_sem);
+ return ret;
+ }
+ return btrfs_uuid_scan_kthread(data);
+}
+
+static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+ struct task_struct *task;
+
+ down(&fs_info->uuid_tree_rescan_sem);
+ task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
+ if (IS_ERR(task)) {
+ /* fs_info->update_uuid_tree_gen remains 0 in all error case */
+ btrfs_warn(fs_info, "failed to start uuid_rescan task");
+ up(&fs_info->uuid_tree_rescan_sem);
+ return PTR_ERR(task);
}
- __setup_root(tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
+ return 0;
+}
+
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
+ char *options)
+{
+ u32 sectorsize;
+ u32 nodesize;
+ u32 stripesize;
+ u64 generation;
+ u64 features;
+ u16 csum_type;
+ struct btrfs_super_block *disk_super;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *tree_root;
+ struct btrfs_root *chunk_root;
+ int ret;
+ int err = -EINVAL;
+ int clear_free_space_tree = 0;
+ int level;
+
+ ret = init_mount_fs_info(fs_info, sb);
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
+
+ /* These need to be init'ed before we start creating inodes and such. */
+ tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
+ GFP_KERNEL);
+ fs_info->tree_root = tree_root;
+ chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
+ GFP_KERNEL);
+ fs_info->chunk_root = chunk_root;
+ if (!tree_root || !chunk_root) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ fs_info->btree_inode = new_inode(sb);
+ if (!fs_info->btree_inode) {
+ err = -ENOMEM;
+ goto fail;
+ }
+ mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
+ btrfs_init_btree_inode(fs_info);
invalidate_bdev(fs_devices->latest_bdev);
/*
* Read super block and check the signature bytes only
*/
- bh = btrfs_read_dev_super(fs_devices->latest_bdev);
- if (IS_ERR(bh)) {
- err = PTR_ERR(bh);
+ disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
+ if (IS_ERR(disk_super)) {
+ err = PTR_ERR(disk_super);
goto fail_alloc;
}
@@ -2819,18 +2958,19 @@
* Verify the type first, if that or the checksum value are
* corrupted, we'll find out
*/
- csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
+ csum_type = btrfs_super_csum_type(disk_super);
if (!btrfs_supported_super_csum(csum_type)) {
btrfs_err(fs_info, "unsupported checksum algorithm: %u",
csum_type);
err = -EINVAL;
- brelse(bh);
+ btrfs_release_disk_super(disk_super);
goto fail_alloc;
}
ret = btrfs_init_csum_hash(fs_info, csum_type);
if (ret) {
err = ret;
+ btrfs_release_disk_super(disk_super);
goto fail_alloc;
}
@@ -2838,11 +2978,11 @@
* We want to check superblock checksum, the type is stored inside.
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
*/
- if (btrfs_check_super_csum(fs_info, bh->b_data)) {
+ if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
btrfs_err(fs_info, "superblock checksum mismatch");
err = -EINVAL;
- brelse(bh);
- goto fail_csum;
+ btrfs_release_disk_super(disk_super);
+ goto fail_alloc;
}
/*
@@ -2850,8 +2990,8 @@
* following bytes up to INFO_SIZE, the checksum is calculated from
* the whole block of INFO_SIZE
*/
- memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
- brelse(bh);
+ memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
+ btrfs_release_disk_super(disk_super);
disk_super = fs_info->super_copy;
@@ -2871,24 +3011,17 @@
if (ret) {
btrfs_err(fs_info, "superblock contains fatal errors");
err = -EINVAL;
- goto fail_csum;
+ goto fail_alloc;
}
if (!btrfs_super_root(disk_super))
- goto fail_csum;
+ goto fail_alloc;
/* check FS state, whether FS is broken. */
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
/*
- * run through our array of backup supers and setup
- * our ring pointer to the oldest one
- */
- generation = btrfs_super_generation(disk_super);
- find_oldest_super_backup(fs_info, generation);
-
- /*
* In the long term, we'll store the compression type in the super
* block, and it'll be used for per file compression control.
*/
@@ -2920,7 +3053,7 @@
ret = btrfs_parse_options(fs_info, options, sb->s_flags);
if (ret) {
err = ret;
- goto fail_csum;
+ goto fail_alloc;
}
features = btrfs_super_incompat_flags(disk_super) &
@@ -2930,7 +3063,7 @@
"cannot mount because of unsupported optional features (%llx)",
features);
err = -EINVAL;
- goto fail_csum;
+ goto fail_alloc;
}
features = btrfs_super_incompat_flags(disk_super);
@@ -2952,7 +3085,7 @@
btrfs_err(fs_info,
"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
nodesize, sectorsize);
- goto fail_csum;
+ goto fail_alloc;
}
/*
@@ -2968,7 +3101,7 @@
"cannot mount read-write because of unsupported optional features (%llx)",
features);
err = -EINVAL;
- goto fail_csum;
+ goto fail_alloc;
}
ret = btrfs_init_workqueues(fs_info, fs_devices);
@@ -2977,10 +3110,6 @@
goto fail_sb_buffer;
}
- sb->s_bdi->congested_fn = btrfs_congested_fn;
- sb->s_bdi->congested_data = fs_info;
- sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
@@ -2999,8 +3128,6 @@
generation = btrfs_super_chunk_root_generation(disk_super);
level = btrfs_super_chunk_root_level(disk_super);
- __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
-
chunk_root->node = read_tree_block(fs_info,
btrfs_super_chunk_root(disk_super),
generation, level, NULL);
@@ -3016,7 +3143,8 @@
chunk_root->commit_root = btrfs_root_node(chunk_root);
read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
- btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
+ offsetof(struct btrfs_header, chunk_tree_uuid),
+ BTRFS_UUID_SIZE);
ret = btrfs_read_chunk_tree(fs_info);
if (ret) {
@@ -3035,44 +3163,9 @@
goto fail_tree_roots;
}
-retry_root_backup:
- generation = btrfs_super_generation(disk_super);
- level = btrfs_super_root_level(disk_super);
-
- tree_root->node = read_tree_block(fs_info,
- btrfs_super_root(disk_super),
- generation, level, NULL);
- if (IS_ERR(tree_root->node) ||
- !extent_buffer_uptodate(tree_root->node)) {
- btrfs_warn(fs_info, "failed to read tree root");
- if (!IS_ERR(tree_root->node))
- free_extent_buffer(tree_root->node);
- tree_root->node = NULL;
- goto recovery_tree_root;
- }
-
- btrfs_set_root_node(&tree_root->root_item, tree_root->node);
- tree_root->commit_root = btrfs_root_node(tree_root);
- btrfs_set_root_refs(&tree_root->root_item, 1);
-
- mutex_lock(&tree_root->objectid_mutex);
- ret = btrfs_find_highest_objectid(tree_root,
- &tree_root->highest_objectid);
- if (ret) {
- mutex_unlock(&tree_root->objectid_mutex);
- goto recovery_tree_root;
- }
-
- ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
-
- mutex_unlock(&tree_root->objectid_mutex);
-
- ret = btrfs_read_roots(fs_info);
+ ret = init_tree_roots(fs_info);
if (ret)
- goto recovery_tree_root;
-
- fs_info->generation = generation;
- fs_info->last_trans_committed = generation;
+ goto fail_tree_roots;
/*
* If we have a uuid root and we're not being told to rescan we need to
@@ -3113,20 +3206,13 @@
btrfs_free_extra_devids(fs_devices, 1);
- ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
+ ret = btrfs_sysfs_add_fsid(fs_devices);
if (ret) {
btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
ret);
goto fail_block_groups;
}
- ret = btrfs_sysfs_add_device(fs_devices);
- if (ret) {
- btrfs_err(fs_info, "failed to init sysfs device interface: %d",
- ret);
- goto fail_fsdev_sysfs;
- }
-
ret = btrfs_sysfs_add_mounted(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
@@ -3145,7 +3231,8 @@
goto fail_sysfs;
}
- if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
+ if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+ !btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
"writable mount is not allowed due to too many missing devices");
goto fail_sysfs;
@@ -3224,11 +3311,7 @@
}
}
- location.objectid = BTRFS_FS_TREE_OBJECTID;
- location.type = BTRFS_ROOT_ITEM_KEY;
- location.offset = 0;
-
- fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
+ fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
btrfs_warn(fs_info, "failed to read fs tree: %d", err);
@@ -3295,6 +3378,7 @@
}
btrfs_qgroup_rescan_resume(fs_info);
+ btrfs_discard_resume(fs_info);
if (!fs_info->uuid_root) {
btrfs_info(fs_info, "creating UUID tree");
@@ -3352,114 +3436,90 @@
btrfs_put_block_group_cache(fs_info);
fail_tree_roots:
+ if (fs_info->data_reloc_root)
+ btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
free_root_pointers(fs_info, true);
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
fail_sb_buffer:
btrfs_stop_all_workers(fs_info);
btrfs_free_block_groups(fs_info);
-fail_csum:
- btrfs_free_csum_hash(fs_info);
fail_alloc:
-fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
iput(fs_info->btree_inode);
-fail_bio_counter:
- percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
-fail_delalloc_bytes:
- percpu_counter_destroy(&fs_info->delalloc_bytes);
-fail_dirty_metadata_bytes:
- percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
-fail_dio_bytes:
- percpu_counter_destroy(&fs_info->dio_bytes);
-fail_srcu:
- cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
- btrfs_free_stripe_hash_table(fs_info);
btrfs_close_devices(fs_info->fs_devices);
return err;
-
-recovery_tree_root:
- if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
- goto fail_tree_roots;
-
- free_root_pointers(fs_info, false);
-
- /* don't use the log in recovery mode, it won't be valid */
- btrfs_set_super_log_root(disk_super, 0);
-
- /* we can't trust the free space cache either */
- btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
-
- ret = next_root_backup(fs_info, fs_info->super_copy,
- &num_backups_tried, &backup_index);
- if (ret == -1)
- goto fail_block_groups;
- goto retry_root_backup;
}
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
-static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+static void btrfs_end_super_write(struct bio *bio)
{
- if (uptodate) {
- set_buffer_uptodate(bh);
- } else {
- struct btrfs_device *device = (struct btrfs_device *)
- bh->b_private;
+ struct btrfs_device *device = bio->bi_private;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+ struct page *page;
- btrfs_warn_rl_in_rcu(device->fs_info,
- "lost page write due to IO error on %s",
- rcu_str_deref(device->name));
- /* note, we don't set_buffer_write_io_error because we have
- * our own ways of dealing with the IO errors
- */
- clear_buffer_uptodate(bh);
- btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ page = bvec->bv_page;
+
+ if (bio->bi_status) {
+ btrfs_warn_rl_in_rcu(device->fs_info,
+ "lost page write due to IO error on %s (%d)",
+ rcu_str_deref(device->name),
+ blk_status_to_errno(bio->bi_status));
+ ClearPageUptodate(page);
+ SetPageError(page);
+ btrfs_dev_stat_inc_and_print(device,
+ BTRFS_DEV_STAT_WRITE_ERRS);
+ } else {
+ SetPageUptodate(page);
+ }
+
+ put_page(page);
+ unlock_page(page);
}
- unlock_buffer(bh);
- put_bh(bh);
+
+ bio_put(bio);
}
-int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
- struct buffer_head **bh_ret)
+struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
+ int copy_num)
{
- struct buffer_head *bh;
struct btrfs_super_block *super;
+ struct page *page;
u64 bytenr;
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
bytenr = btrfs_sb_offset(copy_num);
if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
- bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
- /*
- * If we fail to read from the underlying devices, as of now
- * the best option we have is to mark it EIO.
- */
- if (!bh)
- return -EIO;
+ page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
+ if (IS_ERR(page))
+ return ERR_CAST(page);
- super = (struct btrfs_super_block *)bh->b_data;
- if (btrfs_super_bytenr(super) != bytenr ||
- btrfs_super_magic(super) != BTRFS_MAGIC) {
- brelse(bh);
- return -EINVAL;
+ super = page_address(page);
+ if (btrfs_super_magic(super) != BTRFS_MAGIC) {
+ btrfs_release_disk_super(super);
+ return ERR_PTR(-ENODATA);
}
- *bh_ret = bh;
- return 0;
+ if (btrfs_super_bytenr(super) != bytenr) {
+ btrfs_release_disk_super(super);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return super;
}
-struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
{
- struct buffer_head *bh;
- struct buffer_head *latest = NULL;
- struct btrfs_super_block *super;
+ struct btrfs_super_block *super, *latest = NULL;
int i;
u64 transid = 0;
- int ret = -EINVAL;
/* we would like to check all the supers, but that would make
* a btrfs mount succeed after a mkfs from a different FS.
@@ -3467,48 +3527,41 @@
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
for (i = 0; i < 1; i++) {
- ret = btrfs_read_dev_one_super(bdev, i, &bh);
- if (ret)
+ super = btrfs_read_dev_one_super(bdev, i);
+ if (IS_ERR(super))
continue;
- super = (struct btrfs_super_block *)bh->b_data;
-
if (!latest || btrfs_super_generation(super) > transid) {
- brelse(latest);
- latest = bh;
+ if (latest)
+ btrfs_release_disk_super(super);
+
+ latest = super;
transid = btrfs_super_generation(super);
- } else {
- brelse(bh);
}
}
- if (!latest)
- return ERR_PTR(ret);
-
- return latest;
+ return super;
}
/*
* Write superblock @sb to the @device. Do not wait for completion, all the
- * buffer heads we write are pinned.
+ * pages we use for writing are locked.
*
* Write @max_mirrors copies of the superblock, where 0 means default that fit
* the expected device size at commit time. Note that max_mirrors must be
* same for write and wait phases.
*
- * Return number of errors when buffer head is not found or submission fails.
+ * Return number of errors when page is not found or submission fails.
*/
static int write_dev_supers(struct btrfs_device *device,
struct btrfs_super_block *sb, int max_mirrors)
{
struct btrfs_fs_info *fs_info = device->fs_info;
+ struct address_space *mapping = device->bdev->bd_inode->i_mapping;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- struct buffer_head *bh;
int i;
- int ret;
int errors = 0;
u64 bytenr;
- int op_flags;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
@@ -3516,6 +3569,10 @@
shash->tfm = fs_info->csum_shash;
for (i = 0; i < max_mirrors; i++) {
+ struct page *page;
+ struct bio *bio;
+ struct btrfs_super_block *disk_super;
+
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
@@ -3523,42 +3580,49 @@
btrfs_set_super_bytenr(sb, bytenr);
- crypto_shash_init(shash);
- crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE,
- BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
- crypto_shash_final(shash, sb->csum);
+ crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
+ sb->csum);
- /* One reference for us, and we leave it for the caller */
- bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
- BTRFS_SUPER_INFO_SIZE);
- if (!bh) {
+ page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
+ GFP_NOFS);
+ if (!page) {
btrfs_err(device->fs_info,
- "couldn't get super buffer head for bytenr %llu",
+ "couldn't get super block page for bytenr %llu",
bytenr);
errors++;
continue;
}
- memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+ /* Bump the refcount for wait_dev_supers() */
+ get_page(page);
- /* one reference for submit_bh */
- get_bh(bh);
-
- set_buffer_uptodate(bh);
- lock_buffer(bh);
- bh->b_end_io = btrfs_end_buffer_write_sync;
- bh->b_private = device;
+ disk_super = page_address(page);
+ memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
/*
- * we fua the first super. The others we allow
- * to go down lazy.
+ * Directly use bios here instead of relying on the page cache
+ * to do I/O, so we don't lose the ability to do integrity
+ * checking.
*/
- op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
+ bio = bio_alloc(GFP_NOFS, 1);
+ bio_set_dev(bio, device->bdev);
+ bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
+ bio->bi_private = device;
+ bio->bi_end_io = btrfs_end_super_write;
+ __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
+ offset_in_page(bytenr));
+
+ /*
+ * We FUA only the first super block. The others we allow to
+ * go down lazy and there's a short window where the on-disk
+ * copies might still contain the older version.
+ */
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
- op_flags |= REQ_FUA;
- ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
- if (ret)
- errors++;
+ bio->bi_opf |= REQ_FUA;
+
+ btrfsic_submit_bio(bio);
}
return errors < i ? 0 : -1;
}
@@ -3567,12 +3631,11 @@
* Wait for write completion of superblocks done by write_dev_supers,
* @max_mirrors same for write and wait phases.
*
- * Return number of errors when buffer head is not found or not marked up to
+ * Return number of errors when page is not found or not marked up to
* date.
*/
static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
{
- struct buffer_head *bh;
int i;
int errors = 0;
bool primary_failed = false;
@@ -3582,32 +3645,34 @@
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
for (i = 0; i < max_mirrors; i++) {
+ struct page *page;
+
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
break;
- bh = __find_get_block(device->bdev,
- bytenr / BTRFS_BDEV_BLOCKSIZE,
- BTRFS_SUPER_INFO_SIZE);
- if (!bh) {
+ page = find_get_page(device->bdev->bd_inode->i_mapping,
+ bytenr >> PAGE_SHIFT);
+ if (!page) {
errors++;
if (i == 0)
primary_failed = true;
continue;
}
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
+ /* Page is submitted locked and unlocked once the IO completes */
+ wait_on_page_locked(page);
+ if (PageError(page)) {
errors++;
if (i == 0)
primary_failed = true;
}
- /* drop our reference */
- brelse(bh);
+ /* Drop our reference */
+ put_page(page);
- /* drop the reference from the writing run */
- brelse(bh);
+ /* Drop the reference from the writing run */
+ put_page(page);
}
/* log error, force error return */
@@ -3635,11 +3700,23 @@
*/
static void write_dev_flush(struct btrfs_device *device)
{
- struct request_queue *q = bdev_get_queue(device->bdev);
struct bio *bio = device->flush_bio;
+#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ /*
+ * When a disk has write caching disabled, we skip submission of a bio
+ * with flush and sync requests before writing the superblock, since
+ * it's not needed. However when the integrity checker is enabled, this
+ * results in reports that there are metadata blocks referred by a
+ * superblock that were not properly flushed. So don't skip the bio
+ * submission only when the integrity checker is enabled for the sake
+ * of simplicity, since this is a debug tool and not meant for use in
+ * non-debug builds.
+ */
+ struct request_queue *q = bdev_get_queue(device->bdev);
if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
return;
+#endif
bio_reset(bio);
bio->bi_end_io = btrfs_end_empty_barrier;
@@ -3879,20 +3956,19 @@
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root)
{
+ bool drop_ref = false;
+
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid);
+ if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
+ drop_ref = true;
spin_unlock(&fs_info->fs_roots_radix_lock);
- if (btrfs_root_refs(&root->root_item) == 0)
- synchronize_srcu(&fs_info->subvol_srcu);
-
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
- btrfs_free_log(NULL, root);
+ ASSERT(root->log_root == NULL);
if (root->reloc_root) {
- free_extent_buffer(root->reloc_root->node);
- free_extent_buffer(root->reloc_root->commit_root);
- btrfs_put_fs_root(root->reloc_root);
+ btrfs_put_root(root->reloc_root);
root->reloc_root = NULL;
}
}
@@ -3901,22 +3977,12 @@
__btrfs_remove_free_space_cache(root->free_ino_pinned);
if (root->free_ino_ctl)
__btrfs_remove_free_space_cache(root->free_ino_ctl);
- btrfs_free_fs_root(root);
-}
-
-void btrfs_free_fs_root(struct btrfs_root *root)
-{
- iput(root->ino_cache_inode);
- WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
- if (root->anon_dev)
- free_anon_bdev(root->anon_dev);
- if (root->subv_writers)
- btrfs_free_subvolume_writers(root->subv_writers);
- free_extent_buffer(root->node);
- free_extent_buffer(root->commit_root);
- kfree(root->free_ino_ctl);
- kfree(root->free_ino_pinned);
- btrfs_put_fs_root(root);
+ if (root->ino_cache_inode) {
+ iput(root->ino_cache_inode);
+ root->ino_cache_inode = NULL;
+ }
+ if (drop_ref)
+ btrfs_put_root(root);
}
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -3926,15 +3992,14 @@
int i = 0;
int err = 0;
unsigned int ret = 0;
- int index;
while (1) {
- index = srcu_read_lock(&fs_info->subvol_srcu);
+ spin_lock(&fs_info->fs_roots_radix_lock);
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, root_objectid,
ARRAY_SIZE(gang));
if (!ret) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
break;
}
root_objectid = gang[ret - 1]->root_key.objectid + 1;
@@ -3946,9 +4011,9 @@
continue;
}
/* grab all the search result for later use */
- gang[i] = btrfs_grab_fs_root(gang[i]);
+ gang[i] = btrfs_grab_root(gang[i]);
}
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
for (i = 0; i < ret; i++) {
if (!gang[i])
@@ -3957,7 +4022,7 @@
err = btrfs_orphan_cleanup(gang[i]);
if (err)
break;
- btrfs_put_fs_root(gang[i]);
+ btrfs_put_root(gang[i]);
}
root_objectid++;
}
@@ -3965,7 +4030,7 @@
/* release the uncleaned roots due to error */
for (; i < ret; i++) {
if (gang[i])
- btrfs_put_fs_root(gang[i]);
+ btrfs_put_root(gang[i]);
}
return err;
}
@@ -3990,7 +4055,7 @@
return btrfs_commit_transaction(trans);
}
-void close_ctree(struct btrfs_fs_info *fs_info)
+void __cold close_ctree(struct btrfs_fs_info *fs_info)
{
int ret;
@@ -4026,6 +4091,10 @@
btrfs_cleanup_defrag_inodes(fs_info);
cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
+
+ /* Cancel or finish ongoing discard work */
+ btrfs_discard_cleanup(fs_info);
if (!sb_rdonly(fs_info->sb)) {
/*
@@ -4062,6 +4131,11 @@
ASSERT(list_empty(&fs_info->delayed_iputs));
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
+ if (btrfs_check_quota_leak(fs_info)) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_err(fs_info, "qgroup reserved space leaked");
+ }
+
btrfs_free_qgroup_config(fs_info);
ASSERT(list_empty(&fs_info->delalloc_roots));
@@ -4077,8 +4151,6 @@
btrfs_sysfs_remove_mounted(fs_info);
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
- btrfs_free_fs_roots(fs_info);
-
btrfs_put_block_group_cache(fs_info);
/*
@@ -4090,6 +4162,7 @@
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
free_root_pointers(fs_info, true);
+ btrfs_free_fs_roots(fs_info);
/*
* We must free the block groups after dropping the fs_roots as we could
@@ -4109,16 +4182,6 @@
btrfs_mapping_tree_free(&fs_info->mapping_tree);
btrfs_close_devices(fs_info->fs_devices);
-
- percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
- percpu_counter_destroy(&fs_info->delalloc_bytes);
- percpu_counter_destroy(&fs_info->dio_bytes);
- percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
- cleanup_srcu_struct(&fs_info->subvol_srcu);
-
- btrfs_free_csum_hash(fs_info);
- btrfs_free_stripe_hash_table(fs_info);
- btrfs_free_ref_cache(fs_info);
}
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -4232,6 +4295,36 @@
up_write(&fs_info->cleanup_work_sem);
}
+static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *gang[8];
+ u64 root_objectid = 0;
+ int ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang))) != 0) {
+ int i;
+
+ for (i = 0; i < ret; i++)
+ gang[i] = btrfs_grab_root(gang[i]);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
+ continue;
+ root_objectid = gang[i]->root_key.objectid;
+ btrfs_free_log(NULL, gang[i]);
+ btrfs_put_root(gang[i]);
+ }
+ root_objectid++;
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ btrfs_free_log_root_tree(NULL, fs_info);
+}
+
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct btrfs_ordered_extent *ordered;
@@ -4292,7 +4385,7 @@
spin_lock(&delayed_refs->lock);
if (atomic_read(&delayed_refs->num_entries) == 0) {
spin_unlock(&delayed_refs->lock);
- btrfs_info(fs_info, "delayed_refs has NO entry");
+ btrfs_debug(fs_info, "delayed_refs has NO entry");
return ret;
}
@@ -4326,9 +4419,30 @@
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
- if (pin_bytes)
- btrfs_pin_extent(fs_info, head->bytenr,
- head->num_bytes, 1);
+ if (pin_bytes) {
+ struct btrfs_block_group *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, head->bytenr);
+ BUG_ON(!cache);
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned += head->num_bytes;
+ btrfs_space_info_update_bytes_pinned(fs_info,
+ cache->space_info, head->num_bytes);
+ cache->reserved -= head->num_bytes;
+ cache->space_info->bytes_reserved -= head->num_bytes;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ percpu_counter_add_batch(
+ &cache->space_info->total_bytes_pinned,
+ head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
+
+ btrfs_put_block_group(cache);
+
+ btrfs_error_unpin_extent_range(fs_info, head->bytenr,
+ head->bytenr + head->num_bytes - 1);
+ }
btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
btrfs_put_delayed_ref_head(head);
cond_resched();
@@ -4384,12 +4498,12 @@
while (!list_empty(&splice)) {
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
- root = btrfs_grab_fs_root(root);
+ root = btrfs_grab_root(root);
BUG_ON(!root);
spin_unlock(&fs_info->delalloc_root_lock);
btrfs_destroy_delalloc_inodes(root);
- btrfs_put_fs_root(root);
+ btrfs_put_root(root);
spin_lock(&fs_info->delalloc_root_lock);
}
@@ -4430,16 +4544,12 @@
}
static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *pinned_extents)
+ struct extent_io_tree *unpin)
{
- struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
- bool loop = true;
- unpin = pinned_extents;
-again:
while (1) {
struct extent_state *cached_state = NULL;
@@ -4464,19 +4574,10 @@
cond_resched();
}
- if (loop) {
- if (unpin == &fs_info->freed_extents[0])
- unpin = &fs_info->freed_extents[1];
- else
- unpin = &fs_info->freed_extents[0];
- loop = false;
- goto again;
- }
-
return 0;
}
-static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache)
+static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
{
struct inode *inode;
@@ -4494,12 +4595,12 @@
void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
struct btrfs_fs_info *fs_info)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
spin_lock(&cur_trans->dirty_bgs_lock);
while (!list_empty(&cur_trans->dirty_bgs)) {
cache = list_first_entry(&cur_trans->dirty_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
dirty_list);
if (!list_empty(&cache->io_list)) {
@@ -4527,7 +4628,7 @@
*/
while (!list_empty(&cur_trans->io_bgs)) {
cache = list_first_entry(&cur_trans->io_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
io_list);
list_del_init(&cache->io_list);
@@ -4564,8 +4665,7 @@
btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
EXTENT_DIRTY);
- btrfs_destroy_pinned_extent(fs_info,
- fs_info->pinned_extents);
+ btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
@@ -4617,15 +4717,9 @@
btrfs_destroy_all_ordered_extents(fs_info);
btrfs_destroy_delayed_inodes(fs_info);
btrfs_assert_delayed_root_empty(fs_info);
- btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
btrfs_destroy_all_delalloc_inodes(fs_info);
+ btrfs_drop_all_logs(fs_info);
mutex_unlock(&fs_info->transaction_kthread_mutex);
return 0;
}
-
-static const struct extent_io_ops btree_extent_io_ops = {
- /* mandatory callbacks */
- .submit_bio_hook = btree_submit_bio_hook,
- .readpage_end_io_hook = btree_readpage_end_io_hook,
-};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a695810..182540b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,7 +25,6 @@
BTRFS_WQ_ENDIO_METADATA,
BTRFS_WQ_ENDIO_FREE_SPACE,
BTRFS_WQ_ENDIO_RAID56,
- BTRFS_WQ_ENDIO_DIO_REPAIR,
};
static inline u64 btrfs_sb_offset(int mirror)
@@ -39,6 +38,8 @@
struct btrfs_device;
struct btrfs_fs_devices;
+void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
+void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_verify_level_key(struct extent_buffer *eb, int level,
struct btrfs_key *first_key, u64 parent_transid);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
@@ -49,41 +50,40 @@
struct btrfs_fs_info *fs_info,
u64 bytenr);
void btrfs_clean_tree_block(struct extent_buffer *buf);
-int open_ctree(struct super_block *sb,
+int __cold open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
-void close_ctree(struct btrfs_fs_info *fs_info);
+void __cold close_ctree(struct btrfs_fs_info *fs_info);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
-struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
-int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
- struct buffer_head **bh_ret);
+struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
+struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
+ int copy_num);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
- struct btrfs_key *location);
-int btrfs_init_fs_root(struct btrfs_root *root);
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
- u64 root_id);
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key);
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
- struct btrfs_key *key,
- bool check_ref);
-static inline struct btrfs_root *
-btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
- struct btrfs_key *location)
-{
- return btrfs_get_fs_root(fs_info, location, true);
-}
+ u64 objectid, bool check_ref);
+struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev);
+struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ u64 objectid);
+void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-void btrfs_free_fs_root(struct btrfs_root *root);
-
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int mirror);
+blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -95,19 +95,16 @@
* If you want to ensure the whole tree is safe, you should use
* fs_info->subvol_srcu
*/
-static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
{
+ if (!root)
+ return NULL;
if (refcount_inc_not_zero(&root->refs))
return root;
return NULL;
}
-static inline void btrfs_put_fs_root(struct btrfs_root *root)
-{
- if (refcount_dec_and_test(&root->refs))
- kfree(root);
-}
-
+void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
@@ -133,9 +130,6 @@
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset, u64 start, u64 len,
- int create);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int __init btrfs_end_io_wq_init(void);
void __cold btrfs_end_io_wq_exit(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 93cceeb..1a8d419 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -64,36 +64,18 @@
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
struct inode *inode;
- struct btrfs_key key;
- int index;
- int err = 0;
if (objectid < BTRFS_FIRST_FREE_OBJECTID)
return ERR_PTR(-ESTALE);
- key.objectid = root_objectid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
+ root = btrfs_get_fs_root(fs_info, root_objectid, true);
+ if (IS_ERR(root))
+ return ERR_CAST(root);
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto fail;
- }
-
- key.objectid = objectid;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
-
- inode = btrfs_iget(sb, &key, root, NULL);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- goto fail;
- }
-
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ inode = btrfs_iget(sb, objectid, root);
+ btrfs_put_root(root);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
if (check_generation && generation != inode->i_generation) {
iput(inode);
@@ -101,9 +83,6 @@
}
return d_obtain_alias(inode);
-fail:
- srcu_read_unlock(&fs_info->subvol_srcu, index);
- return ERR_PTR(err);
}
static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -212,9 +191,7 @@
found_key.offset, 0, 0);
}
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
- return d_obtain_alias(btrfs_iget(fs_info->sb, &key, root, NULL));
+ return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root));
fail:
btrfs_free_path(path);
return ERR_PTR(ret);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
new file mode 100644
index 0000000..9800a83
--- /dev/null
+++ b/fs/btrfs/extent-io-tree.h
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_EXTENT_IO_TREE_H
+#define BTRFS_EXTENT_IO_TREE_H
+
+struct extent_changeset;
+struct io_failure_record;
+
+/* Bits for the extent state */
+#define EXTENT_DIRTY (1U << 0)
+#define EXTENT_UPTODATE (1U << 1)
+#define EXTENT_LOCKED (1U << 2)
+#define EXTENT_NEW (1U << 3)
+#define EXTENT_DELALLOC (1U << 4)
+#define EXTENT_DEFRAG (1U << 5)
+#define EXTENT_BOUNDARY (1U << 6)
+#define EXTENT_NODATASUM (1U << 7)
+#define EXTENT_CLEAR_META_RESV (1U << 8)
+#define EXTENT_NEED_WAIT (1U << 9)
+#define EXTENT_DAMAGED (1U << 10)
+#define EXTENT_NORESERVE (1U << 11)
+#define EXTENT_QGROUP_RESERVED (1U << 12)
+#define EXTENT_CLEAR_DATA_RESV (1U << 13)
+#define EXTENT_DELALLOC_NEW (1U << 14)
+#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
+ EXTENT_CLEAR_DATA_RESV)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
+
+/*
+ * Redefined bits above which are used only in the device allocation tree,
+ * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV
+ * / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit
+ * manipulation functions
+ */
+#define CHUNK_ALLOCATED EXTENT_DIRTY
+#define CHUNK_TRIMMED EXTENT_DEFRAG
+#define CHUNK_STATE_MASK (CHUNK_ALLOCATED | \
+ CHUNK_TRIMMED)
+
+enum {
+ IO_TREE_FS_PINNED_EXTENTS,
+ IO_TREE_FS_EXCLUDED_EXTENTS,
+ IO_TREE_BTREE_INODE_IO,
+ IO_TREE_INODE_IO,
+ IO_TREE_INODE_IO_FAILURE,
+ IO_TREE_RELOC_BLOCKS,
+ IO_TREE_TRANS_DIRTY_PAGES,
+ IO_TREE_ROOT_DIRTY_LOG_PAGES,
+ IO_TREE_INODE_FILE_EXTENT,
+ IO_TREE_LOG_CSUM_RANGE,
+ IO_TREE_SELFTEST,
+ IO_TREE_DEVICE_ALLOC_STATE,
+};
+
+struct extent_io_tree {
+ struct rb_root state;
+ struct btrfs_fs_info *fs_info;
+ void *private_data;
+ u64 dirty_bytes;
+ bool track_uptodate;
+
+ /* Who owns this io tree, should be one of IO_TREE_* */
+ u8 owner;
+
+ spinlock_t lock;
+};
+
+struct extent_state {
+ u64 start;
+ u64 end; /* inclusive */
+ struct rb_node rb_node;
+
+ /* ADD NEW ELEMENTS AFTER THIS */
+ wait_queue_head_t wq;
+ refcount_t refs;
+ unsigned state;
+
+ struct io_failure_record *failrec;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct list_head leak_list;
+#endif
+};
+
+int __init extent_state_cache_init(void);
+void __cold extent_state_cache_exit(void);
+
+void extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner,
+ void *private_data);
+void extent_io_tree_release(struct extent_io_tree *tree);
+
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached);
+
+static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return lock_extent_bits(tree, start, end, NULL);
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
+
+int __init extent_io_init(void);
+void __cold extent_io_exit(void);
+
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end,
+ u64 max_bytes, unsigned bits, int contig);
+
+void free_extent_state(struct extent_state *state);
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int filled,
+ struct extent_state *cached_state);
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, struct extent_changeset *changeset);
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached);
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached, gfp_t mask,
+ struct extent_changeset *changeset);
+
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
+}
+
+static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ GFP_NOFS, NULL);
+}
+
+static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
+ u64 start, u64 end, struct extent_state **cached)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ GFP_ATOMIC, NULL);
+}
+
+static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned bits)
+{
+ int wake = 0;
+
+ if (bits & EXTENT_LOCKED)
+ wake = 1;
+
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
+}
+
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, struct extent_changeset *changeset);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask);
+int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits);
+
+static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned bits)
+{
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS);
+}
+
+static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+ cached_state, GFP_NOFS, NULL);
+}
+
+static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
+ NULL, mask);
+}
+
+static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return clear_extent_bit(tree, start, end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0, cached);
+}
+
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, unsigned clear_bits,
+ struct extent_state **cached_state);
+
+static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned int extra_bits,
+ struct extent_state **cached_state)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
+ NULL, cached_state, GFP_NOFS);
+}
+
+static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+ NULL, cached_state, GFP_NOFS);
+}
+
+static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
+ u64 end)
+{
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL,
+ GFP_NOFS);
+}
+
+static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
+ cached_state, mask);
+}
+
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits,
+ struct extent_state **cached_state);
+void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits);
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits);
+int extent_invalidatepage(struct extent_io_tree *tree,
+ struct page *page, unsigned long offset);
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+ u64 *end, u64 max_bytes,
+ struct extent_state **cached_state);
+
+/* This should be reworked in the future and put elsewhere. */
+struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start);
+int set_state_failrec(struct extent_io_tree *tree, u64 start,
+ struct io_failure_record *failrec);
+void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
+ u64 end);
+int free_io_failure(struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree,
+ struct io_failure_record *rec);
+int clean_io_failure(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree, u64 start,
+ struct page *page, u64 ino, unsigned int pg_offset);
+
+#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5273965..2842946 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -32,6 +32,7 @@
#include "block-rsv.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "discard.h"
#include "rcu-string.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -55,7 +56,7 @@
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
-static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
{
return (cache->flags & bits) == bits;
}
@@ -64,60 +65,21 @@
u64 start, u64 num_bytes)
{
u64 end = start + num_bytes - 1;
- set_extent_bits(&fs_info->freed_extents[0],
- start, end, EXTENT_UPTODATE);
- set_extent_bits(&fs_info->freed_extents[1],
- start, end, EXTENT_UPTODATE);
+ set_extent_bits(&fs_info->excluded_extents, start, end,
+ EXTENT_UPTODATE);
return 0;
}
-void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache)
+void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
u64 start, end;
- start = cache->key.objectid;
- end = start + cache->key.offset - 1;
+ start = cache->start;
+ end = start + cache->length - 1;
- clear_extent_bits(&fs_info->freed_extents[0],
- start, end, EXTENT_UPTODATE);
- clear_extent_bits(&fs_info->freed_extents[1],
- start, end, EXTENT_UPTODATE);
-}
-
-static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
-{
- if (ref->type == BTRFS_REF_METADATA) {
- if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
- return BTRFS_BLOCK_GROUP_SYSTEM;
- else
- return BTRFS_BLOCK_GROUP_METADATA;
- }
- return BTRFS_BLOCK_GROUP_DATA;
-}
-
-static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *ref)
-{
- struct btrfs_space_info *space_info;
- u64 flags = generic_ref_to_space_flags(ref);
-
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
-}
-
-static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *ref)
-{
- struct btrfs_space_info *space_info;
- u64 flags = generic_ref_to_space_flags(ref);
-
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ clear_extent_bits(&fs_info->excluded_extents, start, end,
+ EXTENT_UPTODATE);
}
/* simple helper to search for an existing data extent at a given offset */
@@ -1180,7 +1142,22 @@
num_bytes, parent, root_objectid,
owner, offset, 1);
if (ret == 0) {
- BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+ /*
+ * We're adding refs to a tree block we already own, this
+ * should not happen at all.
+ */
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_crit(trans->fs_info,
+"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
+ bytenr, num_bytes, root_objectid);
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ WARN_ON(1);
+ btrfs_crit(trans->fs_info,
+ "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+ return -EUCLEAN;
+ }
update_inline_extent_backref(path, iref, refs_to_add,
extent_op, NULL);
} else if (ret == -ENOENT) {
@@ -1192,24 +1169,6 @@
return ret;
}
-static int insert_extent_backref(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- u64 bytenr, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int refs_to_add)
-{
- int ret;
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- BUG_ON(refs_to_add != 1);
- ret = insert_tree_block_ref(trans, path, bytenr, parent,
- root_objectid);
- } else {
- ret = insert_extent_data_ref(trans, path, bytenr, parent,
- root_objectid, owner, offset,
- refs_to_add);
- }
- return ret;
-}
-
static int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
@@ -1396,7 +1355,6 @@
struct btrfs_ref *generic_ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int old_ref_mod, new_ref_mod;
int ret;
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
@@ -1405,23 +1363,21 @@
generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
- ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
- NULL, &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
else
- ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
btrfs_ref_tree_mod(fs_info, generic_ref);
- if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
- sub_pinned_bytes(fs_info, generic_ref);
-
return ret;
}
/*
* __btrfs_inc_extent_ref - insert backreference for a given extent
*
+ * The counterpart is in __btrfs_free_extent(), with examples and more details
+ * how it works.
+ *
* @trans: Handle of transaction
*
* @node: The delayed ref node used to get the bytenr/length for
@@ -1472,7 +1428,6 @@
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
@@ -1497,11 +1452,17 @@
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* now insert the actual backref */
- ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
- owner, offset, refs_to_add);
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ BUG_ON(refs_to_add != 1);
+ ret = insert_tree_block_ref(trans, path, bytenr, parent,
+ root_objectid);
+ } else {
+ ret = insert_extent_data_ref(trans, path, bytenr, parent,
+ root_objectid, owner, offset,
+ refs_to_add);
+ }
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -1607,7 +1568,6 @@
}
again:
- path->reada = READA_FORWARD;
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
if (ret < 0) {
@@ -1708,8 +1668,7 @@
if (TRANS_ABORTED(trans)) {
if (insert_reserved)
- btrfs_pin_extent(trans->fs_info, node->bytenr,
- node->num_bytes, 1);
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
return 0;
}
@@ -1724,8 +1683,7 @@
else
BUG();
if (ret && insert_reserved)
- btrfs_pin_extent(trans->fs_info, node->bytenr,
- node->num_bytes, 1);
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
return ret;
}
@@ -1801,34 +1759,28 @@
{
int nr_items = 1; /* Dropping this ref head update. */
- if (head->total_ref_mod < 0) {
- struct btrfs_space_info *space_info;
- u64 flags;
+ /*
+ * We had csum deletions accounted for in our delayed refs rsv, we need
+ * to drop the csum leaves for this update from our delayed_refs_rsv.
+ */
+ if (head->total_ref_mod < 0 && head->is_data) {
+ spin_lock(&delayed_refs->lock);
+ delayed_refs->pending_csums -= head->num_bytes;
+ spin_unlock(&delayed_refs->lock);
+ nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+ }
- if (head->is_data)
- flags = BTRFS_BLOCK_GROUP_DATA;
- else if (head->is_system)
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
- else
- flags = BTRFS_BLOCK_GROUP_METADATA;
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -head->num_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ /*
+ * We were dropping refs, or had a new ref and dropped it, and thus must
+ * adjust down our total_bytes_pinned, the space may or may not have
+ * been pinned and so is accounted for properly in the pinned space by
+ * now.
+ */
+ if (head->total_ref_mod < 0 ||
+ (head->total_ref_mod == 0 && head->must_insert_reserved)) {
+ u64 flags = btrfs_ref_head_to_space_flags(head);
- /*
- * We had csum deletions accounted for in our delayed refs rsv,
- * we need to drop the csum leaves for this update from our
- * delayed_refs_rsv.
- */
- if (head->is_data) {
- spin_lock(&delayed_refs->lock);
- delayed_refs->pending_csums -= head->num_bytes;
- spin_unlock(&delayed_refs->lock);
- nr_items += btrfs_csum_bytes_to_leaves(fs_info,
- head->num_bytes);
- }
+ btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes);
}
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
@@ -1870,8 +1822,7 @@
spin_unlock(&delayed_refs->lock);
if (head->must_insert_reserved) {
- btrfs_pin_extent(fs_info, head->bytenr,
- head->num_bytes, 1);
+ btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
if (head->is_data) {
ret = btrfs_del_csums(trans, fs_info->csum_root,
head->bytenr, head->num_bytes);
@@ -2138,22 +2089,6 @@
}
#endif
-static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
-{
- u64 num_bytes;
-
- num_bytes = heads * (sizeof(struct btrfs_extent_item) +
- sizeof(struct btrfs_extent_inline_ref));
- if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
- num_bytes += heads * sizeof(struct btrfs_tree_block_info);
-
- /*
- * We don't ever fill up leaves all the way so multiply by 2 just to be
- * closer to what we're really going to want to use.
- */
- return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
-}
-
/*
* Takes the number of bytes to be csumm'ed and figures out how many leaves it
* would require to store the csums for that many bytes.
@@ -2241,7 +2176,7 @@
}
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, u64 flags,
+ struct extent_buffer *eb, u64 flags,
int level, int is_data)
{
struct btrfs_delayed_extent_op *extent_op;
@@ -2257,7 +2192,7 @@
extent_op->is_data = is_data ? true : false;
extent_op->level = level;
- ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+ ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
if (ret)
btrfs_free_delayed_extent_op(extent_op);
return ret;
@@ -2471,7 +2406,7 @@
nritems = btrfs_header_nritems(buf);
level = btrfs_header_level(buf);
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
return 0;
if (full_backref)
@@ -2546,7 +2481,7 @@
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
int readonly = 0;
block_group = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2576,7 +2511,7 @@
static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
u64 bytenr;
spin_lock(&fs_info->block_group_cache_lock);
@@ -2590,13 +2525,14 @@
if (!cache)
return 0;
- bytenr = cache->key.objectid;
+ bytenr = cache->start;
btrfs_put_block_group(cache);
return bytenr;
}
-static int pin_down_extent(struct btrfs_block_group_cache *cache,
+static int pin_down_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *cache,
u64 bytenr, u64 num_bytes, int reserved)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -2613,25 +2549,21 @@
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
- num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
- set_extent_dirty(fs_info->pinned_extents, bytenr,
+ __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes);
+ set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
}
-/*
- * this function must be called within transaction
- */
-int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
+int btrfs_pin_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, int reserved)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
- cache = btrfs_lookup_block_group(fs_info, bytenr);
+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
BUG_ON(!cache); /* Logic error */
- pin_down_extent(cache, bytenr, num_bytes, reserved);
+ pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
btrfs_put_block_group(cache);
return 0;
@@ -2640,13 +2572,15 @@
/*
* this function must be called within transaction
*/
-int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
int ret;
- cache = btrfs_lookup_block_group(fs_info, bytenr);
+ btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes);
+
+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
if (!cache)
return -EINVAL;
@@ -2658,7 +2592,7 @@
*/
btrfs_cache_block_group(cache, 1);
- pin_down_extent(cache, bytenr, num_bytes, 0);
+ pin_down_extent(trans, cache, bytenr, num_bytes, 0);
/* remove us from the free space cache (if we're there at all) */
ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
@@ -2670,7 +2604,7 @@
u64 start, u64 num_bytes)
{
int ret;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_caching_control *caching_ctl;
block_group = btrfs_lookup_block_group(fs_info, start);
@@ -2682,7 +2616,7 @@
if (!caching_ctl) {
/* Logic error */
- BUG_ON(!btrfs_block_group_cache_done(block_group));
+ BUG_ON(!btrfs_block_group_done(block_group));
ret = btrfs_remove_free_space(block_group, start, num_bytes);
} else {
mutex_lock(&caching_ctl->mutex);
@@ -2747,41 +2681,11 @@
}
static void
-btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
+btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
{
atomic_inc(&bg->reservations);
}
-void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_caching_control *next;
- struct btrfs_caching_control *caching_ctl;
- struct btrfs_block_group_cache *cache;
-
- down_write(&fs_info->commit_root_sem);
-
- list_for_each_entry_safe(caching_ctl, next,
- &fs_info->caching_block_groups, list) {
- cache = caching_ctl->block_group;
- if (btrfs_block_group_cache_done(cache)) {
- cache->last_byte_to_unpin = (u64)-1;
- list_del_init(&caching_ctl->list);
- btrfs_put_caching_control(caching_ctl);
- } else {
- cache->last_byte_to_unpin = caching_ctl->progress;
- }
- }
-
- if (fs_info->pinned_extents == &fs_info->freed_extents[0])
- fs_info->pinned_extents = &fs_info->freed_extents[1];
- else
- fs_info->pinned_extents = &fs_info->freed_extents[0];
-
- up_write(&fs_info->commit_root_sem);
-
- btrfs_update_global_block_rsv(fs_info);
-}
-
/*
* Returns the free cluster for the given space info and sets empty_cluster to
* what it should be based on the mount options.
@@ -2815,7 +2719,7 @@
u64 start, u64 end,
const bool return_free_space)
{
- struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_block_group *cache = NULL;
struct btrfs_space_info *space_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_free_cluster *cluster = NULL;
@@ -2827,7 +2731,7 @@
while (start <= end) {
readonly = false;
if (!cache ||
- start >= cache->key.objectid + cache->key.offset) {
+ start >= cache->start + cache->length) {
if (cache)
btrfs_put_block_group(cache);
total_unpinned = 0;
@@ -2840,7 +2744,7 @@
empty_cluster <<= 1;
}
- len = cache->key.objectid + cache->key.offset - start;
+ len = cache->start + cache->length - start;
len = min(len, end + 1 - start);
if (start < cache->last_byte_to_unpin && return_free_space) {
@@ -2871,8 +2775,7 @@
cache->pinned -= len;
btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
space_info->max_extent_size = 0;
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ __btrfs_mod_total_bytes_pinned(space_info, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
@@ -2894,11 +2797,10 @@
len -= to_add;
}
spin_unlock(&global_rsv->lock);
- /* Add to any tickets we may have */
- if (len)
- btrfs_try_granting_tickets(fs_info,
- space_info);
}
+ /* Add to any tickets we may have */
+ if (!readonly && return_free_space && len)
+ btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
@@ -2910,17 +2812,14 @@
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_block_group *block_group, *tmp;
struct list_head *deleted_bgs;
struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
- if (fs_info->pinned_extents == &fs_info->freed_extents[0])
- unpin = &fs_info->freed_extents[1];
- else
- unpin = &fs_info->freed_extents[0];
+ unpin = &trans->transaction->pinned_extents;
while (!TRANS_ABORTED(trans)) {
struct extent_state *cached_state = NULL;
@@ -2932,8 +2831,11 @@
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ clear_extent_bits(&fs_info->excluded_extents, start,
+ end, EXTENT_UPTODATE);
- if (btrfs_test_opt(fs_info, DISCARD))
+ if (btrfs_test_opt(fs_info, DISCARD_SYNC))
ret = btrfs_discard_extent(fs_info, start,
end + 1 - start, NULL);
@@ -2944,6 +2846,11 @@
cond_resched();
}
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
+ btrfs_discard_calc_delay(&fs_info->discard_ctl);
+ btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
+ }
+
/*
* Transaction is finished. We don't need the lock anymore. We
* do need to clean up the block groups in case of a transaction
@@ -2956,12 +2863,12 @@
ret = -EROFS;
if (!TRANS_ABORTED(trans))
ret = btrfs_discard_extent(fs_info,
- block_group->key.objectid,
- block_group->key.offset,
+ block_group->start,
+ block_group->length,
&trimmed);
list_del_init(&block_group->bg_list);
- btrfs_put_block_group_trimming(block_group);
+ btrfs_unfreeze_block_group(block_group);
btrfs_put_block_group(block_group);
if (ret) {
@@ -2975,6 +2882,65 @@
return 0;
}
+/*
+ * Drop one or more refs of @node.
+ *
+ * 1. Locate the extent refs.
+ * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
+ * Locate it, then reduce the refs number or remove the ref line completely.
+ *
+ * 2. Update the refs count in EXTENT/METADATA_ITEM
+ *
+ * Inline backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 2 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ * extent data backref root FS_TREE objectid 257 offset 0 count 1
+ *
+ * This function gets called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 257
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 1 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ *
+ * Keyed backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 754 gen 6 flags DATA
+ * [...]
+ * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
+ * extent data backref root FS_TREE objectid 866 offset 0 count 1
+ *
+ * This function get called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 866
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 753 gen 6 flags DATA
+ *
+ * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
+ */
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
@@ -3004,11 +2970,18 @@
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
path->leave_spinning = 1;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
- BUG_ON(!is_data && refs_to_drop != 1);
+
+ if (!is_data && refs_to_drop != 1) {
+ btrfs_crit(info,
+"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
+ node->bytenr, refs_to_drop);
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
if (is_data)
skinny_metadata = false;
@@ -3017,6 +2990,13 @@
parent, root_objectid, owner_objectid,
owner_offset);
if (ret == 0) {
+ /*
+ * Either the inline backref or the SHARED_DATA_REF/
+ * SHARED_BLOCK_REF is found
+ *
+ * Here is a quick path to locate EXTENT/METADATA_ITEM.
+ * It's possible the EXTENT/METADATA_ITEM is near current slot.
+ */
extent_slot = path->slots[0];
while (extent_slot >= 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key,
@@ -3033,13 +3013,21 @@
found_extent = 1;
break;
}
+
+ /* Quick path didn't find the EXTEMT/METADATA_ITEM */
if (path->slots[0] - extent_slot > 5)
break;
extent_slot--;
}
if (!found_extent) {
- BUG_ON(iref);
+ if (iref) {
+ btrfs_crit(info,
+"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
+ /* Must be SHARED_* item, remove the backref first */
ret = remove_extent_backref(trans, path, NULL,
refs_to_drop,
is_data, &last_ref);
@@ -3050,6 +3038,7 @@
btrfs_release_path(path);
path->leave_spinning = 1;
+ /* Slow path to locate EXTENT/METADATA_ITEM */
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
@@ -3124,19 +3113,26 @@
if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
key.type == BTRFS_EXTENT_ITEM_KEY) {
struct btrfs_tree_block_info *bi;
- BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
+ if (item_size < sizeof(*ei) + sizeof(*bi)) {
+ btrfs_crit(info,
+"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
+ key.objectid, key.type, key.offset,
+ owner_objectid, item_size,
+ sizeof(*ei) + sizeof(*bi));
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
bi = (struct btrfs_tree_block_info *)(ei + 1);
WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
}
refs = btrfs_extent_refs(leaf, ei);
if (refs < refs_to_drop) {
- btrfs_err(info,
- "trying to drop %d refs but we only have %Lu for bytenr %Lu",
+ btrfs_crit(info,
+ "trying to drop %d refs but we only have %llu for bytenr %llu",
refs_to_drop, refs, bytenr);
- ret = -EINVAL;
- btrfs_abort_transaction(trans, ret);
- goto out;
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
}
refs -= refs_to_drop;
@@ -3148,7 +3144,12 @@
* be updated by remove_extent_backref
*/
if (iref) {
- BUG_ON(!found_extent);
+ if (!found_extent) {
+ btrfs_crit(info,
+"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
} else {
btrfs_set_extent_refs(leaf, ei, refs);
btrfs_mark_buffer_dirty(leaf);
@@ -3163,13 +3164,39 @@
}
}
} else {
+ /* In this branch refs == 1 */
if (found_extent) {
- BUG_ON(is_data && refs_to_drop !=
- extent_data_ref_count(path, iref));
+ if (is_data && refs_to_drop !=
+ extent_data_ref_count(path, iref)) {
+ btrfs_crit(info,
+ "invalid refs_to_drop, current refs %u refs_to_drop %u",
+ extent_data_ref_count(path, iref),
+ refs_to_drop);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
if (iref) {
- BUG_ON(path->slots[0] != extent_slot);
+ if (path->slots[0] != extent_slot) {
+ btrfs_crit(info,
+"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
+ key.objectid, key.type,
+ key.offset);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
} else {
- BUG_ON(path->slots[0] != extent_slot + 1);
+ /*
+ * No inline ref, we must be at SHARED_* item,
+ * And it's single ref, it must be:
+ * | extent_slot ||extent_slot + 1|
+ * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
+ */
+ if (path->slots[0] != extent_slot + 1) {
+ btrfs_crit(info,
+ "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
path->slots[0] = extent_slot;
num_to_del = 2;
}
@@ -3210,6 +3237,19 @@
out:
btrfs_free_path(path);
return ret;
+err_dump:
+ /*
+ * Leaf dump can take up a lot of log buffer, so we only do full leaf
+ * dump for debug build.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
+ path->slots[0], extent_slot);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+
+ btrfs_free_path(path);
+ return -EUCLEAN;
}
/*
@@ -3274,7 +3314,6 @@
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ref generic_ref = { 0 };
- int pin = 1;
int ret;
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
@@ -3283,17 +3322,13 @@
root->root_key.objectid);
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- int old_ref_mod, new_ref_mod;
-
btrfs_ref_tree_mod(fs_info, &generic_ref);
- ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
BUG_ON(ret); /* -ENOMEM */
- pin = old_ref_mod >= 0 && new_ref_mod < 0;
}
if (last_ref && btrfs_header_generation(buf) == trans->transid) {
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, buf->start);
@@ -3301,11 +3336,10 @@
goto out;
}
- pin = 0;
cache = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- pin_down_extent(cache, buf->start, buf->len, 1);
+ pin_down_extent(trans, cache, buf->start, buf->len, 1);
btrfs_put_block_group(cache);
goto out;
}
@@ -3318,9 +3352,6 @@
trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
}
out:
- if (pin)
- add_pinned_bytes(fs_info, &generic_ref);
-
if (last_ref) {
/*
* Deleting the buffer, clear the corrupt flag since it doesn't
@@ -3334,7 +3365,6 @@
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int old_ref_mod, new_ref_mod;
int ret;
if (btrfs_is_testing(fs_info))
@@ -3349,15 +3379,12 @@
(ref->type == BTRFS_REF_DATA &&
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
/* unlocks the pinned mutex */
- btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
- old_ref_mod = new_ref_mod = 0;
+ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
- ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
} else {
- ret = btrfs_add_delayed_data_ref(trans, ref, 0,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_data_ref(trans, ref, 0);
}
if (!((ref->type == BTRFS_REF_METADATA &&
@@ -3366,9 +3393,6 @@
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
btrfs_ref_tree_mod(fs_info, ref);
- if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
- add_pinned_bytes(fs_info, ref);
-
return ret;
}
@@ -3380,15 +3404,14 @@
};
static inline void
-btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
+btrfs_lock_block_group(struct btrfs_block_group *cache,
int delalloc)
{
if (delalloc)
down_read(&cache->data_rwsem);
}
-static inline void
-btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
+static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
int delalloc)
{
btrfs_get_block_group(cache);
@@ -3396,12 +3419,13 @@
down_read(&cache->data_rwsem);
}
-static struct btrfs_block_group_cache *
-btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
+static struct btrfs_block_group *btrfs_lock_cluster(
+ struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
int delalloc)
+ __acquires(&cluster->refill_lock)
{
- struct btrfs_block_group_cache *used_bg = NULL;
+ struct btrfs_block_group *used_bg = NULL;
spin_lock(&cluster->refill_lock);
while (1) {
@@ -3435,7 +3459,7 @@
}
static inline void
-btrfs_release_block_group(struct btrfs_block_group_cache *cache,
+btrfs_release_block_group(struct btrfs_block_group *cache,
int delalloc)
{
if (delalloc)
@@ -3443,13 +3467,16 @@
btrfs_put_block_group(cache);
}
+enum btrfs_extent_allocation_policy {
+ BTRFS_EXTENT_ALLOC_CLUSTERED,
+};
+
/*
* Structure used internally for find_free_extent() function. Wraps needed
* parameters.
*/
struct find_free_extent_ctl {
/* Basic allocation info */
- u64 ram_bytes;
u64 num_bytes;
u64 empty_size;
u64 flags;
@@ -3460,6 +3487,8 @@
/* For clustered allocation */
u64 empty_cluster;
+ struct btrfs_free_cluster *last_ptr;
+ bool use_cluster;
bool have_caching_bg;
bool orig_have_caching_bg;
@@ -3495,6 +3524,12 @@
/* Found result */
u64 found_offset;
+
+ /* Hint where to start looking for an empty space */
+ u64 hint_byte;
+
+ /* Allocation policy */
+ enum btrfs_extent_allocation_policy policy;
};
@@ -3506,12 +3541,12 @@
* Return >0 to inform caller that we find nothing
* Return 0 means we have found a location and set ffe_ctl->found_offset.
*/
-static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
- struct btrfs_free_cluster *last_ptr,
- struct find_free_extent_ctl *ffe_ctl,
- struct btrfs_block_group_cache **cluster_bg_ret)
+static int find_free_extent_clustered(struct btrfs_block_group *bg,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **cluster_bg_ret)
{
- struct btrfs_block_group_cache *cluster_bg;
+ struct btrfs_block_group *cluster_bg;
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
u64 aligned_cluster;
u64 offset;
int ret;
@@ -3524,7 +3559,7 @@
goto release_cluster;
offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
- ffe_ctl->num_bytes, cluster_bg->key.objectid,
+ ffe_ctl->num_bytes, cluster_bg->start,
&ffe_ctl->max_extent_size);
if (offset) {
/* We have a block, we're done */
@@ -3610,10 +3645,10 @@
* Return 0 when we found an free extent and set ffe_ctrl->found_offset
* Return -EAGAIN to inform caller that we need to re-search this block group
*/
-static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
- struct btrfs_free_cluster *last_ptr,
- struct find_free_extent_ctl *ffe_ctl)
+static int find_free_extent_unclustered(struct btrfs_block_group *bg,
+ struct find_free_extent_ctl *ffe_ctl)
{
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
u64 offset;
/*
@@ -3669,16 +3704,101 @@
return 0;
}
+static int do_allocation_clustered(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ int ret;
+
+ /* We want to try and use the cluster allocator, so lets look there */
+ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
+ ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
+ if (ret >= 0 || ret == -EAGAIN)
+ return ret;
+ /* ret == -ENOENT case falls through */
+ }
+
+ return find_free_extent_unclustered(block_group, ffe_ctl);
+}
+
+static int do_allocation(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
+ default:
+ BUG();
+ }
+}
+
+static void release_block_group(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ int delalloc)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ ffe_ctl->retry_clustered = false;
+ ffe_ctl->retry_unclustered = false;
+ break;
+ default:
+ BUG();
+ }
+
+ BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
+ ffe_ctl->index);
+ btrfs_release_block_group(block_group, delalloc);
+}
+
+static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_key *ins)
+{
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
+
+ if (!ffe_ctl->use_cluster && last_ptr) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->window_start = ins->objectid;
+ spin_unlock(&last_ptr->lock);
+ }
+}
+
+static void found_extent(struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_key *ins)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ found_extent_clustered(ffe_ctl, ins);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ /*
+ * If we can't allocate a new chunk we've already looped through
+ * at least once, move on to the NO_EMPTY_SIZE case.
+ */
+ ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
+ return 0;
+ default:
+ BUG();
+ }
+}
+
/*
* Return >0 means caller needs to re-search for free extent
* Return 0 means we have the needed free extent.
* Return <0 means we failed to locate any free extent.
*/
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
- struct btrfs_free_cluster *last_ptr,
struct btrfs_key *ins,
struct find_free_extent_ctl *ffe_ctl,
- int full_search, bool use_cluster)
+ bool full_search)
{
struct btrfs_root *root = fs_info->extent_root;
int ret;
@@ -3695,11 +3815,7 @@
return 1;
if (ins->objectid) {
- if (!use_cluster && last_ptr) {
- spin_lock(&last_ptr->lock);
- last_ptr->window_start = ins->objectid;
- spin_unlock(&last_ptr->lock);
- }
+ found_extent(ffe_ctl, ins);
return 0;
}
@@ -3745,16 +3861,10 @@
ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
CHUNK_ALLOC_FORCE);
- /*
- * If we can't allocate a new chunk we've already looped
- * through at least once, move on to the NO_EMPTY_SIZE
- * case.
- */
- if (ret == -ENOSPC)
- ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
-
/* Do not bail out on ENOSPC since we can do more. */
- if (ret < 0 && ret != -ENOSPC)
+ if (ret == -ENOSPC)
+ ret = chunk_allocation_failed(ffe_ctl);
+ else if (ret < 0)
btrfs_abort_transaction(trans, ret);
else
ret = 0;
@@ -3765,6 +3875,9 @@
}
if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
+ if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
+ return -ENOSPC;
+
/*
* Don't loop again if we already have no empty_size and
* no empty_cluster.
@@ -3780,6 +3893,71 @@
return -ENOSPC;
}
+static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info,
+ struct btrfs_key *ins)
+{
+ /*
+ * If our free space is heavily fragmented we may not be able to make
+ * big contiguous allocations, so instead of doing the expensive search
+ * for free space, simply return ENOSPC with our max_extent_size so we
+ * can go ahead and search for a more manageable chunk.
+ *
+ * If our max_extent_size is large enough for our allocation simply
+ * disable clustering since we will likely not be able to find enough
+ * space to create a cluster and induce latency trying.
+ */
+ if (space_info->max_extent_size) {
+ spin_lock(&space_info->lock);
+ if (space_info->max_extent_size &&
+ ffe_ctl->num_bytes > space_info->max_extent_size) {
+ ins->offset = space_info->max_extent_size;
+ spin_unlock(&space_info->lock);
+ return -ENOSPC;
+ } else if (space_info->max_extent_size) {
+ ffe_ctl->use_cluster = false;
+ }
+ spin_unlock(&space_info->lock);
+ }
+
+ ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
+ &ffe_ctl->empty_cluster);
+ if (ffe_ctl->last_ptr) {
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
+
+ spin_lock(&last_ptr->lock);
+ if (last_ptr->block_group)
+ ffe_ctl->hint_byte = last_ptr->window_start;
+ if (last_ptr->fragmented) {
+ /*
+ * We still set window_start so we can keep track of the
+ * last place we found an allocation to try and save
+ * some time.
+ */
+ ffe_ctl->hint_byte = last_ptr->window_start;
+ ffe_ctl->use_cluster = false;
+ }
+ spin_unlock(&last_ptr->lock);
+ }
+
+ return 0;
+}
+
+static int prepare_allocation(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info,
+ struct btrfs_key *ins)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return prepare_allocation_clustered(fs_info, ffe_ctl,
+ space_info, ins);
+ default:
+ BUG();
+ }
+}
+
/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
@@ -3807,33 +3985,36 @@
*/
static noinline int find_free_extent(struct btrfs_root *root,
u64 ram_bytes, u64 num_bytes, u64 empty_size,
- u64 hint_byte, struct btrfs_key *ins,
+ u64 hint_byte_orig, struct btrfs_key *ins,
u64 flags, int delalloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int cache_block_group_error = 0;
- struct btrfs_free_cluster *last_ptr = NULL;
- struct btrfs_block_group_cache *block_group = NULL;
+ struct btrfs_block_group *block_group = NULL;
struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
- bool use_cluster = true;
bool full_search = false;
WARN_ON(num_bytes < fs_info->sectorsize);
- ffe_ctl.ram_bytes = ram_bytes;
ffe_ctl.num_bytes = num_bytes;
ffe_ctl.empty_size = empty_size;
ffe_ctl.flags = flags;
ffe_ctl.search_start = 0;
- ffe_ctl.retry_clustered = false;
- ffe_ctl.retry_unclustered = false;
ffe_ctl.delalloc = delalloc;
ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
ffe_ctl.have_caching_bg = false;
ffe_ctl.orig_have_caching_bg = false;
ffe_ctl.found_offset = 0;
+ ffe_ctl.hint_byte = hint_byte_orig;
+ ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
+
+ /* For clustered allocation */
+ ffe_ctl.retry_clustered = false;
+ ffe_ctl.retry_unclustered = false;
+ ffe_ctl.last_ptr = NULL;
+ ffe_ctl.use_cluster = true;
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
@@ -3847,51 +4028,14 @@
return -ENOSPC;
}
- /*
- * If our free space is heavily fragmented we may not be able to make
- * big contiguous allocations, so instead of doing the expensive search
- * for free space, simply return ENOSPC with our max_extent_size so we
- * can go ahead and search for a more manageable chunk.
- *
- * If our max_extent_size is large enough for our allocation simply
- * disable clustering since we will likely not be able to find enough
- * space to create a cluster and induce latency trying.
- */
- if (unlikely(space_info->max_extent_size)) {
- spin_lock(&space_info->lock);
- if (space_info->max_extent_size &&
- num_bytes > space_info->max_extent_size) {
- ins->offset = space_info->max_extent_size;
- spin_unlock(&space_info->lock);
- return -ENOSPC;
- } else if (space_info->max_extent_size) {
- use_cluster = false;
- }
- spin_unlock(&space_info->lock);
- }
-
- last_ptr = fetch_cluster_info(fs_info, space_info,
- &ffe_ctl.empty_cluster);
- if (last_ptr) {
- spin_lock(&last_ptr->lock);
- if (last_ptr->block_group)
- hint_byte = last_ptr->window_start;
- if (last_ptr->fragmented) {
- /*
- * We still set window_start so we can keep track of the
- * last place we found an allocation to try and save
- * some time.
- */
- hint_byte = last_ptr->window_start;
- use_cluster = false;
- }
- spin_unlock(&last_ptr->lock);
- }
+ ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
+ if (ret < 0)
+ return ret;
ffe_ctl.search_start = max(ffe_ctl.search_start,
first_logical_byte(fs_info, 0));
- ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
- if (ffe_ctl.search_start == hint_byte) {
+ ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
+ if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
block_group = btrfs_lookup_block_group(fs_info,
ffe_ctl.search_start);
/*
@@ -3932,12 +4076,14 @@
down_read(&space_info->groups_sem);
list_for_each_entry(block_group,
&space_info->block_groups[ffe_ctl.index], list) {
+ struct btrfs_block_group *bg_ret;
+
/* If the block group is read-only, we can skip it entirely. */
if (unlikely(block_group->ro))
continue;
btrfs_grab_block_group(block_group, delalloc);
- ffe_ctl.search_start = block_group->key.objectid;
+ ffe_ctl.search_start = block_group->start;
/*
* this can happen if we end up cycling through all the
@@ -3968,7 +4114,7 @@
}
have_block_group:
- ffe_ctl.cached = btrfs_block_group_cache_done(block_group);
+ ffe_ctl.cached = btrfs_block_group_done(block_group);
if (unlikely(!ffe_ctl.cached)) {
ffe_ctl.have_caching_bg = true;
ret = btrfs_cache_block_group(block_group, 0);
@@ -3992,45 +4138,26 @@
if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
goto loop;
- /*
- * Ok we want to try and use the cluster allocator, so
- * lets look there
- */
- if (last_ptr && use_cluster) {
- struct btrfs_block_group_cache *cluster_bg = NULL;
-
- ret = find_free_extent_clustered(block_group, last_ptr,
- &ffe_ctl, &cluster_bg);
-
- if (ret == 0) {
- if (cluster_bg && cluster_bg != block_group) {
- btrfs_release_block_group(block_group,
- delalloc);
- block_group = cluster_bg;
- }
- goto checks;
- } else if (ret == -EAGAIN) {
- goto have_block_group;
- } else if (ret > 0) {
- goto loop;
+ bg_ret = NULL;
+ ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
+ if (ret == 0) {
+ if (bg_ret && bg_ret != block_group) {
+ btrfs_release_block_group(block_group, delalloc);
+ block_group = bg_ret;
}
- /* ret == -ENOENT case falls through */
+ } else if (ret == -EAGAIN) {
+ goto have_block_group;
+ } else if (ret > 0) {
+ goto loop;
}
- ret = find_free_extent_unclustered(block_group, last_ptr,
- &ffe_ctl);
- if (ret == -EAGAIN)
- goto have_block_group;
- else if (ret > 0)
- goto loop;
- /* ret == 0 case falls through */
-checks:
+ /* Checks */
ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
fs_info->stripesize);
/* move on to the next group */
if (ffe_ctl.search_start + num_bytes >
- block_group->key.objectid + block_group->key.offset) {
+ block_group->start + block_group->length) {
btrfs_add_free_space(block_group, ffe_ctl.found_offset,
num_bytes);
goto loop;
@@ -4058,17 +4185,12 @@
btrfs_release_block_group(block_group, delalloc);
break;
loop:
- ffe_ctl.retry_clustered = false;
- ffe_ctl.retry_unclustered = false;
- BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
- ffe_ctl.index);
- btrfs_release_block_group(block_group, delalloc);
+ release_block_group(block_group, &ffe_ctl, delalloc);
cond_resched();
}
up_read(&space_info->groups_sem);
- ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
- full_search, use_cluster);
+ ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
if (ret > 0)
goto search;
@@ -4177,12 +4299,10 @@
return ret;
}
-static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len,
- int pin, int delalloc)
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 len, int delalloc)
{
- struct btrfs_block_group_cache *cache;
- int ret = 0;
+ struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, start);
if (!cache) {
@@ -4191,32 +4311,32 @@
return -ENOSPC;
}
- if (pin)
- pin_down_extent(cache, start, len, 1);
- else {
- if (btrfs_test_opt(fs_info, DISCARD))
- ret = btrfs_discard_extent(fs_info, start, len, NULL);
- btrfs_add_free_space(cache, start, len);
- btrfs_free_reserved_bytes(cache, len, delalloc);
- trace_btrfs_reserved_extent_free(fs_info, start, len);
- }
+ btrfs_add_free_space(cache, start, len);
+ btrfs_free_reserved_bytes(cache, len, delalloc);
+ trace_btrfs_reserved_extent_free(fs_info, start, len);
btrfs_put_block_group(cache);
+ return 0;
+}
+
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
+ u64 len)
+{
+ struct btrfs_block_group *cache;
+ int ret = 0;
+
+ cache = btrfs_lookup_block_group(trans->fs_info, start);
+ if (!cache) {
+ btrfs_err(trans->fs_info, "unable to find block group for %llu",
+ start);
+ return -ENOSPC;
+ }
+
+ ret = pin_down_extent(trans, cache, start, len, 1);
+ btrfs_put_block_group(cache);
return ret;
}
-int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len, int delalloc)
-{
- return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
-}
-
-int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len)
-{
- return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
-}
-
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
@@ -4390,7 +4510,6 @@
struct btrfs_key *ins)
{
struct btrfs_ref generic_ref = { 0 };
- int ret;
BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
@@ -4398,9 +4517,8 @@
ins->objectid, ins->offset, 0);
btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
- ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
- ram_bytes, NULL, NULL);
- return ret;
+
+ return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
}
/*
@@ -4414,7 +4532,7 @@
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
/*
@@ -4443,14 +4561,15 @@
ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
offset, ins, 1);
if (ret)
- btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1);
+ btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
btrfs_put_block_group(block_group);
return ret;
}
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- u64 bytenr, int level, u64 owner)
+ u64 bytenr, int level, u64 owner,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
@@ -4473,7 +4592,7 @@
}
btrfs_set_buffer_lockdep_class(owner, buf, level);
- btrfs_tree_lock(buf);
+ __btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
@@ -4519,7 +4638,8 @@
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
- u64 empty_size)
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key ins;
@@ -4535,7 +4655,7 @@
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (btrfs_is_testing(fs_info)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
- level, root_objectid);
+ level, root_objectid, nest);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
@@ -4552,7 +4672,7 @@
goto out_unuse;
buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
- root_objectid);
+ root_objectid, nest);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_free_reserved;
@@ -4586,8 +4706,7 @@
generic_ref.real_root = root->root_key.objectid;
btrfs_init_tree_ref(&generic_ref, level, root_objectid);
btrfs_ref_tree_mod(fs_info, &generic_ref);
- ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
- extent_op, NULL, NULL);
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
if (ret)
goto out_free_delayed;
}
@@ -4596,6 +4715,7 @@
out_free_delayed:
btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
+ btrfs_tree_unlock(buf);
free_extent_buffer(buf);
out_free_reserved:
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
@@ -4762,8 +4882,7 @@
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_set_disk_extent_flags(trans, eb->start,
- eb->len, flag,
+ ret = btrfs_set_disk_extent_flags(trans, eb, flag,
btrfs_header_level(eb), 0);
BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
@@ -5221,9 +5340,7 @@
*
* If called with for_reloc == 0, may exit early with -EAGAIN
*/
-int btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int update_ref,
- int for_reloc)
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
@@ -5269,9 +5386,6 @@
if (err)
goto out_end_trans;
- if (block_rsv)
- trans->block_rsv = block_rsv;
-
/*
* This will help us catch people modifying the fs tree while we're
* dropping it. It is unsafe to mess with the fs tree while it's being
@@ -5407,8 +5521,6 @@
err = PTR_ERR(trans);
goto out_free;
}
- if (block_rsv)
- trans->block_rsv = block_rsv;
}
}
btrfs_release_path(path);
@@ -5440,13 +5552,18 @@
}
}
- if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
+ /*
+ * This subvolume is going to be completely dropped, and won't be
+ * recorded as dirty roots, thus pertrans meta rsv will not be freed at
+ * commit transaction time. So free it here manually.
+ */
+ btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
+ btrfs_qgroup_free_meta_all_pertrans(root);
+
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
btrfs_add_dropped_root(trans, root);
- } else {
- free_extent_buffer(root->node);
- free_extent_buffer(root->commit_root);
- btrfs_put_fs_root(root);
- }
+ else
+ btrfs_put_root(root);
root_dropped = true;
out_end_trans:
btrfs_end_transaction_throttle(trans);
@@ -5499,7 +5616,7 @@
btrfs_assert_tree_locked(parent);
parent_level = btrfs_header_level(parent);
- extent_buffer_get(parent);
+ atomic_inc(&parent->refs);
path->nodes[parent_level] = parent;
path->slots[parent_level] = btrfs_header_nritems(parent);
@@ -5543,7 +5660,7 @@
*/
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
u64 free_bytes = 0;
int factor;
@@ -5561,9 +5678,8 @@
}
factor = btrfs_bg_type_to_factor(block_group->flags);
- free_bytes += (block_group->key.offset -
- btrfs_block_group_used(&block_group->item)) *
- factor;
+ free_bytes += (block_group->length -
+ block_group->used) * factor;
spin_unlock(&block_group->lock);
}
@@ -5699,7 +5815,7 @@
*/
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
- struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_block_group *cache = NULL;
struct btrfs_device *device;
struct list_head *devices;
u64 group_trimmed;
@@ -5723,16 +5839,16 @@
cache = btrfs_lookup_first_block_group(fs_info, range->start);
for (; cache; cache = btrfs_next_block_group(cache)) {
- if (cache->key.objectid >= range_end) {
+ if (cache->start >= range_end) {
btrfs_put_block_group(cache);
break;
}
- start = max(range->start, cache->key.objectid);
- end = min(range_end, cache->key.objectid + cache->key.offset);
+ start = max(range->start, cache->start);
+ end = min(range_end, cache->start + cache->length);
if (end - start >= range->minlen) {
- if (!btrfs_block_group_cache_done(cache)) {
+ if (!btrfs_block_group_done(cache)) {
ret = btrfs_cache_block_group(cache, 0);
if (ret) {
bg_failed++;
@@ -5791,47 +5907,3 @@
return bg_ret;
return dev_ret;
}
-
-/*
- * btrfs_{start,end}_write_no_snapshotting() are similar to
- * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
- * data into the page cache through nocow before the subvolume is snapshoted,
- * but flush the data into disk after the snapshot creation, or to prevent
- * operations while snapshotting is ongoing and that cause the snapshot to be
- * inconsistent (writes followed by expanding truncates for example).
- */
-void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
-{
- percpu_counter_dec(&root->subv_writers->counter);
- cond_wake_up(&root->subv_writers->wait);
-}
-
-int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
-{
- if (atomic_read(&root->will_be_snapshotted))
- return 0;
-
- percpu_counter_inc(&root->subv_writers->counter);
- /*
- * Make sure counter is updated before we check for snapshot creation.
- */
- smp_mb();
- if (atomic_read(&root->will_be_snapshotted)) {
- btrfs_end_write_no_snapshotting(root);
- return 0;
- }
- return 1;
-}
-
-void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
-{
- while (true) {
- int ret;
-
- ret = btrfs_start_write_no_snapshotting(root);
- if (ret)
- break;
- wait_var_event(&root->will_be_snapshotted,
- !atomic_read(&root->will_be_snapshotted));
- }
-}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9108a73..5fc65a7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -14,6 +14,7 @@
#include <linux/prefetch.h>
#include <linux/cleancache.h>
#include "extent_io.h"
+#include "extent-io-tree.h"
#include "extent_map.h"
#include "ctree.h"
#include "btrfs_inode.h"
@@ -34,36 +35,59 @@
}
#ifdef CONFIG_BTRFS_DEBUG
-static LIST_HEAD(buffers);
static LIST_HEAD(states);
-
static DEFINE_SPINLOCK(leak_lock);
-static inline
-void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
+static inline void btrfs_leak_debug_add(spinlock_t *lock,
+ struct list_head *new,
+ struct list_head *head)
{
unsigned long flags;
- spin_lock_irqsave(&leak_lock, flags);
+ spin_lock_irqsave(lock, flags);
list_add(new, head);
- spin_unlock_irqrestore(&leak_lock, flags);
+ spin_unlock_irqrestore(lock, flags);
}
-static inline
-void btrfs_leak_debug_del(struct list_head *entry)
+static inline void btrfs_leak_debug_del(spinlock_t *lock,
+ struct list_head *entry)
{
unsigned long flags;
- spin_lock_irqsave(&leak_lock, flags);
+ spin_lock_irqsave(lock, flags);
list_del(entry);
- spin_unlock_irqrestore(&leak_lock, flags);
+ spin_unlock_irqrestore(lock, flags);
}
-static inline
-void btrfs_leak_debug_check(void)
+void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
+{
+ struct extent_buffer *eb;
+ unsigned long flags;
+
+ /*
+ * If we didn't get into open_ctree our allocated_ebs will not be
+ * initialized, so just skip this.
+ */
+ if (!fs_info->allocated_ebs.next)
+ return;
+
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ while (!list_empty(&fs_info->allocated_ebs)) {
+ eb = list_first_entry(&fs_info->allocated_ebs,
+ struct extent_buffer, leak_list);
+ pr_err(
+ "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
+ eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
+ btrfs_header_owner(eb));
+ list_del(&eb->leak_list);
+ kmem_cache_free(extent_buffer_cache, eb);
+ }
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
+}
+
+static inline void btrfs_extent_state_leak_debug_check(void)
{
struct extent_state *state;
- struct extent_buffer *eb;
while (!list_empty(&states)) {
state = list_entry(states.next, struct extent_state, leak_list);
@@ -74,14 +98,6 @@
list_del(&state->leak_list);
kmem_cache_free(extent_state_cache, state);
}
-
- while (!list_empty(&buffers)) {
- eb = list_entry(buffers.next, struct extent_buffer, leak_list);
- pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
- eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
- list_del(&eb->leak_list);
- kmem_cache_free(extent_buffer_cache, eb);
- }
}
#define btrfs_debug_check_extent_io_range(tree, start, end) \
@@ -103,9 +119,9 @@
}
}
#else
-#define btrfs_leak_debug_add(new, head) do {} while (0)
-#define btrfs_leak_debug_del(entry) do {} while (0)
-#define btrfs_leak_debug_check() do {} while (0)
+#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
+#define btrfs_leak_debug_del(lock, entry) do {} while (0)
+#define btrfs_extent_state_leak_debug_check() do {} while (0)
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
@@ -117,7 +133,6 @@
struct extent_page_data {
struct bio *bio;
- struct extent_io_tree *tree;
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
*/
@@ -145,19 +160,20 @@
return ret;
}
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
{
blk_status_t ret = 0;
struct extent_io_tree *tree = bio->bi_private;
bio->bi_private = NULL;
- if (tree->ops)
- ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- mirror_num, bio_flags);
+ if (is_data_inode(tree->private_data))
+ ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
+ bio_flags);
else
- btrfsic_submit_bio(bio);
+ ret = btrfs_submit_metadata_bio(tree->private_data, bio,
+ mirror_num, bio_flags);
return blk_status_to_errno(ret);
}
@@ -196,19 +212,23 @@
return ret;
}
-int __init extent_io_init(void)
+int __init extent_state_cache_init(void)
{
extent_state_cache = kmem_cache_create("btrfs_extent_state",
sizeof(struct extent_state), 0,
SLAB_MEM_SPREAD, NULL);
if (!extent_state_cache)
return -ENOMEM;
+ return 0;
+}
+int __init extent_io_init(void)
+{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
SLAB_MEM_SPREAD, NULL);
if (!extent_buffer_cache)
- goto free_state_cache;
+ return -ENOMEM;
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_io_bio, bio),
@@ -226,38 +246,47 @@
free_buffer_cache:
kmem_cache_destroy(extent_buffer_cache);
extent_buffer_cache = NULL;
-
-free_state_cache:
- kmem_cache_destroy(extent_state_cache);
- extent_state_cache = NULL;
return -ENOMEM;
}
+void __cold extent_state_cache_exit(void)
+{
+ btrfs_extent_state_leak_debug_check();
+ kmem_cache_destroy(extent_state_cache);
+}
+
void __cold extent_io_exit(void)
{
- btrfs_leak_debug_check();
-
/*
* Make sure all delayed rcu free are flushed before we
* destroy caches.
*/
rcu_barrier();
- kmem_cache_destroy(extent_state_cache);
kmem_cache_destroy(extent_buffer_cache);
bioset_exit(&btrfs_bioset);
}
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data)
{
tree->fs_info = fs_info;
tree->state = RB_ROOT;
- tree->ops = NULL;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
tree->private_data = private_data;
tree->owner = owner;
+ if (owner == IO_TREE_INODE_FILE_EXTENT)
+ lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
void extent_io_tree_release(struct extent_io_tree *tree)
@@ -304,7 +333,7 @@
state->state = 0;
state->failrec = NULL;
RB_CLEAR_NODE(&state->rb_node);
- btrfs_leak_debug_add(&state->leak_list, &states);
+ btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
refcount_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
trace_alloc_extent_state(state, mask, _RET_IP_);
@@ -317,7 +346,7 @@
return;
if (refcount_dec_and_test(&state->refs)) {
WARN_ON(extent_state_in_tree(state));
- btrfs_leak_debug_del(&state->leak_list);
+ btrfs_leak_debug_del(&leak_lock, &state->leak_list);
trace_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
}
@@ -1041,6 +1070,16 @@
goto out;
}
+ /*
+ * If this extent already has all the bits we want set, then
+ * skip it, not necessary to split it or do anything with it.
+ */
+ if ((state->state & bits) == bits) {
+ start = state->end + 1;
+ cache_state(state, cached_state);
+ goto search_again;
+ }
+
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, start);
@@ -1556,6 +1595,43 @@
}
/**
+ * find_contiguous_extent_bit: find a contiguous area of bits
+ * @tree - io tree to check
+ * @start - offset to start the search from
+ * @start_ret - the first offset we found with the bits set
+ * @end_ret - the final contiguous range of the bits that were set
+ * @bits - bits to look for
+ *
+ * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
+ * to set bits appropriately, and then merge them again. During this time it
+ * will drop the tree->lock, so use this helper if you want to find the actual
+ * contiguous area for given bits. We will search to the first bit we find, and
+ * then walk down the tree until we find a non-contiguous area. The area
+ * returned will be the full contiguous area with the bits set.
+ */
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, start, bits);
+ if (state) {
+ *start_ret = state->start;
+ *end_ret = state->end;
+ while ((state = next_state(state)) != NULL) {
+ if (state->start > (*end_ret + 1))
+ break;
+ *end_ret = state->end;
+ }
+ ret = 0;
+ }
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/**
* find_first_clear_extent_bit - find the first range that has @bits not set.
* This range could start before @start.
*
@@ -1678,9 +1754,9 @@
*
* true is returned if we find something, false if nothing was in the tree
*/
-static noinline bool find_delalloc_range(struct extent_io_tree *tree,
- u64 *start, u64 *end, u64 max_bytes,
- struct extent_state **cached_state)
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+ u64 *end, u64 max_bytes,
+ struct extent_state **cached_state)
{
struct rb_node *node;
struct extent_state *state;
@@ -1798,8 +1874,8 @@
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
- found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
- max_bytes, &cached_state);
+ found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+ max_bytes, &cached_state);
if (!found || delalloc_end <= *start) {
*start = delalloc_start;
*end = delalloc_end;
@@ -1940,15 +2016,14 @@
return err;
}
-void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned clear_bits,
unsigned long page_ops)
{
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
- NULL);
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
- __process_pages_contig(inode->i_mapping, locked_page,
+ __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
start >> PAGE_SHIFT, end >> PAGE_SHIFT,
page_ops, NULL);
}
@@ -2017,8 +2092,8 @@
* set the private field for a given byte offset in the tree. If there isn't
* an extent_state there already, this does nothing.
*/
-static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record *failrec)
+int set_state_failrec(struct extent_io_tree *tree, u64 start,
+ struct io_failure_record *failrec)
{
struct rb_node *node;
struct extent_state *state;
@@ -2045,12 +2120,11 @@
return ret;
}
-static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record **failrec)
+struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
{
struct rb_node *node;
struct extent_state *state;
- int ret = 0;
+ struct io_failure_record *failrec;
spin_lock(&tree->lock);
/*
@@ -2059,18 +2133,19 @@
*/
node = tree_search(tree, start);
if (!node) {
- ret = -ENOENT;
+ failrec = ERR_PTR(-ENOENT);
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
if (state->start != start) {
- ret = -ENOENT;
+ failrec = ERR_PTR(-ENOENT);
goto out;
}
- *failrec = state->failrec;
+
+ failrec = state->failrec;
out:
spin_unlock(&tree->lock);
- return ret;
+ return failrec;
}
/*
@@ -2256,7 +2331,7 @@
return 0;
}
-int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num)
+int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 start = eb->start;
@@ -2300,8 +2375,8 @@
if (!ret)
return 0;
- ret = get_state_failrec(failure_tree, start, &failrec);
- if (ret)
+ failrec = get_state_failrec(failure_tree, start);
+ if (IS_ERR(failrec))
return 0;
BUG_ON(!failrec->this_mirror);
@@ -2373,8 +2448,8 @@
spin_unlock(&failure_tree->lock);
}
-int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
- struct io_failure_record **failrec_ret)
+static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
+ u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct io_failure_record *failrec;
@@ -2385,65 +2460,8 @@
int ret;
u64 logical;
- ret = get_state_failrec(failure_tree, start, &failrec);
- if (ret) {
- failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
- if (!failrec)
- return -ENOMEM;
-
- failrec->start = start;
- failrec->len = end - start + 1;
- failrec->this_mirror = 0;
- failrec->bio_flags = 0;
- failrec->in_validation = 0;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, failrec->len);
- if (!em) {
- read_unlock(&em_tree->lock);
- kfree(failrec);
- return -EIO;
- }
-
- if (em->start > start || em->start + em->len <= start) {
- free_extent_map(em);
- em = NULL;
- }
- read_unlock(&em_tree->lock);
- if (!em) {
- kfree(failrec);
- return -EIO;
- }
-
- logical = start - em->start;
- logical = em->block_start + logical;
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- logical = em->block_start;
- failrec->bio_flags = EXTENT_BIO_COMPRESSED;
- extent_set_compress_type(&failrec->bio_flags,
- em->compress_type);
- }
-
- btrfs_debug(fs_info,
- "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
- logical, start, failrec->len);
-
- failrec->logical = logical;
- free_extent_map(em);
-
- /* set the bits in the private failure tree */
- ret = set_extent_bits(failure_tree, start, end,
- EXTENT_LOCKED | EXTENT_DIRTY);
- if (ret >= 0)
- ret = set_state_failrec(failure_tree, start, failrec);
- /* set the bits in the inode's tree */
- if (ret >= 0)
- ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
- if (ret < 0) {
- kfree(failrec);
- return ret;
- }
- } else {
+ failrec = get_state_failrec(failure_tree, start);
+ if (!IS_ERR(failrec)) {
btrfs_debug(fs_info,
"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
failrec->logical, failrec->start, failrec->len,
@@ -2453,15 +2471,71 @@
* (e.g. with a list for failed_mirror) to make
* clean_io_failure() clean all those errors at once.
*/
+
+ return failrec;
}
- *failrec_ret = failrec;
+ failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+ if (!failrec)
+ return ERR_PTR(-ENOMEM);
- return 0;
+ failrec->start = start;
+ failrec->len = end - start + 1;
+ failrec->this_mirror = 0;
+ failrec->bio_flags = 0;
+ failrec->in_validation = 0;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, failrec->len);
+ if (!em) {
+ read_unlock(&em_tree->lock);
+ kfree(failrec);
+ return ERR_PTR(-EIO);
+ }
+
+ if (em->start > start || em->start + em->len <= start) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ kfree(failrec);
+ return ERR_PTR(-EIO);
+ }
+
+ logical = start - em->start;
+ logical = em->block_start + logical;
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ logical = em->block_start;
+ failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+ extent_set_compress_type(&failrec->bio_flags, em->compress_type);
+ }
+
+ btrfs_debug(fs_info,
+ "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
+ logical, start, failrec->len);
+
+ failrec->logical = logical;
+ free_extent_map(em);
+
+ /* Set the bits in the private failure tree */
+ ret = set_extent_bits(failure_tree, start, end,
+ EXTENT_LOCKED | EXTENT_DIRTY);
+ if (ret >= 0) {
+ ret = set_state_failrec(failure_tree, start, failrec);
+ /* Set the bits in the inode's tree */
+ ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
+ } else if (ret < 0) {
+ kfree(failrec);
+ return ERR_PTR(ret);
+ }
+
+ return failrec;
}
-bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
- struct io_failure_record *failrec, int failed_mirror)
+static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
+ struct io_failure_record *failrec,
+ int failed_mirror)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int num_copies;
@@ -2484,7 +2558,7 @@
* a) deliver good data to the caller
* b) correct the bad sectors on disk
*/
- if (failed_bio_pages > 1) {
+ if (needs_validation) {
/*
* to fulfill b), we need to know the exact failing sectors, as
* we don't want to rewrite any more than the failed ones. thus,
@@ -2523,95 +2597,114 @@
return true;
}
-
-struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
- struct io_failure_record *failrec,
- struct page *page, int pg_offset, int icsum,
- bio_end_io_t *endio_func, void *data)
+static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct bio *bio;
- struct btrfs_io_bio *btrfs_failed_bio;
- struct btrfs_io_bio *btrfs_bio;
+ u64 len = 0;
+ const u32 blocksize = inode->i_sb->s_blocksize;
- bio = btrfs_io_bio_alloc(1);
- bio->bi_end_io = endio_func;
- bio->bi_iter.bi_sector = failrec->logical >> 9;
- bio_set_dev(bio, fs_info->fs_devices->latest_bdev);
- bio->bi_iter.bi_size = 0;
- bio->bi_private = data;
+ /*
+ * If bi_status is BLK_STS_OK, then this was a checksum error, not an
+ * I/O error. In this case, we already know exactly which sector was
+ * bad, so we don't need to validate.
+ */
+ if (bio->bi_status == BLK_STS_OK)
+ return false;
- btrfs_failed_bio = btrfs_io_bio(failed_bio);
- if (btrfs_failed_bio->csum) {
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ /*
+ * We need to validate each sector individually if the failed I/O was
+ * for multiple sectors.
+ *
+ * There are a few possible bios that can end up here:
+ * 1. A buffered read bio, which is not cloned.
+ * 2. A direct I/O read bio, which is cloned.
+ * 3. A (buffered or direct) repair bio, which is not cloned.
+ *
+ * For cloned bios (case 2), we can get the size from
+ * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get
+ * it from the bvecs.
+ */
+ if (bio_flagged(bio, BIO_CLONED)) {
+ if (btrfs_io_bio(bio)->iter.bi_size > blocksize)
+ return true;
+ } else {
+ struct bio_vec *bvec;
+ int i;
- btrfs_bio = btrfs_io_bio(bio);
- btrfs_bio->csum = btrfs_bio->csum_inline;
- icsum *= csum_size;
- memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
- csum_size);
+ bio_for_each_bvec_all(bvec, bio, i) {
+ len += bvec->bv_len;
+ if (len > blocksize)
+ return true;
+ }
}
-
- bio_add_page(bio, page, failrec->len, pg_offset);
-
- return bio;
+ return false;
}
-/*
- * This is a generic handler for readpage errors. If other copies exist, read
- * those and write back good data to the failed position. Does not investigate
- * in remapping the failed extent elsewhere, hoping the device will be smart
- * enough to do this as needed
- */
-static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int failed_mirror)
+blk_status_t btrfs_submit_read_repair(struct inode *inode,
+ struct bio *failed_bio, u64 phy_offset,
+ struct page *page, unsigned int pgoff,
+ u64 start, u64 end, int failed_mirror,
+ submit_bio_hook_t *submit_bio_hook)
{
struct io_failure_record *failrec;
- struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct bio *bio;
- int read_mode = 0;
+ struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
+ const int icsum = phy_offset >> inode->i_sb->s_blocksize_bits;
+ bool need_validation;
+ struct bio *repair_bio;
+ struct btrfs_io_bio *repair_io_bio;
blk_status_t status;
- int ret;
- unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
+
+ btrfs_debug(fs_info,
+ "repair read error: read error at %llu", start);
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
- ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
- if (ret)
- return ret;
+ failrec = btrfs_get_io_failure_record(inode, start, end);
+ if (IS_ERR(failrec))
+ return errno_to_blk_status(PTR_ERR(failrec));
- if (!btrfs_check_repairable(inode, failed_bio_pages, failrec,
+ need_validation = btrfs_io_needs_validation(inode, failed_bio);
+
+ if (!btrfs_check_repairable(inode, need_validation, failrec,
failed_mirror)) {
free_io_failure(failure_tree, tree, failrec);
- return -EIO;
+ return BLK_STS_IOERR;
}
- if (failed_bio_pages > 1)
- read_mode |= REQ_FAILFAST_DEV;
+ repair_bio = btrfs_io_bio_alloc(1);
+ repair_io_bio = btrfs_io_bio(repair_bio);
+ repair_bio->bi_opf = REQ_OP_READ;
+ if (need_validation)
+ repair_bio->bi_opf |= REQ_FAILFAST_DEV;
+ repair_bio->bi_end_io = failed_bio->bi_end_io;
+ repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
+ repair_bio->bi_private = failed_bio->bi_private;
- phy_offset >>= inode->i_sb->s_blocksize_bits;
- bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
- start - page_offset(page),
- (int)phy_offset, failed_bio->bi_end_io,
- NULL);
- bio->bi_opf = REQ_OP_READ | read_mode;
+ if (failed_io_bio->csum) {
+ const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+
+ repair_io_bio->csum = repair_io_bio->csum_inline;
+ memcpy(repair_io_bio->csum,
+ failed_io_bio->csum + csum_size * icsum, csum_size);
+ }
+
+ bio_add_page(repair_bio, page, failrec->len, pgoff);
+ repair_io_bio->logical = failrec->start;
+ repair_io_bio->iter = repair_bio->bi_iter;
btrfs_debug(btrfs_sb(inode->i_sb),
- "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
- read_mode, failrec->this_mirror, failrec->in_validation);
+"repair read error: submitting new read to mirror %d, in_validation=%d",
+ failrec->this_mirror, failrec->in_validation);
- status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
- failrec->bio_flags);
+ status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
+ failrec->bio_flags);
if (status) {
free_io_failure(failure_tree, tree, failrec);
- bio_put(bio);
- ret = blk_status_to_errno(status);
+ bio_put(repair_bio);
}
-
- return ret;
+ return status;
}
/* lots and lots of room for performance fixes in the end_bio funcs */
@@ -2724,8 +2817,6 @@
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- bool data_inode = btrfs_ino(BTRFS_I(inode))
- != BTRFS_BTREE_INODE_OBJECTID;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
@@ -2756,9 +2847,12 @@
mirror = io_bio->mirror_num;
if (likely(uptodate)) {
- ret = tree->ops->readpage_end_io_hook(io_bio, offset,
- page, start, end,
- mirror);
+ if (is_data_inode(inode))
+ ret = btrfs_verify_data_csum(io_bio, offset, page,
+ start, end, mirror);
+ else
+ ret = btrfs_validate_metadata_buffer(io_bio,
+ offset, page, start, end, mirror);
if (ret)
uptodate = 0;
else
@@ -2771,7 +2865,7 @@
if (likely(uptodate))
goto readpage_ok;
- if (data_inode) {
+ if (is_data_inode(inode)) {
/*
* The generic bio_readpage_error handles errors the
@@ -2783,9 +2877,10 @@
* If it can't handle the error it will return -EIO and
* we remain responsible for that page.
*/
- ret = bio_readpage_error(bio, offset, page, start, end,
- mirror);
- if (ret == 0) {
+ if (!btrfs_submit_read_repair(inode, bio, offset, page,
+ start - page_offset(page),
+ start, end, mirror,
+ btrfs_submit_data_bio)) {
uptodate = !bio->bi_status;
offset += len;
continue;
@@ -2916,25 +3011,22 @@
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
- * @tree: tree so we can call our merge_bio hook
* @wbc: optional writeback control for io accounting
* @page: page to add to the bio
* @pg_offset: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
* @size: portion of page that we want to write
* @offset: starting offset in the page
- * @bdev: attach newly created bios to this bdev
* @bio_ret: must be valid pointer, newly allocated bio will be stored there
* @end_io_func: end_io callback for new bio
* @mirror_num: desired mirror to read/write
* @prev_bio_flags: flags of previous bio to see if we can merge the current one
* @bio_flags: flags of the current bio to see if we can merge them
*/
-static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
+static int submit_extent_page(unsigned int opf,
struct writeback_control *wbc,
struct page *page, u64 offset,
size_t size, unsigned long pg_offset,
- struct block_device *bdev,
struct bio **bio_ret,
bio_end_io_t end_io_func,
int mirror_num,
@@ -2946,6 +3038,7 @@
struct bio *bio;
size_t page_size = min_t(size_t, size, PAGE_SIZE);
sector_t sector = offset >> 9;
+ struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
ASSERT(bio_ret);
@@ -2959,7 +3052,6 @@
else
contig = bio_end_sector(bio) == sector;
- ASSERT(tree->ops);
if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
can_merge = false;
@@ -2980,13 +3072,16 @@
}
bio = btrfs_bio_alloc(offset);
- bio_set_dev(bio, bdev);
bio_add_page(bio, page, page_size, pg_offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
bio->bi_write_hint = page->mapping->host->i_write_hint;
bio->bi_opf = opf;
if (wbc) {
+ struct block_device *bdev;
+
+ bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
+ bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
wbc_account_cgroup_owner(wbc, page, page_size);
}
@@ -2999,28 +3094,21 @@
static void attach_extent_buffer_page(struct extent_buffer *eb,
struct page *page)
{
- if (!PagePrivate(page)) {
- SetPagePrivate(page);
- get_page(page);
- set_page_private(page, (unsigned long)eb);
- } else {
+ if (!PagePrivate(page))
+ attach_page_private(page, eb);
+ else
WARN_ON(page->private != (unsigned long)eb);
- }
}
void set_page_extent_mapped(struct page *page)
{
- if (!PagePrivate(page)) {
- SetPagePrivate(page);
- get_page(page);
- set_page_private(page, EXTENT_PAGE_PRIVATE);
- }
+ if (!PagePrivate(page))
+ attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
}
static struct extent_map *
__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
- u64 start, u64 len, get_extent_t *get_extent,
- struct extent_map **em_cached)
+ u64 start, u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
@@ -3036,7 +3124,7 @@
*em_cached = NULL;
}
- em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
if (em_cached && !IS_ERR_OR_NULL(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
@@ -3051,13 +3139,9 @@
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-static int __do_readpage(struct extent_io_tree *tree,
- struct page *page,
- get_extent_t *get_extent,
- struct extent_map **em_cached,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags, unsigned int read_flags,
- u64 *prev_em_start)
+int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct bio **bio, unsigned long *bio_flags,
+ unsigned int read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
@@ -3068,7 +3152,6 @@
u64 block_start;
u64 cur_end;
struct extent_map *em;
- struct block_device *bdev;
int ret = 0;
int nr = 0;
size_t pg_offset = 0;
@@ -3076,6 +3159,7 @@
size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
unsigned long this_bio_flag = 0;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
set_page_extent_mapped(page);
@@ -3119,7 +3203,7 @@
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
- end - cur + 1, get_extent, em_cached);
+ end - cur + 1, em_cached);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
unlock_extent(tree, cur, end);
@@ -3145,7 +3229,6 @@
offset = em->block_start + extent_offset;
disk_io_size = iosize;
}
- bdev = em->bdev;
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
@@ -3233,10 +3316,10 @@
continue;
}
- ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
+ ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
page, offset, disk_io_size,
- pg_offset, bdev, bio,
- end_bio_extent_readpage, mirror_num,
+ pg_offset, bio,
+ end_bio_extent_readpage, 0,
*bio_flags,
this_bio_flag,
force_bio_submit);
@@ -3260,8 +3343,7 @@
return ret;
}
-static inline void contiguous_readpages(struct extent_io_tree *tree,
- struct page *pages[], int nr_pages,
+static inline void contiguous_readpages(struct page *pages[], int nr_pages,
u64 start, u64 end,
struct extent_map **em_cached,
struct bio **bio,
@@ -3271,48 +3353,15 @@
struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
int index;
- btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
- __do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
- bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
+ btrfs_do_readpage(pages[index], em_cached, bio, bio_flags,
+ REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
}
-static int __extent_read_full_page(struct extent_io_tree *tree,
- struct page *page,
- get_extent_t *get_extent,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags,
- unsigned int read_flags)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- int ret;
-
- btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
-
- ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
- bio_flags, read_flags, NULL);
- return ret;
-}
-
-int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent, int mirror_num)
-{
- struct bio *bio = NULL;
- unsigned long bio_flags = 0;
- int ret;
-
- ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
- &bio_flags, 0);
- if (bio)
- ret = submit_one_bio(bio, mirror_num, bio_flags);
- return ret;
-}
-
static void update_nr_written(struct writeback_control *wbc,
unsigned long nr_written)
{
@@ -3329,7 +3378,7 @@
* This returns 0 if all went well (page still locked)
* This returns < 0 if there were errors (page still locked)
*/
-static noinline_for_stack int writepage_delalloc(struct inode *inode,
+static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct page *page, struct writeback_control *wbc,
u64 delalloc_start, unsigned long *nr_written)
{
@@ -3342,7 +3391,7 @@
while (delalloc_end < page_end) {
- found = find_lock_delalloc_range(inode, page,
+ found = find_lock_delalloc_range(&inode->vfs_inode, page,
&delalloc_start,
&delalloc_end);
if (!found) {
@@ -3359,8 +3408,7 @@
* started, so we don't want to return > 0 unless
* things are going well.
*/
- ret = ret < 0 ? ret : -EIO;
- goto done;
+ return ret < 0 ? ret : -EIO;
}
/*
* delalloc_end is already one less than the total length, so
@@ -3392,10 +3440,7 @@
return 1;
}
- ret = 0;
-
-done:
- return ret;
+ return 0;
}
/*
@@ -3406,15 +3451,15 @@
* 0 if all went well (page still locked)
* < 0 if there were errors (page still locked)
*/
-static noinline_for_stack int __extent_writepage_io(struct inode *inode,
+static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
struct page *page,
struct writeback_control *wbc,
struct extent_page_data *epd,
loff_t i_size,
unsigned long nr_written,
- unsigned int write_flags, int *nr_ret)
+ int *nr_ret)
{
- struct extent_io_tree *tree = epd->tree;
+ struct extent_io_tree *tree = &inode->io_tree;
u64 start = page_offset(page);
u64 page_end = start + PAGE_SIZE - 1;
u64 end;
@@ -3423,21 +3468,17 @@
u64 block_start;
u64 iosize;
struct extent_map *em;
- struct block_device *bdev;
size_t pg_offset = 0;
size_t blocksize;
int ret = 0;
int nr = 0;
+ const unsigned int write_flags = wbc_to_write_flags(wbc);
bool compressed;
ret = btrfs_writepage_cow_fixup(page, start, page_end);
if (ret) {
/* Fixup worker will requeue */
- if (ret == -EBUSY)
- wbc->pages_skipped++;
- else
- redirty_page_for_writepage(wbc, page);
-
+ redirty_page_for_writepage(wbc, page);
update_nr_written(wbc, nr_written);
unlock_page(page);
return 1;
@@ -3450,12 +3491,7 @@
update_nr_written(wbc, nr_written + 1);
end = page_end;
- if (i_size <= start) {
- btrfs_writepage_endio_finish_ordered(page, start, page_end, 1);
- goto done;
- }
-
- blocksize = inode->i_sb->s_blocksize;
+ blocksize = inode->vfs_inode.i_sb->s_blocksize;
while (cur <= end) {
u64 em_end;
@@ -3466,8 +3502,7 @@
page_end, 1);
break;
}
- em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur,
- end - cur + 1, 1);
+ em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
ret = PTR_ERR_OR_ZERO(em);
@@ -3481,7 +3516,6 @@
iosize = min(em_end - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
offset = em->block_start + extent_offset;
- bdev = em->bdev;
block_start = em->block_start;
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
free_extent_map(em);
@@ -3493,22 +3527,11 @@
*/
if (compressed || block_start == EXTENT_MAP_HOLE ||
block_start == EXTENT_MAP_INLINE) {
- /*
- * end_io notification does not happen here for
- * compressed extents
- */
- if (!compressed)
- btrfs_writepage_endio_finish_ordered(page, cur,
- cur + iosize - 1,
- 1);
- else if (compressed) {
- /* we don't want to end_page_writeback on
- * a compressed extent. this happens
- * elsewhere
- */
+ if (compressed)
nr++;
- }
-
+ else
+ btrfs_writepage_endio_finish_ordered(page, cur,
+ cur + iosize - 1, 1);
cur += iosize;
pg_offset += iosize;
continue;
@@ -3516,14 +3539,14 @@
btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
- btrfs_err(BTRFS_I(inode)->root->fs_info,
+ btrfs_err(inode->root->fs_info,
"page %lu not writeback, cur %llu end %llu",
page->index, cur, end);
}
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
page, offset, iosize, pg_offset,
- bdev, &epd->bio,
+ &epd->bio,
end_bio_extent_writepage,
0, 0, 0, false);
if (ret) {
@@ -3536,7 +3559,6 @@
pg_offset += iosize;
nr++;
}
-done:
*nr_ret = nr;
return ret;
}
@@ -3558,14 +3580,11 @@
u64 page_end = start + PAGE_SIZE - 1;
int ret;
int nr = 0;
- size_t pg_offset = 0;
+ size_t pg_offset;
loff_t i_size = i_size_read(inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
- unsigned int write_flags = 0;
unsigned long nr_written = 0;
- write_flags = wbc_to_write_flags(wbc);
-
trace___extent_writepage(page, inode, wbc);
WARN_ON(!PageLocked(page));
@@ -3590,22 +3609,21 @@
flush_dcache_page(page);
}
- pg_offset = 0;
-
set_page_extent_mapped(page);
if (!epd->extent_locked) {
- ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
+ ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
+ &nr_written);
if (ret == 1)
- goto done_unlocked;
+ return 0;
if (ret)
goto done;
}
- ret = __extent_writepage_io(inode, page, wbc, epd,
- i_size, nr_written, write_flags, &nr);
+ ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
+ nr_written, &nr);
if (ret == 1)
- goto done_unlocked;
+ return 0;
done:
if (nr == 0) {
@@ -3620,9 +3638,6 @@
unlock_page(page);
ASSERT(ret <= 0);
return ret;
-
-done_unlocked:
- return 0;
}
void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
@@ -3755,6 +3770,12 @@
return;
/*
+ * A read may stumble upon this buffer later, make sure that it gets an
+ * error and knows there was an error.
+ */
+ clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+
+ /*
* If we error out, we should add back the dirty_metadata_bytes
* to make it consistent.
*/
@@ -3851,9 +3872,6 @@
struct writeback_control *wbc,
struct extent_page_data *epd)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- struct block_device *bdev = fs_info->fs_devices->latest_bdev;
- struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
u64 offset = eb->start;
u32 nritems;
int i, num_pages;
@@ -3886,8 +3904,8 @@
clear_page_dirty_for_io(p);
set_page_writeback(p);
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
- p, offset, PAGE_SIZE, 0, bdev,
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
+ p, offset, PAGE_SIZE, 0,
&epd->bio,
end_bio_extent_buffer_writepage,
0, 0, 0, false);
@@ -3919,11 +3937,9 @@
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
struct extent_buffer *eb, *prev_eb = NULL;
struct extent_page_data epd = {
.bio = NULL,
- .tree = tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4245,7 +4261,6 @@
int ret;
struct extent_page_data epd = {
.bio = NULL,
- .tree = &BTRFS_I(page->mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4267,14 +4282,12 @@
{
int ret = 0;
struct address_space *mapping = inode->i_mapping;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct page *page;
unsigned long nr_pages = (end - start + PAGE_SIZE) >>
PAGE_SHIFT;
struct extent_page_data epd = {
.bio = NULL,
- .tree = tree,
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
};
@@ -4283,8 +4296,12 @@
.nr_to_write = nr_pages * 2,
.range_start = start,
.range_end = end + 1,
+ /* We're called from an async helper function */
+ .punt_to_cgroup = 1,
+ .no_cgroup_owner = 1,
};
+ wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
while (start <= end) {
page = find_get_page(mapping, start >> PAGE_SHIFT);
if (clear_page_dirty_for_io(page))
@@ -4299,11 +4316,12 @@
}
ASSERT(ret <= 0);
- if (ret < 0) {
+ if (ret == 0)
+ ret = flush_write_bio(&epd);
+ else
end_write_bio(&epd, ret);
- return ret;
- }
- ret = flush_write_bio(&epd);
+
+ wbc_detach_inode(&wbc_writepages);
return ret;
}
@@ -4313,7 +4331,6 @@
int ret = 0;
struct extent_page_data epd = {
.bio = NULL,
- .tree = &BTRFS_I(mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4328,52 +4345,32 @@
return ret;
}
-int extent_readpages(struct address_space *mapping, struct list_head *pages,
- unsigned nr_pages)
+void extent_readahead(struct readahead_control *rac)
{
struct bio *bio = NULL;
unsigned long bio_flags = 0;
struct page *pagepool[16];
struct extent_map *em_cached = NULL;
- struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
- int nr = 0;
u64 prev_em_start = (u64)-1;
+ int nr;
- while (!list_empty(pages)) {
- u64 contig_end = 0;
+ while ((nr = readahead_page_batch(rac, pagepool))) {
+ u64 contig_start = page_offset(pagepool[0]);
+ u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1;
- for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
- struct page *page = lru_to_page(pages);
+ ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
- prefetchw(&page->flags);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping, page->index,
- readahead_gfp_mask(mapping))) {
- put_page(page);
- break;
- }
-
- pagepool[nr++] = page;
- contig_end = page_offset(page) + PAGE_SIZE - 1;
- }
-
- if (nr) {
- u64 contig_start = page_offset(pagepool[0]);
-
- ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
-
- contiguous_readpages(tree, pagepool, nr, contig_start,
- contig_end, &em_cached, &bio, &bio_flags,
- &prev_em_start);
- }
+ contiguous_readpages(pagepool, nr, contig_start, contig_end,
+ &em_cached, &bio, &bio_flags, &prev_em_start);
}
if (em_cached)
free_extent_map(em_cached);
- if (bio)
- return submit_one_bio(bio, 0, bio_flags);
- return 0;
+ if (bio) {
+ if (submit_one_bio(bio, 0, bio_flags))
+ return;
+ }
}
/*
@@ -4452,6 +4449,9 @@
page->mapping->host->i_size > SZ_16M) {
u64 len;
while (start <= end) {
+ struct btrfs_fs_info *fs_info;
+ u64 cur_gen;
+
len = end - start + 1;
write_lock(&map->lock);
em = lookup_extent_mapping(map, start, len);
@@ -4476,13 +4476,33 @@
* extra reference on the em.
*/
if (list_empty(&em->list) ||
- test_bit(EXTENT_FLAG_LOGGING, &em->flags)) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &btrfs_inode->runtime_flags);
- remove_extent_mapping(map, em);
- /* once for the rb tree */
- free_extent_map(em);
- }
+ test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ goto remove_em;
+ /*
+ * If it's in the list of modified extents, remove it
+ * only if its generation is older then the current one,
+ * in which case we don't need it for a fast fsync.
+ * Otherwise don't remove it, we could be racing with an
+ * ongoing fast fsync that could miss the new extent.
+ */
+ fs_info = btrfs_inode->root->fs_info;
+ spin_lock(&fs_info->trans_lock);
+ cur_gen = fs_info->generation;
+ spin_unlock(&fs_info->trans_lock);
+ if (em->generation >= cur_gen)
+ goto next;
+remove_em:
+ /*
+ * We only remove extent maps that are not in the list of
+ * modified extents or that are in the list but with a
+ * generation lower then the current generation, so there
+ * is no need to set the full fsync flag on the inode (it
+ * hurts the fsync performance for workloads with a data
+ * size that exceeds or is close to the system's memory).
+ */
+ remove_extent_mapping(map, em);
+ /* once for the rb tree */
+ free_extent_map(em);
next:
start = extent_map_end(em);
write_unlock(&map->lock);
@@ -4500,7 +4520,7 @@
* helper function for fiemap, which doesn't want to see any holes.
* This maps until we find something past 'last'
*/
-static struct extent_map *get_extent_skip_holes(struct inode *inode,
+static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
u64 offset, u64 last)
{
u64 sectorsize = btrfs_inode_sectorsize(inode);
@@ -4515,7 +4535,7 @@
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
+ em = btrfs_get_extent_fiemap(inode, offset, len);
if (IS_ERR_OR_NULL(em))
return em;
@@ -4644,8 +4664,8 @@
return ret;
}
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len)
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
{
int ret = 0;
u64 off;
@@ -4655,12 +4675,12 @@
u64 last;
u64 last_for_get_extent = 0;
u64 disko = 0;
- u64 isize = i_size_read(inode);
+ u64 isize = i_size_read(&inode->vfs_inode);
struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct fiemap_cache cache = { 0 };
struct ulist *roots;
struct ulist *tmp_ulist;
@@ -4696,8 +4716,8 @@
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
*/
- ret = btrfs_lookup_file_extent(NULL, root, path,
- btrfs_ino(BTRFS_I(inode)), -1, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
+ 0);
if (ret < 0) {
goto out_free_ulist;
} else {
@@ -4711,7 +4731,7 @@
found_type = found_key.type;
/* No extents, but there might be delalloc bits */
- if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
+ if (found_key.objectid != btrfs_ino(inode) ||
found_type != BTRFS_EXTENT_DATA_KEY) {
/* have to trust i_size as the end */
last = (u64)-1;
@@ -4737,7 +4757,7 @@
last_for_get_extent = isize;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ lock_extent_bits(&inode->io_tree, start, start + len - 1,
&cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent);
@@ -4806,8 +4826,7 @@
* then we're just getting a count and we can skip the
* lookup stuff.
*/
- ret = btrfs_check_shared(root,
- btrfs_ino(BTRFS_I(inode)),
+ ret = btrfs_check_shared(root, btrfs_ino(inode),
bytenr, roots, tmp_ulist);
if (ret < 0)
goto out_free;
@@ -4851,7 +4870,7 @@
ret = emit_last_fiemap_cache(fieinfo, &cache);
free_extent_map(em);
out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ unlock_extent_cached(&inode->io_tree, start, start + len - 1,
&cached_state);
out_free_ulist:
@@ -4863,11 +4882,10 @@
static void __free_extent_buffer(struct extent_buffer *eb)
{
- btrfs_leak_debug_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
-int extent_buffer_under_io(struct extent_buffer *eb)
+int extent_buffer_under_io(const struct extent_buffer *eb)
{
return (atomic_read(&eb->io_pages) ||
test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
@@ -4909,10 +4927,7 @@
* We need to make sure we haven't be attached
* to a new eb.
*/
- ClearPagePrivate(page);
- set_page_private(page, 0);
- /* One for the page private */
- put_page(page);
+ detach_page_private(page);
}
if (mapped)
@@ -4929,6 +4944,7 @@
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
btrfs_release_extent_buffer_pages(eb);
+ btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
__free_extent_buffer(eb);
}
@@ -4946,11 +4962,12 @@
rwlock_init(&eb->lock);
atomic_set(&eb->blocking_readers, 0);
eb->blocking_writers = 0;
- eb->lock_nested = false;
+ eb->lock_recursed = false;
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
- btrfs_leak_debug_add(&eb->leak_list, &buffers);
+ btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
+ &fs_info->allocated_ebs);
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
@@ -4973,7 +4990,7 @@
return eb;
}
-struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
+struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
int i;
struct page *p;
@@ -5300,6 +5317,7 @@
}
static int release_extent_buffer(struct extent_buffer *eb)
+ __releases(&eb->refs_lock)
{
lockdep_assert_held(&eb->refs_lock);
@@ -5318,6 +5336,7 @@
spin_unlock(&eb->refs_lock);
}
+ btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_pages(eb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -5380,7 +5399,7 @@
release_extent_buffer(eb);
}
-void clear_extent_buffer_dirty(struct extent_buffer *eb)
+void clear_extent_buffer_dirty(const struct extent_buffer *eb)
{
int i;
int num_pages;
@@ -5475,7 +5494,6 @@
unsigned long num_reads = 0;
struct bio *bio = NULL;
unsigned long bio_flags = 0;
- struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -5528,20 +5546,19 @@
}
ClearPageError(page);
- err = __extent_read_full_page(tree, page,
- btree_get_extent, &bio,
- mirror_num, &bio_flags,
- REQ_META);
+ err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
+ page, page_offset(page), PAGE_SIZE, 0,
+ &bio, end_bio_extent_readpage,
+ mirror_num, 0, 0, false);
if (err) {
- ret = err;
/*
- * We use &bio in above __extent_read_full_page,
- * so we ensure that if it returns error, the
- * current page fails to add itself to bio and
- * it's been unlocked.
- *
- * We must dec io_pages by ourselves.
+ * We failed to submit the bio so it's the
+ * caller's responsibility to perform cleanup
+ * i.e unlock page/set error bit.
*/
+ ret = err;
+ SetPageError(page);
+ unlock_page(page);
atomic_dec(&eb->io_pages);
}
} else {
@@ -5576,6 +5593,36 @@
return ret;
}
+static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
+ unsigned long len)
+{
+ btrfs_warn(eb->fs_info,
+ "access to eb bytenr %llu len %lu out of range start %lu len %lu",
+ eb->start, eb->len, start, len);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+
+ return true;
+}
+
+/*
+ * Check if the [start, start + len) range is valid before reading/writing
+ * the eb.
+ * NOTE: @start and @len are offset inside the eb, not logical address.
+ *
+ * Caller should not touch the dst/src memory if this function returns error.
+ */
+static inline int check_eb_range(const struct extent_buffer *eb,
+ unsigned long start, unsigned long len)
+{
+ unsigned long offset;
+
+ /* start, start + len should not go beyond eb->len nor overflow */
+ if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
+ return report_eb_range(eb, start, len);
+
+ return false;
+}
+
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
@@ -5584,17 +5631,12 @@
struct page *page;
char *kaddr;
char *dst = (char *)dstv;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = start >> PAGE_SHIFT;
- if (start + len > eb->len) {
- WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
- eb->start, eb->len, start, len);
- memset(dst, 0, len);
+ if (check_eb_range(eb, start, len))
return;
- }
- offset = offset_in_page(start_offset + start);
+ offset = offset_in_page(start);
while (len > 0) {
page = eb->pages[i];
@@ -5619,21 +5661,20 @@
struct page *page;
char *kaddr;
char __user *dst = (char __user *)dstv;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = start >> PAGE_SHIFT;
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = offset_in_page(start_offset + start);
+ offset = offset_in_page(start);
while (len > 0) {
page = eb->pages[i];
cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
- if (probe_user_write(dst, kaddr + offset, cur)) {
+ if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
}
@@ -5647,48 +5688,6 @@
return ret;
}
-/*
- * return 0 if the item is found within a page.
- * return 1 if the item spans two pages.
- * return -EINVAL otherwise.
- */
-int map_private_extent_buffer(const struct extent_buffer *eb,
- unsigned long start, unsigned long min_len,
- char **map, unsigned long *map_start,
- unsigned long *map_len)
-{
- size_t offset;
- char *kaddr;
- struct page *p;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
- unsigned long end_i = (start_offset + start + min_len - 1) >>
- PAGE_SHIFT;
-
- if (start + min_len > eb->len) {
- WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
- eb->start, eb->len, start, min_len);
- return -EINVAL;
- }
-
- if (i != end_i)
- return 1;
-
- if (i == 0) {
- offset = start_offset;
- *map_start = 0;
- } else {
- offset = 0;
- *map_start = ((u64)i << PAGE_SHIFT) - start_offset;
- }
-
- p = eb->pages[i];
- kaddr = page_address(p);
- *map = kaddr + offset;
- *map_len = PAGE_SIZE - offset;
- return 0;
-}
-
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len)
{
@@ -5697,14 +5696,13 @@
struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = start >> PAGE_SHIFT;
int ret = 0;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return -EINVAL;
- offset = offset_in_page(start_offset + start);
+ offset = offset_in_page(start);
while (len > 0) {
page = eb->pages[i];
@@ -5724,7 +5722,7 @@
return ret;
}
-void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
+void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *srcv)
{
char *kaddr;
@@ -5735,7 +5733,7 @@
BTRFS_FSID_SIZE);
}
-void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv)
+void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
{
char *kaddr;
@@ -5745,7 +5743,7 @@
BTRFS_FSID_SIZE);
}
-void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
unsigned long start, unsigned long len)
{
size_t cur;
@@ -5753,13 +5751,12 @@
struct page *page;
char *kaddr;
char *src = (char *)srcv;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = start >> PAGE_SHIFT;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
- offset = offset_in_page(start_offset + start);
+ offset = offset_in_page(start);
while (len > 0) {
page = eb->pages[i];
@@ -5776,20 +5773,19 @@
}
}
-void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
+void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
unsigned long len)
{
size_t cur;
size_t offset;
struct page *page;
char *kaddr;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = start >> PAGE_SHIFT;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
- offset = offset_in_page(start_offset + start);
+ offset = offset_in_page(start);
while (len > 0) {
page = eb->pages[i];
@@ -5805,8 +5801,8 @@
}
}
-void copy_extent_buffer_full(struct extent_buffer *dst,
- struct extent_buffer *src)
+void copy_extent_buffer_full(const struct extent_buffer *dst,
+ const struct extent_buffer *src)
{
int i;
int num_pages;
@@ -5819,7 +5815,8 @@
page_address(src->pages[i]));
}
-void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+void copy_extent_buffer(const struct extent_buffer *dst,
+ const struct extent_buffer *src,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
@@ -5828,12 +5825,15 @@
size_t offset;
struct page *page;
char *kaddr;
- size_t start_offset = offset_in_page(dst->start);
- unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
+ unsigned long i = dst_offset >> PAGE_SHIFT;
+
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(src, src_offset, len))
+ return;
WARN_ON(src->len != dst_len);
- offset = offset_in_page(start_offset + dst_offset);
+ offset = offset_in_page(dst_offset);
while (len > 0) {
page = dst->pages[i];
@@ -5864,12 +5864,11 @@
* This helper hides the ugliness of finding the byte in an extent buffer which
* contains a given bit.
*/
-static inline void eb_bitmap_offset(struct extent_buffer *eb,
+static inline void eb_bitmap_offset(const struct extent_buffer *eb,
unsigned long start, unsigned long nr,
unsigned long *page_index,
size_t *page_offset)
{
- size_t start_offset = offset_in_page(eb->start);
size_t byte_offset = BIT_BYTE(nr);
size_t offset;
@@ -5878,7 +5877,7 @@
* the bitmap item in the extent buffer + the offset of the byte in the
* bitmap item.
*/
- offset = start_offset + start + byte_offset;
+ offset = start + byte_offset;
*page_index = offset >> PAGE_SHIFT;
*page_offset = offset_in_page(offset);
@@ -5890,7 +5889,7 @@
* @start: offset of the bitmap item in the extent buffer
* @nr: bit number to test
*/
-int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long nr)
{
u8 *kaddr;
@@ -5912,7 +5911,7 @@
* @pos: bit number of the first bit
* @len: number of bits to set
*/
-void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len)
{
u8 *kaddr;
@@ -5954,8 +5953,9 @@
* @pos: bit number of the first bit
* @len: number of bits to clear
*/
-void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
- unsigned long pos, unsigned long len)
+void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
+ unsigned long start, unsigned long pos,
+ unsigned long len)
{
u8 *kaddr;
struct page *page;
@@ -6016,36 +6016,26 @@
memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
}
-void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
- unsigned long src_offset, unsigned long len)
+void memcpy_extent_buffer(const struct extent_buffer *dst,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
- size_t start_offset = offset_in_page(dst->start);
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu dst len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu dst len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
while (len > 0) {
- dst_off_in_page = offset_in_page(start_offset + dst_offset);
- src_off_in_page = offset_in_page(start_offset + src_offset);
+ dst_off_in_page = offset_in_page(dst_offset);
+ src_off_in_page = offset_in_page(src_offset);
- dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
- src_i = (start_offset + src_offset) >> PAGE_SHIFT;
+ dst_i = dst_offset >> PAGE_SHIFT;
+ src_i = src_offset >> PAGE_SHIFT;
cur = min(len, (unsigned long)(PAGE_SIZE -
src_off_in_page));
@@ -6061,41 +6051,31 @@
}
}
-void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
- unsigned long src_offset, unsigned long len)
+void memmove_extent_buffer(const struct extent_buffer *dst,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
unsigned long dst_end = dst_offset + len - 1;
unsigned long src_end = src_offset + len - 1;
- size_t start_offset = offset_in_page(dst->start);
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
if (dst_offset < src_offset) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return;
}
while (len > 0) {
- dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
- src_i = (start_offset + src_end) >> PAGE_SHIFT;
+ dst_i = dst_end >> PAGE_SHIFT;
+ src_i = src_end >> PAGE_SHIFT;
- dst_off_in_page = offset_in_page(start_offset + dst_end);
- src_off_in_page = offset_in_page(start_offset + src_end);
+ dst_off_in_page = offset_in_page(dst_end);
+ src_off_in_page = offset_in_page(src_end);
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index fcf1807..f39d02e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -5,39 +5,9 @@
#include <linux/rbtree.h>
#include <linux/refcount.h>
+#include <linux/fiemap.h>
#include "ulist.h"
-/* bits for the extent state */
-#define EXTENT_DIRTY (1U << 0)
-#define EXTENT_UPTODATE (1U << 1)
-#define EXTENT_LOCKED (1U << 2)
-#define EXTENT_NEW (1U << 3)
-#define EXTENT_DELALLOC (1U << 4)
-#define EXTENT_DEFRAG (1U << 5)
-#define EXTENT_BOUNDARY (1U << 6)
-#define EXTENT_NODATASUM (1U << 7)
-#define EXTENT_CLEAR_META_RESV (1U << 8)
-#define EXTENT_NEED_WAIT (1U << 9)
-#define EXTENT_DAMAGED (1U << 10)
-#define EXTENT_NORESERVE (1U << 11)
-#define EXTENT_QGROUP_RESERVED (1U << 12)
-#define EXTENT_CLEAR_DATA_RESV (1U << 13)
-#define EXTENT_DELALLOC_NEW (1U << 14)
-#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
- EXTENT_CLEAR_DATA_RESV)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
-
-/*
- * Redefined bits above which are used only in the device allocation tree,
- * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV
- * / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit
- * manipulation functions
- */
-#define CHUNK_ALLOCATED EXTENT_DIRTY
-#define CHUNK_TRIMMED EXTENT_DEFRAG
-#define CHUNK_STATE_MASK (CHUNK_ALLOCATED | \
- CHUNK_TRIMMED)
-
/*
* flags for bio submission. The high bits indicate the compression
* type for this bio
@@ -91,70 +61,19 @@
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
-struct extent_state;
struct btrfs_root;
struct btrfs_inode;
struct btrfs_io_bio;
struct io_failure_record;
+struct extent_io_tree;
+typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
+ int mirror_num,
+ unsigned long bio_flags);
typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
struct bio *bio, u64 bio_offset);
-struct extent_io_ops {
- /*
- * The following callbacks must be always defined, the function
- * pointer will be called unconditionally.
- */
- blk_status_t (*submit_bio_hook)(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
- int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int mirror);
-};
-
-enum {
- IO_TREE_FS_INFO_FREED_EXTENTS0,
- IO_TREE_FS_INFO_FREED_EXTENTS1,
- IO_TREE_INODE_IO,
- IO_TREE_INODE_IO_FAILURE,
- IO_TREE_RELOC_BLOCKS,
- IO_TREE_TRANS_DIRTY_PAGES,
- IO_TREE_ROOT_DIRTY_LOG_PAGES,
- IO_TREE_SELFTEST,
-};
-
-struct extent_io_tree {
- struct rb_root state;
- struct btrfs_fs_info *fs_info;
- void *private_data;
- u64 dirty_bytes;
- bool track_uptodate;
-
- /* Who owns this io tree, should be one of IO_TREE_* */
- u8 owner;
-
- spinlock_t lock;
- const struct extent_io_ops *ops;
-};
-
-struct extent_state {
- u64 start;
- u64 end; /* inclusive */
- struct rb_node rb_node;
-
- /* ADD NEW ELEMENTS AFTER THIS */
- wait_queue_head_t wq;
- refcount_t refs;
- unsigned state;
-
- struct io_failure_record *failrec;
-
-#ifdef CONFIG_BTRFS_DEBUG
- struct list_head leak_list;
-#endif
-};
-
#define INLINE_EXTENT_BUFFER_PAGES 16
#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
struct extent_buffer {
@@ -171,7 +90,7 @@
int blocking_writers;
atomic_t blocking_readers;
- bool lock_nested;
+ bool lock_recursed;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
short log_index;
@@ -256,157 +175,17 @@
struct extent_map_tree;
typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
- struct page *page,
- size_t pg_offset,
- u64 start, u64 len,
- int create);
+ struct page *page, size_t pg_offset,
+ u64 start, u64 len);
-void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner,
- void *private_data);
-void extent_io_tree_release(struct extent_io_tree *tree);
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached);
-static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return lock_extent_bits(tree, start, end, NULL);
-}
-
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
-int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent, int mirror_num);
-int __init extent_io_init(void);
-void __cold extent_io_exit(void);
-
-u64 count_range_bits(struct extent_io_tree *tree,
- u64 *start, u64 search_end,
- u64 max_bytes, unsigned bits, int contig);
-
-void free_extent_state(struct extent_state *state);
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int filled,
- struct extent_state *cached_state);
-int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset);
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached);
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached, gfp_t mask,
- struct extent_changeset *changeset);
-
-static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
-}
-
-static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_NOFS, NULL);
-}
-
-static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
- u64 start, u64 end, struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_ATOMIC, NULL);
-}
-
-static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits)
-{
- int wake = 0;
-
- if (bits & EXTENT_LOCKED)
- wake = 1;
-
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
-}
-
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset);
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask);
-int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits);
-
-static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits)
-{
- return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS);
-}
-
-static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state)
-{
- return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- cached_state, GFP_NOFS, NULL);
-}
-
-static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
- u64 end, gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
- NULL, mask);
-}
-
-static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
-{
- return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, cached);
-}
-
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned clear_bits,
- struct extent_state **cached_state);
-
-static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned int extra_bits,
- struct extent_state **cached_state)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
- NULL, cached_state, GFP_NOFS);
-}
-
-static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- NULL, cached_state, GFP_NOFS);
-}
-
-static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
- u64 end)
-{
- return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL,
- GFP_NOFS);
-}
-
-static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
- cached_state, mask);
-}
-
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits,
- struct extent_state **cached_state);
-void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits);
-int extent_invalidatepage(struct extent_io_tree *tree,
- struct page *page, unsigned long offset);
+int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags);
+int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct bio **bio, unsigned long *bio_flags,
+ unsigned int read_flags, u64 *prev_em_start);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
@@ -414,10 +193,9 @@
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
-int extent_readpages(struct address_space *mapping, struct list_head *pages,
- unsigned nr_pages);
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len);
+void extent_readahead(struct readahead_control *rac);
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -426,7 +204,7 @@
u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
-struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
+struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
void free_extent_buffer(struct extent_buffer *eb);
@@ -444,12 +222,7 @@
(eb->start >> PAGE_SHIFT);
}
-static inline void extent_buffer_get(struct extent_buffer *eb)
-{
- atomic_inc(&eb->refs);
-}
-
-static inline int extent_buffer_uptodate(struct extent_buffer *eb)
+static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
{
return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
}
@@ -462,40 +235,40 @@
int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
void __user *dst, unsigned long start,
unsigned long len);
-void write_extent_buffer_fsid(struct extent_buffer *eb, const void *src);
-void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
+void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src);
+void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *src);
-void write_extent_buffer(struct extent_buffer *eb, const void *src,
+void write_extent_buffer(const struct extent_buffer *eb, const void *src,
unsigned long start, unsigned long len);
-void copy_extent_buffer_full(struct extent_buffer *dst,
- struct extent_buffer *src);
-void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+void copy_extent_buffer_full(const struct extent_buffer *dst,
+ const struct extent_buffer *src);
+void copy_extent_buffer(const struct extent_buffer *dst,
+ const struct extent_buffer *src,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len);
-void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
- unsigned long src_offset, unsigned long len);
-void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
- unsigned long src_offset, unsigned long len);
-void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
+void memcpy_extent_buffer(const struct extent_buffer *dst,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len);
+void memmove_extent_buffer(const struct extent_buffer *dst,
+ unsigned long dst_offset, unsigned long src_offset,
unsigned long len);
-int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
+ unsigned long len);
+int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long pos);
-void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len);
-void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
- unsigned long pos, unsigned long len);
-void clear_extent_buffer_dirty(struct extent_buffer *eb);
+void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
+ unsigned long start, unsigned long pos,
+ unsigned long len);
+void clear_extent_buffer_dirty(const struct extent_buffer *eb);
bool set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
-int extent_buffer_under_io(struct extent_buffer *eb);
-int map_private_extent_buffer(const struct extent_buffer *eb,
- unsigned long offset, unsigned long min_len,
- char **map, unsigned long *map_start,
- unsigned long *map_len);
+int extent_buffer_under_io(const struct extent_buffer *eb);
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
-void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned bits_to_clear,
unsigned long page_ops);
@@ -510,12 +283,8 @@
int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num);
-int clean_io_failure(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree, u64 start,
- struct page *page, u64 ino, unsigned int pg_offset);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
-int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num);
+int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
/*
* When IO fails, either with EIO or csum verification fails, we
@@ -537,19 +306,12 @@
};
-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
- u64 end);
-int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
- struct io_failure_record **failrec_ret);
-bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
- struct io_failure_record *failrec, int fail_mirror);
-struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
- struct io_failure_record *failrec,
- struct page *page, int pg_offset, int icsum,
- bio_end_io_t *endio_func, void *data);
-int free_io_failure(struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree,
- struct io_failure_record *rec);
+blk_status_t btrfs_submit_read_repair(struct inode *inode,
+ struct bio *failed_bio, u64 phy_offset,
+ struct page *page, unsigned int pgoff,
+ u64 start, u64 end, int failed_mirror,
+ submit_bio_hook_t *submit_bio_hook);
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
@@ -558,4 +320,10 @@
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
+#ifdef CONFIG_BTRFS_DEBUG
+void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info);
+#else
+#define btrfs_extent_buffer_leak_debug_check(fs_info) do {} while (0)
+#endif
+
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 043eec6..bd6229f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -214,9 +214,13 @@
ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
prev->block_start != EXTENT_MAP_DELALLOC);
+ if (prev->map_lookup || next->map_lookup)
+ ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) &&
+ test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags));
+
if (extent_map_end(prev) == next->start &&
prev->flags == next->flags &&
- prev->bdev == next->bdev &&
+ prev->map_lookup == next->map_lookup &&
((next->block_start == EXTENT_MAP_HOLE &&
prev->block_start == EXTENT_MAP_HOLE) ||
(next->block_start == EXTENT_MAP_INLINE &&
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 473f039..8e21733 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -42,15 +42,8 @@
u64 block_len;
u64 generation;
unsigned long flags;
- union {
- struct block_device *bdev;
-
- /*
- * used for chunk mappings
- * flags & EXTENT_FLAG_FS_MAPPING must be set
- */
- struct map_lookup *map_lookup;
- };
+ /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
+ struct map_lookup *map_lookup;
refcount_t refs;
unsigned int compress_type;
struct list_head list;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 61b82c6..2de1d82 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -23,6 +23,97 @@
#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
PAGE_SIZE))
+/**
+ * @inode - the inode we want to update the disk_i_size for
+ * @new_i_size - the i_size we want to set to, 0 if we use i_size
+ *
+ * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read()
+ * returns as it is perfectly fine with a file that has holes without hole file
+ * extent items.
+ *
+ * However without NO_HOLES we need to only return the area that is contiguous
+ * from the 0 offset of the file. Otherwise we could end up adjust i_size up
+ * to an extent that has a gap in between.
+ *
+ * Finally new_i_size should only be set in the case of truncate where we're not
+ * ready to use i_size_read() as the limiter yet.
+ */
+void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ u64 start, end, i_size;
+ int ret;
+
+ i_size = new_i_size ?: i_size_read(inode);
+ if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ BTRFS_I(inode)->disk_i_size = i_size;
+ return;
+ }
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ ret = find_contiguous_extent_bit(&BTRFS_I(inode)->file_extent_tree, 0,
+ &start, &end, EXTENT_DIRTY);
+ if (!ret && start == 0)
+ i_size = min(i_size, end + 1);
+ else
+ i_size = 0;
+ BTRFS_I(inode)->disk_i_size = i_size;
+ spin_unlock(&BTRFS_I(inode)->lock);
+}
+
+/**
+ * @inode - the inode we're modifying
+ * @start - the start file offset of the file extent we've inserted
+ * @len - the logical length of the file extent item
+ *
+ * Call when we are inserting a new file extent where there was none before.
+ * Does not need to call this in the case where we're replacing an existing file
+ * extent, however if not sure it's fine to call this multiple times.
+ *
+ * The start and len must match the file extent item, so thus must be sectorsize
+ * aligned.
+ */
+int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len)
+{
+ if (len == 0)
+ return 0;
+
+ ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize));
+
+ if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
+ return 0;
+ return set_extent_bits(&inode->file_extent_tree, start, start + len - 1,
+ EXTENT_DIRTY);
+}
+
+/**
+ * @inode - the inode we're modifying
+ * @start - the start file offset of the file extent we've inserted
+ * @len - the logical length of the file extent item
+ *
+ * Called when we drop a file extent, for example when we truncate. Doesn't
+ * need to be called for cases where we're replacing a file extent, like when
+ * we've COWed a file extent.
+ *
+ * The start and len must match the file extent item, so thus must be sectorsize
+ * aligned.
+ */
+int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len)
+{
+ if (len == 0)
+ return 0;
+
+ ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) ||
+ len == (u64)-1);
+
+ if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
+ return 0;
+ return clear_extent_bit(&inode->file_extent_tree, start,
+ start + len - 1, EXTENT_DIRTY, 0, 0, NULL);
+}
+
static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
u16 csum_size)
{
@@ -148,18 +239,30 @@
return ret;
}
-static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
- u64 logical_offset, u8 *dst, int dio)
+/**
+ * btrfs_lookup_bio_sums - Look up checksums for a bio.
+ * @inode: inode that the bio is for.
+ * @bio: bio to look up.
+ * @offset: Unless (u64)-1, look up checksums for this offset in the file.
+ * If (u64)-1, use the page offsets from the bio instead.
+ * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
+ * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
+ * NULL, the checksum buffer is allocated and returned in
+ * btrfs_io_bio(bio)->csum instead.
+ *
+ * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
+ */
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
+ u64 offset, u8 *dst)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct bio_vec bvec;
struct bvec_iter iter;
- struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
struct btrfs_csum_item *item = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_path *path;
+ const bool page_offsets = (offset == (u64)-1);
u8 *csum;
- u64 offset = 0;
u64 item_start_offset = 0;
u64 item_last_offset = 0;
u64 disk_bytenr;
@@ -175,6 +278,8 @@
nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
if (!dst) {
+ struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
+
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
btrfs_bio->csum = kmalloc_array(nblocks, csum_size,
GFP_NOFS);
@@ -205,18 +310,16 @@
}
disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
- if (dio)
- offset = logical_offset;
bio_for_each_segment(bvec, bio, iter) {
page_bytes_left = bvec.bv_len;
if (count)
goto next;
- if (!dio)
+ if (page_offsets)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
- count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
- csum, nblocks);
+ count = btrfs_find_ordered_sum(BTRFS_I(inode), offset,
+ disk_bytenr, csum, nblocks);
if (count)
goto found;
@@ -286,18 +389,7 @@
WARN_ON_ONCE(count);
btrfs_free_path(path);
- return 0;
-}
-
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
- u8 *dst)
-{
- return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0);
-}
-
-blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset)
-{
- return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1);
+ return BLK_STS_OK;
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@@ -430,10 +522,10 @@
* means this bio can contains potentially discontigous bio vecs
* so the logical offset of each should be calculated separately.
*/
-blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
u64 file_start, int contig)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered = NULL;
@@ -476,7 +568,18 @@
if (!ordered) {
ordered = btrfs_lookup_ordered_extent(inode, offset);
- BUG_ON(!ordered); /* Logic error */
+ /*
+ * The bio range is not covered by any ordered extent,
+ * must be a code logic error.
+ */
+ if (unlikely(!ordered)) {
+ WARN(1, KERN_WARNING
+ "no ordered extent for root %llu ino %llu offset %llu\n",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode), offset);
+ kvfree(sums);
+ return BLK_STS_IOERR;
+ }
}
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info,
@@ -484,8 +587,8 @@
- 1);
for (i = 0; i < nr_sectors; i++) {
- if (offset >= ordered->file_offset + ordered->len ||
- offset < ordered->file_offset) {
+ if (offset >= ordered->file_offset + ordered->num_bytes ||
+ offset < ordered->file_offset) {
unsigned long bytes_left;
sums->len = this_sum_bytes;
@@ -509,13 +612,12 @@
index = 0;
}
- crypto_shash_init(shash);
data = kmap_atomic(bvec.bv_page);
- crypto_shash_update(shash, data + bvec.bv_offset
+ crypto_shash_digest(shash, data + bvec.bv_offset
+ (i * fs_info->sectorsize),
- fs_info->sectorsize);
+ fs_info->sectorsize,
+ sums->sums + index);
kunmap_atomic(data);
- crypto_shash_final(shash, (char *)(sums->sums + index));
index += csum_size;
offset += fs_info->sectorsize;
this_sum_bytes += fs_info->sectorsize;
@@ -780,7 +882,7 @@
}
ret = PTR_ERR(item);
if (ret != -EFBIG && ret != -ENOENT)
- goto fail_unlock;
+ goto out;
if (ret == -EFBIG) {
u32 item_size;
@@ -818,14 +920,27 @@
}
/*
- * at this point, we know the tree has an item, but it isn't big
- * enough yet to put our csum in. Grow it
+ * At this point, we know the tree has a checksum item that ends at an
+ * offset matching the start of the checksum range we want to insert.
+ * We try to extend that item as much as possible and then add as many
+ * checksums to it as they fit.
+ *
+ * First check if the leaf has enough free space for at least one
+ * checksum. If it has go directly to the item extension code, otherwise
+ * release the path and do a search for insertion before the extension.
*/
+ if (btrfs_leaf_free_space(leaf) >= csum_size) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ csum_offset = (bytenr - found_key.offset) >>
+ fs_info->sb->s_blocksize_bits;
+ goto extend_csum;
+ }
+
btrfs_release_path(path);
ret = btrfs_search_slot(trans, root, &file_key, path,
csum_size, 1);
if (ret < 0)
- goto fail_unlock;
+ goto out;
if (ret > 0) {
if (path->slots[0] == 0)
@@ -844,19 +959,13 @@
goto insert;
}
+extend_csum:
if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
csum_size) {
int extend_nr;
u64 tmp;
u32 diff;
- u32 free_space;
- if (btrfs_leaf_free_space(leaf) <
- sizeof(struct btrfs_item) + csum_size * 2)
- goto insert;
-
- free_space = btrfs_leaf_free_space(leaf) -
- sizeof(struct btrfs_item) - csum_size;
tmp = sums->len - total_bytes;
tmp >>= fs_info->sb->s_blocksize_bits;
WARN_ON(tmp < 1);
@@ -867,7 +976,7 @@
MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
- diff = min(free_space, diff);
+ diff = min_t(u32, btrfs_leaf_free_space(leaf), diff);
diff /= csum_size;
diff *= csum_size;
@@ -898,9 +1007,9 @@
ins_size);
path->leave_spinning = 0;
if (ret < 0)
- goto fail_unlock;
+ goto out;
if (WARN_ON(ret != 0))
- goto fail_unlock;
+ goto out;
leaf = path->nodes[0];
csum:
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
@@ -930,9 +1039,6 @@
out:
btrfs_free_path(path);
return ret;
-
-fail_unlock:
- goto out;
}
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
@@ -951,21 +1057,9 @@
u8 type = btrfs_file_extent_type(leaf, fi);
int compress_type = btrfs_file_extent_compression(leaf, fi);
- em->bdev = fs_info->fs_devices->latest_bdev;
btrfs_item_key_to_cpu(leaf, &key, slot);
extent_start = key.offset;
-
- if (type == BTRFS_FILE_EXTENT_REG ||
- type == BTRFS_FILE_EXTENT_PREALLOC) {
- extent_end = extent_start +
- btrfs_file_extent_num_bytes(leaf, fi);
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
- size_t size;
- size = btrfs_file_extent_ram_bytes(leaf, fi);
- extent_end = ALIGN(extent_start + size,
- fs_info->sectorsize);
- }
-
+ extent_end = btrfs_file_extent_end(path);
em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -1012,3 +1106,30 @@
root->root_key.objectid);
}
}
+
+/*
+ * Returns the end offset (non inclusive) of the file extent item the given path
+ * points to. If it points to an inline extent, the returned offset is rounded
+ * up to the sector size.
+ */
+u64 btrfs_file_extent_end(const struct btrfs_path *path)
+{
+ const struct extent_buffer *leaf = path->nodes[0];
+ const int slot = path->slots[0];
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 end;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
+ end = btrfs_file_extent_ram_bytes(leaf, fi);
+ end = ALIGN(key.offset + end, leaf->fs_info->sectorsize);
+ } else {
+ end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ }
+
+ return end;
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1279359..f59ec55 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -27,6 +27,7 @@
#include "qgroup.h"
#include "compression.h"
#include "delalloc-space.h"
+#include "reflink.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
@@ -274,34 +275,23 @@
{
struct btrfs_root *inode_root;
struct inode *inode;
- struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
int num_defrag;
- int index;
int ret;
/* get the inode */
- key.objectid = defrag->root;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
if (IS_ERR(inode_root)) {
ret = PTR_ERR(inode_root);
goto cleanup;
}
- key.objectid = defrag->ino;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+ inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
+ btrfs_put_root(inode_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto cleanup;
}
- srcu_read_unlock(&fs_info->subvol_srcu, index);
/* do a chunk of defrag */
clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
@@ -337,7 +327,6 @@
iput(inode);
return 0;
cleanup:
- srcu_read_unlock(&fs_info->subvol_srcu, index);
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
return ret;
}
@@ -463,47 +452,6 @@
}
}
-static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
- const u64 start,
- const u64 len,
- struct extent_state **cached_state)
-{
- u64 search_start = start;
- const u64 end = start + len - 1;
-
- while (search_start < end) {
- const u64 search_len = end - search_start + 1;
- struct extent_map *em;
- u64 em_len;
- int ret = 0;
-
- em = btrfs_get_extent(inode, NULL, 0, search_start,
- search_len, 0);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
- if (em->block_start != EXTENT_MAP_HOLE)
- goto next;
-
- em_len = em->len;
- if (em->start < search_start)
- em_len -= search_start - em->start;
- if (em_len > search_len)
- em_len = search_len;
-
- ret = set_extent_bit(&inode->io_tree, search_start,
- search_start + em_len - 1,
- EXTENT_DELALLOC_NEW,
- NULL, cached_state, GFP_NOFS);
-next:
- search_start = extent_map_end(em);
- free_extent_map(em);
- if (ret)
- return ret;
- }
- return 0;
-}
-
/*
* after copy_from_user, pages need to be dirtied and we need to make
* sure holes are created between the current EOF and the start of
@@ -512,18 +460,18 @@
* this also makes the decision about creating an inline extent vs
* doing real data extents, marking pages dirty and delalloc as required.
*/
-int btrfs_dirty_pages(struct inode *inode, struct page **pages,
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int err = 0;
int i;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
u64 end_pos = pos + write_bytes;
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(&inode->vfs_inode);
unsigned int extra_bits = 0;
start_pos = pos & ~((u64) fs_info->sectorsize - 1);
@@ -536,28 +484,10 @@
* The pages may have already been dirty, clear out old accounting so
* we can set things up properly
*/
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
+ clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, cached);
- if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
- if (start_pos >= isize &&
- !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
- /*
- * There can't be any extents following eof in this case
- * so just set the delalloc new bit for the range
- * directly.
- */
- extra_bits |= EXTENT_DELALLOC_NEW;
- } else {
- err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
- start_pos,
- num_bytes, cached);
- if (err)
- return err;
- }
- }
-
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached);
if (err)
@@ -576,7 +506,7 @@
* at this time.
*/
if (end_pos > isize)
- i_size_write(inode, end_pos);
+ i_size_write(&inode->vfs_inode, end_pos);
return 0;
}
@@ -667,7 +597,6 @@
}
split->generation = gen;
- split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
replace_extent_mapping(em_tree, em, split, modified);
@@ -680,7 +609,6 @@
split->start = start + len;
split->len = em->start + em->len - (start + len);
- split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
split->generation = gen;
@@ -745,7 +673,7 @@
* is deleted from the tree.
*/
int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
+ struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_path *path, u64 start, u64 end,
u64 *drop_end, int drop_cache,
int replace_extent,
@@ -758,7 +686,8 @@
struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ struct inode *vfs_inode = &inode->vfs_inode;
+ u64 ino = btrfs_ino(inode);
u64 search_start = start;
u64 disk_bytenr = 0;
u64 num_bytes = 0;
@@ -776,13 +705,12 @@
int leafs_visited = 0;
if (drop_cache)
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0);
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
- if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
+ if (start >= inode->disk_i_size && !replace_extent)
modify_tree = 0;
- update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- root == fs_info->tree_root);
+ update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@ -949,7 +877,7 @@
extent_end - end);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
- inode_sub_bytes(inode, end - key.offset);
+ inode_sub_bytes(vfs_inode, end - key.offset);
break;
}
@@ -969,7 +897,7 @@
start - key.offset);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
- inode_sub_bytes(inode, extent_end - start);
+ inode_sub_bytes(vfs_inode, extent_end - start);
if (end == extent_end)
break;
@@ -993,7 +921,7 @@
if (update_refs &&
extent_type == BTRFS_FILE_EXTENT_INLINE) {
- inode_sub_bytes(inode,
+ inode_sub_bytes(vfs_inode,
extent_end - key.offset);
extent_end = ALIGN(extent_end,
fs_info->sectorsize);
@@ -1007,7 +935,7 @@
key.offset - extent_offset);
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
- inode_sub_bytes(inode,
+ inode_sub_bytes(vfs_inode,
extent_end - key.offset);
}
@@ -1071,11 +999,7 @@
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- setup_items_for_insert(root, path, &key,
- &extent_item_size,
- extent_item_size,
- sizeof(struct btrfs_item) +
- extent_item_size, 1);
+ setup_items_for_insert(root, path, &key, &extent_item_size, 1);
*key_inserted = 1;
}
@@ -1096,8 +1020,8 @@
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
- drop_cache, 0, 0, NULL);
+ ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start,
+ end, NULL, drop_cache, 0, 0, NULL);
btrfs_free_path(path);
return ret;
}
@@ -1491,9 +1415,7 @@
int ret = 0;
start_pos = round_down(pos, fs_info->sectorsize);
- last_pos = start_pos
- + round_up(pos + write_bytes - start_pos,
- fs_info->sectorsize) - 1;
+ last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
if (start_pos < inode->vfs_inode.i_size) {
struct btrfs_ordered_extent *ordered;
@@ -1503,7 +1425,7 @@
ordered = btrfs_lookup_ordered_range(inode, start_pos,
last_pos - start_pos + 1);
if (ordered &&
- ordered->file_offset + ordered->len > start_pos &&
+ ordered->file_offset + ordered->num_bytes > start_pos &&
ordered->file_offset <= last_pos) {
unlock_extent_cached(&inode->io_tree, start_pos,
last_pos, cached_state);
@@ -1511,8 +1433,7 @@
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_start_ordered_extent(&inode->vfs_inode,
- ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
return -EAGAIN;
}
@@ -1546,8 +1467,8 @@
return ret;
}
-int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
+static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes, bool nowait)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
@@ -1555,33 +1476,87 @@
u64 num_bytes;
int ret;
- ret = btrfs_start_write_no_snapshotting(root);
- if (!ret)
+ if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ return 0;
+
+ if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
-
- btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart,
- lockend, NULL);
-
num_bytes = lockend - lockstart + 1;
+
+ if (nowait) {
+ struct btrfs_ordered_extent *ordered;
+
+ if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
+ return -EAGAIN;
+
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ num_bytes);
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ } else {
+ btrfs_lock_and_flush_ordered_range(inode, lockstart,
+ lockend, NULL);
+ }
+
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
NULL, NULL, NULL, false);
if (ret <= 0) {
ret = 0;
- btrfs_end_write_no_snapshotting(root);
+ if (!nowait)
+ btrfs_drew_write_unlock(&root->snapshot_lock);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
}
-
+out_unlock:
unlock_extent(&inode->io_tree, lockstart, lockend);
return ret;
}
+static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes)
+{
+ return check_can_nocow(inode, pos, write_bytes, true);
+}
+
+/*
+ * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
+ *
+ * @pos: File offset
+ * @write_bytes: The length to write, will be updated to the nocow writeable
+ * range
+ *
+ * This function will flush ordered extents in the range to ensure proper
+ * nocow checks.
+ *
+ * Return:
+ * >0 and update @write_bytes if we can do nocow write
+ * 0 if we can't do nocow write
+ * -EAGAIN if we can't get the needed lock or there are ordered extents
+ * for * (nowait == true) case
+ * <0 if other error happened
+ *
+ * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
+ */
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes)
+{
+ return check_can_nocow(inode, pos, write_bytes, false);
+}
+
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
+{
+ btrfs_drew_write_unlock(&inode->root->snapshot_lock);
+}
+
static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
struct iov_iter *i)
{
@@ -1589,7 +1564,6 @@
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
@@ -1642,13 +1616,12 @@
fs_info->sectorsize);
extent_changeset_release(data_reserved);
- ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
+ &data_reserved, pos,
write_bytes);
if (ret < 0) {
- if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC)) &&
- btrfs_check_can_nocow(BTRFS_I(inode), pos,
- &write_bytes) > 0) {
+ if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
+ &write_bytes) > 0) {
/*
* For nodata cow case, no need to reserve
* data space.
@@ -1673,11 +1646,11 @@
reserve_bytes);
if (ret) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(inode,
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, pos,
write_bytes);
else
- btrfs_end_write_no_snapshotting(root);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
break;
}
@@ -1747,7 +1720,7 @@
__pos = round_down(pos,
fs_info->sectorsize) +
(dirty_pages << PAGE_SHIFT);
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(BTRFS_I(inode),
data_reserved, __pos,
release_bytes, true);
}
@@ -1757,8 +1730,9 @@
fs_info->sectorsize);
if (copied > 0)
- ret = btrfs_dirty_pages(inode, pages, dirty_pages,
- pos, copied, &cached_state);
+ ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
+ dirty_pages, pos, copied,
+ &cached_state);
/*
* If we have not locked the extent range, because the range's
@@ -1781,7 +1755,7 @@
release_bytes = 0;
if (only_release_metadata)
- btrfs_end_write_no_snapshotting(root);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
if (only_release_metadata && copied > 0) {
lockstart = round_down(pos,
@@ -1799,8 +1773,6 @@
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
- if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1)
- btrfs_btree_balance_dirty(fs_info);
pos += copied;
num_written += copied;
@@ -1810,11 +1782,12 @@
if (release_bytes) {
if (only_release_metadata) {
- btrfs_end_write_no_snapshotting(root);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
} else {
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved,
round_down(pos, fs_info->sectorsize),
release_bytes, true);
}
@@ -1834,7 +1807,7 @@
loff_t endbyte;
int err;
- written = generic_file_direct_write(iocb, from);
+ written = btrfs_direct_IO(iocb, from);
if (written < 0 || !iov_iter_count(from))
return written;
@@ -1888,7 +1861,6 @@
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
u64 start_pos;
u64 end_pos;
ssize_t num_written = 0;
@@ -1925,16 +1897,11 @@
* We will allocate space in case nodatacow is not set,
* so bail
*/
- if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC)) ||
- btrfs_check_can_nocow(BTRFS_I(inode), pos,
- &nocow_bytes) <= 0) {
+ if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes)
+ <= 0) {
inode_unlock(inode);
return -EAGAIN;
}
-
- /* check_can_nocow() locks the snapshot lock on success */
- btrfs_end_write_no_snapshotting(root);
/*
* There are holes in the range or parts of the range that must
* be COWed (shared extents, RO block groups, etc), so just bail
@@ -1992,7 +1959,40 @@
atomic_inc(&BTRFS_I(inode)->sync_writers);
if (iocb->ki_flags & IOCB_DIRECT) {
+ /*
+ * 1. We must always clear IOCB_DSYNC in order to not deadlock
+ * in iomap, as it calls generic_write_sync() in this case.
+ * 2. If we are async, we can call iomap_dio_complete() either
+ * in
+ *
+ * 2.1. A worker thread from the last bio completed. In this
+ * case we need to mark the btrfs_dio_data that it is
+ * async in order to call generic_write_sync() properly.
+ * This is handled by setting BTRFS_DIO_SYNC_STUB in the
+ * current->journal_info.
+ * 2.2 The submitter context, because all IO completed
+ * before we exited iomap_dio_rw(). In this case we can
+ * just re-set the IOCB_DSYNC on the iocb and we'll do
+ * the sync below. If our ->end_io() gets called and
+ * current->journal_info is set, then we know we're in
+ * our current context and we will clear
+ * current->journal_info to indicate that we need to
+ * sync below.
+ */
+ if (sync) {
+ ASSERT(current->journal_info == NULL);
+ iocb->ki_flags &= ~IOCB_DSYNC;
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
+ }
num_written = __btrfs_direct_write(iocb, from);
+
+ /*
+ * As stated above, we cleared journal_info, so we need to do
+ * the sync ourselves.
+ */
+ if (sync && current->journal_info == NULL)
+ iocb->ki_flags |= IOCB_DSYNC;
+ current->journal_info = NULL;
} else {
num_written = btrfs_buffered_write(iocb, from);
if (num_written > 0)
@@ -2026,12 +2026,12 @@
filp->private_data = NULL;
/*
- * ordered_data_close is set by setattr when we are about to truncate
- * a file from a non-zero size to a zero size. This tries to
- * flush down new bytes that may have been written if the
- * application were using truncate to replace a file in place.
+ * Set by setattr when we are about to truncate a file from a non-zero
+ * size to a zero size. This tries to flush down new bytes that may
+ * have been written if the application were using truncate to replace
+ * a file in place.
*/
- if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
return 0;
@@ -2057,6 +2057,30 @@
return ret;
}
+static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_inode *inode = BTRFS_I(ctx->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (btrfs_inode_in_log(inode, fs_info->generation) &&
+ list_empty(&ctx->ordered_extents))
+ return true;
+
+ /*
+ * If we are doing a fast fsync we can not bail out if the inode's
+ * last_trans is <= then the last committed transaction, because we only
+ * update the last_trans of the inode during ordered extent completion,
+ * and for a fast fsync we don't wait for that, we only wait for the
+ * writeback to complete.
+ */
+ if (inode->last_trans <= fs_info->last_trans_committed &&
+ (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
+ list_empty(&ctx->ordered_extents)))
+ return true;
+
+ return false;
+}
+
/*
* fsync call for both files and directories. This logs the inode into
* the tree log instead of forcing full commits whenever possible.
@@ -2072,25 +2096,28 @@
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
+ u64 len;
+ bool full_sync;
trace_btrfs_sync_file(file, datasync);
btrfs_init_log_ctx(&ctx, inode);
/*
- * Set the range to full if the NO_HOLES feature is not enabled.
- * This is to avoid missing file extent items representing holes after
- * replaying the log.
+ * Always set the range to a full range, otherwise we can get into
+ * several problems, from missing file extent items to represent holes
+ * when not using the NO_HOLES feature, to log tree corruption due to
+ * races between hole detection during logging and completion of ordered
+ * extents outside the range, to missing checksums due to ordered extents
+ * for which we flushed only a subset of their pages.
*/
- if (!btrfs_fs_incompat(fs_info, NO_HOLES)) {
- start = 0;
- end = LLONG_MAX;
- }
+ start = 0;
+ end = LLONG_MAX;
+ len = (u64)LLONG_MAX + 1;
/*
* We write the dirty pages in the range and wait until they complete
@@ -2114,17 +2141,12 @@
atomic_inc(&root->log_batch);
/*
- * If the inode needs a full sync, make sure we use a full range to
- * avoid log tree corruption, due to hole detection racing with ordered
- * extent completion for adjacent ranges, and assertion failures during
- * hole detection. Do this while holding the inode lock, to avoid races
- * with other tasks.
+ * Always check for the full sync flag while holding the inode's lock,
+ * to avoid races with other tasks. The flag must be either set all the
+ * time during logging or always off all the time while logging.
*/
- if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- start = 0;
- end = LLONG_MAX;
- }
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
/*
* Before we acquired the inode's lock, someone may have dirtied more
@@ -2155,20 +2177,33 @@
* We have to do this here to avoid the priority inversion of waiting on
* IO of a lower priority task while holding a transaction open.
*
- * Also, the range length can be represented by u64, we have to do the
- * typecasts to avoid signed overflow if it's [0, LLONG_MAX].
+ * For a full fsync we wait for the ordered extents to complete while
+ * for a fast fsync we wait just for writeback to complete, and then
+ * attach the ordered extents to the transaction so that a transaction
+ * commit waits for their completion, to avoid data loss if we fsync,
+ * the current transaction commits before the ordered extents complete
+ * and a power failure happens right after that.
*/
- ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1);
- if (ret) {
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ if (full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ } else {
+ /*
+ * Get our ordered extents as soon as possible to avoid doing
+ * checksum lookups in the csum tree, and use instead the
+ * checksums attached to the ordered extents.
+ */
+ btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
+ &ctx.ordered_extents);
+ ret = filemap_fdatawait_range(inode->i_mapping, start, end);
}
+
+ if (ret)
+ goto out_release_extents;
+
atomic_inc(&root->log_batch);
smp_mb();
- if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
- BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) {
+ if (skip_inode_logging(&ctx)) {
/*
* We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -2184,9 +2219,7 @@
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
/*
@@ -2203,12 +2236,11 @@
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
- ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
+ ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
+ btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
@@ -2235,6 +2267,13 @@
goto out;
}
}
+ if (!full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ }
ret = btrfs_commit_transaction(trans);
} else {
ret = btrfs_end_transaction(trans);
@@ -2245,6 +2284,12 @@
if (!ret)
ret = err;
return ret > 0 ? -EIO : ret;
+
+out_release_extents:
+ btrfs_release_log_ctx_extents(&ctx);
+ up_write(&BTRFS_I(inode)->dio_sem);
+ inode_unlock(inode);
+ goto out;
}
static const struct vm_operations_struct btrfs_file_vm_ops = {
@@ -2380,7 +2425,6 @@
hole_em->block_start = EXTENT_MAP_HOLE;
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
- hole_em->bdev = fs_info->fs_devices->latest_bdev;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
@@ -2413,7 +2457,7 @@
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
round_down(*start, fs_info->sectorsize),
- round_up(*len, fs_info->sectorsize), 0);
+ round_up(*len, fs_info->sectorsize));
if (IS_ERR(em))
return PTR_ERR(em);
@@ -2441,7 +2485,8 @@
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
+ lockend);
/*
* We need to make sure we have no ordered extents in this range
@@ -2449,7 +2494,7 @@
* we need to try again.
*/
if ((!ordered ||
- (ordered->file_offset + ordered->len <= lockstart ||
+ (ordered->file_offset + ordered->num_bytes <= lockstart ||
ordered->file_offset > lockend)) &&
!filemap_range_has_page(inode->i_mapping,
lockstart, lockend)) {
@@ -2469,11 +2514,11 @@
return 0;
}
-static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
+static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *path,
- struct btrfs_clone_extent_info *clone_info,
- const u64 clone_len)
+ struct btrfs_replace_extent_info *extent_info,
+ const u64 replace_len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2482,46 +2527,69 @@
struct btrfs_key key;
int slot;
struct btrfs_ref ref = { 0 };
- u64 ref_offset;
int ret;
- if (clone_len == 0)
+ if (replace_len == 0)
return 0;
- if (clone_info->disk_offset == 0 &&
+ if (extent_info->disk_offset == 0 &&
btrfs_fs_incompat(fs_info, NO_HOLES))
return 0;
key.objectid = btrfs_ino(BTRFS_I(inode));
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = clone_info->file_offset;
+ key.offset = extent_info->file_offset;
ret = btrfs_insert_empty_item(trans, root, path, &key,
- clone_info->item_size);
+ sizeof(struct btrfs_file_extent_item));
if (ret)
return ret;
leaf = path->nodes[0];
slot = path->slots[0];
- write_extent_buffer(leaf, clone_info->extent_buf,
+ write_extent_buffer(leaf, extent_info->extent_buf,
btrfs_item_ptr_offset(leaf, slot),
- clone_info->item_size);
+ sizeof(struct btrfs_file_extent_item));
extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset);
- btrfs_set_file_extent_num_bytes(leaf, extent, clone_len);
+ ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
+ btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
+ if (extent_info->is_new_extent)
+ btrfs_set_file_extent_generation(leaf, extent, trans->transid);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+ extent_info->file_offset, replace_len);
+ if (ret)
+ return ret;
+
/* If it's a hole, nothing more needs to be done. */
- if (clone_info->disk_offset == 0)
+ if (extent_info->disk_offset == 0)
return 0;
- inode_add_bytes(inode, clone_len);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
- clone_info->disk_offset,
- clone_info->disk_len, 0);
- ref_offset = clone_info->file_offset - clone_info->data_offset;
- btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(BTRFS_I(inode)), ref_offset);
- ret = btrfs_inc_extent_ref(trans, &ref);
+ inode_add_bytes(inode, replace_len);
+
+ if (extent_info->is_new_extent && extent_info->insertions == 0) {
+ key.objectid = extent_info->disk_offset;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = extent_info->disk_len;
+ ret = btrfs_alloc_reserved_file_extent(trans, root,
+ btrfs_ino(BTRFS_I(inode)),
+ extent_info->file_offset,
+ extent_info->qgroup_reserved,
+ &key);
+ } else {
+ u64 ref_offset;
+
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+ extent_info->disk_offset,
+ extent_info->disk_len, 0);
+ ref_offset = extent_info->file_offset - extent_info->data_offset;
+ btrfs_init_data_ref(&ref, root->root_key.objectid,
+ btrfs_ino(BTRFS_I(inode)), ref_offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ }
+
+ extent_info->insertions++;
return ret;
}
@@ -2529,15 +2597,15 @@
/*
* The respective range must have been previously locked, as well as the inode.
* The end offset is inclusive (last byte of the range).
- * @clone_info is NULL for fallocate's hole punching and non-NULL for extent
- * cloning.
- * When cloning, we don't want to end up in a state where we dropped extents
- * without inserting a new one, so we must abort the transaction to avoid a
- * corruption.
+ * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
+ * the file range with an extent.
+ * When not punching a hole, we don't want to end up in a state where we dropped
+ * extents without inserting a new one, so we must abort the transaction to avoid
+ * a corruption.
*/
-int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
- struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2566,10 +2634,10 @@
/*
* 1 - update the inode
* 1 - removing the extents in the range
- * 1 - adding the hole extent if no_holes isn't set or if we are cloning
- * an extent
+ * 1 - adding the hole extent if no_holes isn't set or if we are
+ * replacing the range with a new extent
*/
- if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info)
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
rsv_count = 3;
else
rsv_count = 2;
@@ -2588,25 +2656,28 @@
cur_offset = start;
while (cur_offset < end) {
- ret = __btrfs_drop_extents(trans, root, inode, path,
+ ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path,
cur_offset, end + 1, &drop_end,
1, 0, 0, NULL);
if (ret != -ENOSPC) {
/*
- * When cloning we want to avoid transaction aborts when
- * nothing was done and we are attempting to clone parts
- * of inline extents, in such cases -EOPNOTSUPP is
- * returned by __btrfs_drop_extents() without having
- * changed anything in the file.
+ * The only time we don't want to abort is if we are
+ * attempting to clone a partial inline extent, in which
+ * case we'll get EOPNOTSUPP. However if we aren't
+ * clone we need to abort no matter what, because if we
+ * got EOPNOTSUPP via prealloc then we messed up and
+ * need to abort.
*/
- if (clone_info && ret && ret != -EOPNOTSUPP)
+ if (ret &&
+ (ret != -EOPNOTSUPP ||
+ (extent_info && extent_info->is_new_extent)))
btrfs_abort_transaction(trans, ret);
break;
}
trans->block_rsv = &fs_info->trans_block_rsv;
- if (!clone_info && cur_offset < drop_end &&
+ if (!extent_info && cur_offset < drop_end &&
cur_offset < ino_size) {
ret = fill_holes(trans, BTRFS_I(inode), path,
cur_offset, drop_end);
@@ -2620,20 +2691,38 @@
btrfs_abort_transaction(trans, ret);
break;
}
+ } else if (!extent_info && cur_offset < drop_end) {
+ /*
+ * We are past the i_size here, but since we didn't
+ * insert holes we need to clear the mapped area so we
+ * know to not set disk_i_size in this area until a new
+ * file extent is inserted here.
+ */
+ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+ cur_offset, drop_end - cur_offset);
+ if (ret) {
+ /*
+ * We couldn't clear our area, so we could
+ * presumably adjust up and corrupt the fs, so
+ * we need to abort.
+ */
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
}
- if (clone_info && drop_end > clone_info->file_offset) {
- u64 clone_len = drop_end - clone_info->file_offset;
+ if (extent_info && drop_end > extent_info->file_offset) {
+ u64 replace_len = drop_end - extent_info->file_offset;
- ret = btrfs_insert_clone_extent(trans, inode, path,
- clone_info, clone_len);
+ ret = btrfs_insert_replace_extent(trans, inode, path,
+ extent_info, replace_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- clone_info->data_len -= clone_len;
- clone_info->data_offset += clone_len;
- clone_info->file_offset += clone_len;
+ extent_info->data_len -= replace_len;
+ extent_info->data_offset += replace_len;
+ extent_info->file_offset += replace_len;
}
cur_offset = drop_end;
@@ -2657,7 +2746,7 @@
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
- if (!clone_info) {
+ if (!extent_info) {
ret = find_first_non_hole(inode, &cur_offset, &len);
if (unlikely(ret < 0))
break;
@@ -2676,7 +2765,7 @@
* than 16Mb would force the full fsync any way (when
* try_release_extent_mapping() is invoked during page cache truncation.
*/
- if (clone_info)
+ if (extent_info && !extent_info->is_new_extent)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
@@ -2702,7 +2791,7 @@
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
*/
- if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) {
+ if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) {
ret = fill_holes(trans, BTRFS_I(inode), path,
cur_offset, drop_end);
if (ret) {
@@ -2710,10 +2799,19 @@
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
+ } else if (!extent_info && cur_offset < drop_end) {
+ /* See the comment in the loop above for the reasoning here. */
+ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+ cur_offset, drop_end - cur_offset);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_trans;
+ }
+
}
- if (clone_info) {
- ret = btrfs_insert_clone_extent(trans, inode, path, clone_info,
- clone_info->data_len);
+ if (extent_info) {
+ ret = btrfs_insert_replace_extent(trans, inode, path, extent_info,
+ extent_info->data_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2768,9 +2866,9 @@
goto out_only_mutex;
}
- lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
+ lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
lockend = round_down(offset + len,
- btrfs_inode_sectorsize(inode)) - 1;
+ btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
@@ -2855,7 +2953,7 @@
goto out;
}
- ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL,
+ ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL,
&trans);
btrfs_free_path(path);
if (ret)
@@ -2959,7 +3057,7 @@
inode->i_ctime = current_time(inode);
i_size_write(inode, end);
- btrfs_ordered_update_i_size(inode, end, NULL);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode(trans, root, inode);
ret2 = btrfs_end_transaction(trans);
@@ -2972,7 +3070,7 @@
RANGE_BOUNDARY_HOLE,
};
-static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
u64 offset)
{
const u64 sectorsize = btrfs_inode_sectorsize(inode);
@@ -2980,7 +3078,7 @@
int ret;
offset = round_down(offset, sectorsize);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -3005,7 +3103,7 @@
struct extent_changeset *data_reserved = NULL;
int ret;
u64 alloc_hint = 0;
- const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
@@ -3013,8 +3111,8 @@
inode_dio_wait(inode);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
- alloc_start, alloc_end - alloc_start, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
+ alloc_end - alloc_start);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -3057,8 +3155,8 @@
if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
- alloc_start, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
+ sectorsize);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -3095,7 +3193,8 @@
* to cover them.
*/
if (!IS_ALIGNED(offset, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode, offset);
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
+ offset);
if (ret < 0)
goto out;
if (ret == RANGE_BOUNDARY_HOLE) {
@@ -3111,7 +3210,7 @@
}
if (!IS_ALIGNED(offset + len, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode,
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
offset + len);
if (ret < 0)
goto out;
@@ -3165,7 +3264,7 @@
ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
out:
if (ret && space_reserved)
- btrfs_free_reserved_data_space(inode, data_reserved,
+ btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
alloc_start, bytes_to_reserve);
extent_changeset_free(data_reserved);
@@ -3189,7 +3288,7 @@
u64 locked_end;
u64 actual_end = 0;
struct extent_map *em;
- int blocksize = btrfs_inode_sectorsize(inode);
+ int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
int ret;
alloc_start = round_down(offset, blocksize);
@@ -3271,10 +3370,11 @@
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
locked_end, &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
+ locked_end);
if (ordered &&
- ordered->file_offset + ordered->len > alloc_start &&
+ ordered->file_offset + ordered->num_bytes > alloc_start &&
ordered->file_offset < alloc_end) {
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
@@ -3299,7 +3399,7 @@
INIT_LIST_HEAD(&reserve_list);
while (cur_offset < alloc_end) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
- alloc_end - cur_offset, 0);
+ alloc_end - cur_offset);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
break;
@@ -3330,8 +3430,9 @@
* range, free reserved data space first, otherwise
* it'll result in false ENOSPC error.
*/
- btrfs_free_reserved_data_space(inode, data_reserved,
- cur_offset, last_byte - cur_offset);
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ data_reserved, cur_offset,
+ last_byte - cur_offset);
}
free_extent_map(em);
cur_offset = last_byte;
@@ -3348,7 +3449,7 @@
range->len, i_blocksize(inode),
offset + len, &alloc_hint);
else
- btrfs_free_reserved_data_space(inode,
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, range->start,
range->len);
list_del(&range->list);
@@ -3369,35 +3470,36 @@
inode_unlock(inode);
/* Let go of our reservation. */
if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
- btrfs_free_reserved_data_space(inode, data_reserved,
+ btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
cur_offset, alloc_end - cur_offset);
extent_changeset_free(data_reserved);
return ret;
}
-static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
+static loff_t find_desired_extent(struct inode *inode, loff_t offset,
+ int whence)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
+ loff_t i_size = inode->i_size;
u64 lockstart;
u64 lockend;
u64 start;
u64 len;
int ret = 0;
- if (inode->i_size == 0)
+ if (i_size == 0 || offset >= i_size)
return -ENXIO;
/*
- * *offset can be negative, in this case we start finding DATA/HOLE from
+ * offset can be negative, in this case we start finding DATA/HOLE from
* the very start of the file.
*/
- start = max_t(loff_t, 0, *offset);
+ start = max_t(loff_t, 0, offset);
lockstart = round_down(start, fs_info->sectorsize);
- lockend = round_up(i_size_read(inode),
- fs_info->sectorsize);
+ lockend = round_up(i_size, fs_info->sectorsize);
if (lockend <= lockstart)
lockend = lockstart + fs_info->sectorsize;
lockend--;
@@ -3406,7 +3508,7 @@
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
- while (start < inode->i_size) {
+ while (start < i_size) {
em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
@@ -3429,59 +3531,71 @@
cond_resched();
}
free_extent_map(em);
- if (!ret) {
- if (whence == SEEK_DATA && start >= inode->i_size)
- ret = -ENXIO;
- else
- *offset = min_t(loff_t, start, inode->i_size);
- }
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
- return ret;
+ if (ret) {
+ offset = ret;
+ } else {
+ if (whence == SEEK_DATA && start >= i_size)
+ offset = -ENXIO;
+ else
+ offset = min_t(loff_t, start, i_size);
+ }
+
+ return offset;
}
static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- int ret;
- inode_lock(inode);
switch (whence) {
- case SEEK_END:
- case SEEK_CUR:
- offset = generic_file_llseek(file, offset, whence);
- goto out;
+ default:
+ return generic_file_llseek(file, offset, whence);
case SEEK_DATA:
case SEEK_HOLE:
- if (offset >= i_size_read(inode)) {
- inode_unlock(inode);
- return -ENXIO;
- }
-
- ret = find_desired_extent(inode, &offset, whence);
- if (ret) {
- inode_unlock(inode);
- return ret;
- }
+ inode_lock_shared(inode);
+ offset = find_desired_extent(inode, offset, whence);
+ inode_unlock_shared(inode);
+ break;
}
- offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-out:
- inode_unlock(inode);
- return offset;
+ if (offset < 0)
+ return offset;
+
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}
static int btrfs_file_open(struct inode *inode, struct file *filp)
{
- filp->f_mode |= FMODE_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
return generic_file_open(inode, filp);
}
+static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret = 0;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ inode_lock_shared(inode);
+ ret = btrfs_direct_IO(iocb, to);
+ inode_unlock_shared(inode);
+ if (ret < 0 || !iov_iter_count(to) ||
+ iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
+ return ret;
+ }
+
+ return generic_file_buffered_read(iocb, to, ret);
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = btrfs_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
+ .splice_write = iter_file_splice_write,
.mmap = btrfs_file_mmap,
.open = btrfs_file_open,
.release = btrfs_release_file,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d2d32fe..ba28070 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -21,9 +21,11 @@
#include "space-info.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "discard.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
-#define MAX_CACHE_BYTES_PER_GIG SZ_32K
+#define MAX_CACHE_BYTES_PER_GIG SZ_64K
+#define FORCE_EXTENT_THRESHOLD SZ_1M
struct btrfs_trim_range {
u64 start;
@@ -31,6 +33,8 @@
struct list_head list;
};
+static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info);
static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
@@ -78,7 +82,7 @@
* sure NOFS is set to keep us from deadlocking.
*/
nofs_flag = memalloc_nofs_save();
- inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path);
+ inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
btrfs_release_path(path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
@@ -91,8 +95,7 @@
return inode;
}
-struct inode *lookup_free_space_inode(
- struct btrfs_block_group_cache *block_group,
+struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -107,7 +110,7 @@
return inode;
inode = __lookup_free_space_inode(fs_info->tree_root, path,
- block_group->key.objectid);
+ block_group->start);
if (IS_ERR(inode))
return inode;
@@ -190,7 +193,7 @@
}
int create_free_space_inode(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
int ret;
@@ -201,7 +204,7 @@
return ret;
return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
- ino, block_group->key.objectid);
+ ino, block_group->start);
}
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
@@ -224,7 +227,7 @@
}
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -368,10 +371,10 @@
}
}
-static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
- int uptodate)
+static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
{
struct page *page;
+ struct inode *inode = io_ctl->inode;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int i;
@@ -410,8 +413,6 @@
static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
- __le64 *val;
-
io_ctl_map_page(io_ctl, 1);
/*
@@ -426,14 +427,13 @@
io_ctl->size -= sizeof(u64) * 2;
}
- val = io_ctl->cur;
- *val = cpu_to_le64(generation);
+ put_unaligned_le64(generation, io_ctl->cur);
io_ctl->cur += sizeof(u64);
}
static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
- __le64 *gen;
+ u64 cache_gen;
/*
* Skip the crc area. If we don't check crcs then we just have a 64bit
@@ -448,11 +448,11 @@
io_ctl->size -= sizeof(u64) * 2;
}
- gen = io_ctl->cur;
- if (le64_to_cpu(*gen) != generation) {
+ cache_gen = get_unaligned_le64(io_ctl->cur);
+ if (cache_gen != generation) {
btrfs_err_rl(io_ctl->fs_info,
"space cache generation (%llu) does not match inode (%llu)",
- *gen, generation);
+ cache_gen, generation);
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -522,8 +522,8 @@
return -ENOSPC;
entry = io_ctl->cur;
- entry->offset = cpu_to_le64(offset);
- entry->bytes = cpu_to_le64(bytes);
+ put_unaligned_le64(offset, &entry->offset);
+ put_unaligned_le64(bytes, &entry->bytes);
entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
BTRFS_FREE_SPACE_EXTENT;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
@@ -596,8 +596,8 @@
}
e = io_ctl->cur;
- entry->offset = le64_to_cpu(e->offset);
- entry->bytes = le64_to_cpu(e->bytes);
+ entry->offset = get_unaligned_le64(&e->offset);
+ entry->bytes = get_unaligned_le64(&e->bytes);
*type = e->type;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
io_ctl->size -= sizeof(struct btrfs_free_space_entry);
@@ -729,7 +729,7 @@
readahead_cache(inode);
- ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
+ ret = io_ctl_prepare_pages(&io_ctl, true);
if (ret)
goto out;
@@ -755,6 +755,16 @@
goto free_cache;
}
+ /*
+ * Sync discard ensures that the free space cache is always
+ * trimmed. So when reading this in, the state should reflect
+ * that. We also do this for async as a stop gap for lack of
+ * persistence.
+ */
+ if (btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ e->trim_state = BTRFS_TRIM_STATE_TRIMMED;
+
if (!e->bytes) {
ret = -1;
kmem_cache_free(btrfs_free_space_cachep, e);
@@ -810,12 +820,19 @@
ret = io_ctl_read_bitmap(&io_ctl, e);
if (ret)
goto free_cache;
+ e->bitmap_extents = count_bitmap_extents(ctl, e);
+ if (!btrfs_free_space_trimmed(e)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] +=
+ e->bitmap_extents;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += e->bytes;
+ }
}
io_ctl_drop_pages(&io_ctl);
merge_space_tree(ctl);
ret = 1;
out:
+ btrfs_discard_update_discardable(ctl->private, ctl);
io_ctl_free(&io_ctl);
return ret;
free_cache:
@@ -824,7 +841,7 @@
goto out;
}
-int load_free_space_cache(struct btrfs_block_group_cache *block_group)
+int load_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -832,7 +849,7 @@
struct btrfs_path *path;
int ret = 0;
bool matched;
- u64 used = btrfs_block_group_used(&block_group->item);
+ u64 used = block_group->used;
/*
* If this block group has been marked to be cleared for one reason or
@@ -886,13 +903,13 @@
spin_unlock(&block_group->lock);
ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
- path, block_group->key.objectid);
+ path, block_group->start);
btrfs_free_path(path);
if (ret <= 0)
goto out;
spin_lock(&ctl->tree_lock);
- matched = (ctl->free_space == (block_group->key.offset - used -
+ matched = (ctl->free_space == (block_group->length - used -
block_group->bytes_super));
spin_unlock(&ctl->tree_lock);
@@ -900,7 +917,7 @@
__btrfs_remove_free_space_cache(ctl);
btrfs_warn(fs_info,
"block group %llu has wrong amount of free space",
- block_group->key.objectid);
+ block_group->start);
ret = -1;
}
out:
@@ -913,7 +930,7 @@
btrfs_warn(fs_info,
"failed to load free space cache for block group %llu, rebuilding it now",
- block_group->key.objectid);
+ block_group->start);
}
iput(inode);
@@ -923,7 +940,7 @@
static noinline_for_stack
int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
struct btrfs_free_space_ctl *ctl,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
int *entries, int *bitmaps,
struct list_head *bitmap_list)
{
@@ -1051,7 +1068,8 @@
}
static noinline_for_stack int write_pinned_extent_entries(
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
int *entries)
{
@@ -1069,11 +1087,11 @@
* We shouldn't have switched the pinned extents yet so this is the
* right one
*/
- unpin = block_group->fs_info->pinned_extents;
+ unpin = &trans->transaction->pinned_extents;
- start = block_group->key.objectid;
+ start = block_group->start;
- while (start < block_group->key.objectid + block_group->key.offset) {
+ while (start < block_group->start + block_group->length) {
ret = find_first_extent_bit(unpin, start,
&extent_start, &extent_end,
EXTENT_DIRTY, NULL);
@@ -1081,13 +1099,12 @@
return 0;
/* This pinned extent is out of our range */
- if (extent_start >= block_group->key.objectid +
- block_group->key.offset)
+ if (extent_start >= block_group->start + block_group->length)
return 0;
extent_start = max(extent_start, start);
- extent_end = min(block_group->key.objectid +
- block_group->key.offset, extent_end + 1);
+ extent_end = min(block_group->start + block_group->length,
+ extent_end + 1);
len = extent_end - extent_start;
*entries += 1;
@@ -1151,7 +1168,7 @@
static int __btrfs_wait_cache_io(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
struct btrfs_path *path, u64 offset)
{
@@ -1173,13 +1190,10 @@
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
- if (block_group) {
-#ifdef DEBUG
- btrfs_err(root->fs_info,
- "failed to write free space cache for block group %llu",
- block_group->key.objectid);
-#endif
- }
+ if (block_group)
+ btrfs_debug(root->fs_info,
+ "failed to write free space cache for block group %llu error %d",
+ block_group->start, ret);
}
btrfs_update_inode(trans, root, inode);
@@ -1219,12 +1233,12 @@
}
int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
block_group, &block_group->io_ctl,
- path, block_group->key.objectid);
+ path, block_group->start);
}
/**
@@ -1240,7 +1254,7 @@
*/
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
struct btrfs_trans_handle *trans)
{
@@ -1275,7 +1289,7 @@
}
/* Lock all pages first so we can lock the extent safely. */
- ret = io_ctl_prepare_pages(io_ctl, inode, 0);
+ ret = io_ctl_prepare_pages(io_ctl, false);
if (ret)
goto out_unlock;
@@ -1301,7 +1315,7 @@
* If this changes while we are working we'll get added back to
* the dirty list and redo it. No locking needed
*/
- ret = write_pinned_extent_entries(block_group, io_ctl, &entries);
+ ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
if (ret)
goto out_nospc_locked;
@@ -1320,8 +1334,9 @@
io_ctl_zero_remaining_pages(io_ctl);
/* Everything is written out, now we dirty the pages in the file. */
- ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
- i_size_read(inode), &cached_state);
+ ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
+ io_ctl->num_pages, 0, i_size_read(inode),
+ &cached_state);
if (ret)
goto out_nospc;
@@ -1351,18 +1366,6 @@
return 0;
-out:
- io_ctl->inode = NULL;
- io_ctl_free(io_ctl);
- if (ret) {
- invalidate_inode_pages2(inode->i_mapping);
- BTRFS_I(inode)->generation = 0;
- }
- btrfs_update_inode(trans, root, inode);
- if (must_iput)
- iput(inode);
- return ret;
-
out_nospc_locked:
cleanup_bitmap_list(&bitmap_list);
spin_unlock(&ctl->tree_lock);
@@ -1375,11 +1378,21 @@
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem);
- goto out;
+out:
+ io_ctl->inode = NULL;
+ io_ctl_free(io_ctl);
+ if (ret) {
+ invalidate_inode_pages2(inode->i_mapping);
+ BTRFS_I(inode)->generation = 0;
+ }
+ btrfs_update_inode(trans, root, inode);
+ if (must_iput)
+ iput(inode);
+ return ret;
}
int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1401,11 +1414,9 @@
ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
block_group, &block_group->io_ctl, trans);
if (ret) {
-#ifdef DEBUG
- btrfs_err(fs_info,
- "failed to write free space cache for block group %llu",
- block_group->key.objectid);
-#endif
+ btrfs_debug(fs_info,
+ "failed to write free space cache for block group %llu error %d",
+ block_group->start, ret);
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_ERROR;
spin_unlock(&block_group->lock);
@@ -1630,6 +1641,11 @@
{
rb_erase(&info->offset_index, &ctl->free_space_offset);
ctl->free_extents--;
+
+ if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]--;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
+ }
}
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
@@ -1650,6 +1666,11 @@
if (ret)
return ret;
+ if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]++;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
+ }
+
ctl->free_space += info->bytes;
ctl->free_extents++;
return ret;
@@ -1657,11 +1678,11 @@
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
- struct btrfs_block_group_cache *block_group = ctl->private;
+ struct btrfs_block_group *block_group = ctl->private;
u64 max_bytes;
u64 bitmap_bytes;
u64 extent_bytes;
- u64 size = block_group->key.offset;
+ u64 size = block_group->length;
u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
@@ -1670,26 +1691,17 @@
ASSERT(ctl->total_bitmaps <= max_bitmaps);
/*
- * The goal is to keep the total amount of memory used per 1gb of space
- * at or below 32k, so we need to adjust how much memory we allow to be
- * used by extent based free space tracking
+ * We are trying to keep the total amount of memory used per 1GiB of
+ * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
+ * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
+ * bitmaps, we may end up using more memory than this.
*/
if (size < SZ_1G)
max_bytes = MAX_CACHE_BYTES_PER_GIG;
else
max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
- /*
- * we want to account for 1 more bitmap than what we have so we can make
- * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
- * we add more bitmaps.
- */
- bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit;
-
- if (bitmap_bytes >= max_bytes) {
- ctl->extents_thresh = 0;
- return;
- }
+ bitmap_bytes = ctl->total_bitmaps * ctl->unit;
/*
* we want the extent entry threshold to always be at most 1/2 the max
@@ -1706,17 +1718,31 @@
struct btrfs_free_space *info,
u64 offset, u64 bytes)
{
- unsigned long start, count;
+ unsigned long start, count, end;
+ int extent_delta = -1;
start = offset_to_bit(info->offset, ctl->unit, offset);
count = bytes_to_bits(bytes, ctl->unit);
- ASSERT(start + count <= BITS_PER_BITMAP);
+ end = start + count;
+ ASSERT(end <= BITS_PER_BITMAP);
bitmap_clear(info->bitmap, start, count);
info->bytes -= bytes;
if (info->max_extent_size > ctl->unit)
info->max_extent_size = 0;
+
+ if (start && test_bit(start - 1, info->bitmap))
+ extent_delta++;
+
+ if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
+ extent_delta++;
+
+ info->bitmap_extents += extent_delta;
+ if (!btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
+ }
}
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
@@ -1731,16 +1757,30 @@
struct btrfs_free_space *info, u64 offset,
u64 bytes)
{
- unsigned long start, count;
+ unsigned long start, count, end;
+ int extent_delta = 1;
start = offset_to_bit(info->offset, ctl->unit, offset);
count = bytes_to_bits(bytes, ctl->unit);
- ASSERT(start + count <= BITS_PER_BITMAP);
+ end = start + count;
+ ASSERT(end <= BITS_PER_BITMAP);
bitmap_set(info->bitmap, start, count);
info->bytes += bytes;
ctl->free_space += bytes;
+
+ if (start && test_bit(start - 1, info->bitmap))
+ extent_delta--;
+
+ if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
+ extent_delta--;
+
+ info->bitmap_extents += extent_delta;
+ if (!btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
+ }
}
/*
@@ -1876,11 +1916,35 @@
return NULL;
}
+static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info)
+{
+ struct btrfs_block_group *block_group = ctl->private;
+ u64 bytes = bitmap_info->bytes;
+ unsigned int rs, re;
+ int count = 0;
+
+ if (!block_group || !bytes)
+ return count;
+
+ bitmap_for_each_set_region(bitmap_info->bitmap, rs, re, 0,
+ BITS_PER_BITMAP) {
+ bytes -= (rs - re) * ctl->unit;
+ count++;
+
+ if (!bytes)
+ break;
+ }
+
+ return count;
+}
+
static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset)
{
info->offset = offset_to_bitmap(ctl, offset);
info->bytes = 0;
+ info->bitmap_extents = 0;
INIT_LIST_HEAD(&info->list);
link_free_space(ctl, info);
ctl->total_bitmaps++;
@@ -1891,6 +1955,18 @@
static void free_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info)
{
+ /*
+ * Normally when this is called, the bitmap is completely empty. However,
+ * if we are blowing up the free space cache for one reason or another
+ * via __btrfs_remove_free_space_cache(), then it may not be freed and
+ * we may leave stats on the table.
+ */
+ if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] -=
+ bitmap_info->bitmap_extents;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
+
+ }
unlink_free_space(ctl, bitmap_info);
kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
@@ -1977,11 +2053,24 @@
static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
- u64 bytes)
+ u64 bytes, enum btrfs_trim_state trim_state)
{
u64 bytes_to_set = 0;
u64 end;
+ /*
+ * This is a tradeoff to make bitmap trim state minimal. We mark the
+ * whole bitmap untrimmed if at any point we add untrimmed regions.
+ */
+ if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
+ if (btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] +=
+ info->bitmap_extents;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
+ }
+ info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ }
+
end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
bytes_to_set = min(end - offset, bytes);
@@ -2001,7 +2090,7 @@
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
- struct btrfs_block_group_cache *block_group = ctl->private;
+ struct btrfs_block_group *block_group = ctl->private;
struct btrfs_fs_info *fs_info = block_group->fs_info;
bool forced = false;
@@ -2010,6 +2099,10 @@
forced = true;
#endif
+ /* This is a way to reclaim large regions from the bitmaps. */
+ if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
+ return false;
+
/*
* If we are below the extents threshold then we can add this as an
* extent, and don't have to deal with the bitmap
@@ -2022,8 +2115,8 @@
* of cache left then go ahead an dadd them, no sense in adding
* the overhead of a bitmap if we don't have to.
*/
- if (info->bytes <= fs_info->sectorsize * 4) {
- if (ctl->free_extents * 2 <= ctl->extents_thresh)
+ if (info->bytes <= fs_info->sectorsize * 8) {
+ if (ctl->free_extents * 3 <= ctl->extents_thresh)
return false;
} else {
return false;
@@ -2038,7 +2131,7 @@
* so allow those block groups to still be allowed to have a bitmap
* entry.
*/
- if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
+ if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
return false;
return true;
@@ -2053,13 +2146,15 @@
struct btrfs_free_space *info)
{
struct btrfs_free_space *bitmap_info;
- struct btrfs_block_group_cache *block_group = NULL;
+ struct btrfs_block_group *block_group = NULL;
int added = 0;
u64 bytes, offset, bytes_added;
+ enum btrfs_trim_state trim_state;
int ret;
bytes = info->bytes;
offset = info->offset;
+ trim_state = info->trim_state;
if (!ctl->op->use_bitmap(ctl, info))
return 0;
@@ -2094,8 +2189,8 @@
}
if (entry->offset == offset_to_bitmap(ctl, offset)) {
- bytes_added = add_bytes_to_bitmap(ctl, entry,
- offset, bytes);
+ bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
+ bytes, trim_state);
bytes -= bytes_added;
offset += bytes_added;
}
@@ -2114,7 +2209,8 @@
goto new_bitmap;
}
- bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+ bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
+ trim_state);
bytes -= bytes_added;
offset += bytes_added;
added = 0;
@@ -2148,6 +2244,7 @@
/* allocate the bitmap */
info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
GFP_NOFS);
+ info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
spin_lock(&ctl->tree_lock);
if (!info->bitmap) {
ret = -ENOMEM;
@@ -2167,6 +2264,22 @@
return ret;
}
+/*
+ * Free space merging rules:
+ * 1) Merge trimmed areas together
+ * 2) Let untrimmed areas coalesce with trimmed areas
+ * 3) Always pull neighboring regions from bitmaps
+ *
+ * The above rules are for when we merge free space based on btrfs_trim_state.
+ * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
+ * same reason: to promote larger extent regions which makes life easier for
+ * find_free_extent(). Rule 2 enables coalescing based on the common path
+ * being returning free space from btrfs_finish_extent_commit(). So when free
+ * space is trimmed, it will prevent aggregating trimmed new region and
+ * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents
+ * and provide find_free_extent() with the largest extents possible hoping for
+ * the reuse path.
+ */
static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, bool update_stat)
{
@@ -2175,6 +2288,7 @@
bool merged = false;
u64 offset = info->offset;
u64 bytes = info->bytes;
+ const bool is_trimmed = btrfs_free_space_trimmed(info);
/*
* first we want to see if there is free space adjacent to the range we
@@ -2188,7 +2302,9 @@
else if (!right_info)
left_info = tree_search_offset(ctl, offset - 1, 0, 0);
- if (right_info && !right_info->bitmap) {
+ /* See try_merge_free_space() comment. */
+ if (right_info && !right_info->bitmap &&
+ (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
if (update_stat)
unlink_free_space(ctl, right_info);
else
@@ -2198,8 +2314,10 @@
merged = true;
}
+ /* See try_merge_free_space() comment. */
if (left_info && !left_info->bitmap &&
- left_info->offset + left_info->bytes == offset) {
+ left_info->offset + left_info->bytes == offset &&
+ (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
if (update_stat)
unlink_free_space(ctl, left_info);
else
@@ -2235,6 +2353,10 @@
bytes = (j - i) * ctl->unit;
info->bytes += bytes;
+ /* See try_merge_free_space() comment. */
+ if (!btrfs_free_space_trimmed(bitmap))
+ info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+
if (update_stat)
bitmap_clear_bits(ctl, bitmap, end, bytes);
else
@@ -2288,6 +2410,10 @@
info->offset -= bytes;
info->bytes += bytes;
+ /* See try_merge_free_space() comment. */
+ if (!btrfs_free_space_trimmed(bitmap))
+ info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+
if (update_stat)
bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
else
@@ -2337,10 +2463,13 @@
int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_free_space_ctl *ctl,
- u64 offset, u64 bytes)
+ u64 offset, u64 bytes,
+ enum btrfs_trim_state trim_state)
{
+ struct btrfs_block_group *block_group = ctl->private;
struct btrfs_free_space *info;
int ret = 0;
+ u64 filter_bytes = bytes;
info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
if (!info)
@@ -2348,6 +2477,7 @@
info->offset = offset;
info->bytes = bytes;
+ info->trim_state = trim_state;
RB_CLEAR_NODE(&info->offset_index);
spin_lock(&ctl->tree_lock);
@@ -2376,10 +2506,13 @@
*/
steal_from_bitmap(ctl, info, true);
+ filter_bytes = max(filter_bytes, info->bytes);
+
ret = link_free_space(ctl, info);
if (ret)
kmem_cache_free(btrfs_free_space_cachep, info);
out:
+ btrfs_discard_update_discardable(block_group, ctl);
spin_unlock(&ctl->tree_lock);
if (ret) {
@@ -2387,18 +2520,47 @@
ASSERT(ret != -EEXIST);
}
+ if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
+ btrfs_discard_check_filter(block_group, filter_bytes);
+ btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
+ }
+
return ret;
}
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size)
{
+ enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+
+ if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
+ trim_state = BTRFS_TRIM_STATE_TRIMMED;
+
return __btrfs_add_free_space(block_group->fs_info,
block_group->free_space_ctl,
- bytenr, size);
+ bytenr, size, trim_state);
}
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+/*
+ * This is a subtle distinction because when adding free space back in general,
+ * we want it to be added as untrimmed for async. But in the case where we add
+ * it on loading of a block group, we want to consider it trimmed.
+ */
+int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size)
+{
+ enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+
+ if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
+ btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
+ trim_state = BTRFS_TRIM_STATE_TRIMMED;
+
+ return __btrfs_add_free_space(block_group->fs_info,
+ block_group->free_space_ctl,
+ bytenr, size, trim_state);
+}
+
+int btrfs_remove_free_space(struct btrfs_block_group *block_group,
u64 offset, u64 bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -2470,8 +2632,10 @@
}
spin_unlock(&ctl->tree_lock);
- ret = btrfs_add_free_space(block_group, offset + bytes,
- old_end - (offset + bytes));
+ ret = __btrfs_add_free_space(block_group->fs_info, ctl,
+ offset + bytes,
+ old_end - (offset + bytes),
+ info->trim_state);
WARN_ON(ret);
goto out;
}
@@ -2483,12 +2647,13 @@
goto again;
}
out_lock:
+ btrfs_discard_update_discardable(block_group, ctl);
spin_unlock(&ctl->tree_lock);
out:
return ret;
}
-void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+void btrfs_dump_free_space(struct btrfs_block_group *block_group,
u64 bytes)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -2513,14 +2678,14 @@
"%d blocks of free space at or bigger than bytes is", count);
}
-void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
+void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
spin_lock_init(&ctl->tree_lock);
ctl->unit = fs_info->sectorsize;
- ctl->start = block_group->key.objectid;
+ ctl->start = block_group->start;
ctl->private = block_group;
ctl->op = &free_space_op;
INIT_LIST_HEAD(&ctl->trimming_ranges);
@@ -2540,9 +2705,8 @@
* pointed to by the cluster, someone else raced in and freed the
* cluster already. In that case, we just return without changing anything
*/
-static int
-__btrfs_return_cluster_to_free_space(
- struct btrfs_block_group_cache *block_group,
+static void __btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -2550,8 +2714,10 @@
struct rb_node *node;
spin_lock(&cluster->lock);
- if (cluster->block_group != block_group)
- goto out;
+ if (cluster->block_group != block_group) {
+ spin_unlock(&cluster->lock);
+ return;
+ }
cluster->block_group = NULL;
cluster->window_start = 0;
@@ -2568,18 +2734,29 @@
bitmap = (entry->bitmap != NULL);
if (!bitmap) {
+ /* Merging treats extents as if they were new */
+ if (!btrfs_free_space_trimmed(entry)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]--;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -=
+ entry->bytes;
+ }
+
try_merge_free_space(ctl, entry, false);
steal_from_bitmap(ctl, entry, false);
+
+ /* As we insert directly, update these statistics */
+ if (!btrfs_free_space_trimmed(entry)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]++;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] +=
+ entry->bytes;
+ }
}
tree_insert_offset(&ctl->free_space_offset,
entry->offset, &entry->offset_index, bitmap);
}
cluster->root = RB_ROOT;
-
-out:
spin_unlock(&cluster->lock);
btrfs_put_block_group(block_group);
- return 0;
}
static void __btrfs_remove_free_space_cache_locked(
@@ -2605,10 +2782,12 @@
{
spin_lock(&ctl->tree_lock);
__btrfs_remove_free_space_cache_locked(ctl);
+ if (ctl->private)
+ btrfs_discard_update_discardable(ctl->private, ctl);
spin_unlock(&ctl->tree_lock);
}
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_cluster *cluster;
@@ -2626,20 +2805,55 @@
cond_resched_lock(&ctl->tree_lock);
}
__btrfs_remove_free_space_cache_locked(ctl);
+ btrfs_discard_update_discardable(block_group, ctl);
spin_unlock(&ctl->tree_lock);
}
-u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+/**
+ * btrfs_is_free_space_trimmed - see if everything is trimmed
+ * @block_group: block_group of interest
+ *
+ * Walk @block_group's free space rb_tree to determine if everything is trimmed.
+ */
+bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *info;
+ struct rb_node *node;
+ bool ret = true;
+
+ spin_lock(&ctl->tree_lock);
+ node = rb_first(&ctl->free_space_offset);
+
+ while (node) {
+ info = rb_entry(node, struct btrfs_free_space, offset_index);
+
+ if (!btrfs_free_space_trimmed(info)) {
+ ret = false;
+ break;
+ }
+
+ node = rb_next(node);
+ }
+
+ spin_unlock(&ctl->tree_lock);
+ return ret;
+}
+
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 offset, u64 bytes, u64 empty_size,
u64 *max_extent_size)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_discard_ctl *discard_ctl =
+ &block_group->fs_info->discard_ctl;
struct btrfs_free_space *entry = NULL;
u64 bytes_search = bytes + empty_size;
u64 ret = 0;
u64 align_gap = 0;
u64 align_gap_len = 0;
+ enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
spin_lock(&ctl->tree_lock);
entry = find_free_space(ctl, &offset, &bytes_search,
@@ -2650,12 +2864,20 @@
ret = offset;
if (entry->bitmap) {
bitmap_clear_bits(ctl, entry, offset, bytes);
+
+ if (!btrfs_free_space_trimmed(entry))
+ atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
+
if (!entry->bytes)
free_bitmap(ctl, entry);
} else {
unlink_free_space(ctl, entry);
align_gap_len = offset - entry->offset;
align_gap = entry->offset;
+ align_gap_trim_state = entry->trim_state;
+
+ if (!btrfs_free_space_trimmed(entry))
+ atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
entry->offset = offset + bytes;
WARN_ON(entry->bytes < bytes + align_gap_len);
@@ -2667,11 +2889,13 @@
link_free_space(ctl, entry);
}
out:
+ btrfs_discard_update_discardable(block_group, ctl);
spin_unlock(&ctl->tree_lock);
if (align_gap_len)
__btrfs_add_free_space(block_group->fs_info, ctl,
- align_gap, align_gap_len);
+ align_gap, align_gap_len,
+ align_gap_trim_state);
return ret;
}
@@ -2683,12 +2907,11 @@
* Otherwise, it'll get a reference on the block group pointed to by the
* cluster and remove the cluster from it.
*/
-int btrfs_return_cluster_to_free_space(
- struct btrfs_block_group_cache *block_group,
+void btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
struct btrfs_free_space_ctl *ctl;
- int ret;
/* first, get a safe pointer to the block group */
spin_lock(&cluster->lock);
@@ -2696,29 +2919,30 @@
block_group = cluster->block_group;
if (!block_group) {
spin_unlock(&cluster->lock);
- return 0;
+ return;
}
} else if (cluster->block_group != block_group) {
/* someone else has already freed it don't redo their work */
spin_unlock(&cluster->lock);
- return 0;
+ return;
}
- atomic_inc(&block_group->count);
+ btrfs_get_block_group(block_group);
spin_unlock(&cluster->lock);
ctl = block_group->free_space_ctl;
/* now return any extents the cluster had on it */
spin_lock(&ctl->tree_lock);
- ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
+ __btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&ctl->tree_lock);
+ btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
+
/* finally drop our ref */
btrfs_put_block_group(block_group);
- return ret;
}
-static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
struct btrfs_free_space *entry,
u64 bytes, u64 min_start,
@@ -2751,11 +2975,13 @@
* if it couldn't find anything suitably large, or a logical disk offset
* if things worked out
*/
-u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster, u64 bytes,
u64 min_start, u64 *max_extent_size)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_discard_ctl *discard_ctl =
+ &block_group->fs_info->discard_ctl;
struct btrfs_free_space *entry = NULL;
struct rb_node *node;
u64 ret = 0;
@@ -2808,8 +3034,6 @@
entry->bytes -= bytes;
}
- if (entry->bytes == 0)
- rb_erase(&entry->offset_index, &cluster->root);
break;
}
out:
@@ -2820,24 +3044,35 @@
spin_lock(&ctl->tree_lock);
+ if (!btrfs_free_space_trimmed(entry))
+ atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
+
ctl->free_space -= bytes;
+ if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
+
+ spin_lock(&cluster->lock);
if (entry->bytes == 0) {
+ rb_erase(&entry->offset_index, &cluster->root);
ctl->free_extents--;
if (entry->bitmap) {
kmem_cache_free(btrfs_free_space_bitmap_cachep,
entry->bitmap);
ctl->total_bitmaps--;
ctl->op->recalc_thresholds(ctl);
+ } else if (!btrfs_free_space_trimmed(entry)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]--;
}
kmem_cache_free(btrfs_free_space_cachep, entry);
}
+ spin_unlock(&cluster->lock);
spin_unlock(&ctl->tree_lock);
return ret;
}
-static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_space *entry,
struct btrfs_free_cluster *cluster,
u64 offset, u64 bytes,
@@ -2919,7 +3154,7 @@
* extent of cont1_bytes, and other clusters of at least min_bytes.
*/
static noinline int
-setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
+setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
u64 cont1_bytes, u64 min_bytes)
@@ -3010,7 +3245,7 @@
* that we have already failed to find extents that will work.
*/
static noinline int
-setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
+setup_cluster_bitmap(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
u64 cont1_bytes, u64 min_bytes)
@@ -3060,7 +3295,7 @@
* returns zero and sets up cluster if things worked out, otherwise
* it returns -enospc
*/
-int btrfs_find_space_cluster(struct btrfs_block_group_cache *block_group,
+int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
u64 offset, u64 bytes, u64 empty_size)
{
@@ -3123,7 +3358,7 @@
list_del_init(&entry->list);
if (!ret) {
- atomic_inc(&block_group->count);
+ btrfs_get_block_group(block_group);
list_add_tail(&cluster->block_group_list,
&block_group->cluster_list);
cluster->block_group = block_group;
@@ -3151,9 +3386,10 @@
cluster->block_group = NULL;
}
-static int do_trimming(struct btrfs_block_group_cache *block_group,
+static int do_trimming(struct btrfs_block_group *block_group,
u64 *total_trimmed, u64 start, u64 bytes,
u64 reserved_start, u64 reserved_bytes,
+ enum btrfs_trim_state reserved_trim_state,
struct btrfs_trim_range *trim_entry)
{
struct btrfs_space_info *space_info = block_group->space_info;
@@ -3161,6 +3397,9 @@
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
int ret;
int update = 0;
+ const u64 end = start + bytes;
+ const u64 reserved_end = reserved_start + reserved_bytes;
+ enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
u64 trimmed = 0;
spin_lock(&space_info->lock);
@@ -3174,11 +3413,20 @@
spin_unlock(&space_info->lock);
ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
- if (!ret)
+ if (!ret) {
*total_trimmed += trimmed;
+ trim_state = BTRFS_TRIM_STATE_TRIMMED;
+ }
mutex_lock(&ctl->cache_writeout_mutex);
- btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+ if (reserved_start < start)
+ __btrfs_add_free_space(fs_info, ctl, reserved_start,
+ start - reserved_start,
+ reserved_trim_state);
+ if (start + bytes < reserved_start + reserved_bytes)
+ __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
+ reserved_trim_state);
+ __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
list_del(&trim_entry->list);
mutex_unlock(&ctl->cache_writeout_mutex);
@@ -3196,16 +3444,24 @@
return ret;
}
-static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
- u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+/*
+ * If @async is set, then we will trim 1 region and return.
+ */
+static int trim_no_bitmap(struct btrfs_block_group *block_group,
+ u64 *total_trimmed, u64 start, u64 end, u64 minlen,
+ bool async)
{
+ struct btrfs_discard_ctl *discard_ctl =
+ &block_group->fs_info->discard_ctl;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
struct rb_node *node;
int ret = 0;
u64 extent_start;
u64 extent_bytes;
+ enum btrfs_trim_state extent_trim_state;
u64 bytes;
+ const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
while (start < end) {
struct btrfs_trim_range trim_entry;
@@ -3213,49 +3469,66 @@
mutex_lock(&ctl->cache_writeout_mutex);
spin_lock(&ctl->tree_lock);
- if (ctl->free_space < minlen) {
- spin_unlock(&ctl->tree_lock);
- mutex_unlock(&ctl->cache_writeout_mutex);
- break;
- }
+ if (ctl->free_space < minlen)
+ goto out_unlock;
entry = tree_search_offset(ctl, start, 0, 1);
- if (!entry) {
- spin_unlock(&ctl->tree_lock);
- mutex_unlock(&ctl->cache_writeout_mutex);
- break;
- }
+ if (!entry)
+ goto out_unlock;
- /* skip bitmaps */
- while (entry->bitmap) {
+ /* Skip bitmaps and if async, already trimmed entries */
+ while (entry->bitmap ||
+ (async && btrfs_free_space_trimmed(entry))) {
node = rb_next(&entry->offset_index);
- if (!node) {
- spin_unlock(&ctl->tree_lock);
- mutex_unlock(&ctl->cache_writeout_mutex);
- goto out;
- }
+ if (!node)
+ goto out_unlock;
entry = rb_entry(node, struct btrfs_free_space,
offset_index);
}
- if (entry->offset >= end) {
- spin_unlock(&ctl->tree_lock);
- mutex_unlock(&ctl->cache_writeout_mutex);
- break;
- }
+ if (entry->offset >= end)
+ goto out_unlock;
extent_start = entry->offset;
extent_bytes = entry->bytes;
- start = max(start, extent_start);
- bytes = min(extent_start + extent_bytes, end) - start;
- if (bytes < minlen) {
- spin_unlock(&ctl->tree_lock);
- mutex_unlock(&ctl->cache_writeout_mutex);
- goto next;
- }
+ extent_trim_state = entry->trim_state;
+ if (async) {
+ start = entry->offset;
+ bytes = entry->bytes;
+ if (bytes < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ goto next;
+ }
+ unlink_free_space(ctl, entry);
+ /*
+ * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
+ * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
+ * X when we come back around. So trim it now.
+ */
+ if (max_discard_size &&
+ bytes >= (max_discard_size +
+ BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
+ bytes = max_discard_size;
+ extent_bytes = max_discard_size;
+ entry->offset += max_discard_size;
+ entry->bytes -= max_discard_size;
+ link_free_space(ctl, entry);
+ } else {
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+ }
+ } else {
+ start = max(start, extent_start);
+ bytes = min(extent_start + extent_bytes, end) - start;
+ if (bytes < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ goto next;
+ }
- unlink_free_space(ctl, entry);
- kmem_cache_free(btrfs_free_space_cachep, entry);
+ unlink_free_space(ctl, entry);
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+ }
spin_unlock(&ctl->tree_lock);
trim_entry.start = extent_start;
@@ -3264,11 +3537,17 @@
mutex_unlock(&ctl->cache_writeout_mutex);
ret = do_trimming(block_group, total_trimmed, start, bytes,
- extent_start, extent_bytes, &trim_entry);
- if (ret)
+ extent_start, extent_bytes, extent_trim_state,
+ &trim_entry);
+ if (ret) {
+ block_group->discard_cursor = start + bytes;
break;
+ }
next:
start += bytes;
+ block_group->discard_cursor = start;
+ if (async && *total_trimmed)
+ break;
if (fatal_signal_pending(current)) {
ret = -ERESTARTSYS;
@@ -3277,19 +3556,76 @@
cond_resched();
}
-out:
+
+ return ret;
+
+out_unlock:
+ block_group->discard_cursor = btrfs_block_group_end(block_group);
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+
return ret;
}
-static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
- u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+/*
+ * If we break out of trimming a bitmap prematurely, we should reset the
+ * trimming bit. In a rather contrieved case, it's possible to race here so
+ * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
+ *
+ * start = start of bitmap
+ * end = near end of bitmap
+ *
+ * Thread 1: Thread 2:
+ * trim_bitmaps(start)
+ * trim_bitmaps(end)
+ * end_trimming_bitmap()
+ * reset_trimming_bitmap()
+ */
+static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
{
+ struct btrfs_free_space *entry;
+
+ spin_lock(&ctl->tree_lock);
+ entry = tree_search_offset(ctl, offset, 1, 0);
+ if (entry) {
+ if (btrfs_free_space_trimmed(entry)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] +=
+ entry->bitmap_extents;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
+ }
+ entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ }
+
+ spin_unlock(&ctl->tree_lock);
+}
+
+static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *entry)
+{
+ if (btrfs_free_space_trimming_bitmap(entry)) {
+ entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
+ ctl->discardable_extents[BTRFS_STAT_CURR] -=
+ entry->bitmap_extents;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
+ }
+}
+
+/*
+ * If @async is set, then we will trim 1 region and return.
+ */
+static int trim_bitmaps(struct btrfs_block_group *block_group,
+ u64 *total_trimmed, u64 start, u64 end, u64 minlen,
+ u64 maxlen, bool async)
+{
+ struct btrfs_discard_ctl *discard_ctl =
+ &block_group->fs_info->discard_ctl;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
int ret = 0;
int ret2;
u64 bytes;
u64 offset = offset_to_bitmap(ctl, start);
+ const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
while (offset < end) {
bool next_bitmap = false;
@@ -3299,35 +3635,84 @@
spin_lock(&ctl->tree_lock);
if (ctl->free_space < minlen) {
+ block_group->discard_cursor =
+ btrfs_block_group_end(block_group);
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
break;
}
entry = tree_search_offset(ctl, offset, 1, 0);
- if (!entry) {
+ /*
+ * Bitmaps are marked trimmed lossily now to prevent constant
+ * discarding of the same bitmap (the reason why we are bound
+ * by the filters). So, retrim the block group bitmaps when we
+ * are preparing to punt to the unused_bgs list. This uses
+ * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
+ * which is the only discard index which sets minlen to 0.
+ */
+ if (!entry || (async && minlen && start == offset &&
+ btrfs_free_space_trimmed(entry))) {
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
next_bitmap = true;
goto next;
}
+ /*
+ * Async discard bitmap trimming begins at by setting the start
+ * to be key.objectid and the offset_to_bitmap() aligns to the
+ * start of the bitmap. This lets us know we are fully
+ * scanning the bitmap rather than only some portion of it.
+ */
+ if (start == offset)
+ entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
+
bytes = minlen;
ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
if (ret2 || start >= end) {
+ /*
+ * We lossily consider a bitmap trimmed if we only skip
+ * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
+ */
+ if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
+ end_trimming_bitmap(ctl, entry);
+ else
+ entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
next_bitmap = true;
goto next;
}
+ /*
+ * We already trimmed a region, but are using the locking above
+ * to reset the trim_state.
+ */
+ if (async && *total_trimmed) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ goto out;
+ }
+
bytes = min(bytes, end - start);
- if (bytes < minlen) {
+ if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
goto next;
}
+ /*
+ * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
+ * If X < @minlen, we won't trim X when we come back around.
+ * So trim it now. We differ here from trimming extents as we
+ * don't keep individual state per bit.
+ */
+ if (async &&
+ max_discard_size &&
+ bytes > (max_discard_size + minlen))
+ bytes = max_discard_size;
+
bitmap_clear_bits(ctl, entry, start, bytes);
if (entry->bytes == 0)
free_bitmap(ctl, entry);
@@ -3339,19 +3724,25 @@
mutex_unlock(&ctl->cache_writeout_mutex);
ret = do_trimming(block_group, total_trimmed, start, bytes,
- start, bytes, &trim_entry);
- if (ret)
+ start, bytes, 0, &trim_entry);
+ if (ret) {
+ reset_trimming_bitmap(ctl, offset);
+ block_group->discard_cursor =
+ btrfs_block_group_end(block_group);
break;
+ }
next:
if (next_bitmap) {
offset += BITS_PER_BITMAP * ctl->unit;
+ start = offset;
} else {
start += bytes;
- if (start >= offset + BITS_PER_BITMAP * ctl->unit)
- offset += BITS_PER_BITMAP * ctl->unit;
}
+ block_group->discard_cursor = start;
if (fatal_signal_pending(current)) {
+ if (start != offset)
+ reset_trimming_bitmap(ctl, offset);
ret = -ERESTARTSYS;
break;
}
@@ -3359,51 +3750,47 @@
cond_resched();
}
+ if (offset >= end)
+ block_group->discard_cursor = end;
+
+out:
return ret;
}
-void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache)
+int btrfs_trim_block_group(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen)
{
- atomic_inc(&cache->trimming);
-}
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ int ret;
+ u64 rem = 0;
-void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
-{
- struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
- bool cleanup;
+ *trimmed = 0;
spin_lock(&block_group->lock);
- cleanup = (atomic_dec_and_test(&block_group->trimming) &&
- block_group->removed);
+ if (block_group->removed) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ btrfs_freeze_block_group(block_group);
spin_unlock(&block_group->lock);
- if (cleanup) {
- mutex_lock(&fs_info->chunk_mutex);
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, block_group->key.objectid,
- 1);
- BUG_ON(!em); /* logic error, can't happen */
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
- mutex_unlock(&fs_info->chunk_mutex);
+ ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
+ if (ret)
+ goto out;
- /* once for us and once for the tree */
- free_extent_map(em);
- free_extent_map(em);
-
- /*
- * We've left one free space entry and other tasks trimming
- * this block group have left 1 entry each one. Free them.
- */
- __btrfs_remove_free_space_cache(block_group->free_space_ctl);
- }
+ ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
+ div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
+ /* If we ended in the middle of a bitmap, reset the trimming flag */
+ if (rem)
+ reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
+out:
+ btrfs_unfreeze_block_group(block_group);
+ return ret;
}
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
- u64 *trimmed, u64 start, u64 end, u64 minlen)
+int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen,
+ bool async)
{
int ret;
@@ -3414,16 +3801,36 @@
spin_unlock(&block_group->lock);
return 0;
}
- btrfs_get_block_group_trimming(block_group);
+ btrfs_freeze_block_group(block_group);
spin_unlock(&block_group->lock);
- ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
- if (ret)
- goto out;
+ ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
+ btrfs_unfreeze_block_group(block_group);
- ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
-out:
- btrfs_put_block_group_trimming(block_group);
+ return ret;
+}
+
+int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen,
+ u64 maxlen, bool async)
+{
+ int ret;
+
+ *trimmed = 0;
+
+ spin_lock(&block_group->lock);
+ if (block_group->removed) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ btrfs_freeze_block_group(block_group);
+ spin_unlock(&block_group->lock);
+
+ ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
+ async);
+
+ btrfs_unfreeze_block_group(block_group);
+
return ret;
}
@@ -3583,11 +3990,9 @@
if (release_metadata)
btrfs_delalloc_release_metadata(BTRFS_I(inode),
inode->i_size, true);
-#ifdef DEBUG
- btrfs_err(fs_info,
- "failed to write free ino cache for root %llu",
- root->root_key.objectid);
-#endif
+ btrfs_debug(fs_info,
+ "failed to write free ino cache for root %llu error %d",
+ root->root_key.objectid, ret);
}
return ret;
@@ -3600,12 +4005,13 @@
* how the free space cache loading stuff works, so you can get really weird
* configurations.
*/
-int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+int test_add_free_space_entry(struct btrfs_block_group *cache,
u64 offset, u64 bytes, bool bitmap)
{
struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
struct btrfs_free_space *info = NULL, *bitmap_info;
void *map = NULL;
+ enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
u64 bytes_added;
int ret;
@@ -3647,7 +4053,8 @@
info = NULL;
}
- bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+ bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
+ trim_state);
bytes -= bytes_added;
offset += bytes_added;
@@ -3668,7 +4075,7 @@
* just used to check the absence of space, so if there is free space in the
* range at all we will return 1.
*/
-int test_check_exists(struct btrfs_block_group_cache *cache,
+int test_check_exists(struct btrfs_block_group *cache,
u64 offset, u64 bytes)
{
struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 39c32c8..e3d5e0a 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -6,6 +6,20 @@
#ifndef BTRFS_FREE_SPACE_CACHE_H
#define BTRFS_FREE_SPACE_CACHE_H
+/*
+ * This is the trim state of an extent or bitmap.
+ *
+ * BTRFS_TRIM_STATE_TRIMMING is special and used to maintain the state of a
+ * bitmap as we may need several trims to fully trim a single bitmap entry.
+ * This is reset should any free space other than trimmed space be added to the
+ * bitmap.
+ */
+enum btrfs_trim_state {
+ BTRFS_TRIM_STATE_UNTRIMMED,
+ BTRFS_TRIM_STATE_TRIMMED,
+ BTRFS_TRIM_STATE_TRIMMING,
+};
+
struct btrfs_free_space {
struct rb_node offset_index;
u64 offset;
@@ -13,8 +27,21 @@
u64 max_extent_size;
unsigned long *bitmap;
struct list_head list;
+ enum btrfs_trim_state trim_state;
+ s32 bitmap_extents;
};
+static inline bool btrfs_free_space_trimmed(struct btrfs_free_space *info)
+{
+ return (info->trim_state == BTRFS_TRIM_STATE_TRIMMED);
+}
+
+static inline bool btrfs_free_space_trimming_bitmap(
+ struct btrfs_free_space *info)
+{
+ return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING);
+}
+
struct btrfs_free_space_ctl {
spinlock_t tree_lock;
struct rb_root free_space_offset;
@@ -24,6 +51,8 @@
int total_bitmaps;
int unit;
u64 start;
+ s32 discardable_extents[BTRFS_STAT_NR_ENTRIES];
+ s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES];
const struct btrfs_free_space_op *op;
void *private;
struct mutex cache_writeout_mutex;
@@ -50,24 +79,23 @@
unsigned check_crcs:1;
};
-struct inode *lookup_free_space_inode(
- struct btrfs_block_group_cache *block_group,
+struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
struct btrfs_path *path);
int create_free_space_inode(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct inode *inode);
-int load_free_space_cache(struct btrfs_block_group_cache *block_group);
+int load_free_space_cache(struct btrfs_block_group *block_group);
int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
struct inode *lookup_free_ino_inode(struct btrfs_root *root,
struct btrfs_path *path);
@@ -81,42 +109,50 @@
struct btrfs_path *path,
struct inode *inode);
-void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
+void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group);
int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_free_space_ctl *ctl,
- u64 bytenr, u64 size);
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytenr, u64 size,
+ enum btrfs_trim_state trim_state);
+int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size);
+int btrfs_remove_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
- *block_group);
-u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group);
+bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group);
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 offset, u64 bytes, u64 empty_size,
u64 *max_extent_size);
u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
-void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+void btrfs_dump_free_space(struct btrfs_block_group *block_group,
u64 bytes);
-int btrfs_find_space_cluster(struct btrfs_block_group_cache *block_group,
+int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
u64 offset, u64 bytes, u64 empty_size);
void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
-u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster, u64 bytes,
u64 min_start, u64 *max_extent_size);
-int btrfs_return_cluster_to_free_space(
- struct btrfs_block_group_cache *block_group,
+void btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster);
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+int btrfs_trim_block_group(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen);
+int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen,
+ bool async);
+int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen,
+ u64 maxlen, bool async);
/* Support functions for running our sanity tests */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+int test_add_free_space_entry(struct btrfs_block_group *cache,
u64 offset, u64 bytes, bool bitmap);
-int test_check_exists(struct btrfs_block_group_cache *cache,
- u64 offset, u64 bytes);
+int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes);
#endif
#endif
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index dfabbbf..6cf2f7b 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -13,22 +13,25 @@
#include "block-group.h"
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
-void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
+void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
{
u32 bitmap_range;
size_t bitmap_size;
u64 num_bitmaps, total_bitmap_size;
+ if (WARN_ON(cache->length == 0))
+ btrfs_warn(cache->fs_info, "block group %llu length is zero",
+ cache->start);
+
/*
* We convert to bitmaps when the disk space required for using extents
* exceeds that required for using bitmaps.
*/
bitmap_range = cache->fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
- num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1,
- bitmap_range);
+ num_bitmaps = div_u64(cache->length + bitmap_range - 1, bitmap_range);
bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE;
total_bitmap_size = num_bitmaps * bitmap_size;
cache->bitmap_high_thresh = div_u64(total_bitmap_size,
@@ -45,7 +48,7 @@
}
static int add_new_free_space_info(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_root *root = trans->fs_info->free_space_root;
@@ -54,9 +57,9 @@
struct extent_buffer *leaf;
int ret;
- key.objectid = block_group->key.objectid;
+ key.objectid = block_group->start;
key.type = BTRFS_FREE_SPACE_INFO_KEY;
- key.offset = block_group->key.offset;
+ key.offset = block_group->length;
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
if (ret)
@@ -78,7 +81,7 @@
EXPORT_FOR_TESTS
struct btrfs_free_space_info *search_free_space_info(
struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, int cow)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -86,16 +89,16 @@
struct btrfs_key key;
int ret;
- key.objectid = block_group->key.objectid;
+ key.objectid = block_group->start;
key.type = BTRFS_FREE_SPACE_INFO_KEY;
- key.offset = block_group->key.offset;
+ key.offset = block_group->length;
ret = btrfs_search_slot(trans, root, &key, path, 0, cow);
if (ret < 0)
return ERR_PTR(ret);
if (ret != 0) {
btrfs_warn(fs_info, "missing free space info for %llu",
- block_group->key.objectid);
+ block_group->start);
ASSERT(0);
return ERR_PTR(-ENOENT);
}
@@ -180,7 +183,7 @@
EXPORT_FOR_TESTS
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -197,7 +200,7 @@
int done = 0, nr;
int ret;
- bitmap_size = free_space_bitmap_size(block_group->key.offset,
+ bitmap_size = free_space_bitmap_size(block_group->length,
fs_info->sectorsize);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
@@ -205,8 +208,8 @@
goto out;
}
- start = block_group->key.objectid;
- end = block_group->key.objectid + block_group->key.offset;
+ start = block_group->start;
+ end = block_group->start + block_group->length;
key.objectid = end - 1;
key.type = (u8)-1;
@@ -224,8 +227,8 @@
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
- ASSERT(found_key.objectid == block_group->key.objectid);
- ASSERT(found_key.offset == block_group->key.offset);
+ ASSERT(found_key.objectid == block_group->start);
+ ASSERT(found_key.offset == block_group->length);
done = 1;
break;
} else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
@@ -271,7 +274,7 @@
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
- block_group->key.objectid, extent_count,
+ block_group->start, extent_count,
expected_extent_count);
ASSERT(0);
ret = -EIO;
@@ -320,7 +323,7 @@
EXPORT_FOR_TESTS
int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -336,7 +339,7 @@
int done = 0, nr;
int ret;
- bitmap_size = free_space_bitmap_size(block_group->key.offset,
+ bitmap_size = free_space_bitmap_size(block_group->length,
fs_info->sectorsize);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
@@ -344,8 +347,8 @@
goto out;
}
- start = block_group->key.objectid;
- end = block_group->key.objectid + block_group->key.offset;
+ start = block_group->start;
+ end = block_group->start + block_group->length;
key.objectid = end - 1;
key.type = (u8)-1;
@@ -363,8 +366,8 @@
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
- ASSERT(found_key.objectid == block_group->key.objectid);
- ASSERT(found_key.offset == block_group->key.offset);
+ ASSERT(found_key.objectid == block_group->start);
+ ASSERT(found_key.offset == block_group->length);
done = 1;
break;
} else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
@@ -413,7 +416,7 @@
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- nrbits = div_u64(block_group->key.offset, block_group->fs_info->sectorsize);
+ nrbits = div_u64(block_group->length, block_group->fs_info->sectorsize);
start_bit = find_next_bit_le(bitmap, nrbits, 0);
while (start_bit < nrbits) {
@@ -437,7 +440,7 @@
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
- block_group->key.objectid, extent_count,
+ block_group->start, extent_count,
expected_extent_count);
ASSERT(0);
ret = -EIO;
@@ -453,7 +456,7 @@
}
static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path,
int new_extents)
{
@@ -491,7 +494,7 @@
}
EXPORT_FOR_TESTS
-int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+int free_space_test_bit(struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 offset)
{
struct extent_buffer *leaf;
@@ -513,7 +516,7 @@
return !!extent_buffer_test_bit(leaf, ptr, i);
}
-static void free_space_set_bits(struct btrfs_block_group_cache *block_group,
+static void free_space_set_bits(struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 *start, u64 *size,
int bit)
{
@@ -581,7 +584,7 @@
* the bitmap.
*/
static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path,
u64 start, u64 size, int remove)
{
@@ -597,7 +600,7 @@
* Read the bit for the block immediately before the extent of space if
* that block is within the block group.
*/
- if (start > block_group->key.objectid) {
+ if (start > block_group->start) {
u64 prev_block = start - block_group->fs_info->sectorsize;
key.objectid = prev_block;
@@ -649,7 +652,7 @@
* Read the bit for the block immediately after the extent of space if
* that block is within the block group.
*/
- if (end < block_group->key.objectid + block_group->key.offset) {
+ if (end < block_group->start + block_group->length) {
/* The next block may be in the next bitmap. */
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (end >= key.objectid + key.offset) {
@@ -694,7 +697,7 @@
}
static int remove_free_space_extent(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path,
u64 start, u64 size)
{
@@ -781,7 +784,7 @@
EXPORT_FOR_TESTS
int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size)
{
struct btrfs_free_space_info *info;
@@ -812,7 +815,7 @@
int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_path *path;
int ret;
@@ -846,7 +849,7 @@
}
static int add_free_space_extent(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path,
u64 start, u64 size)
{
@@ -880,7 +883,7 @@
new_key.offset = size;
/* Search for a neighbor on the left. */
- if (start == block_group->key.objectid)
+ if (start == block_group->start)
goto right;
key.objectid = start - 1;
key.type = (u8)-1;
@@ -900,8 +903,8 @@
found_start = key.objectid;
found_end = key.objectid + key.offset;
- ASSERT(found_start >= block_group->key.objectid &&
- found_end > block_group->key.objectid);
+ ASSERT(found_start >= block_group->start &&
+ found_end > block_group->start);
ASSERT(found_start < start && found_end <= start);
/*
@@ -920,7 +923,7 @@
right:
/* Search for a neighbor on the right. */
- if (end == block_group->key.objectid + block_group->key.offset)
+ if (end == block_group->start + block_group->length)
goto insert;
key.objectid = end;
key.type = (u8)-1;
@@ -940,8 +943,8 @@
found_start = key.objectid;
found_end = key.objectid + key.offset;
- ASSERT(found_start >= block_group->key.objectid &&
- found_end > block_group->key.objectid);
+ ASSERT(found_start >= block_group->start &&
+ found_end > block_group->start);
ASSERT((found_start < start && found_end <= start) ||
(found_start >= end && found_end > end));
@@ -974,7 +977,7 @@
EXPORT_FOR_TESTS
int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size)
{
struct btrfs_free_space_info *info;
@@ -1005,7 +1008,7 @@
int add_to_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_path *path;
int ret;
@@ -1043,7 +1046,7 @@
* through the normal add/remove hooks.
*/
static int populate_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
struct btrfs_root *extent_root = trans->fs_info->extent_root;
struct btrfs_path *path, *path2;
@@ -1075,7 +1078,7 @@
* BLOCK_GROUP_ITEM, so an extent may precede the block group that it's
* contained in.
*/
- key.objectid = block_group->key.objectid;
+ key.objectid = block_group->start;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = 0;
@@ -1084,8 +1087,8 @@
goto out_locked;
ASSERT(ret == 0);
- start = block_group->key.objectid;
- end = block_group->key.objectid + block_group->key.offset;
+ start = block_group->start;
+ end = block_group->start + block_group->length;
while (1) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -1109,7 +1112,7 @@
else
start += key.offset;
} else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- if (key.objectid != block_group->key.objectid)
+ if (key.objectid != block_group->start)
break;
}
@@ -1140,7 +1143,7 @@
struct btrfs_trans_handle *trans;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *free_space_root;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct rb_node *node;
int ret;
@@ -1160,7 +1163,7 @@
node = rb_first(&fs_info->block_group_cache_tree);
while (node) {
- block_group = rb_entry(node, struct btrfs_block_group_cache,
+ block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
ret = populate_free_space_tree(trans, block_group);
if (ret)
@@ -1260,9 +1263,7 @@
btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
0, 1);
- free_extent_buffer(free_space_root->node);
- free_extent_buffer(free_space_root->commit_root);
- kfree(free_space_root);
+ btrfs_put_root(free_space_root);
return btrfs_commit_transaction(trans);
@@ -1273,7 +1274,7 @@
}
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
int ret;
@@ -1285,12 +1286,12 @@
return ret;
return __add_to_free_space_tree(trans, block_group, path,
- block_group->key.objectid,
- block_group->key.offset);
+ block_group->start,
+ block_group->length);
}
int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path = NULL;
@@ -1320,7 +1321,7 @@
}
int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
struct btrfs_root *root = trans->fs_info->free_space_root;
struct btrfs_path *path;
@@ -1344,8 +1345,8 @@
goto out;
}
- start = block_group->key.objectid;
- end = block_group->key.objectid + block_group->key.offset;
+ start = block_group->start;
+ end = block_group->start + block_group->length;
key.objectid = end - 1;
key.type = (u8)-1;
@@ -1363,8 +1364,8 @@
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
- ASSERT(found_key.objectid == block_group->key.objectid);
- ASSERT(found_key.offset == block_group->key.offset);
+ ASSERT(found_key.objectid == block_group->start);
+ ASSERT(found_key.offset == block_group->length);
done = 1;
nr++;
path->slots[0]--;
@@ -1399,7 +1400,7 @@
struct btrfs_path *path,
u32 expected_extent_count)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_key key;
@@ -1415,7 +1416,7 @@
fs_info = block_group->fs_info;
root = fs_info->free_space_root;
- end = block_group->key.objectid + block_group->key.offset;
+ end = block_group->start + block_group->length;
while (1) {
ret = btrfs_next_item(root, path);
@@ -1462,7 +1463,7 @@
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
- block_group->key.objectid, extent_count,
+ block_group->start, extent_count,
expected_extent_count);
ASSERT(0);
ret = -EIO;
@@ -1480,7 +1481,7 @@
struct btrfs_path *path,
u32 expected_extent_count)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_key key;
@@ -1493,7 +1494,7 @@
fs_info = block_group->fs_info;
root = fs_info->free_space_root;
- end = block_group->key.objectid + block_group->key.offset;
+ end = block_group->start + block_group->length;
while (1) {
ret = btrfs_next_item(root, path);
@@ -1524,7 +1525,7 @@
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
- block_group->key.objectid, extent_count,
+ block_group->start, extent_count,
expected_extent_count);
ASSERT(0);
ret = -EIO;
@@ -1540,7 +1541,7 @@
int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_free_space_info *info;
struct btrfs_path *path;
u32 extent_count, flags;
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index 360d50e..dc2463e 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -16,14 +16,14 @@
#define BTRFS_FREE_SPACE_BITMAP_SIZE 256
#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE)
-void set_free_space_tree_thresholds(struct btrfs_block_group_cache *block_group);
+void set_free_space_tree_thresholds(struct btrfs_block_group *block_group);
int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info);
int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group);
+ struct btrfs_block_group *block_group);
int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group);
+ struct btrfs_block_group *block_group);
int add_to_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size);
int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
@@ -32,21 +32,21 @@
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_free_space_info *
search_free_space_info(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, int cow);
int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size);
int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size);
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
-int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+int free_space_test_bit(struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 offset);
#endif
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 37345fb..76d2e43 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -107,7 +107,7 @@
if (last != (u64)-1 && last + 1 != key.objectid) {
__btrfs_add_free_space(fs_info, ctl, last + 1,
- key.objectid - last - 1);
+ key.objectid - last - 1, 0);
wake_up(&root->ino_cache_wait);
}
@@ -118,7 +118,7 @@
if (last < root->highest_objectid - 1) {
__btrfs_add_free_space(fs_info, ctl, last + 1,
- root->highest_objectid - last - 1);
+ root->highest_objectid - last - 1, 0);
}
spin_lock(&root->ino_cache_lock);
@@ -175,7 +175,8 @@
ret = btrfs_find_free_objectid(root, &objectid);
if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
__btrfs_add_free_space(fs_info, ctl, objectid,
- BTRFS_LAST_FREE_OBJECTID - objectid + 1);
+ BTRFS_LAST_FREE_OBJECTID - objectid + 1,
+ 0);
wake_up(&root->ino_cache_wait);
}
@@ -221,7 +222,7 @@
return;
again:
if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
- __btrfs_add_free_space(fs_info, pinned, objectid, 1);
+ __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
} else {
down_write(&fs_info->commit_root_sem);
spin_lock(&root->ino_cache_lock);
@@ -234,7 +235,7 @@
start_caching(root);
- __btrfs_add_free_space(fs_info, pinned, objectid, 1);
+ __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
up_write(&fs_info->commit_root_sem);
}
@@ -281,7 +282,7 @@
spin_unlock(rbroot_lock);
if (count)
__btrfs_add_free_space(root->fs_info, ctl,
- info->offset, count);
+ info->offset, count, 0);
kmem_cache_free(btrfs_free_space_cachep, info);
}
}
@@ -494,7 +495,8 @@
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_SIZE;
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc);
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 0,
+ prealloc);
if (ret)
goto out_put;
@@ -514,7 +516,7 @@
trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
trans->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, trans->block_rsv,
- trans->bytes_reserved);
+ trans->bytes_reserved, NULL);
out:
trans->block_rsv = rsv;
trans->bytes_reserved = num_bytes;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b859ed5..1d9262a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3,9 +3,9 @@
* Copyright (C) 2007 Oracle. All rights reserved.
*/
+#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
-#include <linux/buffer_head.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -28,7 +28,9 @@
#include <linux/magic.h>
#include <linux/iversion.h>
#include <linux/swap.h>
+#include <linux/migrate.h>
#include <linux/sched/mm.h>
+#include <linux/iomap.h>
#include <asm/unaligned.h>
#include "misc.h"
#include "ctree.h"
@@ -44,7 +46,6 @@
#include "locking.h"
#include "free-space-cache.h"
#include "inode-map.h"
-#include "backref.h"
#include "props.h"
#include "qgroup.h"
#include "delalloc-space.h"
@@ -52,25 +53,24 @@
#include "space-info.h"
struct btrfs_iget_args {
- struct btrfs_key *location;
+ u64 ino;
struct btrfs_root *root;
};
struct btrfs_dio_data {
u64 reserve;
- u64 unsubmitted_oe_range_start;
- u64 unsubmitted_oe_range_end;
- int overwrite;
+ loff_t length;
+ ssize_t submitted;
+ struct extent_changeset *data_reserved;
+ bool sync;
};
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
-static const struct inode_operations btrfs_dir_ro_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct file_operations btrfs_dir_file_operations;
-static const struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
@@ -81,17 +81,17 @@
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
-static noinline int cow_file_range(struct inode *inode,
+static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written, int unlock);
-static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
- u64 orig_start, u64 block_start,
+static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
+ u64 len, u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
u64 ram_bytes, int compress_type,
int type);
-static void __endio_write_update_ordered(struct inode *inode,
+static void __endio_write_update_ordered(struct btrfs_inode *inode,
const u64 offset, const u64 bytes,
const bool uptodate);
@@ -105,7 +105,7 @@
* to be released, which we want to happen only when finishing the ordered
* extent (btrfs_finish_ordered_io()).
*/
-static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
+static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
struct page *locked_page,
u64 offset, u64 bytes)
{
@@ -117,7 +117,7 @@
struct page *page;
while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
+ page = find_get_page(inode->vfs_inode.i_mapping, index);
index++;
if (!page)
continue;
@@ -140,13 +140,6 @@
static int btrfs_dirty_inode(struct inode *inode);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode)
-{
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-}
-#endif
-
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
const struct qstr *qstr)
@@ -245,6 +238,15 @@
btrfs_release_path(path);
/*
+ * We align size to sectorsize for inline extents just for simplicity
+ * sake.
+ */
+ size = ALIGN(size, root->fs_info->sectorsize);
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
+ if (ret)
+ goto fail;
+
+ /*
* we're an inline extent, so nobody can
* extend the file past i_size without locking
* a page we already have locked.
@@ -266,15 +268,15 @@
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*/
-static noinline int cow_file_range_inline(struct inode *inode, u64 start,
+static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
u64 end, size_t compressed_size,
int compress_type,
struct page **compressed_pages)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
- u64 isize = i_size_read(inode);
+ u64 isize = i_size_read(&inode->vfs_inode);
u64 actual_end = min(end + 1, isize);
u64 inline_len = actual_end - start;
u64 aligned_end = ALIGN(end, fs_info->sectorsize);
@@ -306,7 +308,7 @@
btrfs_free_path(path);
return PTR_ERR(trans);
}
- trans->block_rsv = &BTRFS_I(inode)->block_rsv;
+ trans->block_rsv = &inode->block_rsv;
if (compressed_size && compressed_pages)
extent_item_size = btrfs_file_extent_calc_inline_size(
@@ -315,9 +317,9 @@
extent_item_size = btrfs_file_extent_calc_inline_size(
inline_len);
- ret = __btrfs_drop_extents(trans, root, inode, path,
- start, aligned_end, NULL,
- 1, 1, extent_item_size, &extent_inserted);
+ ret = __btrfs_drop_extents(trans, root, inode, path, start, aligned_end,
+ NULL, 1, 1, extent_item_size,
+ &extent_inserted);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -326,7 +328,7 @@
if (isize > actual_end)
inline_len = min_t(u64, isize, actual_end);
ret = insert_inline_extent(trans, path, extent_inserted,
- root, inode, start,
+ root, &inode->vfs_inode, start,
inline_len, compressed_size,
compress_type, compressed_pages);
if (ret && ret != -ENOSPC) {
@@ -337,8 +339,8 @@
goto out;
}
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
- btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
out:
/*
* Don't forget to free the reserved space, as for inlined extent
@@ -369,6 +371,7 @@
u64 end;
unsigned int write_flags;
struct list_head extents;
+ struct cgroup_subsys_state *blkcg_css;
struct btrfs_work work;
atomic_t *pending;
};
@@ -403,10 +406,10 @@
/*
* Check if the inode has flags compatible with compression
*/
-static inline bool inode_can_compress(struct inode *inode)
+static inline bool inode_can_compress(struct btrfs_inode *inode)
{
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
- BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ if (inode->flags & BTRFS_INODE_NODATACOW ||
+ inode->flags & BTRFS_INODE_NODATASUM)
return false;
return true;
}
@@ -415,29 +418,30 @@
* Check if the inode needs to be submitted to compression, based on mount
* options, defragmentation, properties or heuristics.
*/
-static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
+static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
+ u64 end)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
if (!inode_can_compress(inode)) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
- btrfs_ino(BTRFS_I(inode)));
+ btrfs_ino(inode));
return 0;
}
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
/* defrag ioctl */
- if (BTRFS_I(inode)->defrag_compress)
+ if (inode->defrag_compress)
return 1;
/* bad compression ratios */
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+ if (inode->flags & BTRFS_INODE_NOCOMPRESS)
return 0;
if (btrfs_test_opt(fs_info, COMPRESS) ||
- BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
- BTRFS_I(inode)->prop_compress)
- return btrfs_compress_heuristic(inode, start, end);
+ inode->flags & BTRFS_INODE_COMPRESS ||
+ inode->prop_compress)
+ return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
return 0;
}
@@ -543,7 +547,7 @@
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (inode_need_compress(inode, start, end)) {
+ if (inode_need_compress(BTRFS_I(inode), start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
@@ -607,11 +611,12 @@
/* we didn't compress the entire range, try
* to make an uncompressed inline extent.
*/
- ret = cow_file_range_inline(inode, start, end, 0,
- BTRFS_COMPRESS_NONE, NULL);
+ ret = cow_file_range_inline(BTRFS_I(inode), start, end,
+ 0, BTRFS_COMPRESS_NONE,
+ NULL);
} else {
/* try making a compressed inline extent */
- ret = cow_file_range_inline(inode, start, end,
+ ret = cow_file_range_inline(BTRFS_I(inode), start, end,
total_compressed,
compress_type, pages);
}
@@ -633,7 +638,8 @@
* our outstanding extent for clearing delalloc for this
* range.
*/
- extent_clear_unlock_delalloc(inode, start, end, NULL,
+ extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
+ NULL,
clear_flags,
PAGE_UNLOCK |
PAGE_CLEAR_DIRTY |
@@ -759,14 +765,14 @@
*/
static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
{
- struct inode *inode = async_chunk->inode;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct async_extent *async_extent;
u64 alloc_hint = 0;
struct btrfs_key ins;
struct extent_map *em;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_root *root = inode->root;
+ struct extent_io_tree *io_tree = &inode->io_tree;
int ret = 0;
again:
@@ -799,7 +805,7 @@
* all those pages down to the drive.
*/
if (!page_started && !ret)
- extent_write_locked_range(inode,
+ extent_write_locked_range(&inode->vfs_inode,
async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
@@ -829,7 +835,7 @@
* will not submit these pages down to lower
* layers.
*/
- extent_range_redirty_for_io(inode,
+ extent_range_redirty_for_io(&inode->vfs_inode,
async_extent->start,
async_extent->start +
async_extent->ram_size - 1);
@@ -864,8 +870,7 @@
BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
if (ret) {
- btrfs_drop_extent_cache(BTRFS_I(inode),
- async_extent->start,
+ btrfs_drop_extent_cache(inode, async_extent->start,
async_extent->start +
async_extent->ram_size - 1, 0);
goto out_free_reserve;
@@ -881,23 +886,22 @@
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
PAGE_SET_WRITEBACK);
- if (btrfs_submit_compressed_write(inode,
- async_extent->start,
+ if (btrfs_submit_compressed_write(inode, async_extent->start,
async_extent->ram_size,
ins.objectid,
ins.offset, async_extent->pages,
async_extent->nr_pages,
- async_chunk->write_flags)) {
+ async_chunk->write_flags,
+ async_chunk->blkcg_css)) {
struct page *p = async_extent->pages[0];
const u64 start = async_extent->start;
const u64 end = start + async_extent->ram_size - 1;
- p->mapping = inode->i_mapping;
+ p->mapping = inode->vfs_inode.i_mapping;
btrfs_writepage_endio_finish_ordered(p, start, end, 0);
p->mapping = NULL;
- extent_clear_unlock_delalloc(inode, start, end,
- NULL, 0,
+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
PAGE_END_WRITEBACK |
PAGE_SET_ERROR);
free_async_extent_pages(async_extent);
@@ -925,10 +929,10 @@
goto again;
}
-static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
u64 num_bytes)
{
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 alloc_hint = 0;
@@ -970,13 +974,13 @@
* required to start IO on it. It may be clean and already done with
* IO when we return.
*/
-static noinline int cow_file_range(struct inode *inode,
+static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written, int unlock)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 alloc_hint = 0;
u64 num_bytes;
unsigned long ram_size;
@@ -990,7 +994,7 @@
bool extent_reserved = false;
int ret = 0;
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (btrfs_is_free_space_inode(inode)) {
WARN_ON_ONCE(1);
ret = -EINVAL;
goto out_unlock;
@@ -1000,7 +1004,7 @@
num_bytes = max(blocksize, num_bytes);
ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
- inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
+ inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
if (start == 0) {
/* lets try to make an inline extent */
@@ -1029,8 +1033,7 @@
}
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
- start + num_bytes - 1, 0);
+ btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
/*
* Relocation relies on the relocated extents to have exactly the same
@@ -1094,7 +1097,7 @@
* skip current ordered extent.
*/
if (ret)
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
+ btrfs_drop_extent_cache(inode, start,
start + ram_size - 1, 0);
}
@@ -1110,8 +1113,7 @@
page_ops = unlock ? PAGE_UNLOCK : 0;
page_ops |= PAGE_SET_PRIVATE2;
- extent_clear_unlock_delalloc(inode, start,
- start + ram_size - 1,
+ extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
@@ -1135,7 +1137,7 @@
return ret;
out_drop_extent_cache:
- btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
+ btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
@@ -1222,6 +1224,8 @@
async_chunk = container_of(work, struct async_chunk, work);
if (async_chunk->inode)
btrfs_add_delayed_iput(async_chunk->inode);
+ if (async_chunk->blkcg_css)
+ css_put(async_chunk->blkcg_css);
/*
* Since the pointer to 'pending' is at the beginning of the array of
* async_chunk's, freeing it ensures the whole array has been freed.
@@ -1230,12 +1234,14 @@
kvfree(async_chunk->pending);
}
-static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+static int cow_file_range_async(struct btrfs_inode *inode,
+ struct writeback_control *wbc,
+ struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written,
- unsigned int write_flags)
+ unsigned long *nr_written)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
struct async_cow *ctx;
struct async_chunk *async_chunk;
unsigned long nr_pages;
@@ -1244,10 +1250,11 @@
int i;
bool should_compress;
unsigned nofs_flag;
+ const unsigned int write_flags = wbc_to_write_flags(wbc);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ unlock_extent(&inode->io_tree, start, end);
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+ if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
!btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
num_chunks = 1;
should_compress = false;
@@ -1285,9 +1292,9 @@
* igrab is called higher up in the call chain, take only the
* lightweight reference for the callback lifetime
*/
- ihold(inode);
+ ihold(&inode->vfs_inode);
async_chunk[i].pending = &ctx->num_chunks;
- async_chunk[i].inode = inode;
+ async_chunk[i].inode = &inode->vfs_inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
async_chunk[i].write_flags = write_flags;
@@ -1303,12 +1310,30 @@
* to unlock it.
*/
if (locked_page) {
+ /*
+ * Depending on the compressibility, the pages might or
+ * might not go through async. We want all of them to
+ * be accounted against wbc once. Let's do it here
+ * before the paths diverge. wbc accounting is used
+ * only for foreign writeback detection and doesn't
+ * need full accuracy. Just account the whole thing
+ * against the first page.
+ */
+ wbc_account_cgroup_owner(wbc, locked_page,
+ cur_end - start);
async_chunk[i].locked_page = locked_page;
locked_page = NULL;
} else {
async_chunk[i].locked_page = NULL;
}
+ if (blkcg_css != blkcg_root_css) {
+ css_get(blkcg_css);
+ async_chunk[i].blkcg_css = blkcg_css;
+ } else {
+ async_chunk[i].blkcg_css = NULL;
+ }
+
btrfs_init_work(&async_chunk[i].work, async_cow_start,
async_cow_submit, async_cow_free);
@@ -1346,15 +1371,15 @@
return 1;
}
-static int fallback_to_cow(struct inode *inode, struct page *locked_page,
+static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
const u64 start, const u64 end,
int *page_started, unsigned long *nr_written)
{
- const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode));
- const bool is_reloc_ino = (BTRFS_I(inode)->root->root_key.objectid ==
+ const bool is_space_ino = btrfs_is_free_space_inode(inode);
+ const bool is_reloc_ino = (inode->root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID);
const u64 range_bytes = end + 1 - start;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *io_tree = &inode->io_tree;
u64 range_start = start;
u64 count;
@@ -1394,7 +1419,7 @@
EXTENT_NORESERVE, 0);
if (count > 0 || is_space_ino || is_reloc_ino) {
u64 bytes = count;
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_space_info *sinfo = fs_info->data_sinfo;
if (is_space_ino || is_reloc_ino)
@@ -1420,21 +1445,21 @@
* If no cow copies or snapshots exist, we write directly to the existing
* blocks on disk
*/
-static noinline int run_delalloc_nocow(struct inode *inode,
+static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct page *locked_page,
const u64 start, const u64 end,
int *page_started, int force,
unsigned long *nr_written)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *root = inode->root;
struct btrfs_path *path;
u64 cow_start = (u64)-1;
u64 cur_offset = start;
int ret;
bool check_prev = true;
- const bool freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ const bool freespace_inode = btrfs_is_free_space_inode(inode);
+ u64 ino = btrfs_ino(inode);
bool nocow = false;
u64 disk_bytenr = 0;
@@ -1660,8 +1685,8 @@
* NOCOW, following one which needs to be COW'ed
*/
if (cow_start != (u64)-1) {
- ret = fallback_to_cow(inode, locked_page, cow_start,
- found_key.offset - 1,
+ ret = fallback_to_cow(inode, locked_page,
+ cow_start, found_key.offset - 1,
page_started, nr_written);
if (ret)
goto error;
@@ -1689,8 +1714,7 @@
num_bytes,
BTRFS_ORDERED_PREALLOC);
if (ret) {
- btrfs_drop_extent_cache(BTRFS_I(inode),
- cur_offset,
+ btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + num_bytes - 1,
0);
goto error;
@@ -1766,11 +1790,11 @@
return ret;
}
-static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
+static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end)
{
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
- !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
+ !(inode->flags & BTRFS_INODE_PREALLOC))
return 0;
/*
@@ -1778,9 +1802,8 @@
* if is not zero, it means the file is defragging.
* Force cow if given extent needs to be defragged.
*/
- if (BTRFS_I(inode)->defrag_bytes &&
- test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
- EXTENT_DEFRAG, 0, NULL))
+ if (inode->defrag_bytes &&
+ test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL))
return 1;
return 0;
@@ -1790,30 +1813,27 @@
* Function to process delayed allocation (create CoW) for ranges which are
* being touched for the first time.
*/
-int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc)
{
int ret;
int force_cow = need_force_cow(inode, start, end);
- unsigned int write_flags = wbc_to_write_flags(wbc);
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
+ if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 1, nr_written);
- } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
+ } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
} else if (!inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
ret = cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
+ page_started, nr_written, 1);
} else {
- set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags);
- ret = cow_file_range_async(inode, locked_page, start, end,
- page_started, nr_written,
- write_flags);
+ set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
+ ret = cow_file_range_async(inode, wbc, locked_page, start, end,
+ page_started, nr_written);
}
if (ret)
btrfs_cleanup_ordered_extents(inode, locked_page, start,
@@ -2060,9 +2080,7 @@
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
do_list && !(state->state & EXTENT_NORESERVE) &&
(*bits & EXTENT_CLEAR_DATA_RESV))
- btrfs_free_reserved_data_space_noquota(
- &inode->vfs_inode,
- state->start, len);
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
fs_info->delalloc_batch);
@@ -2136,11 +2154,8 @@
u64 bio_offset)
{
struct inode *inode = private_data;
- blk_status_t ret = 0;
- ret = btrfs_csum_one_bio(inode, bio, 0, 0);
- BUG_ON(ret); /* -ENOMEM */
- return 0;
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
/*
@@ -2161,9 +2176,8 @@
*
* c-3) otherwise: async submit
*/
-static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2189,7 +2203,7 @@
bio_flags);
goto out;
} else if (!skip_sum) {
- ret = btrfs_lookup_bio_sums(inode, bio, NULL);
+ ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL);
if (ret)
goto out;
}
@@ -2203,13 +2217,13 @@
0, inode, btrfs_submit_bio_start);
goto out;
} else if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, 0, 0);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
if (ret)
goto out;
}
mapit:
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
out:
if (ret) {
@@ -2223,16 +2237,15 @@
* given a list of ordered sums record them in the inode. This happens
* at IO completion time based on sums calculated at bio submission time.
*/
-static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
- struct inode *inode, struct list_head *list)
+static int add_pending_csums(struct btrfs_trans_handle *trans,
+ struct list_head *list)
{
struct btrfs_ordered_sum *sum;
int ret;
list_for_each_entry(sum, list, list) {
trans->adding_csums = true;
- ret = btrfs_csum_file_blocks(trans,
- BTRFS_I(inode)->root->fs_info->csum_root, sum);
+ ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
trans->adding_csums = false;
if (ret)
return ret;
@@ -2240,13 +2253,71 @@
return 0;
}
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+ const u64 start,
+ const u64 len,
+ struct extent_state **cached_state)
+{
+ u64 search_start = start;
+ const u64 end = start + len - 1;
+
+ while (search_start < end) {
+ const u64 search_len = end - search_start + 1;
+ struct extent_map *em;
+ u64 em_len;
+ int ret = 0;
+
+ em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ if (em->block_start != EXTENT_MAP_HOLE)
+ goto next;
+
+ em_len = em->len;
+ if (em->start < search_start)
+ em_len -= search_start - em->start;
+ if (em_len > search_len)
+ em_len = search_len;
+
+ ret = set_extent_bit(&inode->io_tree, search_start,
+ search_start + em_len - 1,
+ EXTENT_DELALLOC_NEW,
+ NULL, cached_state, GFP_NOFS);
+next:
+ search_start = extent_map_end(em);
+ free_extent_map(em);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state)
{
WARN_ON(PAGE_ALIGNED(end));
- return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
- extra_bits, cached_state);
+
+ if (start >= i_size_read(&inode->vfs_inode) &&
+ !(inode->flags & BTRFS_INODE_PREALLOC)) {
+ /*
+ * There can't be any extents following eof in this case so just
+ * set the delalloc new bit for the range directly.
+ */
+ extra_bits |= EXTENT_DELALLOC_NEW;
+ } else {
+ int ret;
+
+ ret = btrfs_find_new_delalloc_bytes(inode, start,
+ end + 1 - start,
+ cached_state);
+ if (ret)
+ return ret;
+ }
+
+ return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
+ cached_state);
}
/* see btrfs_writepage_start_hook for details on why this is required */
@@ -2263,7 +2334,7 @@
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
struct page *page;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 page_start;
u64 page_end;
int ret = 0;
@@ -2271,7 +2342,7 @@
fixup = container_of(work, struct btrfs_writepage_fixup, work);
page = fixup->page;
- inode = fixup->inode;
+ inode = BTRFS_I(fixup->inode);
page_start = page_offset(page);
page_end = page_offset(page) + PAGE_SIZE - 1;
@@ -2308,8 +2379,7 @@
* when the page was already properly dealt with.
*/
if (!ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
+ btrfs_delalloc_release_extents(inode, PAGE_SIZE);
btrfs_delalloc_release_space(inode, data_reserved,
page_start, PAGE_SIZE,
true);
@@ -2325,20 +2395,18 @@
if (ret)
goto out_page;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
- &cached_state);
+ lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
if (PagePrivate2(page))
goto out_reserved;
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
- PAGE_SIZE);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
- page_end, &cached_state);
+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
+ &cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -2358,11 +2426,11 @@
BUG_ON(!PageDirty(page));
free_delalloc_space = false;
out_reserved:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(inode, PAGE_SIZE);
if (free_delalloc_space)
btrfs_delalloc_release_space(inode, data_reserved, page_start,
PAGE_SIZE, true);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
&cached_state);
out_page:
if (ret) {
@@ -2385,7 +2453,7 @@
* that could need flushing space. Recursing back to fixup worker would
* deadlock.
*/
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(&inode->vfs_inode);
}
/*
@@ -2441,18 +2509,18 @@
}
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
- struct inode *inode, u64 file_pos,
- u64 disk_bytenr, u64 disk_num_bytes,
- u64 num_bytes, u64 ram_bytes,
- u8 compression, u8 encryption,
- u16 other_encoding, int extent_type)
+ struct btrfs_inode *inode, u64 file_pos,
+ struct btrfs_file_extent_item *stack_fi,
+ u64 qgroup_reserved)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_file_extent_item *fi;
+ struct btrfs_root *root = inode->root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key ins;
- u64 qg_released;
+ u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
+ u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
+ u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
+ u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
int extent_inserted = 0;
int ret;
@@ -2471,708 +2539,52 @@
*/
ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
file_pos + num_bytes, NULL, 0,
- 1, sizeof(*fi), &extent_inserted);
+ 1, sizeof(*stack_fi), &extent_inserted);
if (ret)
goto out;
if (!extent_inserted) {
- ins.objectid = btrfs_ino(BTRFS_I(inode));
+ ins.objectid = btrfs_ino(inode);
ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
- sizeof(*fi));
+ sizeof(*stack_fi));
if (ret)
goto out;
}
leaf = path->nodes[0];
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_set_file_extent_type(leaf, fi, extent_type);
- btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
- btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
- btrfs_set_file_extent_offset(leaf, fi, 0);
- btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
- btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
- btrfs_set_file_extent_compression(leaf, fi, compression);
- btrfs_set_file_extent_encryption(leaf, fi, encryption);
- btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+ btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
+ write_extent_buffer(leaf, stack_fi,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(struct btrfs_file_extent_item));
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- inode_add_bytes(inode, num_bytes);
+ inode_add_bytes(&inode->vfs_inode, num_bytes);
ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
- /*
- * Release the reserved range from inode dirty range map, as it is
- * already moved into delayed_ref_head
- */
- ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
- if (ret < 0)
- goto out;
- qg_released = ret;
- ret = btrfs_alloc_reserved_file_extent(trans, root,
- btrfs_ino(BTRFS_I(inode)),
- file_pos, qg_released, &ins);
-out:
- btrfs_free_path(path);
-
- return ret;
-}
-
-/* snapshot-aware defrag */
-struct sa_defrag_extent_backref {
- struct rb_node node;
- struct old_sa_defrag_extent *old;
- u64 root_id;
- u64 inum;
- u64 file_pos;
- u64 extent_offset;
- u64 num_bytes;
- u64 generation;
-};
-
-struct old_sa_defrag_extent {
- struct list_head list;
- struct new_sa_defrag_extent *new;
-
- u64 extent_offset;
- u64 bytenr;
- u64 offset;
- u64 len;
- int count;
-};
-
-struct new_sa_defrag_extent {
- struct rb_root root;
- struct list_head head;
- struct btrfs_path *path;
- struct inode *inode;
- u64 file_pos;
- u64 len;
- u64 bytenr;
- u64 disk_len;
- u8 compress_type;
-};
-
-static int backref_comp(struct sa_defrag_extent_backref *b1,
- struct sa_defrag_extent_backref *b2)
-{
- if (b1->root_id < b2->root_id)
- return -1;
- else if (b1->root_id > b2->root_id)
- return 1;
-
- if (b1->inum < b2->inum)
- return -1;
- else if (b1->inum > b2->inum)
- return 1;
-
- if (b1->file_pos < b2->file_pos)
- return -1;
- else if (b1->file_pos > b2->file_pos)
- return 1;
-
- /*
- * [------------------------------] ===> (a range of space)
- * |<--->| |<---->| =============> (fs/file tree A)
- * |<---------------------------->| ===> (fs/file tree B)
- *
- * A range of space can refer to two file extents in one tree while
- * refer to only one file extent in another tree.
- *
- * So we may process a disk offset more than one time(two extents in A)
- * and locate at the same extent(one extent in B), then insert two same
- * backrefs(both refer to the extent in B).
- */
- return 0;
-}
-
-static void backref_insert(struct rb_root *root,
- struct sa_defrag_extent_backref *backref)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct sa_defrag_extent_backref *entry;
- int ret;
-
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
-
- ret = backref_comp(backref, entry);
- if (ret < 0)
- p = &(*p)->rb_left;
- else
- p = &(*p)->rb_right;
- }
-
- rb_link_node(&backref->node, parent, p);
- rb_insert_color(&backref->node, root);
-}
-
-/*
- * Note the backref might has changed, and in this case we just return 0.
- */
-static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
- void *ctx)
-{
- struct btrfs_file_extent_item *extent;
- struct old_sa_defrag_extent *old = ctx;
- struct new_sa_defrag_extent *new = old->new;
- struct btrfs_path *path = new->path;
- struct btrfs_key key;
- struct btrfs_root *root;
- struct sa_defrag_extent_backref *backref;
- struct extent_buffer *leaf;
- struct inode *inode = new->inode;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int slot;
- int ret;
- u64 extent_offset;
- u64 num_bytes;
-
- if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
- inum == btrfs_ino(BTRFS_I(inode)))
- return 0;
-
- key.objectid = root_id;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(root)) {
- if (PTR_ERR(root) == -ENOENT)
- return 0;
- WARN_ON(1);
- btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
- inum, offset, root_id);
- return PTR_ERR(root);
- }
-
- key.objectid = inum;
- key.type = BTRFS_EXTENT_DATA_KEY;
- if (offset > (u64)-1 << 32)
- key.offset = 0;
- else
- key.offset = offset;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (WARN_ON(ret < 0))
- return ret;
- ret = 0;
-
- while (1) {
- cond_resched();
-
- leaf = path->nodes[0];
- slot = path->slots[0];
-
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- goto out;
- }
- continue;
- }
-
- path->slots[0]++;
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
-
- if (key.objectid > inum)
- goto out;
-
- if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
- continue;
-
- extent = btrfs_item_ptr(leaf, slot,
- struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
- continue;
-
- /*
- * 'offset' refers to the exact key.offset,
- * NOT the 'offset' field in btrfs_extent_data_ref, ie.
- * (key.offset - extent_offset).
- */
- if (key.offset != offset)
- continue;
-
- extent_offset = btrfs_file_extent_offset(leaf, extent);
- num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
-
- if (extent_offset >= old->extent_offset + old->offset +
- old->len || extent_offset + num_bytes <=
- old->extent_offset + old->offset)
- continue;
- break;
- }
-
- backref = kmalloc(sizeof(*backref), GFP_NOFS);
- if (!backref) {
- ret = -ENOENT;
- goto out;
- }
-
- backref->root_id = root_id;
- backref->inum = inum;
- backref->file_pos = offset;
- backref->num_bytes = num_bytes;
- backref->extent_offset = extent_offset;
- backref->generation = btrfs_file_extent_generation(leaf, extent);
- backref->old = old;
- backref_insert(&new->root, backref);
- old->count++;
-out:
- btrfs_release_path(path);
- WARN_ON(ret);
- return ret;
-}
-
-static noinline bool record_extent_backrefs(struct btrfs_path *path,
- struct new_sa_defrag_extent *new)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
- struct old_sa_defrag_extent *old, *tmp;
- int ret;
-
- new->path = path;
-
- list_for_each_entry_safe(old, tmp, &new->head, list) {
- ret = iterate_inodes_from_logical(old->bytenr +
- old->extent_offset, fs_info,
- path, record_one_backref,
- old, false);
- if (ret < 0 && ret != -ENOENT)
- return false;
-
- /* no backref to be processed for this extent */
- if (!old->count) {
- list_del(&old->list);
- kfree(old);
- }
- }
-
- if (list_empty(&new->head))
- return false;
-
- return true;
-}
-
-static int relink_is_mergable(struct extent_buffer *leaf,
- struct btrfs_file_extent_item *fi,
- struct new_sa_defrag_extent *new)
-{
- if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
- return 0;
-
- if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
- return 0;
-
- if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
- return 0;
-
- if (btrfs_file_extent_encryption(leaf, fi) ||
- btrfs_file_extent_other_encoding(leaf, fi))
- return 0;
-
- return 1;
-}
-
-/*
- * Note the backref might has changed, and in this case we just return 0.
- */
-static noinline int relink_extent_backref(struct btrfs_path *path,
- struct sa_defrag_extent_backref *prev,
- struct sa_defrag_extent_backref *backref)
-{
- struct btrfs_file_extent_item *extent;
- struct btrfs_file_extent_item *item;
- struct btrfs_ordered_extent *ordered;
- struct btrfs_trans_handle *trans;
- struct btrfs_ref ref = { 0 };
- struct btrfs_root *root;
- struct btrfs_key key;
- struct extent_buffer *leaf;
- struct old_sa_defrag_extent *old = backref->old;
- struct new_sa_defrag_extent *new = old->new;
- struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
- struct inode *inode;
- struct extent_state *cached = NULL;
- int ret = 0;
- u64 start;
- u64 len;
- u64 lock_start;
- u64 lock_end;
- bool merge = false;
- int index;
-
- if (prev && prev->root_id == backref->root_id &&
- prev->inum == backref->inum &&
- prev->file_pos + prev->num_bytes == backref->file_pos)
- merge = true;
-
- /* step 1: get root */
- key.objectid = backref->root_id;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
- if (PTR_ERR(root) == -ENOENT)
- return 0;
- return PTR_ERR(root);
- }
-
- if (btrfs_root_readonly(root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
- return 0;
- }
-
- /* step 2: get inode */
- key.objectid = backref->inum;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
-
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
- if (IS_ERR(inode)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
- return 0;
- }
-
- srcu_read_unlock(&fs_info->subvol_srcu, index);
-
- /* step 3: relink backref */
- lock_start = backref->file_pos;
- lock_end = backref->file_pos + backref->num_bytes - 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
- &cached);
-
- ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
- if (ordered) {
- btrfs_put_ordered_extent(ordered);
- goto out_unlock;
- }
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out_unlock;
- }
-
- key.objectid = backref->inum;
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = backref->file_pos;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- goto out_free_path;
- } else if (ret > 0) {
- ret = 0;
- goto out_free_path;
- }
-
- extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_generation(path->nodes[0], extent) !=
- backref->generation)
- goto out_free_path;
-
- btrfs_release_path(path);
-
- start = backref->file_pos;
- if (backref->extent_offset < old->extent_offset + old->offset)
- start += old->extent_offset + old->offset -
- backref->extent_offset;
-
- len = min(backref->extent_offset + backref->num_bytes,
- old->extent_offset + old->offset + old->len);
- len -= max(backref->extent_offset, old->extent_offset + old->offset);
-
- ret = btrfs_drop_extents(trans, root, inode, start,
- start + len, 1);
+ ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
if (ret)
- goto out_free_path;
-again:
- key.objectid = btrfs_ino(BTRFS_I(inode));
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = start;
-
- path->leave_spinning = 1;
- if (merge) {
- struct btrfs_file_extent_item *fi;
- u64 extent_len;
- struct btrfs_key found_key;
-
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- if (ret < 0)
- goto out_free_path;
-
- path->slots[0]--;
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- extent_len = btrfs_file_extent_num_bytes(leaf, fi);
-
- if (extent_len + found_key.offset == start &&
- relink_is_mergable(leaf, fi, new)) {
- btrfs_set_file_extent_num_bytes(leaf, fi,
- extent_len + len);
- btrfs_mark_buffer_dirty(leaf);
- inode_add_bytes(inode, len);
-
- ret = 1;
- goto out_free_path;
- } else {
- merge = false;
- btrfs_release_path(path);
- goto again;
- }
- }
-
- ret = btrfs_insert_empty_item(trans, root, path, &key,
- sizeof(*extent));
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_free_path;
- }
-
- leaf = path->nodes[0];
- item = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
- btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
- btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
- btrfs_set_file_extent_num_bytes(leaf, item, len);
- btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
- btrfs_set_file_extent_generation(leaf, item, trans->transid);
- btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
- btrfs_set_file_extent_compression(leaf, item, new->compress_type);
- btrfs_set_file_extent_encryption(leaf, item, 0);
- btrfs_set_file_extent_other_encoding(leaf, item, 0);
-
- btrfs_mark_buffer_dirty(leaf);
- inode_add_bytes(inode, len);
- btrfs_release_path(path);
-
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new->bytenr,
- new->disk_len, 0);
- btrfs_init_data_ref(&ref, backref->root_id, backref->inum,
- new->file_pos); /* start - extent_offset */
- ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_free_path;
- }
-
- ret = 1;
-out_free_path:
- btrfs_release_path(path);
- path->leave_spinning = 0;
- btrfs_end_transaction(trans);
-out_unlock:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
- &cached);
- iput(inode);
- return ret;
-}
-
-static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
-{
- struct old_sa_defrag_extent *old, *tmp;
-
- if (!new)
- return;
-
- list_for_each_entry_safe(old, tmp, &new->head, list) {
- kfree(old);
- }
- kfree(new);
-}
-
-static void relink_file_extents(struct new_sa_defrag_extent *new)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
- struct btrfs_path *path;
- struct sa_defrag_extent_backref *backref;
- struct sa_defrag_extent_backref *prev = NULL;
- struct rb_node *node;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return;
-
- if (!record_extent_backrefs(path, new)) {
- btrfs_free_path(path);
goto out;
- }
- btrfs_release_path(path);
- while (1) {
- node = rb_first(&new->root);
- if (!node)
- break;
- rb_erase(node, &new->root);
-
- backref = rb_entry(node, struct sa_defrag_extent_backref, node);
-
- ret = relink_extent_backref(path, prev, backref);
- WARN_ON(ret < 0);
-
- kfree(prev);
-
- if (ret == 1)
- prev = backref;
- else
- prev = NULL;
- cond_resched();
- }
- kfree(prev);
-
- btrfs_free_path(path);
+ ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
+ file_pos, qgroup_reserved, &ins);
out:
- free_sa_defrag_extent(new);
-
- atomic_dec(&fs_info->defrag_running);
- wake_up(&fs_info->transaction_wait);
-}
-
-static struct new_sa_defrag_extent *
-record_old_file_extents(struct inode *inode,
- struct btrfs_ordered_extent *ordered)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_path *path;
- struct btrfs_key key;
- struct old_sa_defrag_extent *old;
- struct new_sa_defrag_extent *new;
- int ret;
-
- new = kmalloc(sizeof(*new), GFP_NOFS);
- if (!new)
- return NULL;
-
- new->inode = inode;
- new->file_pos = ordered->file_offset;
- new->len = ordered->len;
- new->bytenr = ordered->start;
- new->disk_len = ordered->disk_len;
- new->compress_type = ordered->compress_type;
- new->root = RB_ROOT;
- INIT_LIST_HEAD(&new->head);
-
- path = btrfs_alloc_path();
- if (!path)
- goto out_kfree;
-
- key.objectid = btrfs_ino(BTRFS_I(inode));
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = new->file_pos;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out_free_path;
- if (ret > 0 && path->slots[0] > 0)
- path->slots[0]--;
-
- /* find out all the old extents for the file range */
- while (1) {
- struct btrfs_file_extent_item *extent;
- struct extent_buffer *l;
- int slot;
- u64 num_bytes;
- u64 offset;
- u64 end;
- u64 disk_bytenr;
- u64 extent_offset;
-
- l = path->nodes[0];
- slot = path->slots[0];
-
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out_free_path;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(l, &key, slot);
-
- if (key.objectid != btrfs_ino(BTRFS_I(inode)))
- break;
- if (key.type != BTRFS_EXTENT_DATA_KEY)
- break;
- if (key.offset >= new->file_pos + new->len)
- break;
-
- extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
-
- num_bytes = btrfs_file_extent_num_bytes(l, extent);
- if (key.offset + num_bytes < new->file_pos)
- goto next;
-
- disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
- if (!disk_bytenr)
- goto next;
-
- extent_offset = btrfs_file_extent_offset(l, extent);
-
- old = kmalloc(sizeof(*old), GFP_NOFS);
- if (!old)
- goto out_free_path;
-
- offset = max(new->file_pos, key.offset);
- end = min(new->file_pos + new->len, key.offset + num_bytes);
-
- old->bytenr = disk_bytenr;
- old->extent_offset = extent_offset;
- old->offset = offset - key.offset;
- old->len = end - offset;
- old->new = new;
- old->count = 0;
- list_add_tail(&old->list, &new->head);
-next:
- path->slots[0]++;
- cond_resched();
- }
-
btrfs_free_path(path);
- atomic_inc(&fs_info->defrag_running);
- return new;
-
-out_free_path:
- btrfs_free_path(path);
-out_kfree:
- free_sa_defrag_extent(new);
- return NULL;
+ return ret;
}
static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
u64 start, u64 len)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, start);
ASSERT(cache);
@@ -3184,7 +2596,33 @@
btrfs_put_block_group(cache);
}
-/* as ordered data IO finishes, this gets called so we can finish
+static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_ordered_extent *oe)
+{
+ struct btrfs_file_extent_item stack_fi;
+ u64 logical_len;
+
+ memset(&stack_fi, 0, sizeof(stack_fi));
+ btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
+ btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
+ btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
+ oe->disk_num_bytes);
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
+ logical_len = oe->truncated_len;
+ else
+ logical_len = oe->num_bytes;
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len);
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len);
+ btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
+ /* Encryption and other encoding is reserved and all 0 */
+
+ return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
+ oe->file_offset, &stack_fi,
+ oe->qgroup_rsv);
+}
+
+/*
+ * As ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
*/
@@ -3196,32 +2634,33 @@
struct btrfs_trans_handle *trans = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
- struct new_sa_defrag_extent *new = NULL;
+ u64 start, end;
int compress_type = 0;
int ret = 0;
- u64 logical_len = ordered_extent->len;
- bool nolock;
+ u64 logical_len = ordered_extent->num_bytes;
+ bool freespace_inode;
bool truncated = false;
bool range_locked = false;
bool clear_new_delalloc_bytes = false;
bool clear_reserved_extent = true;
+ unsigned int clear_bits;
+
+ start = ordered_extent->file_offset;
+ end = start + ordered_extent->num_bytes - 1;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
clear_new_delalloc_bytes = true;
- nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
+ freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
ret = -EIO;
goto out;
}
- btrfs_free_io_failure_record(BTRFS_I(inode),
- ordered_extent->file_offset,
- ordered_extent->file_offset +
- ordered_extent->len - 1);
+ btrfs_free_io_failure_record(BTRFS_I(inode), start, end);
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
@@ -3234,16 +2673,9 @@
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
- /*
- * For mwrite(mmap + memset to write) case, we still reserve
- * space for NOCOW range.
- * As NOCOW won't cause a new delayed ref, just free the space
- */
- btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
- ordered_extent->len);
- btrfs_ordered_update_i_size(inode, 0, ordered_extent);
- if (nolock)
- trans = btrfs_join_transaction_nolock(root);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
+ if (freespace_inode)
+ trans = btrfs_join_transaction_spacecache(root);
else
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
@@ -3259,26 +2691,10 @@
}
range_locked = true;
- lock_extent_bits(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset + ordered_extent->len - 1,
- &cached_state);
+ lock_extent_bits(io_tree, start, end, &cached_state);
- ret = test_range_bit(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset + ordered_extent->len - 1,
- EXTENT_DEFRAG, 0, cached_state);
- if (ret) {
- u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
- if (0 && last_snapshot >= BTRFS_I(inode)->generation)
- /* the inode is shared */
- new = record_old_file_extents(inode, ordered_extent);
-
- clear_extent_bit(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset + ordered_extent->len - 1,
- EXTENT_DEFRAG, 0, 0, &cached_state);
- }
-
- if (nolock)
- trans = btrfs_join_transaction_nolock(root);
+ if (freespace_inode)
+ trans = btrfs_join_transaction_spacecache(root);
else
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
@@ -3293,43 +2709,35 @@
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
BUG_ON(compress_type);
- btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
- ordered_extent->len);
ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
} else {
BUG_ON(root == fs_info->tree_root);
- ret = insert_reserved_file_extent(trans, inode,
- ordered_extent->file_offset,
- ordered_extent->start,
- ordered_extent->disk_len,
- logical_len, logical_len,
- compress_type, 0, 0,
- BTRFS_FILE_EXTENT_REG);
+ ret = insert_ordered_extent_file_extent(trans, ordered_extent);
if (!ret) {
clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
- ordered_extent->start,
- ordered_extent->disk_len);
+ ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
}
}
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
- ordered_extent->file_offset, ordered_extent->len,
- trans->transid);
+ ordered_extent->file_offset,
+ ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- ret = add_pending_csums(trans, inode, &ordered_extent->list);
+ ret = add_pending_csums(trans, &ordered_extent->list);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
@@ -3337,27 +2745,20 @@
}
ret = 0;
out:
- if (range_locked || clear_new_delalloc_bytes) {
- unsigned int clear_bits = 0;
-
- if (range_locked)
- clear_bits |= EXTENT_LOCKED;
- if (clear_new_delalloc_bytes)
- clear_bits |= EXTENT_DELALLOC_NEW;
- clear_extent_bit(&BTRFS_I(inode)->io_tree,
- ordered_extent->file_offset,
- ordered_extent->file_offset +
- ordered_extent->len - 1,
- clear_bits,
- (clear_bits & EXTENT_LOCKED) ? 1 : 0,
- 0, &cached_state);
- }
+ clear_bits = EXTENT_DEFRAG;
+ if (range_locked)
+ clear_bits |= EXTENT_LOCKED;
+ if (clear_new_delalloc_bytes)
+ clear_bits |= EXTENT_DELALLOC_NEW;
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits,
+ (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
+ &cached_state);
if (trans)
btrfs_end_transaction(trans);
if (ret || truncated) {
- u64 start, end;
+ u64 unwritten_start = start;
/*
* If we failed to finish this ordered extent for any reason we
@@ -3372,14 +2773,11 @@
mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
if (truncated)
- start = ordered_extent->file_offset + logical_len;
- else
- start = ordered_extent->file_offset;
- end = ordered_extent->file_offset + ordered_extent->len - 1;
- clear_extent_uptodate(io_tree, start, end, NULL);
+ unwritten_start += logical_len;
+ clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
/* Drop the cache for the part of the extent we didn't write. */
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
+ btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0);
/*
* If the ordered extent had an IOERR or something else went
@@ -3394,28 +2792,27 @@
if ((ret || !logical_len) &&
clear_reserved_extent &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
- !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
+ !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+ /*
+ * Discard the range before returning it back to the
+ * free space pool
+ */
+ if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
+ btrfs_discard_extent(fs_info,
+ ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes,
+ NULL);
btrfs_free_reserved_extent(fs_info,
- ordered_extent->start,
- ordered_extent->disk_len, 1);
+ ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes, 1);
+ }
}
-
/*
* This needs to be done to make sure anybody waiting knows we are done
* updating everything for this ordered extent.
*/
- btrfs_remove_ordered_extent(inode, ordered_extent);
-
- /* for snapshot-aware defrag */
- if (new) {
- if (ret) {
- free_sa_defrag_extent(new);
- atomic_dec(&fs_info->defrag_running);
- } else {
- relink_file_extents(new);
- }
- }
+ btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent);
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
@@ -3435,8 +2832,8 @@
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate)
{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_extent *ordered_extent = NULL;
struct btrfs_workqueue *wq;
@@ -3447,7 +2844,7 @@
end - start + 1, uptodate))
return;
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
+ if (btrfs_is_free_space_inode(inode))
wq = fs_info->endio_freespace_worker;
else
wq = fs_info->endio_write_workers;
@@ -3456,10 +2853,9 @@
btrfs_queue_work(wq, &ordered_extent->work);
}
-static int __readpage_endio_check(struct inode *inode,
- struct btrfs_io_bio *io_bio,
- int icsum, struct page *page,
- int pgoff, u64 start, size_t len)
+static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
+ int icsum, struct page *page, int pgoff, u64 start,
+ size_t len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
@@ -3473,9 +2869,7 @@
kaddr = kmap_atomic(page);
shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
- crypto_shash_update(shash, kaddr + pgoff, len);
- crypto_shash_final(shash, csum);
+ crypto_shash_digest(shash, kaddr + pgoff, len, csum);
if (memcmp(csum, csum_expected, csum_size))
goto zeroit;
@@ -3485,6 +2879,9 @@
zeroit:
btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
io_bio->mirror_num);
+ if (io_bio->device)
+ btrfs_dev_stat_inc_and_print(io_bio->device,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
memset(kaddr + pgoff, 1, len);
flush_dcache_page(page);
kunmap_atomic(kaddr);
@@ -3496,9 +2893,8 @@
* if there's a match, we allow the bio to finish. If not, the code in
* extent_io.c will try to find good copies for us.
*/
-static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end, int mirror)
{
size_t offset = start - page_offset(page);
struct inode *inode = page->mapping->host;
@@ -3520,8 +2916,8 @@
}
phy_offset >>= inode->i_sb->s_blocksize_bits;
- return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
- start, (size_t)(end - start + 1));
+ return check_data_csum(inode, io_bio, phy_offset, page, offset, start,
+ (size_t)(end - start + 1));
}
/*
@@ -3712,14 +3108,13 @@
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, last_objectid, root);
ret = PTR_ERR_OR_ZERO(inode);
if (ret && ret != -ENOENT)
goto out;
if (ret == -ENOENT && root == fs_info->tree_root) {
struct btrfs_root *dead_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
int is_dead_root = 0;
/*
@@ -3731,18 +3126,16 @@
* orphan must not get deleted.
* find_dead_roots already ran before us, so if this
* is a snapshot deletion, we should find the root
- * in the dead_roots list
+ * in the fs_roots radix tree.
*/
- spin_lock(&fs_info->trans_lock);
- list_for_each_entry(dead_root, &fs_info->dead_roots,
- root_list) {
- if (dead_root->root_key.objectid ==
- found_key.objectid) {
- is_dead_root = 1;
- break;
- }
- }
- spin_unlock(&fs_info->trans_lock);
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)found_key.objectid);
+ if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
+ is_dead_root = 1;
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
if (is_dead_root) {
/* prevent this orphan from being found again */
key.offset = found_key.objectid - 1;
@@ -3933,6 +3326,8 @@
i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
+ btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
+ round_up(i_size_read(inode), fs_info->sectorsize));
inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
@@ -4004,6 +3399,14 @@
*/
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+ /*
+ * Same logic as for last_unlink_trans. We don't persist the generation
+ * of the last transaction where this inode was used for a reflink
+ * operation, so after eviction and reloading the inode we must be
+ * pessimistic and assume the last transaction that modified the inode.
+ */
+ BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
+
path->slots[0]++;
if (inode->i_nlink != 1 ||
path->slots[0] >= btrfs_header_nritems(leaf))
@@ -4051,7 +3454,6 @@
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
break;
@@ -4086,43 +3488,40 @@
btrfs_init_map_token(&token, leaf);
- btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
- btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
- btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
- &token);
- btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
- btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
+ btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
+ btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
+ btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_token_inode_mode(&token, item, inode->i_mode);
+ btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
- btrfs_set_token_timespec_sec(leaf, &item->atime,
- inode->i_atime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->atime,
- inode->i_atime.tv_nsec, &token);
+ btrfs_set_token_timespec_sec(&token, &item->atime,
+ inode->i_atime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->atime,
+ inode->i_atime.tv_nsec);
- btrfs_set_token_timespec_sec(leaf, &item->mtime,
- inode->i_mtime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->mtime,
- inode->i_mtime.tv_nsec, &token);
+ btrfs_set_token_timespec_sec(&token, &item->mtime,
+ inode->i_mtime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->mtime,
+ inode->i_mtime.tv_nsec);
- btrfs_set_token_timespec_sec(leaf, &item->ctime,
- inode->i_ctime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->ctime,
- inode->i_ctime.tv_nsec, &token);
+ btrfs_set_token_timespec_sec(&token, &item->ctime,
+ inode->i_ctime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->ctime,
+ inode->i_ctime.tv_nsec);
- btrfs_set_token_timespec_sec(leaf, &item->otime,
- BTRFS_I(inode)->i_otime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->otime,
- BTRFS_I(inode)->i_otime.tv_nsec, &token);
+ btrfs_set_token_timespec_sec(&token, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_nsec);
- btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
- &token);
- btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
- &token);
- btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode),
- &token);
- btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
- btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
- btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
- btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+ btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
+ btrfs_set_token_inode_generation(&token, item,
+ BTRFS_I(inode)->generation);
+ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
+ btrfs_set_token_inode_transid(&token, item, trans->transid);
+ btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+ btrfs_set_token_inode_block_group(&token, item, 0);
}
/*
@@ -4155,7 +3554,7 @@
fill_inode_item(trans, leaf, inode_item, inode);
btrfs_mark_buffer_dirty(leaf);
- btrfs_set_inode_last_trans(trans, inode);
+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
ret = 0;
failed:
btrfs_free_path(path);
@@ -4185,7 +3584,7 @@
ret = btrfs_delayed_update_inode(trans, root, inode);
if (!ret)
- btrfs_set_inode_last_trans(trans, inode);
+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
return ret;
}
@@ -4710,7 +4109,7 @@
err = ret;
inode->i_flags |= S_DEAD;
out_release:
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
out_up_write:
up_write(&fs_info->subvol_sem);
if (err) {
@@ -4839,11 +4238,12 @@
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
/*
- * for non-free space inodes and ref cows, we want to back off from
- * time to time
+ * For non-free space inodes and non-shareable roots, we want to back
+ * off from time to time. This means all inodes in subvolume roots,
+ * reloc roots, and data reloc roots.
*/
if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
- test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
be_nice = true;
path = btrfs_alloc_path();
@@ -4851,20 +4251,19 @@
return -ENOMEM;
path->reada = READA_BACK;
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
&cached_state);
- /*
- * We want to drop from the next block forward in case this new size is
- * not block aligned since we will be keeping the last block of the
- * extent just the way it is.
- */
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- root == fs_info->tree_root)
+ /*
+ * We want to drop from the next block forward in case this
+ * new size is not block aligned since we will be keeping the
+ * last block of the extent just the way it is.
+ */
btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
fs_info->sectorsize),
(u64)-1, 0);
+ }
/*
* This function is also used to drop the items in the log tree before
@@ -4906,6 +4305,8 @@
}
while (1) {
+ u64 clear_start = 0, clear_len = 0;
+
fi = NULL;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -4956,6 +4357,8 @@
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
u64 num_dec;
+
+ clear_start = found_key.offset;
extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
if (!del_item) {
u64 orig_num_bytes =
@@ -4963,11 +4366,12 @@
extent_num_bytes = ALIGN(new_size -
found_key.offset,
fs_info->sectorsize);
+ clear_start = ALIGN(new_size, fs_info->sectorsize);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_num_bytes);
num_dec = (orig_num_bytes -
extent_num_bytes);
- if (test_bit(BTRFS_ROOT_REF_COWS,
+ if (test_bit(BTRFS_ROOT_SHAREABLE,
&root->state) &&
extent_start != 0)
inode_sub_bytes(inode, num_dec);
@@ -4983,11 +4387,12 @@
num_dec = btrfs_file_extent_num_bytes(leaf, fi);
if (extent_start != 0) {
found_extent = 1;
- if (test_bit(BTRFS_ROOT_REF_COWS,
+ if (test_bit(BTRFS_ROOT_SHAREABLE,
&root->state))
inode_sub_bytes(inode, num_dec);
}
}
+ clear_len = num_dec;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
/*
* we can't truncate inline items that have had
@@ -5009,12 +4414,33 @@
*/
ret = NEED_TRUNCATE_BLOCK;
break;
+ } else {
+ /*
+ * Inline extents are special, we just treat
+ * them as a full sector worth in the file
+ * extent tree just for simplicity sake.
+ */
+ clear_len = fs_info->sectorsize;
}
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
inode_sub_bytes(inode, item_end + 1 - new_size);
}
delete:
+ /*
+ * We use btrfs_truncate_inode_items() to clean up log trees for
+ * multiple fsyncs, and in this case we don't want to clear the
+ * file extent range because it's just the log.
+ */
+ if (root == BTRFS_I(inode)->root) {
+ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+ clear_start, clear_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ }
+
if (del_item)
last_size = found_key.offset;
else
@@ -5038,8 +4464,7 @@
should_throttle = false;
if (found_extent &&
- (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- root == fs_info->tree_root)) {
+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
struct btrfs_ref ref = { 0 };
bytes_deleted += extent_num_bytes;
@@ -5116,7 +4541,7 @@
ASSERT(last_size >= new_size);
if (!ret && last_size > new_size)
last_size = new_size;
- btrfs_ordered_update_i_size(inode, last_size, NULL);
+ btrfs_inode_safe_disk_i_size_write(inode, last_size);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
(u64)-1, &cached_state);
}
@@ -5164,14 +4589,11 @@
block_start = round_down(from, blocksize);
block_end = block_start + blocksize - 1;
-
- ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
- blocksize);
+ ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved,
+ block_start, blocksize);
if (ret < 0) {
- if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC)) &&
- btrfs_check_can_nocow(BTRFS_I(inode), block_start,
- &write_bytes) > 0) {
+ if (btrfs_check_nocow_lock(BTRFS_I(inode), block_start,
+ &write_bytes) > 0) {
/* For nocow case, no need to reserve data space */
only_release_metadata = true;
} else {
@@ -5181,14 +4603,14 @@
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize);
if (ret < 0) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(inode, data_reserved,
- block_start, blocksize);
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ data_reserved, block_start, blocksize);
goto out;
}
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
block_start, blocksize, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
ret = -ENOMEM;
@@ -5213,13 +4635,13 @@
lock_extent_bits(io_tree, block_start, block_end, &cached_state);
set_page_extent_mapped(page);
- ordered = btrfs_lookup_ordered_extent(inode, block_start);
+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), block_start);
if (ordered) {
unlock_extent_cached(io_tree, block_start, block_end,
&cached_state);
unlock_page(page);
put_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -5228,7 +4650,7 @@
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state);
- ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), block_start, block_end, 0,
&cached_state);
if (ret) {
unlock_extent_cached(io_tree, block_start, block_end,
@@ -5264,7 +4686,7 @@
btrfs_delalloc_release_metadata(BTRFS_I(inode),
blocksize, true);
else
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
block_start, blocksize, true);
}
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
@@ -5272,7 +4694,7 @@
put_page(page);
out:
if (only_release_metadata)
- btrfs_end_write_no_snapshotting(BTRFS_I(inode)->root);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
extent_changeset_free(data_reserved);
return ret;
}
@@ -5354,12 +4776,12 @@
if (size <= hole_start)
return 0;
- btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start,
+ btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start,
block_end - 1, &cached_state);
cur_offset = hole_start;
while (1) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
- block_end - cur_offset, 0);
+ block_end - cur_offset);
if (IS_ERR(em)) {
err = PTR_ERR(em);
em = NULL;
@@ -5367,14 +4789,21 @@
}
last_byte = min(extent_map_end(em), block_end);
last_byte = ALIGN(last_byte, fs_info->sectorsize);
+ hole_size = last_byte - cur_offset;
+
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
struct extent_map *hole_em;
- hole_size = last_byte - cur_offset;
err = maybe_insert_hole(root, inode, cur_offset,
hole_size);
if (err)
break;
+
+ err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+ cur_offset, hole_size);
+ if (err)
+ break;
+
btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
cur_offset + hole_size - 1, 0);
hole_em = alloc_extent_map();
@@ -5391,7 +4820,6 @@
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
hole_em->ram_bytes = hole_size;
- hole_em->bdev = fs_info->fs_devices->latest_bdev;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = fs_info->generation;
@@ -5407,6 +4835,11 @@
hole_size - 1, 0);
}
free_extent_map(hole_em);
+ } else {
+ err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+ cur_offset, hole_size);
+ if (err)
+ break;
}
next:
free_extent_map(em);
@@ -5450,42 +4883,39 @@
* truncation, it must capture all writes that happened before
* this truncation.
*/
- btrfs_wait_for_snapshot_creation(root);
+ btrfs_drew_write_lock(&root->snapshot_lock);
ret = btrfs_cont_expand(inode, oldsize, newsize);
if (ret) {
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
return ret;
}
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
return PTR_ERR(trans);
}
i_size_write(inode, newsize);
- btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
pagecache_isize_extended(inode, oldsize, newsize);
ret = btrfs_update_inode(trans, root, inode);
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_end_transaction(trans);
} else {
/*
* We're truncating a file that used to have good data down to
- * zero. Make sure it gets into the ordered flush list so that
- * any new writes get down to disk quickly.
+ * zero. Make sure any new writes to the file get on disk
+ * on close.
*/
if (newsize == 0)
- set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags);
truncate_setsize(inode, newsize);
- /* Disable nonlocked read DIO to avoid the endless truncate */
- btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
inode_dio_wait(inode);
- btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
@@ -5579,8 +5009,8 @@
/*
* Keep looping until we have no more ranges in the io tree.
- * We can have ongoing bios started by readpages (called from readahead)
- * that have their endio callback (extent_io.c:end_bio_extent_readpage)
+ * We can have ongoing bios started by readahead that have
+ * their endio callback (extent_io.c:end_bio_extent_readpage)
* still in progress (unlocked the pages in the bio but did not yet
* unlocked the ranges in the io tree). Therefore this means some
* ranges can still be locked and eviction started because before
@@ -5619,7 +5049,8 @@
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
if (state_flags & EXTENT_DELALLOC)
- btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
+ btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
+ end - start + 1);
clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DELALLOC |
@@ -5877,7 +5308,7 @@
btrfs_release_path(path);
- new_root = btrfs_read_fs_root_no_name(fs_info, location);
+ new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
if (IS_ERR(new_root)) {
err = PTR_ERR(new_root);
goto out;
@@ -5929,15 +5360,15 @@
spin_unlock(&root->inode_lock);
}
-static void inode_tree_del(struct inode *inode)
+static void inode_tree_del(struct btrfs_inode *inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
int empty = 0;
spin_lock(&root->inode_lock);
- if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
- rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ if (!RB_EMPTY_NODE(&inode->rb_node)) {
+ rb_erase(&inode->rb_node, &root->inode_tree);
+ RB_CLEAR_NODE(&inode->rb_node);
empty = RB_EMPTY_ROOT(&root->inode_tree);
}
spin_unlock(&root->inode_lock);
@@ -5955,29 +5386,32 @@
static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
struct btrfs_iget_args *args = p;
- inode->i_ino = args->location->objectid;
- memcpy(&BTRFS_I(inode)->location, args->location,
- sizeof(*args->location));
- BTRFS_I(inode)->root = args->root;
+
+ inode->i_ino = args->ino;
+ BTRFS_I(inode)->location.objectid = args->ino;
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.offset = 0;
+ BTRFS_I(inode)->root = btrfs_grab_root(args->root);
+ BUG_ON(args->root && !BTRFS_I(inode)->root);
return 0;
}
static int btrfs_find_actor(struct inode *inode, void *opaque)
{
struct btrfs_iget_args *args = opaque;
- return args->location->objectid == BTRFS_I(inode)->location.objectid &&
+
+ return args->ino == BTRFS_I(inode)->location.objectid &&
args->root == BTRFS_I(inode)->root;
}
-static struct inode *btrfs_iget_locked(struct super_block *s,
- struct btrfs_key *location,
+static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
struct btrfs_root *root)
{
struct inode *inode;
struct btrfs_iget_args args;
- unsigned long hashval = btrfs_inode_hash(location->objectid, root);
+ unsigned long hashval = btrfs_inode_hash(ino, root);
- args.location = location;
+ args.ino = ino;
args.root = root;
inode = iget5_locked(s, hashval, btrfs_find_actor,
@@ -5986,16 +5420,18 @@
return inode;
}
-/* Get an inode object given its location and corresponding root.
- * Returns in *is_new if the inode was read from disk
+/*
+ * Get an inode object given its inode number and corresponding root.
+ * Path can be preallocated to prevent recursing back to iget through
+ * allocator. NULL is also valid but may require an additional allocation
+ * later.
*/
-struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *new,
- struct btrfs_path *path)
+struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
+ struct btrfs_root *root, struct btrfs_path *path)
{
struct inode *inode;
- inode = btrfs_iget_locked(s, location, root);
+ inode = btrfs_iget_locked(s, ino, root);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -6006,8 +5442,6 @@
if (!ret) {
inode_tree_add(inode);
unlock_new_inode(inode);
- if (new)
- *new = 1;
} else {
iget_failed(inode);
/*
@@ -6024,10 +5458,9 @@
return inode;
}
-struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *new)
+struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
{
- return btrfs_iget_path(s, location, root, new, NULL);
+ return btrfs_iget_path(s, ino, root, NULL);
}
static struct inode *new_simple_dir(struct super_block *s,
@@ -6039,12 +5472,16 @@
if (!inode)
return ERR_PTR(-ENOMEM);
- BTRFS_I(inode)->root = root;
+ BTRFS_I(inode)->root = btrfs_grab_root(root);
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
- inode->i_op = &btrfs_dir_ro_inode_operations;
+ /*
+ * We only need lookup, the rest is read-only and there's no inode
+ * associated with the dentry
+ */
+ inode->i_op = &simple_dir_inode_operations;
inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
@@ -6082,7 +5519,6 @@
struct btrfs_root *sub_root = root;
struct btrfs_key location;
u8 di_type = 0;
- int index;
int ret = 0;
if (dentry->d_name.len > BTRFS_NAME_LEN)
@@ -6093,7 +5529,7 @@
return ERR_PTR(ret);
if (location.type == BTRFS_INODE_ITEM_KEY) {
- inode = btrfs_iget(dir->i_sb, &location, root, NULL);
+ inode = btrfs_iget(dir->i_sb, location.objectid, root);
if (IS_ERR(inode))
return inode;
@@ -6109,7 +5545,6 @@
return inode;
}
- index = srcu_read_lock(&fs_info->subvol_srcu);
ret = fixup_tree_root_location(fs_info, dir, dentry,
&location, &sub_root);
if (ret < 0) {
@@ -6118,9 +5553,10 @@
else
inode = new_simple_dir(dir->i_sb, &location, sub_root);
} else {
- inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
+ inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
}
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ if (root != sub_root)
+ btrfs_put_root(sub_root);
if (!IS_ERR(inode) && root != sub_root) {
down_read(&fs_info->cleanup_work_sem);
@@ -6498,7 +5934,8 @@
static int btrfs_insert_inode_locked(struct inode *inode)
{
struct btrfs_iget_args args;
- args.location = &BTRFS_I(inode)->location;
+
+ args.ino = BTRFS_I(inode)->location.objectid;
args.root = BTRFS_I(inode)->root;
return insert_inode_locked4(inode,
@@ -6601,7 +6038,7 @@
*/
BTRFS_I(inode)->index_cnt = 2;
BTRFS_I(inode)->dir_index = *index;
- BTRFS_I(inode)->root = root;
+ BTRFS_I(inode)->root = btrfs_grab_root(root);
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
@@ -6688,7 +6125,7 @@
inode_tree_add(inode);
trace_btrfs_inode_new(inode);
- btrfs_set_inode_last_trans(trans, inode);
+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
btrfs_update_root_times(trans, root);
@@ -6929,7 +6366,6 @@
if (err)
goto out_unlock;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
d_instantiate_new(dentry, inode);
out_unlock:
@@ -7129,22 +6565,30 @@
return ret;
}
-/*
- * a bit scary, this does extent mapping from logical file offset to the disk.
- * the ugly parts come from merging extents from the disk with the in-ram
- * representation. This gets more complex because of the data=ordered code,
- * where the in-ram extents might be locked pending data=ordered completion.
+/**
+ * btrfs_get_extent - Lookup the first extent overlapping a range in a file.
+ * @inode: file to search in
+ * @page: page to read extent data into if the extent is inline
+ * @pg_offset: offset into @page to copy to
+ * @start: file offset
+ * @len: length of range starting at @start
*
- * This also copies inline extents directly into the page.
+ * This returns the first &struct extent_map which overlaps with the given
+ * range, reading it from the B-tree and caching it if necessary. Note that
+ * there may be more extents which overlap the given range after the returned
+ * extent_map.
+ *
+ * If @page is not NULL and the extent is inline, this also reads the extent
+ * data directly into the page and marks the extent up to date in the io_tree.
+ *
+ * Return: ERR_PTR on error, non-NULL extent_map on success.
*/
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page,
- size_t pg_offset, u64 start, u64 len,
- int create)
+ struct page *page, size_t pg_offset,
+ u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int ret;
- int err = 0;
+ int ret = 0;
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
@@ -7157,12 +6601,9 @@
struct extent_map *em = NULL;
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_io_tree *io_tree = &inode->io_tree;
- const bool new_inline = !page || create;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
- if (em)
- em->bdev = fs_info->fs_devices->latest_bdev;
read_unlock(&em_tree->lock);
if (em) {
@@ -7175,10 +6616,9 @@
}
em = alloc_extent_map();
if (!em) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
- em->bdev = fs_info->fs_devices->latest_bdev;
em->start = EXTENT_MAP_HOLE;
em->orig_start = EXTENT_MAP_HOLE;
em->len = (u64)-1;
@@ -7186,7 +6626,7 @@
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -7199,14 +6639,16 @@
*/
path->leave_spinning = 1;
+ path->recurse = btrfs_is_free_space_inode(inode);
+
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
- err = ret;
goto out;
} else if (ret > 0) {
if (path->slots[0] == 0)
goto not_found;
path->slots[0]--;
+ ret = 0;
}
leaf = path->nodes[0];
@@ -7227,28 +6669,20 @@
extent_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
+ extent_end = btrfs_file_extent_end(path);
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
/* Only regular file could have regular/prealloc extent */
if (!S_ISREG(inode->vfs_inode.i_mode)) {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
btrfs_crit(fs_info,
"regular/prealloc extent found for non-regular inode %llu",
btrfs_ino(inode));
goto out;
}
- extent_end = extent_start +
- btrfs_file_extent_num_bytes(leaf, item);
-
trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
extent_start);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- size_t size;
-
- size = btrfs_file_extent_ram_bytes(leaf, item);
- extent_end = ALIGN(extent_start + size,
- fs_info->sectorsize);
-
trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
path->slots[0],
extent_start);
@@ -7258,12 +6692,11 @@
path->slots[0]++;
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- } else if (ret > 0) {
+ else if (ret > 0)
goto not_found;
- }
+
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -7283,8 +6716,7 @@
goto insert;
}
- btrfs_extent_item_to_extent_map(inode, path, item,
- new_inline, em);
+ btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -7296,7 +6728,7 @@
size_t extent_offset;
size_t copy_size;
- if (new_inline)
+ if (!page)
goto out;
size = btrfs_file_extent_ram_bytes(leaf, item);
@@ -7315,10 +6747,8 @@
BTRFS_COMPRESS_NONE) {
ret = uncompress_inline(path, page, pg_offset,
extent_offset, item);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
} else {
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -7342,29 +6772,28 @@
em->len = len;
em->block_start = EXTENT_MAP_HOLE;
insert:
+ ret = 0;
btrfs_release_path(path);
if (em->start > start || extent_map_end(em) <= start) {
btrfs_err(fs_info,
"bad extent! em: [%llu %llu] passed [%llu %llu]",
em->start, em->len, start, len);
- err = -EIO;
+ ret = -EIO;
goto out;
}
- err = 0;
write_lock(&em_tree->lock);
- err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
btrfs_free_path(path);
trace_btrfs_get_extent(root, inode, em);
- if (err) {
+ if (ret) {
free_extent_map(em);
- return ERR_PTR(err);
+ return ERR_PTR(ret);
}
- BUG_ON(!em); /* Error is always set */
return em;
}
@@ -7379,7 +6808,7 @@
u64 delalloc_end;
int err = 0;
- em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ em = btrfs_get_extent(inode, NULL, 0, start, len);
if (IS_ERR(em))
return em;
/*
@@ -7437,7 +6866,6 @@
err = -ENOMEM;
goto out;
}
- em->bdev = NULL;
ASSERT(hole_em);
/*
@@ -7496,7 +6924,7 @@
return em;
}
-static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
+static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
const u64 start,
const u64 len,
const u64 orig_start,
@@ -7510,21 +6938,19 @@
int ret;
if (type != BTRFS_ORDERED_NOCOW) {
- em = create_io_em(inode, start, len, orig_start,
- block_start, block_len, orig_block_len,
- ram_bytes,
+ em = create_io_em(inode, start, len, orig_start, block_start,
+ block_len, orig_block_len, ram_bytes,
BTRFS_COMPRESS_NONE, /* compress_type */
type);
if (IS_ERR(em))
goto out;
}
- ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
- len, block_len, type);
+ ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len,
+ block_len, type);
if (ret) {
if (em) {
free_extent_map(em);
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
- start + len - 1, 0);
+ btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
}
em = ERR_PTR(ret);
}
@@ -7533,11 +6959,11 @@
return em;
}
-static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
u64 start, u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_map *em;
struct btrfs_key ins;
u64 alloc_hint;
@@ -7554,15 +6980,34 @@
ins.offset, BTRFS_ORDERED_REGULAR);
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
if (IS_ERR(em))
- btrfs_free_reserved_extent(fs_info, ins.objectid,
- ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
+ 1);
return em;
}
/*
- * returns 1 when the nocow is safe, < 1 on error, 0 if the
- * block must be cow'd
+ * Check if we can do nocow write into the range [@offset, @offset + @len)
+ *
+ * @offset: File offset
+ * @len: The length to write, will be updated to the nocow writeable
+ * range
+ * @orig_start: (optional) Return the original file offset of the file extent
+ * @orig_len: (optional) Return the original on-disk length of the file extent
+ * @ram_bytes: (optional) Return the ram_bytes of the file extent
+ * @strict: if true, omit optimizations that might force us into unnecessary
+ * cow. e.g., don't trust generation number.
+ *
+ * This function will flush ordered extents in the range to ensure proper
+ * nocow checks for (nowait == false) case.
+ *
+ * Return:
+ * >0 and update @len if we can do nocow write
+ * 0 if we can't do nocow write
+ * <0 if error happened
+ *
+ * NOTE: This only checks the file extents, caller is responsible to wait for
+ * any ordered extents.
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
@@ -7711,7 +7156,7 @@
}
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state, int writing)
+ struct extent_state **cached_state, bool writing)
{
struct btrfs_ordered_extent *ordered;
int ret = 0;
@@ -7760,7 +7205,7 @@
*/
if (writing ||
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
else
ret = -ENOTBLK;
btrfs_put_ordered_extent(ordered);
@@ -7770,11 +7215,11 @@
* for it to complete) and then invalidate the pages for
* this range (through invalidate_inode_pages2_range()),
* but that can lead us to a deadlock with a concurrent
- * call to readpages() (a buffered read or a defrag call
+ * call to readahead (a buffered read or a defrag call
* triggered a readahead) on a page lock due to an
* ordered dio extent we created before but did not have
* yet a corresponding bio submitted (whence it can not
- * complete), which makes readpages() wait for that
+ * complete), which makes readahead wait for that
* ordered extent to complete while holding a lock on
* that page.
*/
@@ -7791,15 +7236,14 @@
}
/* The callers of this must take lock_extent() */
-static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
- u64 orig_start, u64 block_start,
+static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
+ u64 len, u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
u64 ram_bytes, int compress_type,
int type)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
- struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
ASSERT(type == BTRFS_ORDERED_PREALLOC ||
@@ -7807,7 +7251,7 @@
type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_REGULAR);
- em_tree = &BTRFS_I(inode)->extent_tree;
+ em_tree = &inode->extent_tree;
em = alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
@@ -7817,7 +7261,6 @@
em->len = len;
em->block_len = block_len;
em->block_start = block_start;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
em->orig_block_len = orig_block_len;
em->ram_bytes = ram_bytes;
em->generation = -1;
@@ -7830,8 +7273,8 @@
}
do {
- btrfs_drop_extent_cache(BTRFS_I(inode), em->start,
- em->start + em->len - 1, 0);
+ btrfs_drop_extent_cache(inode, em->start,
+ em->start + em->len - 1, 0);
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 1);
write_unlock(&em_tree->lock);
@@ -7851,28 +7294,7 @@
}
-static int btrfs_get_blocks_direct_read(struct extent_map *em,
- struct buffer_head *bh_result,
- struct inode *inode,
- u64 start, u64 len)
-{
- if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return -ENOENT;
-
- len = min(len, em->len - (start - em->start));
-
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = em->bdev;
- set_buffer_mapped(bh_result);
-
- return 0;
-}
-
static int btrfs_get_blocks_direct_write(struct extent_map **map,
- struct buffer_head *bh_result,
struct inode *inode,
struct btrfs_dio_data *dio_data,
u64 start, u64 len)
@@ -7908,7 +7330,7 @@
btrfs_inc_nocow_writers(fs_info, block_start)) {
struct extent_map *em2;
- em2 = btrfs_create_dio_extent(inode, start, len,
+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
orig_start, block_start,
len, orig_block_len,
ram_bytes, type);
@@ -7927,16 +7349,14 @@
* use the existing or preallocated extent, so does not
* need to adjust btrfs_space_info's bytes_may_use.
*/
- btrfs_free_reserved_data_space_noquota(inode, start,
- len);
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
goto skip_cow;
}
}
/* this will cow the extent */
- len = bh_result->b_size;
free_extent_map(em);
- *map = em = btrfs_new_extent_direct(inode, start, len);
+ *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -7945,69 +7365,93 @@
len = min(len, em->len - (start - em->start));
skip_cow:
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = em->bdev;
- set_buffer_mapped(bh_result);
-
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- set_buffer_new(bh_result);
-
/*
* Need to update the i_size under the extent lock so buffered
* readers will get the updated i_size when we unlock.
*/
- if (!dio_data->overwrite && start + len > i_size_read(inode))
+ if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
- dio_data->unsubmitted_oe_range_end = start + len;
- current->journal_info = dio_data;
out:
return ret;
}
-static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ loff_t length, unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
struct extent_state *cached_state = NULL;
struct btrfs_dio_data *dio_data = NULL;
- u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
- u64 len = bh_result->b_size;
+ const bool write = !!(flags & IOMAP_WRITE);
int ret = 0;
+ u64 len = length;
+ bool unlock_extents = false;
+ bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB);
- if (!create)
+ /*
+ * We used current->journal_info here to see if we were sync, but
+ * there's a lot of tests in the enospc machinery to not do flushing if
+ * we have a journal_info set, so we need to clear this out and re-set
+ * it in iomap_end.
+ */
+ ASSERT(current->journal_info == NULL ||
+ current->journal_info == BTRFS_DIO_SYNC_STUB);
+ current->journal_info = NULL;
+
+ if (!write)
len = min_t(u64, len, fs_info->sectorsize);
lockstart = start;
lockend = start + len - 1;
- if (current->journal_info) {
- /*
- * Need to pull our outstanding extents and set journal_info to NULL so
- * that anything that needs to check if there's a transaction doesn't get
- * confused.
- */
- dio_data = current->journal_info;
- current->journal_info = NULL;
+ /*
+ * The generic stuff only does filemap_write_and_wait_range, which
+ * isn't enough if we've written compressed pages to this area, so we
+ * need to flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk.
+ */
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags)) {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+ if (ret)
+ return ret;
}
+ dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
+ if (!dio_data)
+ return -ENOMEM;
+
+ dio_data->sync = sync;
+ dio_data->length = length;
+ if (write) {
+ dio_data->reserve = round_up(length, fs_info->sectorsize);
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, dio_data->reserve);
+ if (ret) {
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ return ret;
+ }
+ }
+ iomap->private = dio_data;
+
+
/*
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
- create)) {
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
ret = -ENOTBLK;
goto err;
}
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto unlock_err;
@@ -8034,36 +7478,48 @@
goto unlock_err;
}
- if (create) {
- ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
- dio_data, start, len);
+ len = min(len, em->len - (start - em->start));
+ if (write) {
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ start, len);
if (ret < 0)
goto unlock_err;
-
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extents = true;
+ /* Recalc len in case the new em is smaller than requested */
+ len = min(len, em->len - (start - em->start));
} else {
- ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
- start, len);
- /* Can be negative only if we read from a hole */
- if (ret < 0) {
- ret = 0;
- free_extent_map(em);
- goto unlock_err;
- }
/*
* We need to unlock only the end area that we aren't using.
* The rest is going to be unlocked by the endio routine.
*/
- lockstart = start + bh_result->b_size;
- if (lockstart < lockend) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
- } else {
- free_extent_state(cached_state);
- }
+ lockstart = start + len;
+ if (lockstart < lockend)
+ unlock_extents = true;
}
+ if (unlock_extents)
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, &cached_state);
+ else
+ free_extent_state(cached_state);
+
+ /*
+ * Translate extent map information to iomap.
+ * We trim the extents (and move the addr) even though iomap code does
+ * that, since we have locked only the parts we are performing I/O in.
+ */
+ if ((em->block_start == EXTENT_MAP_HOLE) ||
+ (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ } else {
+ iomap->addr = em->block_start + (start - em->start);
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = start;
+ iomap->bdev = fs_info->fs_devices->latest_bdev;
+ iomap->length = len;
+
free_extent_map(em);
return 0;
@@ -8072,382 +7528,175 @@
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
err:
- if (dio_data)
- current->journal_info = dio_data;
+ if (dio_data) {
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved, start,
+ dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ }
return ret;
}
-static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
- struct bio *bio,
- int mirror_num)
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
{
+ int ret = 0;
+ struct btrfs_dio_data *dio_data = iomap->private;
+ size_t submitted = dio_data->submitted;
+ const bool write = !!(flags & IOMAP_WRITE);
+
+ if (!write && (iomap->type == IOMAP_HOLE)) {
+ /* If reading from a hole, unlock and return */
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
+ goto out;
+ }
+
+ if (submitted < length) {
+ pos += submitted;
+ length -= submitted;
+ if (write)
+ __endio_write_update_ordered(BTRFS_I(inode), pos,
+ length, false);
+ else
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1);
+ ret = -ENOTBLK;
+ }
+
+ if (write) {
+ if (dio_data->reserve)
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved, pos,
+ dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
+ extent_changeset_free(dio_data->data_reserved);
+ }
+out:
+ /*
+ * We're all done, we can re-set the current->journal_info now safely
+ * for our endio.
+ */
+ if (dio_data->sync) {
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
+ }
+ kfree(dio_data);
+ iomap->private = NULL;
+
+ return ret;
+}
+
+static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
+{
+ /*
+ * This implies a barrier so that stores to dio_bio->bi_status before
+ * this and loads of dio_bio->bi_status after this are fully ordered.
+ */
+ if (!refcount_dec_and_test(&dip->refs))
+ return;
+
+ if (bio_op(dip->dio_bio) == REQ_OP_WRITE) {
+ __endio_write_update_ordered(BTRFS_I(dip->inode),
+ dip->logical_offset,
+ dip->bytes,
+ !dip->dio_bio->bi_status);
+ } else {
+ unlock_extent(&BTRFS_I(dip->inode)->io_tree,
+ dip->logical_offset,
+ dip->logical_offset + dip->bytes - 1);
+ }
+
+ bio_endio(dip->dio_bio);
+ kfree(dip);
+}
+
+static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+ int mirror_num,
+ unsigned long bio_flags)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
blk_status_t ret;
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
if (ret)
return ret;
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
-
+ refcount_inc(&dip->refs);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ if (ret)
+ refcount_dec(&dip->refs);
return ret;
}
-static int btrfs_check_dio_repairable(struct inode *inode,
- struct bio *failed_bio,
- struct io_failure_record *failrec,
- int failed_mirror)
+static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
+ struct btrfs_io_bio *io_bio,
+ const bool uptodate)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int num_copies;
-
- num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
- if (num_copies == 1) {
- /*
- * we only have a single copy of the data, so don't bother with
- * all the retry and error correction code that follows. no
- * matter what the error is, it is very likely to persist.
- */
- btrfs_debug(fs_info,
- "Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
- num_copies, failrec->this_mirror, failed_mirror);
- return 0;
- }
-
- failrec->failed_mirror = failed_mirror;
- failrec->this_mirror++;
- if (failrec->this_mirror == failed_mirror)
- failrec->this_mirror++;
-
- if (failrec->this_mirror > num_copies) {
- btrfs_debug(fs_info,
- "Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
- num_copies, failrec->this_mirror, failed_mirror);
- return 0;
- }
-
- return 1;
-}
-
-static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
- struct page *page, unsigned int pgoff,
- u64 start, u64 end, int failed_mirror,
- bio_end_io_t *repair_endio, void *repair_arg)
-{
- struct io_failure_record *failrec;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ const u32 sectorsize = fs_info->sectorsize;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct bio *bio;
- int isector;
- unsigned int read_mode = 0;
- int segs;
- int ret;
- blk_status_t status;
- struct bio_vec bvec;
-
- BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
-
- ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
- if (ret)
- return errno_to_blk_status(ret);
-
- ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
- failed_mirror);
- if (!ret) {
- free_io_failure(failure_tree, io_tree, failrec);
- return BLK_STS_IOERR;
- }
-
- segs = bio_segments(failed_bio);
- bio_get_first_bvec(failed_bio, &bvec);
- if (segs > 1 ||
- (bvec.bv_len > btrfs_inode_sectorsize(inode)))
- read_mode |= REQ_FAILFAST_DEV;
-
- isector = start - btrfs_io_bio(failed_bio)->logical;
- isector >>= inode->i_sb->s_blocksize_bits;
- bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
- pgoff, isector, repair_endio, repair_arg);
- bio->bi_opf = REQ_OP_READ | read_mode;
-
- btrfs_debug(BTRFS_I(inode)->root->fs_info,
- "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d",
- read_mode, failrec->this_mirror, failrec->in_validation);
-
- status = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
- if (status) {
- free_io_failure(failure_tree, io_tree, failrec);
- bio_put(bio);
- }
-
- return status;
-}
-
-struct btrfs_retry_complete {
- struct completion done;
- struct inode *inode;
- u64 start;
- int uptodate;
-};
-
-static void btrfs_retry_endio_nocsum(struct bio *bio)
-{
- struct btrfs_retry_complete *done = bio->bi_private;
- struct inode *inode = done->inode;
- struct bio_vec *bvec;
- struct extent_io_tree *io_tree, *failure_tree;
- struct bvec_iter_all iter_all;
-
- if (bio->bi_status)
- goto end;
-
- ASSERT(bio->bi_vcnt == 1);
- io_tree = &BTRFS_I(inode)->io_tree;
- failure_tree = &BTRFS_I(inode)->io_failure_tree;
- ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
-
- done->uptodate = 1;
- ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all)
- clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
- io_tree, done->start, bvec->bv_page,
- btrfs_ino(BTRFS_I(inode)), 0);
-end:
- complete(&done->done);
- bio_put(bio);
-}
-
-static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode,
- struct btrfs_io_bio *io_bio)
-{
- struct btrfs_fs_info *fs_info;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
struct bio_vec bvec;
struct bvec_iter iter;
- struct btrfs_retry_complete done;
- u64 start;
- unsigned int pgoff;
- u32 sectorsize;
- int nr_sectors;
- blk_status_t ret;
+ u64 start = io_bio->logical;
+ int icsum = 0;
blk_status_t err = BLK_STS_OK;
- fs_info = BTRFS_I(inode)->root->fs_info;
- sectorsize = fs_info->sectorsize;
+ __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
+ unsigned int i, nr_sectors, pgoff;
- start = io_bio->logical;
- done.inode = inode;
- io_bio->bio.bi_iter = io_bio->iter;
-
- bio_for_each_segment(bvec, &io_bio->bio, iter) {
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
pgoff = bvec.bv_offset;
-
-next_block_or_try_again:
- done.uptodate = 0;
- done.start = start;
- init_completion(&done.done);
-
- ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
- pgoff, start, start + sectorsize - 1,
- io_bio->mirror_num,
- btrfs_retry_endio_nocsum, &done);
- if (ret) {
- err = ret;
- goto next;
- }
-
- wait_for_completion_io(&done.done);
-
- if (!done.uptodate) {
- /* We might have another mirror, so try again */
- goto next_block_or_try_again;
- }
-
-next:
- start += sectorsize;
-
- nr_sectors--;
- if (nr_sectors) {
- pgoff += sectorsize;
+ for (i = 0; i < nr_sectors; i++) {
ASSERT(pgoff < PAGE_SIZE);
- goto next_block_or_try_again;
+ if (uptodate &&
+ (!csum || !check_data_csum(inode, io_bio, icsum,
+ bvec.bv_page, pgoff,
+ start, sectorsize))) {
+ clean_io_failure(fs_info, failure_tree, io_tree,
+ start, bvec.bv_page,
+ btrfs_ino(BTRFS_I(inode)),
+ pgoff);
+ } else {
+ blk_status_t status;
+
+ status = btrfs_submit_read_repair(inode,
+ &io_bio->bio,
+ start - io_bio->logical,
+ bvec.bv_page, pgoff,
+ start,
+ start + sectorsize - 1,
+ io_bio->mirror_num,
+ submit_dio_repair_bio);
+ if (status)
+ err = status;
+ }
+ start += sectorsize;
+ icsum++;
+ pgoff += sectorsize;
}
}
-
return err;
}
-static void btrfs_retry_endio(struct bio *bio)
-{
- struct btrfs_retry_complete *done = bio->bi_private;
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- struct extent_io_tree *io_tree, *failure_tree;
- struct inode *inode = done->inode;
- struct bio_vec *bvec;
- int uptodate;
- int ret;
- int i = 0;
- struct bvec_iter_all iter_all;
-
- if (bio->bi_status)
- goto end;
-
- uptodate = 1;
-
- ASSERT(bio->bi_vcnt == 1);
- ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
-
- io_tree = &BTRFS_I(inode)->io_tree;
- failure_tree = &BTRFS_I(inode)->io_failure_tree;
-
- ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
- ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
- bvec->bv_offset, done->start,
- bvec->bv_len);
- if (!ret)
- clean_io_failure(BTRFS_I(inode)->root->fs_info,
- failure_tree, io_tree, done->start,
- bvec->bv_page,
- btrfs_ino(BTRFS_I(inode)),
- bvec->bv_offset);
- else
- uptodate = 0;
- i++;
- }
-
- done->uptodate = uptodate;
-end:
- complete(&done->done);
- bio_put(bio);
-}
-
-static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
- struct btrfs_io_bio *io_bio, blk_status_t err)
-{
- struct btrfs_fs_info *fs_info;
- struct bio_vec bvec;
- struct bvec_iter iter;
- struct btrfs_retry_complete done;
- u64 start;
- u64 offset = 0;
- u32 sectorsize;
- int nr_sectors;
- unsigned int pgoff;
- int csum_pos;
- bool uptodate = (err == 0);
- int ret;
- blk_status_t status;
-
- fs_info = BTRFS_I(inode)->root->fs_info;
- sectorsize = fs_info->sectorsize;
-
- err = BLK_STS_OK;
- start = io_bio->logical;
- done.inode = inode;
- io_bio->bio.bi_iter = io_bio->iter;
-
- bio_for_each_segment(bvec, &io_bio->bio, iter) {
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
-
- pgoff = bvec.bv_offset;
-next_block:
- if (uptodate) {
- csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
- ret = __readpage_endio_check(inode, io_bio, csum_pos,
- bvec.bv_page, pgoff, start, sectorsize);
- if (likely(!ret))
- goto next;
- }
-try_again:
- done.uptodate = 0;
- done.start = start;
- init_completion(&done.done);
-
- status = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
- pgoff, start, start + sectorsize - 1,
- io_bio->mirror_num, btrfs_retry_endio,
- &done);
- if (status) {
- err = status;
- goto next;
- }
-
- wait_for_completion_io(&done.done);
-
- if (!done.uptodate) {
- /* We might have another mirror, so try again */
- goto try_again;
- }
-next:
- offset += sectorsize;
- start += sectorsize;
-
- ASSERT(nr_sectors);
-
- nr_sectors--;
- if (nr_sectors) {
- pgoff += sectorsize;
- ASSERT(pgoff < PAGE_SIZE);
- goto next_block;
- }
- }
-
- return err;
-}
-
-static blk_status_t btrfs_subio_endio_read(struct inode *inode,
- struct btrfs_io_bio *io_bio, blk_status_t err)
-{
- bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-
- if (skip_csum) {
- if (unlikely(err))
- return __btrfs_correct_data_nocsum(inode, io_bio);
- else
- return BLK_STS_OK;
- } else {
- return __btrfs_subio_endio_read(inode, io_bio, err);
- }
-}
-
-static void btrfs_endio_direct_read(struct bio *bio)
-{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct inode *inode = dip->inode;
- struct bio *dio_bio;
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- blk_status_t err = bio->bi_status;
-
- if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
- err = btrfs_subio_endio_read(inode, io_bio, err);
-
- unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
- dip->logical_offset + dip->bytes - 1);
- dio_bio = dip->dio_bio;
-
- kfree(dip);
-
- dio_bio->bi_status = err;
- dio_end_io(dio_bio);
- btrfs_io_bio_free_csum(io_bio);
- bio_put(bio);
-}
-
-static void __endio_write_update_ordered(struct inode *inode,
+static void __endio_write_update_ordered(struct btrfs_inode *inode,
const u64 offset, const u64 bytes,
const bool uptodate)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_extent *ordered = NULL;
struct btrfs_workqueue *wq;
u64 ordered_offset = offset;
u64 ordered_bytes = bytes;
u64 last_offset;
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
+ if (btrfs_is_free_space_inode(inode))
wq = fs_info->endio_freespace_worker;
else
wq = fs_info->endio_write_workers;
@@ -8455,9 +7704,9 @@
while (ordered_offset < offset + bytes) {
last_offset = ordered_offset;
if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
- &ordered_offset,
- ordered_bytes,
- uptodate)) {
+ &ordered_offset,
+ ordered_bytes,
+ uptodate)) {
btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
NULL);
btrfs_queue_work(wq, &ordered->work);
@@ -8479,29 +7728,12 @@
}
}
-static void btrfs_endio_direct_write(struct bio *bio)
-{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct bio *dio_bio = dip->dio_bio;
-
- __endio_write_update_ordered(dip->inode, dip->logical_offset,
- dip->bytes, !bio->bi_status);
-
- kfree(dip);
-
- dio_bio->bi_status = bio->bi_status;
- dio_end_io(dio_bio);
- bio_put(bio);
-}
-
static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
struct bio *bio, u64 offset)
{
struct inode *inode = private_data;
- blk_status_t ret;
- ret = btrfs_csum_one_bio(inode, bio, offset, 1);
- BUG_ON(ret); /* -ENOMEM */
- return 0;
+
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
}
static void btrfs_end_dio_bio(struct bio *bio)
@@ -8517,64 +7749,16 @@
(unsigned long long)bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, err);
- if (dip->subio_endio)
- err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
-
- if (err) {
- /*
- * We want to perceive the errors flag being set before
- * decrementing the reference count. We don't need a barrier
- * since atomic operations with a return value are fully
- * ordered as per atomic_t.txt
- */
- dip->errors = 1;
+ if (bio_op(bio) == REQ_OP_READ) {
+ err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio),
+ !err);
}
- /* if there are more bios still pending for this dio, just exit */
- if (!atomic_dec_and_test(&dip->pending_bios))
- goto out;
+ if (err)
+ dip->dio_bio->bi_status = err;
- if (dip->errors) {
- bio_io_error(dip->orig_bio);
- } else {
- dip->dio_bio->bi_status = BLK_STS_OK;
- bio_endio(dip->orig_bio);
- }
-out:
bio_put(bio);
-}
-
-static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
- struct btrfs_dio_private *dip,
- struct bio *bio,
- u64 file_offset)
-{
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
- u16 csum_size;
- blk_status_t ret;
-
- /*
- * We load all the csum data we need when we submit
- * the first bio to reduce the csum tree search and
- * contention.
- */
- if (dip->logical_offset == file_offset) {
- ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio,
- file_offset);
- if (ret)
- return ret;
- }
-
- if (bio == dip->orig_bio)
- return 0;
-
- file_offset -= dip->logical_offset;
- file_offset >>= inode->i_sb->s_blocksize_bits;
- csum_size = btrfs_super_csum_size(btrfs_sb(inode->i_sb)->super_copy);
- io_bio->csum = orig_io_bio->csum + csum_size * file_offset;
-
- return 0;
+ btrfs_dio_private_put(dip);
}
static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
@@ -8608,17 +7792,19 @@
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
- ret = btrfs_csum_one_bio(inode, bio, file_offset, 1);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1);
if (ret)
goto err;
} else {
- ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio,
- file_offset);
- if (ret)
- goto err;
+ u64 csum_offset;
+
+ csum_offset = file_offset - dip->logical_offset;
+ csum_offset >>= inode->i_sb->s_blocksize_bits;
+ csum_offset *= btrfs_super_csum_size(fs_info->super_copy);
+ btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
}
map:
- ret = btrfs_map_bio(fs_info, bio, 0, 0);
+ ret = btrfs_map_bio(fs_info, bio, 0);
err:
return ret;
}
@@ -8632,54 +7818,43 @@
loff_t file_offset)
{
const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
+ const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
+ size_t dip_size;
struct btrfs_dio_private *dip;
- struct bio *bio;
- dip = kzalloc(sizeof(*dip), GFP_NOFS);
+ dip_size = sizeof(*dip);
+ if (!write && csum) {
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ size_t nblocks;
+
+ nblocks = dio_bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
+ dip_size += csum_size * nblocks;
+ }
+
+ dip = kzalloc(dip_size, GFP_NOFS);
if (!dip)
return NULL;
- bio = btrfs_bio_clone(dio_bio);
- bio->bi_private = dip;
- btrfs_io_bio(bio)->logical = file_offset;
-
- dip->private = dio_bio->bi_private;
dip->inode = inode;
dip->logical_offset = file_offset;
dip->bytes = dio_bio->bi_iter.bi_size;
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
- dip->orig_bio = bio;
dip->dio_bio = dio_bio;
- atomic_set(&dip->pending_bios, 1);
-
- if (write) {
- struct btrfs_dio_data *dio_data = current->journal_info;
-
- /*
- * Setting range start and end to the same value means that
- * no cleanup will happen in btrfs_direct_IO
- */
- dio_data->unsubmitted_oe_range_end = dip->logical_offset +
- dip->bytes;
- dio_data->unsubmitted_oe_range_start =
- dio_data->unsubmitted_oe_range_end;
-
- bio->bi_end_io = btrfs_endio_direct_write;
- } else {
- bio->bi_end_io = btrfs_endio_direct_read;
- dip->subio_endio = btrfs_subio_endio_read;
- }
+ refcount_set(&dip->refs, 1);
return dip;
}
-static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
- loff_t file_offset)
+static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+ struct bio *dio_bio, loff_t file_offset)
{
const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
+ const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
+ BTRFS_BLOCK_GROUP_RAID56_MASK);
struct btrfs_dio_private *dip;
struct bio *bio;
- struct bio *orig_bio;
u64 start_sector;
int async_submit = 0;
u64 submit_len;
@@ -8688,6 +7863,7 @@
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
+ struct btrfs_dio_data *dio_data = iomap->private;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
if (!dip) {
@@ -8696,93 +7872,89 @@
file_offset + dio_bio->bi_iter.bi_size - 1);
}
dio_bio->bi_status = BLK_STS_RESOURCE;
- dio_end_io(dio_bio);
- return;
+ bio_endio(dio_bio);
+ return BLK_QC_T_NONE;
}
- orig_bio = dip->orig_bio;
- start_sector = orig_bio->bi_iter.bi_sector;
- submit_len = orig_bio->bi_iter.bi_size;
- ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
- start_sector << 9, submit_len, &geom);
- if (ret)
- goto out_err;
-
- if (geom.len >= submit_len) {
- bio = orig_bio;
- dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
- goto submit;
+ if (!write && csum) {
+ /*
+ * Load the csums up front to reduce csum tree searches and
+ * contention when submitting bios.
+ */
+ status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset,
+ dip->csums);
+ if (status != BLK_STS_OK)
+ goto out_err;
}
- /* async crcs make it difficult to collect full stripe writes. */
- if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK)
- async_submit = 0;
- else
- async_submit = 1;
+ start_sector = dio_bio->bi_iter.bi_sector;
+ submit_len = dio_bio->bi_iter.bi_size;
- /* bio split */
- ASSERT(geom.len <= INT_MAX);
do {
+ ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio),
+ start_sector << 9, submit_len,
+ &geom);
+ if (ret) {
+ status = errno_to_blk_status(ret);
+ goto out_err;
+ }
+ ASSERT(geom.len <= INT_MAX);
+
clone_len = min_t(int, submit_len, geom.len);
/*
* This will never fail as it's passing GPF_NOFS and
* the allocation is backed by btrfs_bioset.
*/
- bio = btrfs_bio_clone_partial(orig_bio, clone_offset,
- clone_len);
+ bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
ASSERT(submit_len >= clone_len);
submit_len -= clone_len;
- if (submit_len == 0)
- break;
/*
* Increase the count before we submit the bio so we know
* the end IO handler won't happen before we increase the
* count. Otherwise, the dip might get freed before we're
* done setting it up.
+ *
+ * We transfer the initial reference to the last bio, so we
+ * don't need to increment the reference count for the last one.
*/
- atomic_inc(&dip->pending_bios);
+ if (submit_len > 0) {
+ refcount_inc(&dip->refs);
+ /*
+ * If we are submitting more than one bio, submit them
+ * all asynchronously. The exception is RAID 5 or 6, as
+ * asynchronous checksums make it difficult to collect
+ * full stripe writes.
+ */
+ if (!raid56)
+ async_submit = 1;
+ }
status = btrfs_submit_dio_bio(bio, inode, file_offset,
async_submit);
if (status) {
bio_put(bio);
- atomic_dec(&dip->pending_bios);
+ if (submit_len > 0)
+ refcount_dec(&dip->refs);
goto out_err;
}
+ dio_data->submitted += clone_len;
clone_offset += clone_len;
start_sector += clone_len >> 9;
file_offset += clone_len;
-
- ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
- start_sector << 9, submit_len, &geom);
- if (ret)
- goto out_err;
} while (submit_len > 0);
+ return BLK_QC_T_NONE;
-submit:
- status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
- if (!status)
- return;
-
- if (bio != orig_bio)
- bio_put(bio);
out_err:
- dip->errors = 1;
- /*
- * Before atomic variable goto zero, we must make sure dip->errors is
- * perceived to be set. This ordering is ensured by the fact that an
- * atomic operations with a return value are fully ordered as per
- * atomic_t.txt
- */
- if (atomic_dec_and_test(&dip->pending_bios))
- bio_io_error(dip->orig_bio);
+ dip->dio_bio->bi_status = status;
+ btrfs_dio_private_put(dip);
+ return BLK_QC_T_NONE;
}
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
@@ -8818,37 +7990,63 @@
return retval;
}
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size,
+ int error, unsigned flags)
+{
+ /*
+ * Now if we're still in the context of our submitter we know we can't
+ * safely run generic_write_sync(), so clear our flag here so that the
+ * caller knows to follow up with a sync.
+ */
+ if (current->journal_info == BTRFS_DIO_SYNC_STUB) {
+ current->journal_info = NULL;
+ return error;
+ }
+
+ if (error)
+ return error;
+
+ if (size) {
+ iocb->ki_flags |= IOCB_DSYNC;
+ return generic_write_sync(iocb, size);
+ }
+
+ return 0;
+}
+
+static const struct iomap_ops btrfs_dio_iomap_ops = {
+ .iomap_begin = btrfs_dio_iomap_begin,
+ .iomap_end = btrfs_dio_iomap_end,
+};
+
+static const struct iomap_dio_ops btrfs_dio_ops = {
+ .submit_io = btrfs_submit_direct,
+};
+
+static const struct iomap_dio_ops btrfs_sync_dops = {
+ .submit_io = btrfs_submit_direct,
+ .end_io = btrfs_maybe_fsync_end_io,
+};
+
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_dio_data dio_data = { 0 };
struct extent_changeset *data_reserved = NULL;
loff_t offset = iocb->ki_pos;
size_t count = 0;
- int flags = 0;
- bool wakeup = true;
bool relock = false;
ssize_t ret;
- if (check_direct_IO(fs_info, iter, offset))
+ if (check_direct_IO(fs_info, iter, offset)) {
+ ASSERT(current->journal_info == NULL ||
+ current->journal_info == BTRFS_DIO_SYNC_STUB);
+ current->journal_info = NULL;
return 0;
+ }
- inode_dio_begin(inode);
-
- /*
- * The generic stuff only does filemap_write_and_wait_range, which
- * isn't enough if we've written compressed pages to this area, so
- * we need to flush the dirty pages again to make absolutely sure
- * that any outstanding dirty pages are on disk.
- */
count = iov_iter_count(iter);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_fdatawrite_range(inode->i_mapping, offset,
- offset + count - 1);
-
if (iov_iter_rw(iter) == WRITE) {
/*
* If the write DIO is beyond the EOF, we need update
@@ -8856,65 +8054,29 @@
* not unlock the i_mutex at this case.
*/
if (offset + count <= inode->i_size) {
- dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
}
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
- offset, count);
- if (ret)
- goto out;
-
- /*
- * We need to know how many extents we reserved so that we can
- * do the accounting properly if we go over the number we
- * originally calculated. Abuse current->journal_info for this.
- */
- dio_data.reserve = round_up(count,
- fs_info->sectorsize);
- dio_data.unsubmitted_oe_range_start = (u64)offset;
- dio_data.unsubmitted_oe_range_end = (u64)offset;
- current->journal_info = &dio_data;
down_read(&BTRFS_I(inode)->dio_sem);
- } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- &BTRFS_I(inode)->runtime_flags)) {
- inode_dio_end(inode);
- flags = DIO_LOCKING | DIO_SKIP_HOLES;
- wakeup = false;
}
- ret = __blockdev_direct_IO(iocb, inode,
- fs_info->fs_devices->latest_bdev,
- iter, btrfs_get_blocks_direct, NULL,
- btrfs_submit_direct, flags);
- if (iov_iter_rw(iter) == WRITE) {
+ /*
+ * We have are actually a sync iocb, so we need our fancy endio to know
+ * if we need to sync.
+ */
+ if (current->journal_info)
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
+ &btrfs_sync_dops, is_sync_kiocb(iocb));
+ else
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
+ &btrfs_dio_ops, is_sync_kiocb(iocb));
+
+ if (ret == -ENOTBLK)
+ ret = 0;
+
+ if (iov_iter_rw(iter) == WRITE)
up_read(&BTRFS_I(inode)->dio_sem);
- current->journal_info = NULL;
- if (ret < 0 && ret != -EIOCBQUEUED) {
- if (dio_data.reserve)
- btrfs_delalloc_release_space(inode, data_reserved,
- offset, dio_data.reserve, true);
- /*
- * On error we might have left some ordered extents
- * without submitting corresponding bios for them, so
- * cleanup them up to avoid other tasks getting them
- * and waiting for them to complete forever.
- */
- if (dio_data.unsubmitted_oe_range_start <
- dio_data.unsubmitted_oe_range_end)
- __endio_write_update_ordered(inode,
- dio_data.unsubmitted_oe_range_start,
- dio_data.unsubmitted_oe_range_end -
- dio_data.unsubmitted_oe_range_start,
- false);
- } else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(inode, data_reserved,
- offset, count - (size_t)ret, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count);
- }
-out:
- if (wakeup)
- inode_dio_end(inode);
+
if (relock)
inode_lock(inode);
@@ -8922,25 +8084,33 @@
return ret;
}
-#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
-
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len)
+ u64 start, u64 len)
{
int ret;
- ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
+ ret = fiemap_prep(inode, fieinfo, start, &len, 0);
if (ret)
return ret;
- return extent_fiemap(inode, fieinfo, start, len);
+ return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
int btrfs_readpage(struct file *file, struct page *page)
{
- struct extent_io_tree *tree;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_read_full_page(tree, page, btrfs_get_extent, 0);
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_SIZE - 1;
+ unsigned long bio_flags = 0;
+ struct bio *bio = NULL;
+ int ret;
+
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+
+ ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL);
+ if (bio)
+ ret = submit_one_bio(bio, 0, bio_flags);
+ return ret;
}
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -8974,21 +8144,16 @@
return extent_writepages(mapping, wbc);
}
-static int
-btrfs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void btrfs_readahead(struct readahead_control *rac)
{
- return extent_readpages(mapping, pages, nr_pages);
+ extent_readahead(rac);
}
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
int ret = try_release_extent_mapping(page, gfp_flags);
- if (ret == 1) {
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- }
+ if (ret == 1)
+ detach_page_private(page);
return ret;
}
@@ -8999,18 +8164,45 @@
return __btrfs_releasepage(page, gfp_flags);
}
+#ifdef CONFIG_MIGRATION
+static int btrfs_migratepage(struct address_space *mapping,
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
+{
+ int ret;
+
+ ret = migrate_page_move_mapping(mapping, newpage, page, 0);
+ if (ret != MIGRATEPAGE_SUCCESS)
+ return ret;
+
+ if (page_has_private(page))
+ attach_page_private(newpage, detach_page_private(page));
+
+ if (PagePrivate2(page)) {
+ ClearPagePrivate2(page);
+ SetPagePrivate2(newpage);
+ }
+
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
+ return MIGRATEPAGE_SUCCESS;
+}
+#endif
+
static void btrfs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
- struct inode *inode = page->mapping->host;
- struct extent_io_tree *tree;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct extent_io_tree *tree = &inode->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_SIZE - 1;
u64 start;
u64 end;
- int inode_evicting = inode->i_state & I_FREEING;
+ int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
/*
* we have the page locked, so new writeback can't start,
@@ -9021,20 +8213,32 @@
*/
wait_on_page_writeback(page);
- tree = &BTRFS_I(inode)->io_tree;
- if (offset) {
+ /*
+ * For subpage case, we have call sites like
+ * btrfs_punch_hole_lock_range() which passes range not aligned to
+ * sectorsize.
+ * If the range doesn't cover the full page, we don't need to and
+ * shouldn't clear page extent mapped, as page->private can still
+ * record subpage dirty bits for other part of the range.
+ *
+ * For cases that can invalidate the full even the range doesn't
+ * cover the full page, like invalidating the last page, we're
+ * still safe to wait for ordered extent to finish.
+ */
+ if (!(offset == 0 && length == PAGE_SIZE)) {
btrfs_releasepage(page, GFP_NOFS);
return;
}
if (!inode_evicting)
lock_extent_bits(tree, page_start, page_end, &cached_state);
-again:
+
start = page_start;
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
- page_end - start + 1);
+again:
+ ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
if (ordered) {
- end = min(page_end, ordered->file_offset + ordered->len - 1);
+ end = min(page_end,
+ ordered->file_offset + ordered->num_bytes - 1);
/*
* IO on this page will never be started, so we need
* to account for any ordered extents now
@@ -9052,7 +8256,7 @@
struct btrfs_ordered_inode_tree *tree;
u64 new_len;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
@@ -9102,11 +8306,7 @@
}
ClearPageChecked(page);
- if (PagePrivate(page)) {
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- }
+ detach_page_private(page);
}
/*
@@ -9159,8 +8359,8 @@
* end up waiting indefinitely to get a lock on the page currently
* being processed by btrfs_page_mkwrite() function.
*/
- ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
- reserved_space);
+ ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
+ page_start, reserved_space);
if (!ret2) {
ret2 = file_update_time(vmf->vma->vm_file);
reserved = 1;
@@ -9197,7 +8397,7 @@
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -9207,9 +8407,9 @@
fs_info->sectorsize);
if (reserved_space < PAGE_SIZE) {
end = page_start + reserved_space - 1;
- btrfs_delalloc_release_space(inode, data_reserved,
- page_start, PAGE_SIZE - reserved_space,
- true);
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved, page_start,
+ PAGE_SIZE - reserved_space, true);
}
}
@@ -9224,7 +8424,7 @@
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 0, 0, &cached_state);
- ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0,
+ ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
&cached_state);
if (ret2) {
unlock_extent_cached(io_tree, page_start, page_end,
@@ -9232,7 +8432,6 @@
ret = VM_FAULT_SIGBUS;
goto out_unlock;
}
- ret2 = 0;
/* page is wholly or partially inside EOF */
if (page_start + PAGE_SIZE > size)
@@ -9254,18 +8453,16 @@
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
- if (!ret2) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- sb_end_pagefault(inode->i_sb);
- extent_changeset_free(data_reserved);
- return VM_FAULT_LOCKED;
- }
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
+ return VM_FAULT_LOCKED;
out_unlock:
unlock_page(page);
out:
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
reserved_space, (ret != 0));
out_noreserve:
sb_end_pagefault(inode->i_sb);
@@ -9371,7 +8568,7 @@
break;
}
- btrfs_block_rsv_release(fs_info, rsv, -1);
+ btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */
@@ -9396,7 +8593,7 @@
ret = PTR_ERR(trans);
goto out;
}
- btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
}
if (trans) {
@@ -9479,6 +8676,7 @@
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
+ ei->last_reflink_trans = 0;
ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
@@ -9500,11 +8698,12 @@
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
extent_io_tree_init(fs_info, &ei->io_failure_tree,
IO_TREE_INODE_IO_FAILURE, inode);
+ extent_io_tree_init(fs_info, &ei->file_extent_tree,
+ IO_TREE_INODE_FILE_EXTENT, inode);
ei->io_tree.track_uptodate = true;
ei->io_failure_tree.track_uptodate = true;
atomic_set(&ei->sync_writers, 0);
mutex_init(&ei->log_mutex);
- mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
@@ -9527,21 +8726,21 @@
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
-void btrfs_destroy_inode(struct inode *inode)
+void btrfs_destroy_inode(struct inode *vfs_inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_root *root = inode->root;
- WARN_ON(!hlist_empty(&inode->i_dentry));
- WARN_ON(inode->i_data.nrpages);
- WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
- WARN_ON(BTRFS_I(inode)->block_rsv.size);
- WARN_ON(BTRFS_I(inode)->outstanding_extents);
- WARN_ON(BTRFS_I(inode)->delalloc_bytes);
- WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
- WARN_ON(BTRFS_I(inode)->csum_bytes);
- WARN_ON(BTRFS_I(inode)->defrag_bytes);
+ WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
+ WARN_ON(vfs_inode->i_data.nrpages);
+ WARN_ON(inode->block_rsv.reserved);
+ WARN_ON(inode->block_rsv.size);
+ WARN_ON(inode->outstanding_extents);
+ WARN_ON(inode->delalloc_bytes);
+ WARN_ON(inode->new_delalloc_bytes);
+ WARN_ON(inode->csum_bytes);
+ WARN_ON(inode->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -9556,17 +8755,19 @@
if (!ordered)
break;
else {
- btrfs_err(fs_info,
+ btrfs_err(root->fs_info,
"found ordered extent %llu %llu on inode cleanup",
- ordered->file_offset, ordered->len);
+ ordered->file_offset, ordered->num_bytes);
btrfs_remove_ordered_extent(inode, ordered);
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
}
- btrfs_qgroup_check_reserved_leak(BTRFS_I(inode));
+ btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
+ btrfs_put_root(inode->root);
}
int btrfs_drop_inode(struct inode *inode)
@@ -9699,6 +8900,7 @@
int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
+ bool need_abort = false;
/*
* For non-subvolumes allow exchange only within one subvolume, in the
@@ -9761,6 +8963,7 @@
old_idx);
if (ret)
goto out_fail;
+ need_abort = true;
}
/* And now for the dest. */
@@ -9776,8 +8979,11 @@
new_ino,
btrfs_ino(BTRFS_I(old_dir)),
new_idx);
- if (ret)
+ if (ret) {
+ if (need_abort)
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
+ }
}
/* Update inode version and ctime/mtime. */
@@ -10222,7 +9428,9 @@
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
+static int start_delalloc_inodes(struct btrfs_root *root,
+ struct writeback_control *wbc, bool snapshot,
+ bool in_reclaim_context)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -10230,6 +9438,7 @@
struct list_head works;
struct list_head splice;
int ret = 0;
+ bool full_flush = wbc->nr_to_write == LONG_MAX;
INIT_LIST_HEAD(&works);
INIT_LIST_HEAD(&splice);
@@ -10243,6 +9452,11 @@
list_move_tail(&binode->delalloc_inodes,
&root->delalloc_inodes);
+
+ if (in_reclaim_context &&
+ test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
+ continue;
+
inode = igrab(&binode->vfs_inode);
if (!inode) {
cond_resched_lock(&root->delalloc_lock);
@@ -10253,18 +9467,26 @@
if (snapshot)
set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
&binode->runtime_flags);
- work = btrfs_alloc_delalloc_work(inode);
- if (!work) {
- iput(inode);
- ret = -ENOMEM;
- goto out;
+ if (full_flush) {
+ work = btrfs_alloc_delalloc_work(inode);
+ if (!work) {
+ iput(inode);
+ ret = -ENOMEM;
+ goto out;
+ }
+ list_add_tail(&work->list, &works);
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &work->work);
+ } else {
+ ret = sync_inode(inode, wbc);
+ if (!ret &&
+ test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = sync_inode(inode, wbc);
+ btrfs_add_delayed_iput(inode);
+ if (ret || wbc->nr_to_write <= 0)
+ goto out;
}
- list_add_tail(&work->list, &works);
- btrfs_queue_work(root->fs_info->flush_workers,
- &work->work);
- ret++;
- if (nr != -1 && ret >= nr)
- goto out;
cond_resched();
spin_lock(&root->delalloc_lock);
}
@@ -10288,20 +9510,29 @@
int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
{
+ struct writeback_control wbc = {
+ .nr_to_write = LONG_MAX,
+ .sync_mode = WB_SYNC_NONE,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
- ret = start_delalloc_inodes(root, -1, true);
- if (ret > 0)
- ret = 0;
- return ret;
+ return start_delalloc_inodes(root, &wbc, true, false);
}
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+ bool in_reclaim_context)
{
+ struct writeback_control wbc = {
+ .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr,
+ .sync_mode = WB_SYNC_NONE,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
struct btrfs_root *root;
struct list_head splice;
int ret;
@@ -10315,23 +9546,25 @@
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
while (!list_empty(&splice) && nr) {
+ /*
+ * Reset nr_to_write here so we know that we're doing a full
+ * flush.
+ */
+ if (nr == U64_MAX)
+ wbc.nr_to_write = LONG_MAX;
+
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
- root = btrfs_grab_fs_root(root);
+ root = btrfs_grab_root(root);
BUG_ON(!root);
list_move_tail(&root->delalloc_root,
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, nr, false);
- btrfs_put_fs_root(root);
- if (ret < 0)
+ ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
+ btrfs_put_root(root);
+ if (ret < 0 || wbc.nr_to_write <= 0)
goto out;
-
- if (nr != -1) {
- nr -= ret;
- WARN_ON(nr < 0);
- }
spin_lock(&fs_info->delalloc_root_lock);
}
spin_unlock(&fs_info->delalloc_root_lock);
@@ -10402,7 +9635,6 @@
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
@@ -10467,6 +9699,65 @@
return err;
}
+static struct btrfs_trans_handle *insert_prealloc_file_extent(
+ struct btrfs_trans_handle *trans_in,
+ struct inode *inode, struct btrfs_key *ins,
+ u64 file_offset)
+{
+ struct btrfs_file_extent_item stack_fi;
+ struct btrfs_replace_extent_info extent_info;
+ struct btrfs_trans_handle *trans = trans_in;
+ struct btrfs_path *path;
+ u64 start = ins->objectid;
+ u64 len = ins->offset;
+ int ret;
+
+ memset(&stack_fi, 0, sizeof(stack_fi));
+
+ btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
+ btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
+ btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
+ /* Encryption and other encoding is reserved and all 0 */
+
+ ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (trans) {
+ ret = insert_reserved_file_extent(trans, BTRFS_I(inode),
+ file_offset, &stack_fi, ret);
+ if (ret)
+ return ERR_PTR(ret);
+ return trans;
+ }
+
+ extent_info.disk_offset = start;
+ extent_info.disk_len = len;
+ extent_info.data_offset = 0;
+ extent_info.data_len = len;
+ extent_info.file_offset = file_offset;
+ extent_info.extent_buf = (char *)&stack_fi;
+ extent_info.is_new_extent = true;
+ extent_info.qgroup_reserved = ret;
+ extent_info.insertions = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ ret = btrfs_replace_file_extents(inode, path, file_offset,
+ file_offset + len - 1, &extent_info,
+ &trans);
+ btrfs_free_path(path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return trans;
+}
+
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint,
@@ -10489,14 +9780,6 @@
if (trans)
own_trans = false;
while (num_bytes > 0) {
- if (own_trans) {
- trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
- }
-
cur_bytes = min_t(u64, num_bytes, SZ_256M);
cur_bytes = max(cur_bytes, min_size);
/*
@@ -10508,11 +9791,8 @@
cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
min_size, 0, *alloc_hint, &ins, 1, 0);
- if (ret) {
- if (own_trans)
- btrfs_end_transaction(trans);
+ if (ret)
break;
- }
/*
* We've reserved this space, and thus converted it from
@@ -10522,20 +9802,20 @@
* clear_offset by our extent size.
*/
clear_offset += ins.offset;
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
- ret = insert_reserved_file_extent(trans, inode,
- cur_offset, ins.objectid,
- ins.offset, ins.offset,
- ins.offset, 0, 0, 0,
- BTRFS_FILE_EXTENT_PREALLOC);
- if (ret) {
+ trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
+ /*
+ * Now that we inserted the prealloc extent we can finally
+ * decrement the number of reservations in the block group.
+ * If we did it before, we could race with relocation and have
+ * relocation miss the reserved extent, making it fail later.
+ */
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
ins.offset, 0);
- btrfs_abort_transaction(trans, ret);
- if (own_trans)
- btrfs_end_transaction(trans);
break;
}
@@ -10556,7 +9836,6 @@
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
em->ram_bytes = ins.offset;
- em->bdev = fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
em->generation = trans->transid;
@@ -10587,7 +9866,7 @@
else
i_size = cur_offset;
i_size_write(inode, i_size);
- btrfs_ordered_update_i_size(inode, i_size, NULL);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
}
ret = btrfs_update_inode(trans, root, inode);
@@ -10599,11 +9878,13 @@
break;
}
- if (own_trans)
+ if (own_trans) {
btrfs_end_transaction(trans);
+ trans = NULL;
+ }
}
if (clear_offset < end)
- btrfs_free_reserved_data_space(inode, NULL, clear_offset,
+ btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
end - clear_offset + 1);
return ret;
}
@@ -10679,7 +9960,6 @@
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
ret = btrfs_init_inode_security(trans, inode, dir, NULL);
if (ret)
@@ -10747,6 +10027,7 @@
sp->ptr = ptr;
sp->inode = inode;
sp->is_block_group = is_block_group;
+ sp->bg_extent_count = 1;
spin_lock(&fs_info->swapfile_pins_lock);
p = &fs_info->swapfile_pins.rb_node;
@@ -10760,6 +10041,8 @@
(sp->ptr == entry->ptr && sp->inode > entry->inode)) {
p = &(*p)->rb_right;
} else {
+ if (is_block_group)
+ entry->bg_extent_count++;
spin_unlock(&fs_info->swapfile_pins_lock);
kfree(sp);
return 1;
@@ -10785,8 +10068,11 @@
sp = rb_entry(node, struct btrfs_swapfile_pin, node);
if (sp->inode == inode) {
rb_erase(&sp->node, &fs_info->swapfile_pins);
- if (sp->is_block_group)
+ if (sp->is_block_group) {
+ btrfs_dec_block_group_swap_extents(sp->ptr,
+ sp->bg_extent_count);
btrfs_put_block_group(sp->ptr);
+ }
kfree(sp);
}
node = next;
@@ -10808,9 +10094,19 @@
struct btrfs_swap_info *bsi)
{
unsigned long nr_pages;
+ unsigned long max_pages;
u64 first_ppage, first_ppage_reported, next_ppage;
int ret;
+ /*
+ * Our swapfile may have had its size extended after the swap header was
+ * written. In that case activating the swapfile should not go beyond
+ * the max size set in the swap header.
+ */
+ if (bsi->nr_pages >= sis->max)
+ return 0;
+
+ max_pages = sis->max - bsi->nr_pages;
first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
PAGE_SIZE) >> PAGE_SHIFT;
@@ -10818,6 +10114,7 @@
if (first_ppage >= next_ppage)
return 0;
nr_pages = next_ppage - first_ppage;
+ nr_pages = min(nr_pages, max_pages);
first_ppage_reported = first_ppage;
if (bsi->start == 0)
@@ -10847,7 +10144,8 @@
sector_t *span)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
struct extent_map *em = NULL;
@@ -10886,25 +10184,39 @@
/*
* Balance or device remove/replace/resize can move stuff around from
- * under us. The EXCL_OP flag makes sure they aren't running/won't run
- * concurrently while we are mapping the swap extents, and
- * fs_info->swapfile_pins prevents them from running while the swap file
- * is active and moving the extents. Note that this also prevents a
- * concurrent device add which isn't actually necessary, but it's not
+ * under us. The exclop protection makes sure they aren't running/won't
+ * run concurrently while we are mapping the swap extents, and
+ * fs_info->swapfile_pins prevents them from running while the swap
+ * file is active and moving the extents. Note that this also prevents
+ * a concurrent device add which isn't actually necessary, but it's not
* really worth the trouble to allow it.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
btrfs_warn(fs_info,
"cannot activate swapfile while exclusive operation is running");
return -EBUSY;
}
+
+ /*
+ * Prevent snapshot creation while we are activating the swap file.
+ * We do not want to race with snapshot creation. If snapshot creation
+ * already started before we bumped nr_swapfiles from 0 to 1 and
+ * completes before the first write into the swap file after it is
+ * activated, than that write would fallback to COW.
+ */
+ if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
+ btrfs_exclop_finish(fs_info);
+ btrfs_warn(fs_info,
+ "cannot activate swapfile because snapshot creation is in progress");
+ return -EINVAL;
+ }
/*
* Snapshots can create extents which require COW even if NODATACOW is
* set. We use this counter to prevent snapshots. We must increment it
* before walking the extents because we don't want a concurrent
* snapshot to run after we've already checked the extents.
*/
- atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles);
+ atomic_inc(&root->nr_swapfiles);
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
@@ -10912,10 +10224,10 @@
start = 0;
while (start < isize) {
u64 logical_block_start, physical_block_start;
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
u64 len = isize - start;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -11001,6 +10313,17 @@
goto out;
}
+ if (!btrfs_inc_block_group_swap_extents(bg)) {
+ btrfs_warn(fs_info,
+ "block group for swapfile at %llu is read-only%s",
+ bg->start,
+ atomic_read(&fs_info->scrubs_running) ?
+ " (scrub running)" : "");
+ btrfs_put_block_group(bg);
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = btrfs_add_swapfile_pin(inode, bg, true);
if (ret) {
btrfs_put_block_group(bg);
@@ -11039,7 +10362,9 @@
if (ret)
btrfs_swap_deactivate(file);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+
+ btrfs_exclop_finish(fs_info);
if (ret)
return ret;
@@ -11083,11 +10408,6 @@
.update_time = btrfs_update_time,
.tmpfile = btrfs_tmpfile,
};
-static const struct inode_operations btrfs_dir_ro_inode_operations = {
- .lookup = btrfs_lookup,
- .permission = btrfs_permission,
- .update_time = btrfs_update_time,
-};
static const struct file_operations btrfs_dir_file_operations = {
.llseek = generic_file_llseek,
@@ -11102,12 +10422,6 @@
.fsync = btrfs_sync_file,
};
-static const struct extent_io_ops btrfs_extent_io_ops = {
- /* mandatory callbacks */
- .submit_bio_hook = btrfs_submit_bio_hook,
- .readpage_end_io_hook = btrfs_readpage_end_io_hook,
-};
-
/*
* btrfs doesn't support the bmap operation because swapfiles
* use bmap to make a mapping of extents in the file. They assume
@@ -11124,10 +10438,13 @@
.readpage = btrfs_readpage,
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
- .readpages = btrfs_readpages,
- .direct_IO = btrfs_direct_IO,
+ .readahead = btrfs_readahead,
+ .direct_IO = noop_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
+#ifdef CONFIG_MIGRATION
+ .migratepage = btrfs_migratepage,
+#endif
.set_page_dirty = btrfs_set_page_dirty,
.error_remove_page = generic_error_remove_page,
.swap_activate = btrfs_swap_activate,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e9d3eb7..b5e9bfe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -28,6 +28,7 @@
#include <linux/iversion.h>
#include "ctree.h"
#include "disk-io.h"
+#include "export.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
@@ -86,10 +87,6 @@
struct btrfs_ioctl_send_args_32)
#endif
-static int btrfs_clone(struct inode *src, struct inode *inode,
- u64 off, u64 olen, u64 olen_aligned, u64 destoff,
- int no_time_update);
-
/* Mask out flags that are inappropriate for the given type of inode. */
static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
unsigned int flags)
@@ -381,6 +378,18 @@
return 0;
}
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
+{
+ return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type);
+}
+
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
+{
+ WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
+ sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
+}
+
/*
* Set the xflags from the internal inode flags. The remaining items of fsxattr
* are zeroed.
@@ -493,10 +502,9 @@
return put_user(inode->i_generation, arg);
}
-static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
+static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_device *device;
struct request_queue *q;
struct fstrim_range range;
@@ -555,7 +563,7 @@
return 0;
}
-int btrfs_is_empty_uuid(u8 *uuid)
+int __pure btrfs_is_empty_uuid(u8 *uuid)
{
int i;
@@ -569,7 +577,6 @@
static noinline int create_subvol(struct inode *dir,
struct dentry *dentry,
const char *name, int namelen,
- u64 *async_transid,
struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
@@ -585,10 +592,10 @@
struct inode *inode;
int ret;
int err;
+ dev_t anon_dev = 0;
u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
- uuid_le new_uuid;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
@@ -598,6 +605,10 @@
if (ret)
goto fail_free;
+ ret = get_anon_bdev(&anon_dev);
+ if (ret < 0)
+ goto fail_free;
+
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
* screwed up since it assumes subvolume qgroup's level to be 0.
@@ -619,7 +630,7 @@
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
goto fail_free;
}
trans->block_rsv = &block_rsv;
@@ -629,7 +640,8 @@
if (ret)
goto fail;
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
@@ -658,8 +670,7 @@
btrfs_set_root_generation_v2(root_item,
btrfs_root_generation(root_item));
- uuid_le_gen(&new_uuid);
- memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+ generate_random_guid(root_item->uuid);
btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
root_item->ctime = root_item->otime;
@@ -693,16 +704,20 @@
leaf = NULL;
key.offset = (u64)-1;
- new_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
if (IS_ERR(new_root)) {
+ free_anon_bdev(anon_dev);
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
goto fail;
}
+ /* Freeing will be done in btrfs_put_root() of new_root */
+ anon_dev = 0;
btrfs_record_root_in_trans(trans, new_root);
ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
+ btrfs_put_root(new_root);
if (ret) {
/* We potentially lose an unused inode item here */
btrfs_abort_transaction(trans, ret);
@@ -752,16 +767,9 @@
kfree(root_item);
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
- if (async_transid) {
- *async_transid = trans->transid;
- err = btrfs_commit_transaction_async(trans, 1);
- if (err)
- err = btrfs_commit_transaction(trans);
- } else {
- err = btrfs_commit_transaction(trans);
- }
+ err = btrfs_commit_transaction(trans);
if (err && !ret)
ret = err;
@@ -774,13 +782,14 @@
return ret;
fail_free:
+ if (anon_dev)
+ free_anon_bdev(anon_dev);
kfree(root_item);
return ret;
}
static int create_snapshot(struct btrfs_root *root, struct inode *dir,
- struct dentry *dentry,
- u64 *async_transid, bool readonly,
+ struct dentry *dentry, bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
@@ -788,9 +797,8 @@
struct btrfs_pending_snapshot *pending_snapshot;
struct btrfs_trans_handle *trans;
int ret;
- bool snapshot_force_cow = false;
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return -EINVAL;
if (atomic_read(&root->nr_swapfiles)) {
@@ -803,6 +811,9 @@
if (!pending_snapshot)
return -ENOMEM;
+ ret = get_anon_bdev(&pending_snapshot->anon_dev);
+ if (ret < 0)
+ goto free_pending;
pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
GFP_KERNEL);
pending_snapshot->path = btrfs_alloc_path();
@@ -811,31 +822,6 @@
goto free_pending;
}
- /*
- * Force new buffered writes to reserve space even when NOCOW is
- * possible. This is to avoid later writeback (running dealloc) to
- * fallback to COW mode and unexpectedly fail with ENOSPC.
- */
- atomic_inc(&root->will_be_snapshotted);
- smp_mb__after_atomic();
- /* wait for no snapshot writes */
- wait_event(root->subv_writers->wait,
- percpu_counter_sum(&root->subv_writers->counter) == 0);
-
- ret = btrfs_start_delalloc_snapshot(root);
- if (ret)
- goto dec_and_free;
-
- /*
- * All previous writes have started writeback in NOCOW mode, so now
- * we force future writes to fallback to COW mode during snapshot
- * creation.
- */
- atomic_inc(&root->snapshot_force_cow);
- snapshot_force_cow = true;
-
- btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
-
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
/*
@@ -850,7 +836,7 @@
&pending_snapshot->block_rsv, 8,
false);
if (ret)
- goto dec_and_free;
+ goto free_pending;
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
@@ -868,14 +854,8 @@
list_add(&pending_snapshot->list,
&trans->transaction->pending_snapshots);
spin_unlock(&fs_info->trans_lock);
- if (async_transid) {
- *async_transid = trans->transid;
- ret = btrfs_commit_transaction_async(trans, 1);
- if (ret)
- ret = btrfs_commit_transaction(trans);
- } else {
- ret = btrfs_commit_transaction(trans);
- }
+
+ ret = btrfs_commit_transaction(trans);
if (ret)
goto fail;
@@ -895,14 +875,16 @@
d_instantiate(dentry, inode);
ret = 0;
+ pending_snapshot->anon_dev = 0;
fail:
- btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
-dec_and_free:
- if (snapshot_force_cow)
- atomic_dec(&root->snapshot_force_cow);
- if (atomic_dec_and_test(&root->will_be_snapshotted))
- wake_up_var(&root->will_be_snapshotted);
+ /* Prevent double freeing of anon_dev */
+ if (ret && pending_snapshot->snap)
+ pending_snapshot->snap->anon_dev = 0;
+ btrfs_put_root(pending_snapshot->snap);
+ btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
free_pending:
+ if (pending_snapshot->anon_dev)
+ free_anon_bdev(pending_snapshot->anon_dev);
kfree(pending_snapshot->root_item);
btrfs_free_path(pending_snapshot->path);
kfree(pending_snapshot);
@@ -980,7 +962,7 @@
static noinline int btrfs_mksubvol(const struct path *parent,
const char *name, int namelen,
struct btrfs_root *snap_src,
- u64 *async_transid, bool readonly,
+ bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
struct inode *dir = d_inode(parent->dentry);
@@ -1016,13 +998,11 @@
if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
goto out_up_read;
- if (snap_src) {
- error = create_snapshot(snap_src, dir, dentry,
- async_transid, readonly, inherit);
- } else {
- error = create_subvol(dir, dentry, name, namelen,
- async_transid, inherit);
- }
+ if (snap_src)
+ error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
+ else
+ error = create_subvol(dir, dentry, name, namelen, inherit);
+
if (!error)
fsnotify_mkdir(dir, dentry);
out_up_read:
@@ -1034,6 +1014,45 @@
return error;
}
+static noinline int btrfs_mksnapshot(const struct path *parent,
+ const char *name, int namelen,
+ struct btrfs_root *root,
+ bool readonly,
+ struct btrfs_qgroup_inherit *inherit)
+{
+ int ret;
+ bool snapshot_force_cow = false;
+
+ /*
+ * Force new buffered writes to reserve space even when NOCOW is
+ * possible. This is to avoid later writeback (running dealloc) to
+ * fallback to COW mode and unexpectedly fail with ENOSPC.
+ */
+ btrfs_drew_read_lock(&root->snapshot_lock);
+
+ ret = btrfs_start_delalloc_snapshot(root);
+ if (ret)
+ goto out;
+
+ /*
+ * All previous writes have started writeback in NOCOW mode, so now
+ * we force future writes to fallback to COW mode during snapshot
+ * creation.
+ */
+ atomic_inc(&root->snapshot_force_cow);
+ snapshot_force_cow = true;
+
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+
+ ret = btrfs_mksubvol(parent, name, namelen,
+ root, readonly, inherit);
+out:
+ if (snapshot_force_cow)
+ atomic_dec(&root->snapshot_force_cow);
+ btrfs_drew_read_unlock(&root->snapshot_lock);
+ return ret;
+}
+
/*
* When we're defragging a range, we don't want to kick it off again
* if it is really just waiting for delalloc to send it down.
@@ -1155,7 +1174,7 @@
/* get the big lock and read metadata off disk */
lock_extent_bits(io_tree, start, end, &cached);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
unlock_extent_cached(io_tree, start, end, &cached);
if (IS_ERR(em))
@@ -1284,7 +1303,7 @@
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
start, page_cnt << PAGE_SHIFT);
if (ret)
return ret;
@@ -1305,7 +1324,7 @@
while (1) {
lock_extent_bits(tree, page_start, page_end,
&cached_state);
- ordered = btrfs_lookup_ordered_extent(inode,
+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode),
page_start);
unlock_extent_cached(tree, page_start, page_end,
&cached_state);
@@ -1313,7 +1332,7 @@
break;
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
lock_page(page);
/*
@@ -1384,7 +1403,7 @@
struct extent_map *em;
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start,
- page_end - search_start, 0);
+ page_end - search_start);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_unlock_range;
@@ -1407,7 +1426,7 @@
spin_lock(&BTRFS_I(inode)->lock);
btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
start, (page_cnt - i_done) << PAGE_SHIFT, true);
}
@@ -1438,7 +1457,7 @@
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
start, page_cnt << PAGE_SHIFT, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
@@ -1478,7 +1497,7 @@
return -EINVAL;
if (do_compress) {
- if (range->compress_type > BTRFS_COMPRESS_TYPES)
+ if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
return -EINVAL;
if (range->compress_type)
compress_type = range->compress_type;
@@ -1681,7 +1700,7 @@
if (ret)
return ret;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) {
mnt_drop_write_file(file);
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
}
@@ -1775,9 +1794,6 @@
new_size = round_down(new_size, fs_info->sectorsize);
- btrfs_info_in_rcu(fs_info, "new size for %s is %llu",
- rcu_str_deref(device->name), new_size);
-
if (new_size > old_size) {
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
@@ -1790,17 +1806,22 @@
ret = btrfs_shrink_device(device, new_size);
} /* equal, nothing need to do */
+ if (ret == 0 && new_size != old_size)
+ btrfs_info_in_rcu(fs_info,
+ "resize device %s (devid %llu) from %llu to %llu",
+ rcu_str_deref(device->name), device->devid,
+ old_size, new_size);
out_free:
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
mnt_drop_write_file(file);
return ret;
}
-static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
+static noinline int __btrfs_ioctl_snap_create(struct file *file,
const char *name, unsigned long fd, int subvol,
- u64 *transid, bool readonly,
+ bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
int namelen;
@@ -1827,7 +1848,7 @@
if (subvol) {
ret = btrfs_mksubvol(&file->f_path, name, namelen,
- NULL, transid, readonly, inherit);
+ NULL, readonly, inherit);
} else {
struct fd src = fdget(fd);
struct inode *src_inode;
@@ -1848,9 +1869,9 @@
*/
ret = -EPERM;
} else {
- ret = btrfs_mksubvol(&file->f_path, name, namelen,
+ ret = btrfs_mksnapshot(&file->f_path, name, namelen,
BTRFS_I(src_inode)->root,
- transid, readonly, inherit);
+ readonly, inherit);
}
fdput(src);
}
@@ -1874,9 +1895,8 @@
return PTR_ERR(vol_args);
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
- vol_args->fd, subvol,
- NULL, false, NULL);
+ ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
+ subvol, false, NULL);
kfree(vol_args);
return ret;
@@ -1887,8 +1907,6 @@
{
struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
- u64 transid = 0;
- u64 *ptr = NULL;
bool readonly = false;
struct btrfs_qgroup_inherit *inherit = NULL;
@@ -1900,22 +1918,11 @@
return PTR_ERR(vol_args);
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
- if (vol_args->flags &
- ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
- BTRFS_SUBVOL_QGROUP_INHERIT)) {
+ if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
ret = -EOPNOTSUPP;
goto free_args;
}
- if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) {
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
- btrfs_warn(fs_info,
-"SNAP_CREATE_V2 ioctl with CREATE_ASYNC is deprecated and will be removed in kernel 5.7");
-
- ptr = &transid;
- }
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
@@ -1947,18 +1954,10 @@
}
}
- ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
- vol_args->fd, subvol, ptr,
- readonly, inherit);
+ ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
+ subvol, readonly, inherit);
if (ret)
goto free_inherit;
-
- if (ptr && copy_to_user(arg +
- offsetof(struct btrfs_ioctl_vol_args_v2,
- transid),
- ptr, sizeof(*ptr)))
- ret = -EFAULT;
-
free_inherit:
kfree(inherit);
free_args:
@@ -2017,11 +2016,6 @@
goto out_drop_write;
}
- if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
- ret = -EINVAL;
- goto out_drop_write;
- }
-
if (flags & ~BTRFS_SUBVOL_RDONLY) {
ret = -EOPNOTSUPP;
goto out_drop_write;
@@ -2177,7 +2171,7 @@
* problem. Otherwise we'll fault and then copy the buffer in
* properly this next time through
*/
- if (probe_user_write(ubuf + *sk_offset, &sh, sizeof(sh))) {
+ if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
ret = 0;
goto out;
}
@@ -2264,12 +2258,9 @@
if (sk->tree_id == 0) {
/* search the root of the inode that was passed */
- root = BTRFS_I(inode)->root;
+ root = btrfs_grab_root(BTRFS_I(inode)->root);
} else {
- key.objectid = sk->tree_id;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(info, &key);
+ root = btrfs_get_fs_root(info, sk->tree_id, true);
if (IS_ERR(root)) {
btrfs_free_path(path);
return PTR_ERR(root);
@@ -2303,6 +2294,7 @@
ret = 0;
err:
sk->nr_items = num_found;
+ btrfs_put_root(root);
btrfs_free_path(path);
return ret;
}
@@ -2406,12 +2398,10 @@
ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];
- key.objectid = tree_id;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(info, &key);
+ root = btrfs_get_fs_root(info, tree_id, true);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
+ root = NULL;
goto out;
}
@@ -2462,6 +2452,7 @@
name[total_len] = '\0';
ret = 0;
out:
+ btrfs_put_root(root);
btrfs_free_path(path);
return ret;
}
@@ -2478,7 +2469,7 @@
unsigned long item_len;
struct btrfs_inode_ref *iref;
struct btrfs_root_ref *rref;
- struct btrfs_root *root;
+ struct btrfs_root *root = NULL;
struct btrfs_path *path;
struct btrfs_key key, key2;
struct extent_buffer *leaf;
@@ -2500,10 +2491,7 @@
if (dirid != upper_limit.objectid) {
ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
- key.objectid = treeid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(fs_info, &key);
+ root = btrfs_get_fs_root(fs_info, treeid, true);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out;
@@ -2515,15 +2503,15 @@
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
- goto out;
+ goto out_put;
} else if (ret > 0) {
ret = btrfs_previous_item(root, path, dirid,
BTRFS_INODE_REF_KEY);
if (ret < 0) {
- goto out;
+ goto out_put;
} else if (ret > 0) {
ret = -ENOENT;
- goto out;
+ goto out_put;
}
}
@@ -2537,7 +2525,7 @@
total_len += len + 1;
if (ptr < args->path) {
ret = -ENAMETOOLONG;
- goto out;
+ goto out_put;
}
*(ptr + len) = '/';
@@ -2548,10 +2536,10 @@
ret = btrfs_previous_item(root, path, dirid,
BTRFS_INODE_ITEM_KEY);
if (ret < 0) {
- goto out;
+ goto out_put;
} else if (ret > 0) {
ret = -ENOENT;
- goto out;
+ goto out_put;
}
leaf = path->nodes[0];
@@ -2559,26 +2547,26 @@
btrfs_item_key_to_cpu(leaf, &key2, slot);
if (key2.objectid != dirid) {
ret = -ENOENT;
- goto out;
+ goto out_put;
}
- temp_inode = btrfs_iget(sb, &key2, root, NULL);
+ temp_inode = btrfs_iget(sb, key2.objectid, root);
if (IS_ERR(temp_inode)) {
ret = PTR_ERR(temp_inode);
- goto out;
+ goto out_put;
}
ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC);
iput(temp_inode);
if (ret) {
ret = -EACCES;
- goto out;
+ goto out_put;
}
if (key.offset == upper_limit.objectid)
break;
if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
ret = -EACCES;
- goto out;
+ goto out_put;
}
btrfs_release_path(path);
@@ -2589,15 +2577,16 @@
memmove(args->path, ptr, total_len);
args->path[total_len] = '\0';
+ btrfs_put_root(root);
+ root = NULL;
btrfs_release_path(path);
}
/* Get the bottom subvolume's name from ROOT_REF */
- root = fs_info->tree_root;
key.objectid = treeid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = args->treeid;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0) {
goto out;
} else if (ret > 0) {
@@ -2624,6 +2613,8 @@
read_extent_buffer(leaf, args->name, item_off, item_len);
args->name[item_len] = 0;
+out_put:
+ btrfs_put_root(root);
out:
btrfs_free_path(path);
return ret;
@@ -2746,12 +2737,10 @@
/* Get root_item of inode's subvolume */
key.objectid = BTRFS_I(inode)->root->root_key.objectid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(fs_info, &key);
+ root = btrfs_get_fs_root(fs_info, key.objectid, true);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
- goto out;
+ goto out_free;
}
root_item = &root->root_item;
@@ -2784,16 +2773,14 @@
if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
/* Search root tree for ROOT_BACKREF of this subvolume */
- root = fs_info->tree_root;
-
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0) {
goto out;
} else if (path->slots[0] >=
btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
+ ret = btrfs_next_leaf(fs_info->tree_root, path);
if (ret < 0) {
goto out;
} else if (ret > 0) {
@@ -2828,8 +2815,10 @@
ret = -EFAULT;
out:
+ btrfs_put_root(root);
+out_free:
btrfs_free_path(path);
- kzfree(subvol_info);
+ kfree(subvol_info);
return ret;
}
@@ -2931,7 +2920,8 @@
}
static noinline int btrfs_ioctl_snap_destroy(struct file *file,
- void __user *arg)
+ void __user *arg,
+ bool destroy_v2)
{
struct dentry *parent = file->f_path.dentry;
struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
@@ -2940,34 +2930,120 @@
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *dest = NULL;
- struct btrfs_ioctl_vol_args *vol_args;
- int namelen;
+ struct btrfs_ioctl_vol_args *vol_args = NULL;
+ struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
+ char *subvol_name, *subvol_name_ptr = NULL;
+ int subvol_namelen;
int err = 0;
+ bool destroy_parent = false;
- if (!S_ISDIR(dir->i_mode))
- return -ENOTDIR;
+ if (destroy_v2) {
+ vol_args2 = memdup_user(arg, sizeof(*vol_args2));
+ if (IS_ERR(vol_args2))
+ return PTR_ERR(vol_args2);
- vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- namelen = strlen(vol_args->name);
- if (strchr(vol_args->name, '/') ||
- strncmp(vol_args->name, "..", namelen) == 0) {
- err = -EINVAL;
- goto out;
+ /*
+ * If SPEC_BY_ID is not set, we are looking for the subvolume by
+ * name, same as v1 currently does.
+ */
+ if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
+ vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0;
+ subvol_name = vol_args2->name;
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto out;
+ } else {
+ if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto out;
+
+ dentry = btrfs_get_dentry(fs_info->sb,
+ BTRFS_FIRST_FREE_OBJECTID,
+ vol_args2->subvolid, 0, 0);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_drop_write;
+ }
+
+ /*
+ * Change the default parent since the subvolume being
+ * deleted can be outside of the current mount point.
+ */
+ parent = btrfs_get_parent(dentry);
+
+ /*
+ * At this point dentry->d_name can point to '/' if the
+ * subvolume we want to destroy is outsite of the
+ * current mount point, so we need to release the
+ * current dentry and execute the lookup to return a new
+ * one with ->d_name pointing to the
+ * <mount point>/subvol_name.
+ */
+ dput(dentry);
+ if (IS_ERR(parent)) {
+ err = PTR_ERR(parent);
+ goto out_drop_write;
+ }
+ dir = d_inode(parent);
+
+ /*
+ * If v2 was used with SPEC_BY_ID, a new parent was
+ * allocated since the subvolume can be outside of the
+ * current mount point. Later on we need to release this
+ * new parent dentry.
+ */
+ destroy_parent = true;
+
+ subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
+ fs_info, vol_args2->subvolid);
+ if (IS_ERR(subvol_name_ptr)) {
+ err = PTR_ERR(subvol_name_ptr);
+ goto free_parent;
+ }
+ /* subvol_name_ptr is already NULL termined */
+ subvol_name = (char *)kbasename(subvol_name_ptr);
+ }
+ } else {
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = 0;
+ subvol_name = vol_args->name;
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto out;
}
- err = mnt_want_write_file(file);
- if (err)
- goto out;
+ subvol_namelen = strlen(subvol_name);
+ if (strchr(subvol_name, '/') ||
+ strncmp(subvol_name, "..", subvol_namelen) == 0) {
+ err = -EINVAL;
+ goto free_subvol_name;
+ }
+
+ if (!S_ISDIR(dir->i_mode)) {
+ err = -ENOTDIR;
+ goto free_subvol_name;
+ }
err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
if (err == -EINTR)
- goto out_drop_write;
- dentry = lookup_one_len(vol_args->name, parent, namelen);
+ goto free_subvol_name;
+ dentry = lookup_one_len(subvol_name, parent, subvol_namelen);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out_unlock_dir;
@@ -3027,18 +3103,22 @@
inode_lock(inode);
err = btrfs_delete_subvolume(dir, dentry);
inode_unlock(inode);
- if (!err) {
- fsnotify_rmdir(dir, dentry);
- d_delete(dentry);
- }
+ if (!err)
+ d_delete_notify(dir, dentry);
out_dput:
dput(dentry);
out_unlock_dir:
inode_unlock(dir);
+free_subvol_name:
+ kfree(subvol_name_ptr);
+free_parent:
+ if (destroy_parent)
+ dput(parent);
out_drop_write:
mnt_drop_write_file(file);
out:
+ kfree(vol_args2);
kfree(vol_args);
return err;
}
@@ -3123,7 +3203,7 @@
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
vol_args = memdup_user(arg, sizeof(*vol_args));
@@ -3140,7 +3220,7 @@
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return ret;
}
@@ -3164,13 +3244,12 @@
goto err_drop;
}
- /* Check for compatibility reject unknown flags */
- if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) {
+ if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
ret = -EOPNOTSUPP;
goto out;
}
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out;
}
@@ -3181,7 +3260,7 @@
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
}
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
if (!ret) {
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
@@ -3212,7 +3291,7 @@
if (ret)
return ret;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out_drop_write;
}
@@ -3230,7 +3309,7 @@
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
out_drop_write:
mnt_drop_write_file(file);
@@ -3243,11 +3322,15 @@
struct btrfs_ioctl_fs_info_args *fi_args;
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ u64 flags_in;
int ret = 0;
- fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
- if (!fi_args)
- return -ENOMEM;
+ fi_args = memdup_user(arg, sizeof(*fi_args));
+ if (IS_ERR(fi_args))
+ return PTR_ERR(fi_args);
+
+ flags_in = fi_args->flags;
+ memset(fi_args, 0, sizeof(*fi_args));
rcu_read_lock();
fi_args->num_devices = fs_devices->num_devices;
@@ -3263,6 +3346,23 @@
fi_args->sectorsize = fs_info->sectorsize;
fi_args->clone_alignment = fs_info->sectorsize;
+ if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) {
+ fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy);
+ fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO;
+ }
+
+ if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
+ fi_args->generation = fs_info->generation;
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
+ }
+
+ if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) {
+ memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid,
+ sizeof(fi_args->metadata_uuid));
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID;
+ }
+
if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
ret = -EFAULT;
@@ -3315,735 +3415,6 @@
return ret;
}
-static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
-{
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
-}
-
-static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
-{
- if (inode1 < inode2) {
- swap(inode1, inode2);
- swap(loff1, loff2);
- } else if (inode1 == inode2 && loff2 < loff1) {
- swap(loff1, loff2);
- }
- lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
-}
-
-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
- struct inode *dst, u64 dst_loff)
-{
- const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
- int ret;
-
- /*
- * Lock destination range to serialize with concurrent readpages() and
- * source range to serialize with relocation.
- */
- btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
- ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
- btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
-
- return ret;
-}
-
-#define BTRFS_MAX_DEDUPE_LEN SZ_16M
-
-static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
- struct inode *dst, u64 dst_loff)
-{
- int ret;
- u64 i, tail_len, chunk_count;
- struct btrfs_root *root_dst = BTRFS_I(dst)->root;
-
- spin_lock(&root_dst->root_item_lock);
- if (root_dst->send_in_progress) {
- btrfs_warn_rl(root_dst->fs_info,
-"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
- root_dst->root_key.objectid,
- root_dst->send_in_progress);
- spin_unlock(&root_dst->root_item_lock);
- return -EAGAIN;
- }
- root_dst->dedupe_in_progress++;
- spin_unlock(&root_dst->root_item_lock);
-
- tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
- chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
-
- for (i = 0; i < chunk_count; i++) {
- ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
- dst, dst_loff);
- if (ret)
- goto out;
-
- loff += BTRFS_MAX_DEDUPE_LEN;
- dst_loff += BTRFS_MAX_DEDUPE_LEN;
- }
-
- if (tail_len > 0)
- ret = btrfs_extent_same_range(src, loff, tail_len, dst,
- dst_loff);
-out:
- spin_lock(&root_dst->root_item_lock);
- root_dst->dedupe_in_progress--;
- spin_unlock(&root_dst->root_item_lock);
-
- return ret;
-}
-
-static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
- struct inode *inode,
- u64 endoff,
- const u64 destoff,
- const u64 olen,
- int no_time_update)
-{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret;
-
- inode_inc_iversion(inode);
- if (!no_time_update)
- inode->i_mtime = inode->i_ctime = current_time(inode);
- /*
- * We round up to the block size at eof when determining which
- * extents to clone above, but shouldn't round up the file size.
- */
- if (endoff > destoff + olen)
- endoff = destoff + olen;
- if (endoff > inode->i_size)
- btrfs_i_size_write(BTRFS_I(inode), endoff);
-
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- goto out;
- }
- ret = btrfs_end_transaction(trans);
-out:
- return ret;
-}
-
-/*
- * Make sure we do not end up inserting an inline extent into a file that has
- * already other (non-inline) extents. If a file has an inline extent it can
- * not have any other extents and the (single) inline extent must start at the
- * file offset 0. Failing to respect these rules will lead to file corruption,
- * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
- *
- * We can have extents that have been already written to disk or we can have
- * dirty ranges still in delalloc, in which case the extent maps and items are
- * created only when we run delalloc, and the delalloc ranges might fall outside
- * the range we are currently locking in the inode's io tree. So we check the
- * inode's i_size because of that (i_size updates are done while holding the
- * i_mutex, which we are holding here).
- * We also check to see if the inode has a size not greater than "datal" but has
- * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
- * protected against such concurrent fallocate calls by the i_mutex).
- *
- * If the file has no extents but a size greater than datal, do not allow the
- * copy because we would need turn the inline extent into a non-inline one (even
- * with NO_HOLES enabled). If we find our destination inode only has one inline
- * extent, just overwrite it with the source inline extent if its size is less
- * than the source extent's size, or we could copy the source inline extent's
- * data into the destination inode's inline extent if the later is greater then
- * the former.
- */
-static int clone_copy_inline_extent(struct inode *dst,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct btrfs_key *new_key,
- const u64 drop_start,
- const u64 datal,
- const u64 skip,
- const u64 size,
- char *inline_data)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
- struct btrfs_root *root = BTRFS_I(dst)->root;
- const u64 aligned_end = ALIGN(new_key->offset + datal,
- fs_info->sectorsize);
- int ret;
- struct btrfs_key key;
-
- if (new_key->offset > 0)
- return -EOPNOTSUPP;
-
- key.objectid = btrfs_ino(BTRFS_I(dst));
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- return ret;
- } else if (ret > 0) {
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- return ret;
- else if (ret > 0)
- goto copy_inline_extent;
- }
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
- key.type == BTRFS_EXTENT_DATA_KEY) {
- ASSERT(key.offset > 0);
- return -EOPNOTSUPP;
- }
- } else if (i_size_read(dst) <= datal) {
- struct btrfs_file_extent_item *ei;
- u64 ext_len;
-
- /*
- * If the file size is <= datal, make sure there are no other
- * extents following (can happen do to an fallocate call with
- * the flag FALLOC_FL_KEEP_SIZE).
- */
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- /*
- * If it's an inline extent, it can not have other extents
- * following it.
- */
- if (btrfs_file_extent_type(path->nodes[0], ei) ==
- BTRFS_FILE_EXTENT_INLINE)
- goto copy_inline_extent;
-
- ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
- if (ext_len > aligned_end)
- return -EOPNOTSUPP;
-
- ret = btrfs_next_item(root, path);
- if (ret < 0) {
- return ret;
- } else if (ret == 0) {
- btrfs_item_key_to_cpu(path->nodes[0], &key,
- path->slots[0]);
- if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
- key.type == BTRFS_EXTENT_DATA_KEY)
- return -EOPNOTSUPP;
- }
- }
-
-copy_inline_extent:
- /*
- * We have no extent items, or we have an extent at offset 0 which may
- * or may not be inlined. All these cases are dealt the same way.
- */
- if (i_size_read(dst) > datal) {
- /*
- * If the destination inode has an inline extent...
- * This would require copying the data from the source inline
- * extent into the beginning of the destination's inline extent.
- * But this is really complex, both extents can be compressed
- * or just one of them, which would require decompressing and
- * re-compressing data (which could increase the new compressed
- * size, not allowing the compressed data to fit anymore in an
- * inline extent).
- * So just don't support this case for now (it should be rare,
- * we are not really saving space when cloning inline extents).
- */
- return -EOPNOTSUPP;
- }
-
- btrfs_release_path(path);
- ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
- if (ret)
- return ret;
- ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
- if (ret)
- return ret;
-
- if (skip) {
- const u32 start = btrfs_file_extent_calc_inline_size(0);
-
- memmove(inline_data + start, inline_data + start + skip, datal);
- }
-
- write_extent_buffer(path->nodes[0], inline_data,
- btrfs_item_ptr_offset(path->nodes[0],
- path->slots[0]),
- size);
- inode_add_bytes(dst, datal);
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
-
- return 0;
-}
-
-/**
- * btrfs_clone() - clone a range from inode file to another
- *
- * @src: Inode to clone from
- * @inode: Inode to clone to
- * @off: Offset within source to start clone from
- * @olen: Original length, passed by user, of range to clone
- * @olen_aligned: Block-aligned value of olen
- * @destoff: Offset within @inode to start clone
- * @no_time_update: Whether to update mtime/ctime on the target inode
- */
-static int btrfs_clone(struct inode *src, struct inode *inode,
- const u64 off, const u64 olen, const u64 olen_aligned,
- const u64 destoff, int no_time_update)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_path *path = NULL;
- struct extent_buffer *leaf;
- struct btrfs_trans_handle *trans;
- char *buf = NULL;
- struct btrfs_key key;
- u32 nritems;
- int slot;
- int ret;
- const u64 len = olen_aligned;
- u64 last_dest_end = destoff;
-
- ret = -ENOMEM;
- buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
- if (!buf)
- return ret;
-
- path = btrfs_alloc_path();
- if (!path) {
- kvfree(buf);
- return ret;
- }
-
- path->reada = READA_FORWARD;
- /* clone data */
- key.objectid = btrfs_ino(BTRFS_I(src));
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = off;
-
- while (1) {
- u64 next_key_min_offset = key.offset + 1;
- struct btrfs_file_extent_item *extent;
- int type;
- u32 size;
- struct btrfs_key new_key;
- u64 disko = 0, diskl = 0;
- u64 datao = 0, datal = 0;
- u8 comp;
- u64 drop_start;
-
- /*
- * note the key will change type as we walk through the
- * tree.
- */
- path->leave_spinning = 1;
- ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
- 0, 0);
- if (ret < 0)
- goto out;
- /*
- * First search, if no extent item that starts at offset off was
- * found but the previous item is an extent item, it's possible
- * it might overlap our target range, therefore process it.
- */
- if (key.offset == off && ret > 0 && path->slots[0] > 0) {
- btrfs_item_key_to_cpu(path->nodes[0], &key,
- path->slots[0] - 1);
- if (key.type == BTRFS_EXTENT_DATA_KEY)
- path->slots[0]--;
- }
-
- nritems = btrfs_header_nritems(path->nodes[0]);
-process_slot:
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
- if (ret < 0)
- goto out;
- if (ret > 0)
- break;
- nritems = btrfs_header_nritems(path->nodes[0]);
- }
- leaf = path->nodes[0];
- slot = path->slots[0];
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.type > BTRFS_EXTENT_DATA_KEY ||
- key.objectid != btrfs_ino(BTRFS_I(src)))
- break;
-
- ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
-
- extent = btrfs_item_ptr(leaf, slot,
- struct btrfs_file_extent_item);
- comp = btrfs_file_extent_compression(leaf, extent);
- type = btrfs_file_extent_type(leaf, extent);
- if (type == BTRFS_FILE_EXTENT_REG ||
- type == BTRFS_FILE_EXTENT_PREALLOC) {
- disko = btrfs_file_extent_disk_bytenr(leaf, extent);
- diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
- datao = btrfs_file_extent_offset(leaf, extent);
- datal = btrfs_file_extent_num_bytes(leaf, extent);
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
- /* Take upper bound, may be compressed */
- datal = btrfs_file_extent_ram_bytes(leaf, extent);
- }
-
- /*
- * The first search might have left us at an extent item that
- * ends before our target range's start, can happen if we have
- * holes and NO_HOLES feature enabled.
- */
- if (key.offset + datal <= off) {
- path->slots[0]++;
- goto process_slot;
- } else if (key.offset >= off + len) {
- break;
- }
- next_key_min_offset = key.offset + datal;
- size = btrfs_item_size_nr(leaf, slot);
- read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
- size);
-
- btrfs_release_path(path);
- path->leave_spinning = 0;
-
- memcpy(&new_key, &key, sizeof(new_key));
- new_key.objectid = btrfs_ino(BTRFS_I(inode));
- if (off <= key.offset)
- new_key.offset = key.offset + destoff - off;
- else
- new_key.offset = destoff;
-
- /*
- * Deal with a hole that doesn't have an extent item that
- * represents it (NO_HOLES feature enabled).
- * This hole is either in the middle of the cloning range or at
- * the beginning (fully overlaps it or partially overlaps it).
- */
- if (new_key.offset != last_dest_end)
- drop_start = last_dest_end;
- else
- drop_start = new_key.offset;
-
- if (type == BTRFS_FILE_EXTENT_REG ||
- type == BTRFS_FILE_EXTENT_PREALLOC) {
- struct btrfs_clone_extent_info clone_info;
-
- /*
- * a | --- range to clone ---| b
- * | ------------- extent ------------- |
- */
-
- /* Subtract range b */
- if (key.offset + datal > off + len)
- datal = off + len - key.offset;
-
- /* Subtract range a */
- if (off > key.offset) {
- datao += off - key.offset;
- datal -= off - key.offset;
- }
-
- clone_info.disk_offset = disko;
- clone_info.disk_len = diskl;
- clone_info.data_offset = datao;
- clone_info.data_len = datal;
- clone_info.file_offset = new_key.offset;
- clone_info.extent_buf = buf;
- clone_info.item_size = size;
- ret = btrfs_punch_hole_range(inode, path,
- drop_start,
- new_key.offset + datal - 1,
- &clone_info, &trans);
- if (ret)
- goto out;
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
- u64 skip = 0;
- u64 trim = 0;
-
- if (off > key.offset) {
- skip = off - key.offset;
- new_key.offset += skip;
- }
-
- if (key.offset + datal > off + len)
- trim = key.offset + datal - (off + len);
-
- if (comp && (skip || trim)) {
- ret = -EINVAL;
- goto out;
- }
- size -= skip + trim;
- datal -= skip + trim;
-
- /*
- * If our extent is inline, we know we will drop or
- * adjust at most 1 extent item in the destination root.
- *
- * 1 - adjusting old extent (we may have to split it)
- * 1 - add new extent
- * 1 - inode update
- */
- trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out;
- }
-
- ret = clone_copy_inline_extent(inode, trans, path,
- &new_key, drop_start,
- datal, skip, size, buf);
- if (ret) {
- if (ret != -EOPNOTSUPP)
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- goto out;
- }
- }
-
- btrfs_release_path(path);
-
- last_dest_end = ALIGN(new_key.offset + datal,
- fs_info->sectorsize);
- ret = clone_finish_inode_update(trans, inode, last_dest_end,
- destoff, olen, no_time_update);
- if (ret)
- goto out;
- if (new_key.offset + datal >= destoff + len)
- break;
-
- btrfs_release_path(path);
- key.offset = next_key_min_offset;
-
- if (fatal_signal_pending(current)) {
- ret = -EINTR;
- goto out;
- }
-
- cond_resched();
- }
- ret = 0;
-
- if (last_dest_end < destoff + len) {
- /*
- * We have an implicit hole that fully or partially overlaps our
- * cloning range at its end. This means that we either have the
- * NO_HOLES feature enabled or the implicit hole happened due to
- * mixing buffered and direct IO writes against this file.
- */
- btrfs_release_path(path);
- path->leave_spinning = 0;
-
- ret = btrfs_punch_hole_range(inode, path,
- last_dest_end, destoff + len - 1,
- NULL, &trans);
- if (ret)
- goto out;
-
- ret = clone_finish_inode_update(trans, inode, destoff + len,
- destoff, olen, no_time_update);
- }
-
-out:
- btrfs_free_path(path);
- kvfree(buf);
- return ret;
-}
-
-static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
- u64 off, u64 olen, u64 destoff)
-{
- struct inode *inode = file_inode(file);
- struct inode *src = file_inode(file_src);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int ret;
- u64 len = olen;
- u64 bs = fs_info->sb->s_blocksize;
-
- /*
- * TODO:
- * - split compressed inline extents. annoying: we need to
- * decompress into destination's address_space (the file offset
- * may change, so source mapping won't do), then recompress (or
- * otherwise reinsert) a subrange.
- *
- * - split destination inode's inline extents. The inline extents can
- * be either compressed or non-compressed.
- */
-
- /*
- * VFS's generic_remap_file_range_prep() protects us from cloning the
- * eof block into the middle of a file, which would result in corruption
- * if the file size is not blocksize aligned. So we don't need to check
- * for that case here.
- */
- if (off + len == src->i_size)
- len = ALIGN(src->i_size, bs) - off;
-
- if (destoff > inode->i_size) {
- const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
-
- ret = btrfs_cont_expand(inode, inode->i_size, destoff);
- if (ret)
- return ret;
- /*
- * We may have truncated the last block if the inode's size is
- * not sector size aligned, so we need to wait for writeback to
- * complete before proceeding further, otherwise we can race
- * with cloning and attempt to increment a reference to an
- * extent that no longer exists (writeback completed right after
- * we found the previous extent covering eof and before we
- * attempted to increment its reference count).
- */
- ret = btrfs_wait_ordered_range(inode, wb_start,
- destoff - wb_start);
- if (ret)
- return ret;
- }
-
- /*
- * Lock destination range to serialize with concurrent readpages() and
- * source range to serialize with relocation.
- */
- btrfs_double_extent_lock(src, off, inode, destoff, len);
- ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
- btrfs_double_extent_unlock(src, off, inode, destoff, len);
- /*
- * Truncate page cache pages so that future reads will see the cloned
- * data immediately and not the previous data.
- */
- truncate_inode_pages_range(&inode->i_data,
- round_down(destoff, PAGE_SIZE),
- round_up(destoff + len, PAGE_SIZE) - 1);
-
- return ret;
-}
-
-static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
- bool same_inode = inode_out == inode_in;
- u64 wb_len;
- int ret;
-
- if (!(remap_flags & REMAP_FILE_DEDUP)) {
- struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
-
- if (btrfs_root_readonly(root_out))
- return -EROFS;
-
- if (file_in->f_path.mnt != file_out->f_path.mnt ||
- inode_in->i_sb != inode_out->i_sb)
- return -EXDEV;
- }
-
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
- return -EINVAL;
- }
-
- /*
- * Now that the inodes are locked, we need to start writeback ourselves
- * and can not rely on the writeback from the VFS's generic helper
- * generic_remap_file_range_prep() because:
- *
- * 1) For compression we must call filemap_fdatawrite_range() range
- * twice (btrfs_fdatawrite_range() does it for us), and the generic
- * helper only calls it once;
- *
- * 2) filemap_fdatawrite_range(), called by the generic helper only
- * waits for the writeback to complete, i.e. for IO to be done, and
- * not for the ordered extents to complete. We need to wait for them
- * to complete so that new file extent items are in the fs tree.
- */
- if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
- wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
- else
- wb_len = ALIGN(*len, bs);
-
- /*
- * Since we don't lock ranges, wait for ongoing lockless dio writes (as
- * any in progress could create its ordered extents after we wait for
- * existing ordered extents below).
- */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- /*
- * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
- *
- * Btrfs' back references do not have a block level granularity, they
- * work at the whole extent level.
- * NOCOW buffered write without data space reserved may not be able
- * to fall back to CoW due to lack of data space, thus could cause
- * data loss.
- *
- * Here we take a shortcut by flushing the whole inode, so that all
- * nocow write should reach disk as nocow before we increase the
- * reference of the extent. We could do better by only flushing NOCOW
- * data, but that needs extra accounting.
- *
- * Also we don't need to check ASYNC_EXTENT, as async extent will be
- * CoWed anyway, not affecting nocow part.
- */
- ret = filemap_flush(inode_in->i_mapping);
- if (ret < 0)
- return ret;
-
- ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
- wb_len);
- if (ret < 0)
- return ret;
- ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
- wb_len);
- if (ret < 0)
- return ret;
-
- return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
- len, remap_flags);
-}
-
-loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
- struct file *dst_file, loff_t destoff, loff_t len,
- unsigned int remap_flags)
-{
- struct inode *src_inode = file_inode(src_file);
- struct inode *dst_inode = file_inode(dst_file);
- bool same_inode = dst_inode == src_inode;
- int ret;
-
- if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
- return -EINVAL;
-
- if (same_inode)
- inode_lock(src_inode);
- else
- lock_two_nondirectories(src_inode, dst_inode);
-
- ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
- &len, remap_flags);
- if (ret < 0 || len == 0)
- goto out_unlock;
-
- if (remap_flags & REMAP_FILE_DEDUP)
- ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
- else
- ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
-
-out_unlock:
- if (same_inode)
- inode_unlock(src_inode);
- else
- unlock_two_nondirectories(src_inode, dst_inode);
-
- return ret < 0 ? ret : len;
-}
-
static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
@@ -4052,8 +3423,7 @@
struct btrfs_root *new_root;
struct btrfs_dir_item *di;
struct btrfs_trans_handle *trans;
- struct btrfs_path *path;
- struct btrfs_key location;
+ struct btrfs_path *path = NULL;
struct btrfs_disk_key disk_key;
u64 objectid = 0;
u64 dir_id;
@@ -4074,53 +3444,51 @@
if (!objectid)
objectid = BTRFS_FS_TREE_OBJECTID;
- location.objectid = objectid;
- location.type = BTRFS_ROOT_ITEM_KEY;
- location.offset = (u64)-1;
-
- new_root = btrfs_read_fs_root_no_name(fs_info, &location);
+ new_root = btrfs_get_fs_root(fs_info, objectid, true);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
goto out;
}
if (!is_fstree(new_root->root_key.objectid)) {
ret = -ENOENT;
- goto out;
+ goto out_free;
}
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
- goto out;
+ goto out_free;
}
path->leave_spinning = 1;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
- btrfs_free_path(path);
ret = PTR_ERR(trans);
- goto out;
+ goto out_free;
}
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
dir_id, "default", 7, 1);
if (IS_ERR_OR_NULL(di)) {
- btrfs_free_path(path);
+ btrfs_release_path(path);
btrfs_end_transaction(trans);
btrfs_err(fs_info,
"Umm, you don't have the default diritem, this isn't going to work");
ret = -ENOENT;
- goto out;
+ goto out_free;
}
btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_free_path(path);
+ btrfs_release_path(path);
btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
btrfs_end_transaction(trans);
+out_free:
+ btrfs_put_root(new_root);
+ btrfs_free_path(path);
out:
mnt_drop_write_file(file);
return ret;
@@ -4129,16 +3497,15 @@
static void get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
space->total_bytes = 0;
space->used_bytes = 0;
space->flags = 0;
list_for_each_entry(block_group, groups_list, list) {
space->flags = block_group->flags;
- space->total_bytes += block_group->key.offset;
- space->used_bytes +=
- btrfs_block_group_used(&block_group->item);
+ space->total_bytes += block_group->length;
+ space->used_bytes += block_group->used;
}
}
@@ -4172,15 +3539,12 @@
struct btrfs_space_info *tmp;
info = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp, &fs_info->space_info,
- list) {
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
- rcu_read_unlock();
if (!info)
continue;
@@ -4228,15 +3592,12 @@
break;
info = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp, &fs_info->space_info,
- list) {
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
- rcu_read_unlock();
if (!info)
continue;
@@ -4446,11 +3807,11 @@
ret = -EROFS;
goto out;
}
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
ret = btrfs_dev_replace_by_ioctl(fs_info, p);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
}
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
@@ -4661,7 +4022,7 @@
return ret;
again:
- if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
mutex_lock(&fs_info->balance_mutex);
need_unlock = true;
goto locked;
@@ -4707,7 +4068,6 @@
}
locked:
- BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
@@ -4762,10 +4122,10 @@
do_balance:
/*
- * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to
- * btrfs_balance. bctl is freed in reset_balance_state, or, if
- * restriper was paused all the way until unmount, in free_fs_info.
- * The flag should be cleared after reset_balance_state.
+ * Ownership of bctl and exclusive operation goes to btrfs_balance.
+ * bctl is freed in reset_balance_state, or, if restriper was paused
+ * all the way until unmount, in free_fs_info. The flag should be
+ * cleared after reset_balance_state.
*/
need_unlock = false;
@@ -4784,7 +4144,7 @@
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
if (need_unlock)
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
out:
mnt_drop_write_file(file);
return ret;
@@ -5061,10 +4421,9 @@
return ret;
}
-static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
+static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_quota_rescan_args *qsa;
int ret = 0;
@@ -5087,11 +4446,9 @@
return ret;
}
-static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -5263,10 +4620,9 @@
return ret;
}
-static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
+static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
size_t len;
int ret;
char label[BTRFS_LABEL_SIZE];
@@ -5350,10 +4706,9 @@
return 0;
}
-static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
+static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_super_block *super_block = fs_info->super_copy;
struct btrfs_ioctl_feature_flags features;
@@ -5554,11 +4909,11 @@
case FS_IOC_GETVERSION:
return btrfs_ioctl_getversion(file, argp);
case FS_IOC_GETFSLABEL:
- return btrfs_ioctl_get_fslabel(file, argp);
+ return btrfs_ioctl_get_fslabel(fs_info, argp);
case FS_IOC_SETFSLABEL:
return btrfs_ioctl_set_fslabel(file, argp);
case FITRIM:
- return btrfs_ioctl_fitrim(file, argp);
+ return btrfs_ioctl_fitrim(fs_info, argp);
case BTRFS_IOC_SNAP_CREATE:
return btrfs_ioctl_snap_create(file, argp, 0);
case BTRFS_IOC_SNAP_CREATE_V2:
@@ -5568,7 +4923,9 @@
case BTRFS_IOC_SUBVOL_CREATE_V2:
return btrfs_ioctl_snap_create_v2(file, argp, 1);
case BTRFS_IOC_SNAP_DESTROY:
- return btrfs_ioctl_snap_destroy(file, argp);
+ return btrfs_ioctl_snap_destroy(file, argp, false);
+ case BTRFS_IOC_SNAP_DESTROY_V2:
+ return btrfs_ioctl_snap_destroy(file, argp, true);
case BTRFS_IOC_SUBVOL_GETFLAGS:
return btrfs_ioctl_subvol_getflags(file, argp);
case BTRFS_IOC_SUBVOL_SETFLAGS:
@@ -5610,7 +4967,7 @@
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(fs_info, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false);
if (ret)
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
@@ -5663,15 +5020,15 @@
case BTRFS_IOC_QUOTA_RESCAN:
return btrfs_ioctl_quota_rescan(file, argp);
case BTRFS_IOC_QUOTA_RESCAN_STATUS:
- return btrfs_ioctl_quota_rescan_status(file, argp);
+ return btrfs_ioctl_quota_rescan_status(fs_info, argp);
case BTRFS_IOC_QUOTA_RESCAN_WAIT:
- return btrfs_ioctl_quota_rescan_wait(file, argp);
+ return btrfs_ioctl_quota_rescan_wait(fs_info, argp);
case BTRFS_IOC_DEV_REPLACE:
return btrfs_ioctl_dev_replace(fs_info, argp);
case BTRFS_IOC_GET_SUPPORTED_FEATURES:
return btrfs_ioctl_get_supported_features(argp);
case BTRFS_IOC_GET_FEATURES:
- return btrfs_ioctl_get_features(file, argp);
+ return btrfs_ioctl_get_features(fs_info, argp);
case BTRFS_IOC_SET_FEATURES:
return btrfs_ioctl_set_features(file, argp);
case FS_IOC_FSGETXATTR:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 7f9a578..66e02eb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -13,65 +13,164 @@
#include "extent_io.h"
#include "locking.h"
+/*
+ * Extent buffer locking
+ * =====================
+ *
+ * The locks use a custom scheme that allows to do more operations than are
+ * available fromt current locking primitives. The building blocks are still
+ * rwlock and wait queues.
+ *
+ * Required semantics:
+ *
+ * - reader/writer exclusion
+ * - writer/writer exclusion
+ * - reader/reader sharing
+ * - spinning lock semantics
+ * - blocking lock semantics
+ * - try-lock semantics for readers and writers
+ * - one level nesting, allowing read lock to be taken by the same thread that
+ * already has write lock
+ *
+ * The extent buffer locks (also called tree locks) manage access to eb data
+ * related to the storage in the b-tree (keys, items, but not the individual
+ * members of eb).
+ * We want concurrency of many readers and safe updates. The underlying locking
+ * is done by read-write spinlock and the blocking part is implemented using
+ * counters and wait queues.
+ *
+ * spinning semantics - the low-level rwlock is held so all other threads that
+ * want to take it are spinning on it.
+ *
+ * blocking semantics - the low-level rwlock is not held but the counter
+ * denotes how many times the blocking lock was held;
+ * sleeping is possible
+ *
+ * Write lock always allows only one thread to access the data.
+ *
+ *
+ * Debugging
+ * ---------
+ *
+ * There are additional state counters that are asserted in various contexts,
+ * removed from non-debug build to reduce extent_buffer size and for
+ * performance reasons.
+ *
+ *
+ * Lock recursion
+ * --------------
+ *
+ * A write operation on a tree might indirectly start a look up on the same
+ * tree. This can happen when btrfs_cow_block locks the tree and needs to
+ * lookup free extents.
+ *
+ * btrfs_cow_block
+ * ..
+ * alloc_tree_block_no_bg_flush
+ * btrfs_alloc_tree_block
+ * btrfs_reserve_extent
+ * ..
+ * load_free_space_cache
+ * ..
+ * btrfs_lookup_file_extent
+ * btrfs_search_slot
+ *
+ *
+ * Locking pattern - spinning
+ * --------------------------
+ *
+ * The simple locking scenario, the +--+ denotes the spinning section.
+ *
+ * +- btrfs_tree_lock
+ * | - extent_buffer::rwlock is held
+ * | - no heavy operations should happen, eg. IO, memory allocations, large
+ * | structure traversals
+ * +- btrfs_tree_unock
+*
+*
+ * Locking pattern - blocking
+ * --------------------------
+ *
+ * The blocking write uses the following scheme. The +--+ denotes the spinning
+ * section.
+ *
+ * +- btrfs_tree_lock
+ * |
+ * +- btrfs_set_lock_blocking_write
+ *
+ * - allowed: IO, memory allocations, etc.
+ *
+ * -- btrfs_tree_unlock - note, no explicit unblocking necessary
+ *
+ *
+ * Blocking read is similar.
+ *
+ * +- btrfs_tree_read_lock
+ * |
+ * +- btrfs_set_lock_blocking_read
+ *
+ * - heavy operations allowed
+ *
+ * +- btrfs_tree_read_unlock_blocking
+ * |
+ * +- btrfs_tree_read_unlock
+ *
+ */
+
#ifdef CONFIG_BTRFS_DEBUG
-static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb)
+static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb)
{
WARN_ON(eb->spinning_writers);
eb->spinning_writers++;
}
-static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb)
+static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb)
{
WARN_ON(eb->spinning_writers != 1);
eb->spinning_writers--;
}
-static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb)
+static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb)
{
WARN_ON(eb->spinning_writers);
}
-static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb)
+static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb)
{
atomic_inc(&eb->spinning_readers);
}
-static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb)
+static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb)
{
WARN_ON(atomic_read(&eb->spinning_readers) == 0);
atomic_dec(&eb->spinning_readers);
}
-static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb)
+static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb)
{
atomic_inc(&eb->read_locks);
}
-static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb)
+static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb)
{
atomic_dec(&eb->read_locks);
}
-static void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
{
BUG_ON(!atomic_read(&eb->read_locks));
}
-static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
+static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
{
eb->write_locks++;
}
-static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
+static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
{
eb->write_locks--;
}
-void btrfs_assert_tree_locked(struct extent_buffer *eb)
-{
- BUG_ON(!eb->write_locks);
-}
-
#else
static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { }
static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { }
@@ -81,11 +180,19 @@
static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { }
static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { }
-void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { }
static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { }
#endif
+/*
+ * Mark already held read lock as blocking. Can be nested in write lock by the
+ * same thread.
+ *
+ * Use when there are potentially long operations ahead so other thread waiting
+ * on the lock will not actively spin but sleep instead.
+ *
+ * The rwlock is released and blocking reader counter is increased.
+ */
void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
{
trace_btrfs_set_lock_blocking_read(eb);
@@ -94,7 +201,7 @@
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner)
+ if (eb->lock_recursed && current->pid == eb->lock_owner)
return;
btrfs_assert_tree_read_locked(eb);
atomic_inc(&eb->blocking_readers);
@@ -102,6 +209,14 @@
read_unlock(&eb->lock);
}
+/*
+ * Mark already held write lock as blocking.
+ *
+ * Use when there are potentially long operations ahead so other threads
+ * waiting on the lock will not actively spin but sleep instead.
+ *
+ * The rwlock is released and blocking writers is set.
+ */
void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
{
trace_btrfs_set_lock_blocking_write(eb);
@@ -110,21 +225,27 @@
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner)
+ if (eb->lock_recursed && current->pid == eb->lock_owner)
return;
if (eb->blocking_writers == 0) {
btrfs_assert_spinning_writers_put(eb);
btrfs_assert_tree_locked(eb);
- eb->blocking_writers++;
+ WRITE_ONCE(eb->blocking_writers, 1);
write_unlock(&eb->lock);
}
}
/*
- * take a spinning read lock. This will wait for any blocking
- * writers
+ * Lock the extent buffer for read. Wait for any writers (spinning or blocking).
+ * Can be nested in write lock by the same thread.
+ *
+ * Use when the locked section does only lightweight actions and busy waiting
+ * would be cheaper than making other threads do the wait/wake loop.
+ *
+ * The rwlock is held upon exit.
*/
-void btrfs_tree_read_lock(struct extent_buffer *eb)
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
+ bool recurse)
{
u64 start_ns = 0;
@@ -134,23 +255,25 @@
read_lock(&eb->lock);
BUG_ON(eb->blocking_writers == 0 &&
current->pid == eb->lock_owner);
- if (eb->blocking_writers && current->pid == eb->lock_owner) {
- /*
- * This extent is already write-locked by our thread. We allow
- * an additional read lock to be added because it's for the same
- * thread. btrfs_find_all_roots() depends on this as it may be
- * called on a partly (write-)locked tree.
- */
- BUG_ON(eb->lock_nested);
- eb->lock_nested = true;
- read_unlock(&eb->lock);
- trace_btrfs_tree_read_lock(eb, start_ns);
- return;
- }
if (eb->blocking_writers) {
+ if (current->pid == eb->lock_owner) {
+ /*
+ * This extent is already write-locked by our thread.
+ * We allow an additional read lock to be added because
+ * it's for the same thread. btrfs_find_all_roots()
+ * depends on this as it may be called on a partly
+ * (write-)locked tree.
+ */
+ WARN_ON(!recurse);
+ BUG_ON(eb->lock_recursed);
+ eb->lock_recursed = true;
+ read_unlock(&eb->lock);
+ trace_btrfs_tree_read_lock(eb, start_ns);
+ return;
+ }
read_unlock(&eb->lock);
wait_event(eb->write_lock_wq,
- eb->blocking_writers == 0);
+ READ_ONCE(eb->blocking_writers) == 0);
goto again;
}
btrfs_assert_tree_read_locks_get(eb);
@@ -158,18 +281,25 @@
trace_btrfs_tree_read_lock(eb, start_ns);
}
+void btrfs_tree_read_lock(struct extent_buffer *eb)
+{
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false);
+}
+
/*
- * take a spinning read lock.
- * returns 1 if we get the read lock and 0 if we don't
- * this won't wait for blocking writers
+ * Lock extent buffer for read, optimistically expecting that there are no
+ * contending blocking writers. If there are, don't wait.
+ *
+ * Return 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
{
- if (eb->blocking_writers)
+ if (READ_ONCE(eb->blocking_writers))
return 0;
read_lock(&eb->lock);
- if (eb->blocking_writers) {
+ /* Refetch value after lock */
+ if (READ_ONCE(eb->blocking_writers)) {
read_unlock(&eb->lock);
return 0;
}
@@ -180,18 +310,20 @@
}
/*
- * returns 1 if we get the read lock and 0 if we don't
- * this won't wait for blocking writers
+ * Try-lock for read. Don't block or wait for contending writers.
+ *
+ * Retrun 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
- if (eb->blocking_writers)
+ if (READ_ONCE(eb->blocking_writers))
return 0;
if (!read_trylock(&eb->lock))
return 0;
- if (eb->blocking_writers) {
+ /* Refetch value after lock */
+ if (READ_ONCE(eb->blocking_writers)) {
read_unlock(&eb->lock);
return 0;
}
@@ -202,16 +334,19 @@
}
/*
- * returns 1 if we get the read lock and 0 if we don't
- * this won't wait for blocking writers or readers
+ * Try-lock for write. May block until the lock is uncontended, but does not
+ * wait until it is free.
+ *
+ * Retrun 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
- if (eb->blocking_writers || atomic_read(&eb->blocking_readers))
+ if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers))
return 0;
write_lock(&eb->lock);
- if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) {
+ /* Refetch value after lock */
+ if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) {
write_unlock(&eb->lock);
return 0;
}
@@ -223,7 +358,10 @@
}
/*
- * drop a spinning read lock
+ * Release read lock. Must be used only if the lock is in spinning mode. If
+ * the read lock is nested, must pair with read lock before the write unlock.
+ *
+ * The rwlock is not held upon exit.
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
@@ -231,11 +369,11 @@
/*
* if we're nested, we have the write lock. No new locking
* is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_nested
+ * The write unlock will do a barrier for us, and the lock_recursed
* field only matters to the lock owner.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = false;
+ if (eb->lock_recursed && current->pid == eb->lock_owner) {
+ eb->lock_recursed = false;
return;
}
btrfs_assert_tree_read_locked(eb);
@@ -245,7 +383,11 @@
}
/*
- * drop a blocking read lock
+ * Release read lock, previously set to blocking by a pairing call to
+ * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same
+ * thread.
+ *
+ * State of rwlock is unchanged, last reader wakes waiting threads.
*/
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
{
@@ -253,11 +395,11 @@
/*
* if we're nested, we have the write lock. No new locking
* is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_nested
+ * The write unlock will do a barrier for us, and the lock_recursed
* field only matters to the lock owner.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = false;
+ if (eb->lock_recursed && current->pid == eb->lock_owner) {
+ eb->lock_recursed = false;
return;
}
btrfs_assert_tree_read_locked(eb);
@@ -269,10 +411,13 @@
}
/*
- * take a spinning write lock. This will wait for both
- * blocking readers or writers
+ * Lock for write. Wait for all blocking and spinning readers and writers. This
+ * starts context where reader lock could be nested by the same thread.
+ *
+ * The rwlock is held for write upon exit.
*/
-void btrfs_tree_lock(struct extent_buffer *eb)
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
+ __acquires(&eb->lock)
{
u64 start_ns = 0;
@@ -282,9 +427,11 @@
WARN_ON(eb->lock_owner == current->pid);
again:
wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
- wait_event(eb->write_lock_wq, eb->blocking_writers == 0);
+ wait_event(eb->write_lock_wq, READ_ONCE(eb->blocking_writers) == 0);
write_lock(&eb->lock);
- if (atomic_read(&eb->blocking_readers) || eb->blocking_writers) {
+ /* Refetch value after lock */
+ if (atomic_read(&eb->blocking_readers) ||
+ READ_ONCE(eb->blocking_writers)) {
write_unlock(&eb->lock);
goto again;
}
@@ -294,11 +441,25 @@
trace_btrfs_tree_lock(eb, start_ns);
}
+void btrfs_tree_lock(struct extent_buffer *eb)
+{
+ __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL);
+}
+
/*
- * drop a spinning or a blocking write lock.
+ * Release the write lock, either blocking or spinning (ie. there's no need
+ * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking).
+ * This also ends the context for nesting, the read lock must have been
+ * released already.
+ *
+ * Tasks blocked and waiting are woken, rwlock is not held upon exit.
*/
void btrfs_tree_unlock(struct extent_buffer *eb)
{
+ /*
+ * This is read both locked and unlocked but always by the same thread
+ * that already owns the lock so we don't need to use READ_ONCE
+ */
int blockers = eb->blocking_writers;
BUG_ON(blockers > 1);
@@ -310,7 +471,8 @@
if (blockers) {
btrfs_assert_no_spinning_writers(eb);
- eb->blocking_writers--;
+ /* Unlocked write */
+ WRITE_ONCE(eb->blocking_writers, 0);
/*
* We need to order modifying blocking_writers above with
* actually waking up the sleepers to ensure they see the
@@ -322,3 +484,191 @@
write_unlock(&eb->lock);
}
}
+
+/*
+ * Set all locked nodes in the path to blocking locks. This should be done
+ * before scheduling
+ */
+void btrfs_set_path_blocking(struct btrfs_path *p)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ if (!p->nodes[i] || !p->locks[i])
+ continue;
+ /*
+ * If we currently have a spinning reader or writer lock this
+ * will bump the count of blocking holders and drop the
+ * spinlock.
+ */
+ if (p->locks[i] == BTRFS_READ_LOCK) {
+ btrfs_set_lock_blocking_read(p->nodes[i]);
+ p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
+ } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
+ btrfs_set_lock_blocking_write(p->nodes[i]);
+ p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
+ }
+ }
+}
+
+/*
+ * This releases any locks held in the path starting at level and going all the
+ * way up to the root.
+ *
+ * btrfs_search_slot will keep the lock held on higher nodes in a few corner
+ * cases, such as COW of the block at slot zero in the node. This ignores
+ * those rules, and it should only be called when there are no more updates to
+ * be done higher up in the tree.
+ */
+void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
+{
+ int i;
+
+ if (path->keep_locks)
+ return;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ if (!path->nodes[i])
+ continue;
+ if (!path->locks[i])
+ continue;
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+ path->locks[i] = 0;
+ }
+}
+
+/*
+ * Loop around taking references on and locking the root node of the tree until
+ * we end up with a lock on the root node.
+ *
+ * Return: root extent buffer with write lock held
+ */
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+ btrfs_tree_lock(eb);
+ if (eb == root->node)
+ break;
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/*
+ * Loop around taking references on and locking the root node of the tree until
+ * we end up with a lock on the root node.
+ *
+ * Return: root extent buffer with read lock held
+ */
+struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
+ bool recurse)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse);
+ if (eb == root->node)
+ break;
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/*
+ * DREW locks
+ * ==========
+ *
+ * DREW stands for double-reader-writer-exclusion lock. It's used in situation
+ * where you want to provide A-B exclusion but not AA or BB.
+ *
+ * Currently implementation gives more priority to reader. If a reader and a
+ * writer both race to acquire their respective sides of the lock the writer
+ * would yield its lock as soon as it detects a concurrent reader. Additionally
+ * if there are pending readers no new writers would be allowed to come in and
+ * acquire the lock.
+ */
+
+int btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
+{
+ int ret;
+
+ ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ atomic_set(&lock->readers, 0);
+ init_waitqueue_head(&lock->pending_readers);
+ init_waitqueue_head(&lock->pending_writers);
+
+ return 0;
+}
+
+void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock)
+{
+ percpu_counter_destroy(&lock->writers);
+}
+
+/* Return true if acquisition is successful, false otherwise */
+bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock)
+{
+ if (atomic_read(&lock->readers))
+ return false;
+
+ percpu_counter_inc(&lock->writers);
+
+ /* Ensure writers count is updated before we check for pending readers */
+ smp_mb();
+ if (atomic_read(&lock->readers)) {
+ btrfs_drew_write_unlock(lock);
+ return false;
+ }
+
+ return true;
+}
+
+void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
+{
+ while (true) {
+ if (btrfs_drew_try_write_lock(lock))
+ return;
+ wait_event(lock->pending_writers, !atomic_read(&lock->readers));
+ }
+}
+
+void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
+{
+ percpu_counter_dec(&lock->writers);
+ cond_wake_up(&lock->pending_readers);
+}
+
+void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
+{
+ atomic_inc(&lock->readers);
+
+ /*
+ * Ensure the pending reader count is perceieved BEFORE this reader
+ * goes to sleep in case of active writers. This guarantees new writers
+ * won't be allowed and that the current reader will be woken up when
+ * the last active writer finishes its jobs.
+ */
+ smp_mb__after_atomic();
+
+ wait_event(lock->pending_readers,
+ percpu_counter_sum(&lock->writers) == 0);
+}
+
+void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock)
+{
+ /*
+ * atomic_dec_and_test implies a full barrier, so woken up writers
+ * are guaranteed to see the decrement
+ */
+ if (atomic_dec_and_test(&lock->readers))
+ wake_up(&lock->pending_writers);
+}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index b775a42..3ea81ed 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -6,24 +6,118 @@
#ifndef BTRFS_LOCKING_H
#define BTRFS_LOCKING_H
+#include <linux/atomic.h>
+#include <linux/wait.h>
+#include <linux/percpu_counter.h>
+#include "extent_io.h"
+
#define BTRFS_WRITE_LOCK 1
#define BTRFS_READ_LOCK 2
#define BTRFS_WRITE_LOCK_BLOCKING 3
#define BTRFS_READ_LOCK_BLOCKING 4
+/*
+ * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at
+ * the time of this patch is 8, which is how many we use. Keep this in mind if
+ * you decide you want to add another subclass.
+ */
+enum btrfs_lock_nesting {
+ BTRFS_NESTING_NORMAL,
+
+ /*
+ * When we COW a block we are holding the lock on the original block,
+ * and since our lockdep maps are rootid+level, this confuses lockdep
+ * when we lock the newly allocated COW'd block. Handle this by having
+ * a subclass for COW'ed blocks so that lockdep doesn't complain.
+ */
+ BTRFS_NESTING_COW,
+
+ /*
+ * Oftentimes we need to lock adjacent nodes on the same level while
+ * still holding the lock on the original node we searched to, such as
+ * for searching forward or for split/balance.
+ *
+ * Because of this we need to indicate to lockdep that this is
+ * acceptable by having a different subclass for each of these
+ * operations.
+ */
+ BTRFS_NESTING_LEFT,
+ BTRFS_NESTING_RIGHT,
+
+ /*
+ * When splitting we will be holding a lock on the left/right node when
+ * we need to cow that node, thus we need a new set of subclasses for
+ * these two operations.
+ */
+ BTRFS_NESTING_LEFT_COW,
+ BTRFS_NESTING_RIGHT_COW,
+
+ /*
+ * When splitting we may push nodes to the left or right, but still use
+ * the subsequent nodes in our path, keeping our locks on those adjacent
+ * blocks. Thus when we go to allocate a new split block we've already
+ * used up all of our available subclasses, so this subclass exists to
+ * handle this case where we need to allocate a new split block.
+ */
+ BTRFS_NESTING_SPLIT,
+
+ /*
+ * When promoting a new block to a root we need to have a special
+ * subclass so we don't confuse lockdep, as it will appear that we are
+ * locking a higher level node before a lower level one. Copying also
+ * has this problem as it appears we're locking the same block again
+ * when we make a snapshot of an existing root.
+ */
+ BTRFS_NESTING_NEW_ROOT,
+
+ /*
+ * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so
+ * add this in here and add a static_assert to keep us from going over
+ * the limit. As of this writing we're limited to 8, and we're
+ * definitely using 8, hence this check to keep us from messing up in
+ * the future.
+ */
+ BTRFS_NESTING_MAX,
+};
+
+static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
+ "too many lock subclasses defined");
+
+struct btrfs_path;
+
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
+ bool recurse);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
void btrfs_set_lock_blocking_read(struct extent_buffer *eb);
void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
-void btrfs_assert_tree_locked(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
+ bool recurse);
+static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+{
+ return __btrfs_read_lock_root_node(root, false);
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
+ BUG_ON(!eb->write_locks);
+}
+#else
+static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
+#endif
+
+void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
{
@@ -37,4 +131,19 @@
BUG();
}
+struct btrfs_drew_lock {
+ atomic_t readers;
+ struct percpu_counter writers;
+ wait_queue_head_t pending_writers;
+ wait_queue_head_t pending_readers;
+};
+
+int btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
+void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock);
+void btrfs_drew_write_lock(struct btrfs_drew_lock *lock);
+bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock);
+void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock);
+void btrfs_drew_read_lock(struct btrfs_drew_lock *lock);
+void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock);
+
#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index acad417..aa9cd11 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -63,27 +63,7 @@
static struct workspace_manager wsm;
-static void lzo_init_workspace_manager(void)
-{
- btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress);
-}
-
-static void lzo_cleanup_workspace_manager(void)
-{
- btrfs_cleanup_workspace_manager(&wsm);
-}
-
-static struct list_head *lzo_get_workspace(unsigned int level)
-{
- return btrfs_get_workspace(&wsm, level);
-}
-
-static void lzo_put_workspace(struct list_head *ws)
-{
- btrfs_put_workspace(&wsm, ws);
-}
-
-static void lzo_free_workspace(struct list_head *ws)
+void lzo_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -93,7 +73,7 @@
kfree(workspace);
}
-static struct list_head *lzo_alloc_workspace(unsigned int level)
+struct list_head *lzo_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
@@ -131,13 +111,9 @@
return le32_to_cpu(dlen);
}
-static int lzo_compress_pages(struct list_head *ws,
- struct address_space *mapping,
- u64 start,
- struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out)
+int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
@@ -303,7 +279,7 @@
return ret;
}
-static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0, ret2;
@@ -444,10 +420,9 @@
return ret;
}
-static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page,
- unsigned long start_byte,
- size_t srclen, size_t destlen)
+int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
size_t in_len;
@@ -508,15 +483,7 @@
}
const struct btrfs_compress_op btrfs_lzo_compress = {
- .init_workspace_manager = lzo_init_workspace_manager,
- .cleanup_workspace_manager = lzo_cleanup_workspace_manager,
- .get_workspace = lzo_get_workspace,
- .put_workspace = lzo_put_workspace,
- .alloc_workspace = lzo_alloc_workspace,
- .free_workspace = lzo_free_workspace,
- .compress_pages = lzo_compress_pages,
- .decompress_bio = lzo_decompress_bio,
- .decompress = lzo_decompress,
+ .workspace_manager = &wsm,
.max_level = 1,
.default_level = 1,
};
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 7d56492..6461ebc 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -6,6 +6,7 @@
#include <linux/sched.h>
#include <linux/wait.h>
#include <asm/div64.h>
+#include <linux/rbtree.h>
#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
@@ -47,4 +48,68 @@
return div_u64(num, 100);
}
+/* Copy of is_power_of_two that is 64bit safe */
+static inline bool is_power_of_two_u64(u64 n)
+{
+ return n != 0 && (n & (n - 1)) == 0;
+}
+
+static inline bool has_single_bit_set(u64 n)
+{
+ return is_power_of_two_u64(n);
+}
+
+/*
+ * Simple bytenr based rb_tree relate structures
+ *
+ * Any structure wants to use bytenr as single search index should have their
+ * structure start with these members.
+ */
+struct rb_simple_node {
+ struct rb_node rb_node;
+ u64 bytenr;
+};
+
+static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
+{
+ struct rb_node *node = root->rb_node;
+ struct rb_simple_node *entry;
+
+ while (node) {
+ entry = rb_entry(node, struct rb_simple_node, rb_node);
+
+ if (bytenr < entry->bytenr)
+ node = node->rb_left;
+ else if (bytenr > entry->bytenr)
+ node = node->rb_right;
+ else
+ return node;
+ }
+ return NULL;
+}
+
+static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct rb_simple_node *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct rb_simple_node, rb_node);
+
+ if (bytenr < entry->bytenr)
+ p = &(*p)->rb_left;
+ else if (bytenr > entry->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 00e1ef4..87bac9e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -15,14 +15,15 @@
#include "disk-io.h"
#include "compression.h"
#include "delalloc-space.h"
+#include "qgroup.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
static u64 entry_end(struct btrfs_ordered_extent *entry)
{
- if (entry->file_offset + entry->len < entry->file_offset)
+ if (entry->file_offset + entry->num_bytes < entry->file_offset)
return (u64)-1;
- return entry->file_offset + entry->len;
+ return entry->file_offset + entry->num_bytes;
}
/* returns NULL if the insertion worked, or it returns the node it did find
@@ -52,14 +53,6 @@
return NULL;
}
-static void ordered_data_tree_panic(struct inode *inode, int errno,
- u64 offset)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- btrfs_panic(fs_info, errno,
- "Inconsistency in ordered tree at offset %llu", offset);
-}
-
/*
* look for a given offset in the tree, and if it can't be found return the
* first lesser offset
@@ -120,7 +113,7 @@
static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
{
if (file_offset < entry->file_offset ||
- entry->file_offset + entry->len <= file_offset)
+ entry->file_offset + entry->num_bytes <= file_offset)
return 0;
return 1;
}
@@ -129,7 +122,7 @@
u64 len)
{
if (file_offset + len <= entry->file_offset ||
- entry->file_offset + entry->len <= file_offset)
+ entry->file_offset + entry->num_bytes <= file_offset)
return 0;
return 1;
}
@@ -160,45 +153,57 @@
return ret;
}
-/* allocate and add a new ordered_extent into the per-inode tree.
- * file_offset is the logical offset in the file
- *
- * start is the disk block number of an extent already reserved in the
- * extent allocation tree
- *
- * len is the length of the extent
+/*
+ * Allocate and add a new ordered_extent into the per-inode tree.
*
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
-static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len,
- int type, int dio, int compress_type)
+static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ u64 disk_bytenr, u64 num_bytes,
+ u64 disk_num_bytes, int type, int dio,
+ int compress_type)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
+ int ret;
- tree = &BTRFS_I(inode)->ordered_tree;
+ if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) {
+ /* For nocow write, we can release the qgroup rsv right now */
+ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
+ if (ret < 0)
+ return ret;
+ ret = 0;
+ } else {
+ /*
+ * The ordered extent has reserved qgroup space, release now
+ * and pass the reserved number for qgroup_record to free.
+ */
+ ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
+ if (ret < 0)
+ return ret;
+ }
entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
if (!entry)
return -ENOMEM;
entry->file_offset = file_offset;
- entry->start = start;
- entry->len = len;
- entry->disk_len = disk_len;
- entry->bytes_left = len;
- entry->inode = igrab(inode);
+ entry->disk_bytenr = disk_bytenr;
+ entry->num_bytes = num_bytes;
+ entry->disk_num_bytes = disk_num_bytes;
+ entry->bytes_left = num_bytes;
+ entry->inode = igrab(&inode->vfs_inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
+ entry->qgroup_rsv = ret;
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
if (dio) {
- percpu_counter_add_batch(&fs_info->dio_bytes, len,
+ percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes,
fs_info->delalloc_batch);
set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
}
@@ -207,11 +212,10 @@
refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
+ INIT_LIST_HEAD(&entry->log_list);
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
- INIT_LIST_HEAD(&entry->log_list);
- INIT_LIST_HEAD(&entry->trans_list);
trace_btrfs_ordered_extent_add(inode, entry);
@@ -219,7 +223,9 @@
node = tree_insert(&tree->tree, file_offset,
&entry->rb_node);
if (node)
- ordered_data_tree_panic(inode, -EEXIST, file_offset);
+ btrfs_panic(fs_info, -EEXIST,
+ "inconsistency in ordered tree at offset %llu",
+ file_offset);
spin_unlock_irq(&tree->lock);
spin_lock(&root->ordered_extent_lock);
@@ -239,35 +245,38 @@
* that work has been done at higher layers, so this is truly the
* smallest the extent is going to get.
*/
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, 1);
+ spin_unlock(&inode->lock);
return 0;
}
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len, int type)
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
+ int type)
{
- return __btrfs_add_ordered_extent(inode, file_offset, start, len,
- disk_len, type, 0,
+ return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
+ num_bytes, disk_num_bytes, type, 0,
BTRFS_COMPRESS_NONE);
}
-int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len, int type)
+int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
+ u64 disk_bytenr, u64 num_bytes,
+ u64 disk_num_bytes, int type)
{
- return __btrfs_add_ordered_extent(inode, file_offset, start, len,
- disk_len, type, 1,
+ return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
+ num_bytes, disk_num_bytes, type, 1,
BTRFS_COMPRESS_NONE);
}
-int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len,
- int type, int compress_type)
+int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
+ u64 disk_bytenr, u64 num_bytes,
+ u64 disk_num_bytes, int type,
+ int compress_type)
{
- return __btrfs_add_ordered_extent(inode, file_offset, start, len,
- disk_len, type, 0,
+ return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
+ num_bytes, disk_num_bytes, type, 0,
compress_type);
}
@@ -299,12 +308,12 @@
* file_offset is updated to one byte past the range that is recorded as
* complete. This allows you to walk forward in the file.
*/
-int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 *file_offset, u64 io_size, int uptodate)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
int ret;
@@ -313,7 +322,6 @@
u64 dec_start;
u64 to_dec;
- tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irqsave(&tree->lock, flags);
node = tree_search(tree, *file_offset);
if (!node) {
@@ -328,8 +336,8 @@
}
dec_start = max(*file_offset, entry->file_offset);
- dec_end = min(*file_offset + io_size, entry->file_offset +
- entry->len);
+ dec_end = min(*file_offset + io_size,
+ entry->file_offset + entry->num_bytes);
*file_offset = dec_end;
if (dec_start > dec_end) {
btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu",
@@ -370,17 +378,16 @@
* test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
* to make sure this function only returns 1 once for a given ordered extent.
*/
-int btrfs_dec_test_ordered_pending(struct inode *inode,
+int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate)
{
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
int ret;
- tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irqsave(&tree->lock, flags);
if (cached && *cached) {
entry = *cached;
@@ -401,7 +408,7 @@
}
if (io_size > entry->bytes_left) {
- btrfs_crit(BTRFS_I(inode)->root->fs_info,
+ btrfs_crit(inode->root->fs_info,
"bad ordered accounting left %llu size %llu",
entry->bytes_left, io_size);
}
@@ -434,12 +441,11 @@
struct list_head *cur;
struct btrfs_ordered_sum *sum;
- trace_btrfs_ordered_extent_put(entry->inode, entry);
+ trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
if (refcount_dec_and_test(&entry->refs)) {
- ASSERT(list_empty(&entry->log_list));
- ASSERT(list_empty(&entry->trans_list));
ASSERT(list_empty(&entry->root_extent_list));
+ ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
btrfs_add_delayed_iput(entry->inode);
@@ -457,24 +463,25 @@
* remove an ordered extent from the tree. No references are dropped
* and waiters are woken up.
*/
-void btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_inode_tree *tree;
- struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
struct btrfs_root *root = btrfs_inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
+ bool pending;
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(&btrfs_inode->lock);
btrfs_mod_outstanding_extents(btrfs_inode, -1);
spin_unlock(&btrfs_inode->lock);
if (root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false);
+ btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
+ false);
if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- percpu_counter_add_batch(&fs_info->dio_bytes, -entry->len,
+ percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes,
fs_info->delalloc_batch);
tree = &btrfs_inode->ordered_tree;
@@ -485,13 +492,41 @@
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
spin_unlock_irq(&tree->lock);
+ /*
+ * The current running transaction is waiting on us, we need to let it
+ * know that we're complete and wake it up.
+ */
+ if (pending) {
+ struct btrfs_transaction *trans;
+
+ /*
+ * The checks for trans are just a formality, it should be set,
+ * but if it isn't we don't want to deref/assert under the spin
+ * lock, so be nice and check if trans is set, but ASSERT() so
+ * if it isn't set a developer will notice.
+ */
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans)
+ refcount_inc(&trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ ASSERT(trans);
+ if (trans) {
+ if (atomic_dec_and_test(&trans->pending_ordered))
+ wake_up(&trans->pending_wait);
+ btrfs_put_transaction(trans);
+ }
+ }
+
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
- trace_btrfs_ordered_extent_remove(inode, entry);
+ trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
if (!root->nr_ordered_extents) {
spin_lock(&fs_info->ordered_root_lock);
@@ -508,7 +543,7 @@
struct btrfs_ordered_extent *ordered;
ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
- btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
complete(&ordered->completion);
}
@@ -534,8 +569,8 @@
ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
root_extent_list);
- if (range_end <= ordered->start ||
- ordered->start + ordered->disk_len <= range_start) {
+ if (range_end <= ordered->disk_bytenr ||
+ ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
list_move_tail(&ordered->root_extent_list, &skipped);
cond_resched_lock(&root->ordered_extent_lock);
continue;
@@ -572,12 +607,11 @@
return count;
}
-u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
const u64 range_start, const u64 range_len)
{
struct btrfs_root *root;
struct list_head splice;
- u64 total_done = 0;
u64 done;
INIT_LIST_HEAD(&splice);
@@ -588,7 +622,7 @@
while (!list_empty(&splice) && nr) {
root = list_first_entry(&splice, struct btrfs_root,
ordered_root);
- root = btrfs_grab_fs_root(root);
+ root = btrfs_grab_root(root);
BUG_ON(!root);
list_move_tail(&root->ordered_root,
&fs_info->ordered_roots);
@@ -596,8 +630,7 @@
done = btrfs_wait_ordered_extents(root, nr,
range_start, range_len);
- btrfs_put_fs_root(root);
- total_done += done;
+ btrfs_put_root(root);
spin_lock(&fs_info->ordered_root_lock);
if (nr != U64_MAX) {
@@ -607,8 +640,6 @@
list_splice_tail(&splice, &fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
mutex_unlock(&fs_info->ordered_operations_mutex);
-
- return total_done;
}
/*
@@ -618,12 +649,11 @@
* in the extent, and it waits on the io completion code to insert
* metadata into the btree corresponding to the extent
*/
-void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry,
- int wait)
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
{
u64 start = entry->file_offset;
- u64 end = start + entry->len - 1;
+ u64 end = start + entry->num_bytes - 1;
+ struct btrfs_inode *inode = BTRFS_I(entry->inode);
trace_btrfs_ordered_extent_start(inode, entry);
@@ -633,7 +663,7 @@
* for the flusher thread to find them
*/
if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- filemap_fdatawrite_range(inode->i_mapping, start, end);
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
if (wait) {
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
@@ -677,18 +707,18 @@
end = orig_end;
while (1) {
- ordered = btrfs_lookup_first_ordered_extent(inode, end);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
if (!ordered)
break;
if (ordered->file_offset > orig_end) {
btrfs_put_ordered_extent(ordered);
break;
}
- if (ordered->file_offset + ordered->len <= start) {
+ if (ordered->file_offset + ordered->num_bytes <= start) {
btrfs_put_ordered_extent(ordered);
break;
}
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
end = ordered->file_offset;
/*
* If the ordered extent had an error save the error but don't
@@ -709,14 +739,14 @@
* find an ordered extent corresponding to file_offset. return NULL if
* nothing is found, otherwise take a reference on the extent and return it
*/
-struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
@@ -773,17 +803,45 @@
}
/*
+ * Adds all ordered extents to the given list. The list ends up sorted by the
+ * file_offset of the ordered extents.
+ */
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list)
+{
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct rb_node *n;
+
+ ASSERT(inode_is_locked(&inode->vfs_inode));
+
+ spin_lock_irq(&tree->lock);
+ for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+
+ if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ continue;
+
+ ASSERT(list_empty(&ordered->log_list));
+ list_add_tail(&ordered->log_list, list);
+ refcount_inc(&ordered->refs);
+ }
+ spin_unlock_irq(&tree->lock);
+}
+
+/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
*/
struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
@@ -797,148 +855,21 @@
}
/*
- * After an extent is done, call this to conditionally update the on disk
- * i_size. i_size is updated to cover any fully written part of the file.
- */
-int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
- struct btrfs_ordered_extent *ordered)
-{
- struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
- u64 disk_i_size;
- u64 new_i_size;
- u64 i_size = i_size_read(inode);
- struct rb_node *node;
- struct rb_node *prev = NULL;
- struct btrfs_ordered_extent *test;
- int ret = 1;
- u64 orig_offset = offset;
-
- spin_lock_irq(&tree->lock);
- if (ordered) {
- offset = entry_end(ordered);
- if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
- offset = min(offset,
- ordered->file_offset +
- ordered->truncated_len);
- } else {
- offset = ALIGN(offset, btrfs_inode_sectorsize(inode));
- }
- disk_i_size = BTRFS_I(inode)->disk_i_size;
-
- /*
- * truncate file.
- * If ordered is not NULL, then this is called from endio and
- * disk_i_size will be updated by either truncate itself or any
- * in-flight IOs which are inside the disk_i_size.
- *
- * Because btrfs_setsize() may set i_size with disk_i_size if truncate
- * fails somehow, we need to make sure we have a precise disk_i_size by
- * updating it as usual.
- *
- */
- if (!ordered && disk_i_size > i_size) {
- BTRFS_I(inode)->disk_i_size = orig_offset;
- ret = 0;
- goto out;
- }
-
- /*
- * if the disk i_size is already at the inode->i_size, or
- * this ordered extent is inside the disk i_size, we're done
- */
- if (disk_i_size == i_size)
- goto out;
-
- /*
- * We still need to update disk_i_size if outstanding_isize is greater
- * than disk_i_size.
- */
- if (offset <= disk_i_size &&
- (!ordered || ordered->outstanding_isize <= disk_i_size))
- goto out;
-
- /*
- * walk backward from this ordered extent to disk_i_size.
- * if we find an ordered extent then we can't update disk i_size
- * yet
- */
- if (ordered) {
- node = rb_prev(&ordered->rb_node);
- } else {
- prev = tree_search(tree, offset);
- /*
- * we insert file extents without involving ordered struct,
- * so there should be no ordered struct cover this offset
- */
- if (prev) {
- test = rb_entry(prev, struct btrfs_ordered_extent,
- rb_node);
- BUG_ON(offset_in_entry(test, offset));
- }
- node = prev;
- }
- for (; node; node = rb_prev(node)) {
- test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-
- /* We treat this entry as if it doesn't exist */
- if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
- continue;
-
- if (entry_end(test) <= disk_i_size)
- break;
- if (test->file_offset >= i_size)
- break;
-
- /*
- * We don't update disk_i_size now, so record this undealt
- * i_size. Or we will not know the real i_size.
- */
- if (test->outstanding_isize < offset)
- test->outstanding_isize = offset;
- if (ordered &&
- ordered->outstanding_isize > test->outstanding_isize)
- test->outstanding_isize = ordered->outstanding_isize;
- goto out;
- }
- new_i_size = min_t(u64, offset, i_size);
-
- /*
- * Some ordered extents may completed before the current one, and
- * we hold the real i_size in ->outstanding_isize.
- */
- if (ordered && ordered->outstanding_isize > new_i_size)
- new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
- BTRFS_I(inode)->disk_i_size = new_i_size;
- ret = 0;
-out:
- /*
- * We need to do this because we can't remove ordered extents until
- * after the i_disk_size has been updated and then the inode has been
- * updated to reflect the change, so we need to tell anybody who finds
- * this ordered extent that we've already done all the real work, we
- * just haven't completed all the other work.
- */
- if (ordered)
- set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
- spin_unlock_irq(&tree->lock);
- return ret;
-}
-
-/*
* search the ordered extents for one corresponding to 'offset' and
* try to find a checksum. This is used because we allow pages to
* be reclaimed before their checksum is actually put into the btree
*/
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u8 *sum, int len)
+int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
+ u64 disk_bytenr, u8 *sum, int len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *ordered_sum;
struct btrfs_ordered_extent *ordered;
- struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
unsigned long num_sectors;
unsigned long i;
u32 sectorsize = btrfs_inode_sectorsize(inode);
+ const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits;
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int index = 0;
@@ -950,10 +881,8 @@
list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
if (disk_bytenr >= ordered_sum->bytenr &&
disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
- i = (disk_bytenr - ordered_sum->bytenr) >>
- inode->i_sb->s_blocksize_bits;
- num_sectors = ordered_sum->len >>
- inode->i_sb->s_blocksize_bits;
+ i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits;
+ num_sectors = ordered_sum->len >> blocksize_bits;
num_sectors = min_t(int, len - index, num_sectors - i);
memcpy(sum + index, ordered_sum->sums + i * csum_size,
num_sectors * csum_size);
@@ -974,7 +903,6 @@
* btrfs_flush_ordered_range - Lock the passed range and ensures all pending
* ordered extents in it are run to completion.
*
- * @tree: IO tree used for locking out other users of the range
* @inode: Inode whose ordered tree is to be searched
* @start: Beginning of range to flush
* @end: Last byte of range to lock
@@ -984,8 +912,7 @@
* This function always returns with the given range locked, ensuring after it's
* called no order extent can be pending.
*/
-void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
- struct btrfs_inode *inode, u64 start,
+void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state)
{
@@ -997,7 +924,7 @@
cachedp = cached_state;
while (1) {
- lock_extent_bits(tree, start, end, cachedp);
+ lock_extent_bits(&inode->io_tree, start, end, cachedp);
ordered = btrfs_lookup_ordered_range(inode, start,
end - start + 1);
if (!ordered) {
@@ -1010,8 +937,8 @@
refcount_dec(&cache->refs);
break;
}
- unlock_extent_cached(tree, start, end, cachedp);
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+ unlock_extent_cached(&inode->io_tree, start, end, cachedp);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
}
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 5204171..c3a2325 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -52,29 +52,29 @@
BTRFS_ORDERED_DIRECT,
/* We had an io error when writing this out */
BTRFS_ORDERED_IOERR,
- /*
- * indicates whether this ordered extent has done its due diligence in
- * updating the isize
- */
- BTRFS_ORDERED_UPDATED_ISIZE,
/* Set when we have to truncate an extent */
BTRFS_ORDERED_TRUNCATED,
/* Regular IO for COW */
BTRFS_ORDERED_REGULAR,
+ /* Used during fsync to track already logged extents */
+ BTRFS_ORDERED_LOGGED,
+ /* We have already logged all the csums of the ordered extent */
+ BTRFS_ORDERED_LOGGED_CSUM,
+ /* We wait for this extent to complete in the current transaction */
+ BTRFS_ORDERED_PENDING,
};
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
- /* disk byte number */
- u64 start;
-
- /* ram length of the extent in bytes */
- u64 len;
-
- /* extent length on disk */
- u64 disk_len;
+ /*
+ * These fields directly correspond to the same fields in
+ * btrfs_file_extent_item.
+ */
+ u64 disk_bytenr;
+ u64 num_bytes;
+ u64 disk_num_bytes;
/* number of bytes that still need writing */
u64 bytes_left;
@@ -98,6 +98,9 @@
/* compression algorithm */
int compress_type;
+ /* Qgroup reserved space */
+ int qgroup_rsv;
+
/* reference count */
refcount_t refs;
@@ -107,12 +110,9 @@
/* list of checksums for insertion when the extent io is done */
struct list_head list;
- /* If we need to wait on this to be done */
+ /* used for fast fsyncs */
struct list_head log_list;
- /* If the transaction needs to wait on this ordered extent */
- struct list_head trans_list;
-
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
@@ -151,45 +151,46 @@
}
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
-void btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
-int btrfs_dec_test_ordered_pending(struct inode *inode,
+int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate);
-int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 *file_offset, u64 io_size,
int uptodate);
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len, int type);
-int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len, int type);
-int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len,
- int type, int compress_type);
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
+ int type);
+int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
+ u64 disk_bytenr, u64 num_bytes,
+ u64 disk_num_bytes, int type);
+int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
+ u64 disk_bytenr, u64 num_bytes,
+ u64 disk_num_bytes, int type,
+ int compress_type);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
-struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
-void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry, int wait);
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait);
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode,
u64 file_offset,
u64 len);
-int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
- struct btrfs_ordered_extent *ordered);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u8 *sum, int len);
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list);
+int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
+ u64 disk_bytenr, u8 *sum, int len);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
-u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
const u64 range_start, const u64 range_len);
-void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
- struct btrfs_inode *inode, u64 start,
+void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
int __init ordered_data_init(void);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f4edadf..c62771f 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -7,6 +7,44 @@
#include "disk-io.h"
#include "print-tree.h"
+struct root_name_map {
+ u64 id;
+ char name[16];
+};
+
+static const struct root_name_map root_map[] = {
+ { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" },
+ { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" },
+ { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" },
+ { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" },
+ { BTRFS_FS_TREE_OBJECTID, "FS_TREE" },
+ { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" },
+ { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" },
+ { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" },
+ { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" },
+ { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
+ { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
+};
+
+const char *btrfs_root_name(const struct btrfs_key *key, char *buf)
+{
+ int i;
+
+ if (key->objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN,
+ "TREE_RELOC offset=%llu", key->offset);
+ return buf;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(root_map); i++) {
+ if (root_map[i].id == key->objectid)
+ return root_map[i].name;
+ }
+
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", key->objectid);
+ return buf;
+}
+
static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
{
int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
@@ -268,9 +306,9 @@
struct btrfs_block_group_item);
pr_info(
"\t\tblock group used %llu chunk_objectid %llu flags %llu\n",
- btrfs_disk_block_group_used(l, bi),
- btrfs_disk_block_group_chunk_objectid(l, bi),
- btrfs_disk_block_group_flags(l, bi));
+ btrfs_block_group_used(l, bi),
+ btrfs_block_group_chunk_objectid(l, bi),
+ btrfs_block_group_flags(l, bi));
break;
case BTRFS_CHUNK_ITEM_KEY:
print_chunk(l, btrfs_item_ptr(l, i,
@@ -319,7 +357,7 @@
print_uuid_item(l, btrfs_item_ptr_offset(l, i),
btrfs_item_size_nr(l, i));
break;
- };
+ }
}
}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index e6bb38f..8c3e931 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -6,7 +6,11 @@
#ifndef BTRFS_PRINT_TREE_H
#define BTRFS_PRINT_TREE_H
+/* Buffer size to contain tree name and possibly additional data (offset) */
+#define BTRFS_ROOT_NAME_BUF_LEN 48
+
void btrfs_print_leaf(struct extent_buffer *l);
void btrfs_print_tree(struct extent_buffer *c, bool follow);
+const char *btrfs_root_name(const struct btrfs_key *key, char *buf);
#endif
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 1e664e0..2dcb1cb 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -383,7 +383,7 @@
if (need_reserve) {
btrfs_block_rsv_release(fs_info, trans->block_rsv,
- num_bytes);
+ num_bytes, NULL);
if (ret)
return ret;
}
@@ -408,19 +408,14 @@
struct btrfs_root *parent_root)
{
struct super_block *sb = root->fs_info->sb;
- struct btrfs_key key;
struct inode *parent_inode, *child_inode;
int ret;
- key.objectid = BTRFS_FIRST_FREE_OBJECTID;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
-
- parent_inode = btrfs_iget(sb, &key, parent_root, NULL);
+ parent_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, parent_root);
if (IS_ERR(parent_inode))
return PTR_ERR(parent_inode);
- child_inode = btrfs_iget(sb, &key, root, NULL);
+ child_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, root);
if (IS_ERR(child_inode)) {
iput(parent_inode);
return PTR_ERR(child_inode);
@@ -437,8 +432,6 @@
{
int i;
- hash_init(prop_handlers_ht);
-
for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
struct prop_handler *p = &prop_handlers[i];
u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index bb034e1..a02e38f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -11,6 +11,7 @@
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/btrfs.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "transaction.h"
@@ -21,6 +22,7 @@
#include "extent_io.h"
#include "qgroup.h"
#include "block-group.h"
+#include "sysfs.h"
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
@@ -219,7 +221,8 @@
return qgroup;
}
-static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
+static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
{
struct btrfs_qgroup_list *list;
@@ -239,7 +242,6 @@
list_del(&list->next_member);
kfree(list);
}
- kfree(qgroup);
}
/* must be called with qgroup_lock held */
@@ -251,7 +253,7 @@
return -ENOENT;
rb_erase(&qgroup->node, &fs_info->qgroup_tree);
- __del_qgroup_rb(qgroup);
+ __del_qgroup_rb(fs_info, qgroup);
return 0;
}
@@ -350,6 +352,9 @@
goto out;
}
+ ret = btrfs_sysfs_add_qgroups(fs_info);
+ if (ret < 0)
+ goto out;
/* default this to quota off, in case no status key is found */
fs_info->qgroup_flags = 0;
@@ -411,6 +416,10 @@
goto out;
}
}
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0)
+ goto out;
+
switch (found_key.type) {
case BTRFS_QGROUP_INFO_KEY: {
struct btrfs_qgroup_info_item *ptr;
@@ -499,12 +508,51 @@
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ btrfs_sysfs_del_qgroups(fs_info);
}
return ret < 0 ? ret : 0;
}
/*
+ * Called in close_ctree() when quota is still enabled. This verifies we don't
+ * leak some reserved space.
+ *
+ * Return false if no reserved space is left.
+ * Return true if some reserved space is leaked.
+ */
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *node;
+ bool ret = false;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return ret;
+ /*
+ * Since we're unmounting, there is no race and no need to grab qgroup
+ * lock. And here we don't go post-order to provide a more user
+ * friendly sorted result.
+ */
+ for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
+ struct btrfs_qgroup *qgroup;
+ int i;
+
+ qgroup = rb_entry(node, struct btrfs_qgroup, node);
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
+ if (qgroup->rsv.values[i]) {
+ ret = true;
+ btrfs_warn(fs_info,
+ "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ i, qgroup->rsv.values[i]);
+ }
+ }
+ }
+ return ret;
+}
+
+/*
* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
* first two are in single-threaded paths.And for the third one, we have set
* quota_root to be null with qgroup_lock held before, so it is safe to clean
@@ -518,7 +566,9 @@
while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
- __del_qgroup_rb(qgroup);
+ __del_qgroup_rb(fs_info, qgroup);
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ kfree(qgroup);
}
/*
* We call btrfs_free_qgroup_config() when unmounting
@@ -527,6 +577,7 @@
*/
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
+ btrfs_sysfs_del_qgroups(fs_info);
}
static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
@@ -890,6 +941,14 @@
int ret = 0;
int slot;
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to enable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock before setting fs_info->quota_root
+ * and before setting BTRFS_FS_QUOTA_ENABLED.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (fs_info->quota_root)
goto out;
@@ -900,6 +959,10 @@
goto out;
}
+ ret = btrfs_sysfs_add_qgroups(fs_info);
+ if (ret < 0)
+ goto out;
+
/*
* Unlock qgroup_ioctl_lock before starting the transaction. This is to
* avoid lock acquisition inversion problems (reported by lockdep) between
@@ -997,6 +1060,10 @@
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_ROOT_REF_KEY) {
+
+ /* Release locks on tree_root before we access quota_root */
+ btrfs_release_path(path);
+
ret = add_qgroup_item(trans, quota_root,
found_key.offset);
if (ret) {
@@ -1010,6 +1077,25 @@
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+ ret = btrfs_search_slot_for_read(tree_root, &found_key,
+ path, 1, 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+ if (ret > 0) {
+ /*
+ * Shouldn't happen, but in case it does we
+ * don't need to do the btrfs_next_item, just
+ * continue.
+ */
+ continue;
+ }
}
ret = btrfs_next_item(tree_root, path);
if (ret < 0) {
@@ -1034,9 +1120,25 @@
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ /*
+ * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
+ * a deadlock with tasks concurrently doing other qgroup operations, such
+ * adding/removing qgroups or adding/deleting qgroup relations for example,
+ * because all qgroup operations first start or join a transaction and then
+ * lock the qgroup_ioctl_lock mutex.
+ * We are safe from a concurrent task trying to enable quotas, by calling
+ * this function, since we are serialized by fs_info->subvol_sem.
+ */
ret = btrfs_commit_transaction(trans);
trans = NULL;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (ret)
goto out_free_path;
@@ -1061,15 +1163,13 @@
out_free_path:
btrfs_free_path(path);
out_free_root:
- if (ret) {
- free_extent_buffer(quota_root->node);
- free_extent_buffer(quota_root->commit_root);
- kfree(quota_root);
- }
+ if (ret)
+ btrfs_put_root(quota_root);
out:
if (ret) {
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
+ btrfs_sysfs_del_qgroups(fs_info);
}
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (ret && trans)
@@ -1086,12 +1186,34 @@
struct btrfs_trans_handle *trans = NULL;
int ret = 0;
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to disable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
+
+ /*
+ * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
+ * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
+ * to lock that mutex while holding a transaction handle and the rescan
+ * worker needs to commit a transaction.
+ */
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
+ * Request qgroup rescan worker to complete and wait for it. This wait
+ * must be done before transaction start for quota disable since it may
+ * deadlock with transaction by the qgroup rescan worker.
+ */
+ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ btrfs_qgroup_wait_for_completion(fs_info, false);
+
+ /*
* 1 For the root item
*
* We should also reserve enough items for the quota tree deletion in
@@ -1106,14 +1228,13 @@
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
goto out;
}
if (!fs_info->quota_root)
goto out;
- clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- btrfs_qgroup_wait_for_completion(fs_info, false);
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
@@ -1141,9 +1262,7 @@
btrfs_tree_unlock(quota_root->node);
btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
- free_extent_buffer(quota_root->node);
- free_extent_buffer(quota_root->commit_root);
- kfree(quota_root);
+ btrfs_put_root(quota_root);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
@@ -1283,25 +1402,27 @@
u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
struct ulist *tmp;
+ unsigned int nofs_flag;
int ret = 0;
/* Check the level of src and dst first */
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
return -EINVAL;
+ /* We hold a transaction handle open, must do a NOFS allocation. */
+ nofs_flag = memalloc_nofs_save();
tmp = ulist_alloc(GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!tmp)
return -ENOMEM;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
member = find_qgroup_rb(fs_info, src);
@@ -1347,22 +1468,24 @@
u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
struct ulist *tmp;
bool found = false;
+ unsigned int nofs_flag;
int ret = 0;
int ret2;
+ /* We hold a transaction handle open, must do a NOFS allocation. */
+ nofs_flag = memalloc_nofs_save();
tmp = ulist_alloc(GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!tmp)
return -ENOMEM;
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
@@ -1427,11 +1550,11 @@
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
+ quota_root = fs_info->quota_root;
qgroup = find_qgroup_rb(fs_info, qgroupid);
if (qgroup) {
ret = -EEXIST;
@@ -1446,8 +1569,11 @@
qgroup = add_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
- if (IS_ERR(qgroup))
+ if (IS_ERR(qgroup)) {
ret = PTR_ERR(qgroup);
+ goto out;
+ }
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
@@ -1456,15 +1582,13 @@
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_qgroup_list *list;
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
@@ -1496,6 +1620,14 @@
spin_lock(&fs_info->qgroup_lock);
del_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
+
+ /*
+ * Remove the qgroup from sysfs now without holding the qgroup_lock
+ * spinlock, since the sysfs_remove_group() function needs to take
+ * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
+ */
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ kfree(qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
@@ -1505,7 +1637,6 @@
struct btrfs_qgroup_limit *limit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
int ret = 0;
/* Sometimes we would want to clear the limit on this qgroup.
@@ -1515,9 +1646,8 @@
const u64 CLEAR_VALUE = -1;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
@@ -1851,7 +1981,7 @@
btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
/* For src_path */
- extent_buffer_get(src_eb);
+ atomic_inc(&src_eb->refs);
src_path->nodes[root_level] = src_eb;
src_path->slots[root_level] = dst_path->slots[root_level];
src_path->locks[root_level] = 0;
@@ -2107,7 +2237,7 @@
goto out;
}
/* For dst_path */
- extent_buffer_get(dst_eb);
+ atomic_inc(&dst_eb->refs);
dst_path->nodes[level] = dst_eb;
dst_path->slots[level] = 0;
dst_path->locks[level] = 0;
@@ -2166,7 +2296,7 @@
* walk back up the tree (adjusting slot pointers as we go)
* and restart the search process.
*/
- extent_buffer_get(root_eb); /* For path */
+ atomic_inc(&root_eb->refs); /* For path */
path->nodes[root_level] = root_eb;
path->slots[root_level] = 0;
path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
@@ -2622,10 +2752,9 @@
int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root = fs_info->quota_root;
int ret = 0;
- if (!quota_root)
+ if (!fs_info->quota_root)
return ret;
spin_lock(&fs_info->qgroup_lock);
@@ -2871,6 +3000,8 @@
unlock:
spin_unlock(&fs_info->qgroup_lock);
+ if (!ret)
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
@@ -2895,7 +3026,6 @@
static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
enum btrfs_qgroup_rsv_type type)
{
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
@@ -2914,8 +3044,7 @@
enforce = false;
spin_lock(&fs_info->qgroup_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root)
+ if (!fs_info->quota_root)
goto out;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -2982,7 +3111,6 @@
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type)
{
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -3000,8 +3128,7 @@
}
spin_lock(&fs_info->qgroup_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root)
+ if (!fs_info->quota_root)
goto out;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -3273,7 +3400,6 @@
}
mutex_lock(&fs_info->qgroup_rescan_lock);
- spin_lock(&fs_info->qgroup_lock);
if (init_flags) {
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
@@ -3285,10 +3411,12 @@
btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
+ } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ /* Quota disable is in progress */
+ ret = -EBUSY;
}
if (ret) {
- spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
return ret;
}
@@ -3299,12 +3427,8 @@
sizeof(fs_info->qgroup_rescan_progress));
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
init_completion(&fs_info->qgroup_rescan_completion);
-
- spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
- memset(&fs_info->qgroup_rescan_work, 0,
- sizeof(fs_info->qgroup_rescan_work));
btrfs_init_work(&fs_info->qgroup_rescan_work,
btrfs_qgroup_rescan_worker, NULL, NULL);
return 0;
@@ -3379,9 +3503,7 @@
int ret = 0;
mutex_lock(&fs_info->qgroup_rescan_lock);
- spin_lock(&fs_info->qgroup_lock);
running = fs_info->qgroup_rescan_running;
- spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
if (!running)
@@ -3421,24 +3543,20 @@
{
struct rb_node *node;
struct rb_node *next;
- struct ulist_node *entry = NULL;
+ struct ulist_node *entry;
int ret = 0;
node = reserved->range_changed.root.rb_node;
+ if (!node)
+ return 0;
while (node) {
entry = rb_entry(node, struct ulist_node, rb_node);
if (entry->val < start)
node = node->rb_right;
- else if (entry)
- node = node->rb_left;
else
- break;
+ node = node->rb_left;
}
- /* Empty changeset */
- if (!entry)
- return 0;
-
if (entry->val > start && rb_prev(&entry->rb_node))
entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
rb_node);
@@ -3505,16 +3623,6 @@
bool can_commit = true;
/*
- * We don't want to run flush again and again, so if there is a running
- * one, we won't try to start a new flush, but exit directly.
- */
- if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
- wait_event(root->qgroup_flush_wait,
- !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
- return 0;
- }
-
- /*
* If current process holds a transaction, we shouldn't flush, as we
* assume all space reservation happens before a transaction handle is
* held.
@@ -3528,6 +3636,26 @@
current->journal_info != BTRFS_SEND_TRANS_STUB)
can_commit = false;
+ /*
+ * We don't want to run flush again and again, so if there is a running
+ * one, we won't try to start a new flush, but exit directly.
+ */
+ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
+ /*
+ * We are already holding a transaction, thus we can block other
+ * threads from flushing. So exit right now. This increases
+ * the chance of EDQUOT for heavy load and near limit cases.
+ * But we can argue that if we're already near limit, EDQUOT is
+ * unavoidable anyway.
+ */
+ if (!can_commit)
+ return 0;
+
+ wait_event(root->qgroup_flush_wait,
+ !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
+ return 0;
+ }
+
ret = btrfs_start_delalloc_snapshot(root);
if (ret < 0)
goto out;
@@ -3685,7 +3813,7 @@
return ret;
}
-static int __btrfs_qgroup_release_data(struct inode *inode,
+static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len,
int free)
{
@@ -3693,28 +3821,26 @@
int trace_op = QGROUP_RELEASE;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
- &BTRFS_I(inode)->root->fs_info->flags))
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
return 0;
/* In release case, we shouldn't have @reserved */
WARN_ON(!free && reserved);
if (free && reserved)
- return qgroup_free_reserved_data(BTRFS_I(inode), reserved,
- start, len);
+ return qgroup_free_reserved_data(inode, reserved, start, len);
extent_changeset_init(&changeset);
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
- start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
+ ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
+ EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
if (free)
trace_op = QGROUP_FREE;
- trace_btrfs_qgroup_release_data(inode, start, len,
+ trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
changeset.bytes_changed, trace_op);
if (free)
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->root_key.objectid,
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ inode->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
ret = changeset.bytes_changed;
out:
@@ -3734,7 +3860,7 @@
*
* NOTE: This function may sleep for memory allocation.
*/
-int btrfs_qgroup_free_data(struct inode *inode,
+int btrfs_qgroup_free_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
@@ -3755,7 +3881,7 @@
*
* NOTE: This function may sleep for memory allocation.
*/
-int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
{
return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
}
@@ -3881,7 +4007,6 @@
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
int num_bytes)
{
- struct btrfs_root *quota_root = fs_info->quota_root;
struct btrfs_qgroup *qgroup;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -3889,7 +4014,7 @@
if (num_bytes == 0)
return;
- if (!quota_root)
+ if (!fs_info->quota_root)
return;
spin_lock(&fs_info->qgroup_lock);
@@ -4023,7 +4148,7 @@
*/
int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *subvol_root,
- struct btrfs_block_group_cache *bg,
+ struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
u64 last_snapshot)
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 0a26596..7283e4f 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -8,6 +8,7 @@
#include <linux/spinlock.h>
#include <linux/rbtree.h>
+#include <linux/kobject.h>
#include "ulist.h"
#include "delayed-ref.h"
@@ -223,8 +224,18 @@
*/
u64 old_refcnt;
u64 new_refcnt;
+
+ /*
+ * Sysfs kobjectid
+ */
+ struct kobject kobj;
};
+static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
+{
+ return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
+}
+
/*
* For qgroup event trace points only
*/
@@ -346,9 +357,10 @@
/* New io_tree based accurate qgroup reserve API */
int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
-int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
-int btrfs_qgroup_free_data(struct inode *inode,
- struct extent_changeset *reserved, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start,
+ u64 len);
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
@@ -409,12 +421,13 @@
void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *subvol_root,
- struct btrfs_block_group_cache *bg,
+ struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
u64 last_snapshot);
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 7ac679e..e65d0fa 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -206,7 +206,6 @@
struct btrfs_stripe_hash *h;
int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
int i;
- int table_size;
if (info->stripe_hash_table)
return 0;
@@ -218,8 +217,7 @@
* Try harder to allocate and fallback to vmalloc to lower the chance
* of a failing mount.
*/
- table_size = sizeof(*table) + sizeof(*h) * num_entries;
- table = kvzalloc(table_size, GFP_KERNEL);
+ table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
if (!table)
return -ENOMEM;
@@ -671,8 +669,7 @@
*/
static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
{
- int bucket = rbio_bucket(rbio);
- struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
+ struct btrfs_stripe_hash *h;
struct btrfs_raid_bio *cur;
struct btrfs_raid_bio *pending;
unsigned long flags;
@@ -680,64 +677,63 @@
struct btrfs_raid_bio *cache_drop = NULL;
int ret = 0;
+ h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
+
spin_lock_irqsave(&h->lock, flags);
list_for_each_entry(cur, &h->hash_list, hash_list) {
- if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
- spin_lock(&cur->bio_list_lock);
+ if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
+ continue;
- /* can we steal this cached rbio's pages? */
- if (bio_list_empty(&cur->bio_list) &&
- list_empty(&cur->plug_list) &&
- test_bit(RBIO_CACHE_BIT, &cur->flags) &&
- !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
- list_del_init(&cur->hash_list);
- refcount_dec(&cur->refs);
+ spin_lock(&cur->bio_list_lock);
- steal_rbio(cur, rbio);
- cache_drop = cur;
- spin_unlock(&cur->bio_list_lock);
+ /* Can we steal this cached rbio's pages? */
+ if (bio_list_empty(&cur->bio_list) &&
+ list_empty(&cur->plug_list) &&
+ test_bit(RBIO_CACHE_BIT, &cur->flags) &&
+ !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
+ list_del_init(&cur->hash_list);
+ refcount_dec(&cur->refs);
- goto lockit;
- }
+ steal_rbio(cur, rbio);
+ cache_drop = cur;
+ spin_unlock(&cur->bio_list_lock);
- /* can we merge into the lock owner? */
- if (rbio_can_merge(cur, rbio)) {
- merge_rbio(cur, rbio);
+ goto lockit;
+ }
+
+ /* Can we merge into the lock owner? */
+ if (rbio_can_merge(cur, rbio)) {
+ merge_rbio(cur, rbio);
+ spin_unlock(&cur->bio_list_lock);
+ freeit = rbio;
+ ret = 1;
+ goto out;
+ }
+
+
+ /*
+ * We couldn't merge with the running rbio, see if we can merge
+ * with the pending ones. We don't have to check for rmw_locked
+ * because there is no way they are inside finish_rmw right now
+ */
+ list_for_each_entry(pending, &cur->plug_list, plug_list) {
+ if (rbio_can_merge(pending, rbio)) {
+ merge_rbio(pending, rbio);
spin_unlock(&cur->bio_list_lock);
freeit = rbio;
ret = 1;
goto out;
}
-
-
- /*
- * we couldn't merge with the running
- * rbio, see if we can merge with the
- * pending ones. We don't have to
- * check for rmw_locked because there
- * is no way they are inside finish_rmw
- * right now
- */
- list_for_each_entry(pending, &cur->plug_list,
- plug_list) {
- if (rbio_can_merge(pending, rbio)) {
- merge_rbio(pending, rbio);
- spin_unlock(&cur->bio_list_lock);
- freeit = rbio;
- ret = 1;
- goto out;
- }
- }
-
- /* no merging, put us on the tail of the plug list,
- * our rbio will be started with the currently
- * running rbio unlocks
- */
- list_add_tail(&rbio->plug_list, &cur->plug_list);
- spin_unlock(&cur->bio_list_lock);
- ret = 1;
- goto out;
}
+
+ /*
+ * No merging, put us on the tail of the plug list, our rbio
+ * will be started with the currently running rbio unlocks
+ */
+ list_add_tail(&rbio->plug_list, &cur->plug_list);
+ spin_unlock(&cur->bio_list_lock);
+ ret = 1;
+ goto out;
}
lockit:
refcount_inc(&rbio->refs);
@@ -1087,7 +1083,6 @@
unsigned long bio_max_len)
{
struct bio *last = bio_list->tail;
- u64 last_end = 0;
int ret;
struct bio *bio;
struct btrfs_bio_stripe *stripe;
@@ -1102,15 +1097,14 @@
/* see if we can add this page onto our existing bio */
if (last) {
- last_end = (u64)last->bi_iter.bi_sector << 9;
+ u64 last_end = (u64)last->bi_iter.bi_sector << 9;
last_end += last->bi_iter.bi_size;
/*
* we can't merge these if they are from different
* devices or if they are not contiguous
*/
- if (last_end == disk_start && stripe->dev->bdev &&
- !last->bi_status &&
+ if (last_end == disk_start && !last->bi_status &&
last->bi_disk == stripe->dev->bdev->bd_disk &&
last->bi_partno == stripe->dev->bdev->bd_partno) {
ret = bio_add_page(last, page, PAGE_SIZE, 0);
@@ -1121,6 +1115,7 @@
/* put a new bio on the list */
bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
+ btrfs_io_bio(bio)->device = stripe->dev;
bio->bi_iter.bi_size = 0;
bio_set_dev(bio, stripe->dev->bdev);
bio->bi_iter.bi_sector = disk_start >> 9;
@@ -1329,11 +1324,7 @@
atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
bio->bi_opf = REQ_OP_WRITE;
@@ -1358,7 +1349,6 @@
struct bio *bio)
{
u64 physical = bio->bi_iter.bi_sector;
- u64 stripe_start;
int i;
struct btrfs_bio_stripe *stripe;
@@ -1366,9 +1356,7 @@
for (i = 0; i < rbio->bbio->num_stripes; i++) {
stripe = &rbio->bbio->stripes[i];
- stripe_start = stripe->physical;
- if (physical >= stripe_start &&
- physical < stripe_start + rbio->stripe_len &&
+ if (in_range(physical, stripe->physical, rbio->stripe_len) &&
stripe->dev->bdev &&
bio->bi_disk == stripe->dev->bdev->bd_disk &&
bio->bi_partno == stripe->dev->bdev->bd_partno) {
@@ -1386,18 +1374,14 @@
static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
- u64 logical = bio->bi_iter.bi_sector;
- u64 stripe_start;
+ u64 logical = (u64)bio->bi_iter.bi_sector << 9;
int i;
- logical <<= 9;
-
for (i = 0; i < rbio->nr_data; i++) {
- stripe_start = rbio->bbio->raid_map[i];
- if (logical >= stripe_start &&
- logical < stripe_start + rbio->stripe_len) {
+ u64 stripe_start = rbio->bbio->raid_map[i];
+
+ if (in_range(logical, stripe_start, rbio->stripe_len))
return i;
- }
}
return -1;
}
@@ -1571,11 +1555,7 @@
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_rmw_end_io;
bio->bi_opf = REQ_OP_READ;
@@ -1666,7 +1646,8 @@
/*
* rbios on the plug list are sorted for easier merging.
*/
-static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int plug_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
plug_list);
@@ -1882,11 +1863,8 @@
}
/* make sure our ps and qs are in order */
- if (faila > failb) {
- int tmp = failb;
- failb = faila;
- faila = tmp;
- }
+ if (faila > failb)
+ swap(faila, failb);
/* if the q stripe is failed, do a pstripe reconstruction
* from the xors.
@@ -2106,7 +2084,7 @@
*/
if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
__raid_recover_end_io(rbio);
- goto out;
+ return 0;
} else {
goto cleanup;
}
@@ -2117,11 +2095,7 @@
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_recover_end_io;
bio->bi_opf = REQ_OP_READ;
@@ -2130,7 +2104,7 @@
submit_bio(bio);
}
-out:
+
return 0;
cleanup:
@@ -2485,11 +2459,7 @@
atomic_set(&rbio->stripes_pending, nr_data);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
bio->bi_opf = REQ_OP_WRITE;
@@ -2667,11 +2637,7 @@
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid56_parity_scrub_end_io;
bio->bi_opf = REQ_OP_READ;
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
index a97dc74..5c1a617 100644
--- a/fs/btrfs/rcu-string.h
+++ b/fs/btrfs/rcu-string.h
@@ -8,7 +8,7 @@
struct rcu_string {
struct rcu_head rcu;
- char str[0];
+ char str[];
};
static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 2656dc8..d9a166e 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -227,7 +227,7 @@
struct btrfs_fs_info *fs_info = dev->fs_info;
int ret;
struct reada_zone *zone;
- struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_block_group *cache = NULL;
u64 start;
u64 end;
int i;
@@ -248,8 +248,8 @@
if (!cache)
return NULL;
- start = cache->key.objectid;
- end = start + cache->key.offset - 1;
+ start = cache->start;
+ end = start + cache->length - 1;
btrfs_put_block_group(cache);
zone = kzalloc(sizeof(*zone), GFP_KERNEL);
@@ -772,31 +772,39 @@
kfree(rmw);
}
-static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+/* Try to start up to 10k READA requests for a group of devices */
+static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 enqueued;
u64 total = 0;
- int i;
+ struct btrfs_device *device;
-again:
do {
enqueued = 0;
- mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (atomic_read(&device->reada_in_flight) <
MAX_IN_FLIGHT)
enqueued += reada_start_machine_dev(device);
}
- mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
} while (enqueued && total < 10000);
- if (fs_devices->seed) {
- fs_devices = fs_devices->seed;
- goto again;
- }
+ return total;
+}
+
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
+ int i;
+ u64 enqueued = 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+
+ enqueued += reada_start_for_fsdevs(fs_devices);
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+ enqueued += reada_start_for_fsdevs(seed_devs);
+
+ mutex_unlock(&fs_devices->device_list_mutex);
if (enqueued == 0)
return;
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index bbd6353..78693d3 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -511,7 +511,7 @@
switch (key.type) {
case BTRFS_EXTENT_ITEM_KEY:
*num_bytes = key.offset;
- /* fall through */
+ fallthrough;
case BTRFS_METADATA_ITEM_KEY:
*bytenr = key.objectid;
ret = process_extent_item(fs_info, path, &key, i,
@@ -805,6 +805,15 @@
kfree(ref);
kfree(ra);
goto out_unlock;
+ } else if (be->num_refs == 0) {
+ btrfs_err(fs_info,
+ "trying to do action %d for a bytenr that has 0 total references",
+ action);
+ dump_block_entry(fs_info, be);
+ dump_ref_action(fs_info, ra);
+ kfree(ref);
+ kfree(ra);
+ goto out_unlock;
}
if (!parent) {
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
new file mode 100644
index 0000000..3a3102b
--- /dev/null
+++ b/fs/btrfs/reflink.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/blkdev.h>
+#include <linux/iversion.h>
+#include "compression.h"
+#include "ctree.h"
+#include "delalloc-space.h"
+#include "reflink.h"
+#include "transaction.h"
+
+#define BTRFS_MAX_DEDUPE_LEN SZ_16M
+
+static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ u64 endoff,
+ const u64 destoff,
+ const u64 olen,
+ int no_time_update)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ inode_inc_iversion(inode);
+ if (!no_time_update)
+ inode->i_mtime = inode->i_ctime = current_time(inode);
+ /*
+ * We round up to the block size at eof when determining which
+ * extents to clone above, but shouldn't round up the file size.
+ */
+ if (endoff > destoff + olen)
+ endoff = destoff + olen;
+ if (endoff > inode->i_size) {
+ i_size_write(inode, endoff);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
+ }
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ ret = btrfs_end_transaction(trans);
+out:
+ return ret;
+}
+
+static int copy_inline_to_page(struct btrfs_inode *inode,
+ const u64 file_offset,
+ char *inline_data,
+ const u64 size,
+ const u64 datal,
+ const u8 comp_type)
+{
+ const u64 block_size = btrfs_inode_sectorsize(inode);
+ const u64 range_end = file_offset + block_size - 1;
+ const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
+ char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
+ struct extent_changeset *data_reserved = NULL;
+ struct page *page = NULL;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ int ret;
+
+ ASSERT(IS_ALIGNED(file_offset, block_size));
+
+ /*
+ * We have flushed and locked the ranges of the source and destination
+ * inodes, we also have locked the inodes, so we are safe to do a
+ * reservation here. Also we must not do the reservation while holding
+ * a transaction open, otherwise we would deadlock.
+ */
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
+ block_size);
+ if (ret)
+ goto out;
+
+ page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
+ btrfs_alloc_write_mask(mapping));
+ if (!page) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ set_page_extent_mapped(page);
+ clear_extent_bit(&inode->io_tree, file_offset, range_end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ 0, 0, NULL);
+ ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * After dirtying the page our caller will need to start a transaction,
+ * and if we are low on metadata free space, that can cause flushing of
+ * delalloc for all inodes in order to get metadata space released.
+ * However we are holding the range locked for the whole duration of
+ * the clone/dedupe operation, so we may deadlock if that happens and no
+ * other task releases enough space. So mark this inode as not being
+ * possible to flush to avoid such deadlock. We will clear that flag
+ * when we finish cloning all extents, since a transaction is started
+ * after finding each extent to clone.
+ */
+ set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
+
+ if (comp_type == BTRFS_COMPRESS_NONE) {
+ char *map;
+
+ map = kmap(page);
+ memcpy(map, data_start, datal);
+ flush_dcache_page(page);
+ kunmap(page);
+ } else {
+ ret = btrfs_decompress(comp_type, data_start, page, 0,
+ inline_size, datal);
+ if (ret)
+ goto out_unlock;
+ flush_dcache_page(page);
+ }
+
+ /*
+ * If our inline data is smaller then the block/page size, then the
+ * remaining of the block/page is equivalent to zeroes. We had something
+ * like the following done:
+ *
+ * $ xfs_io -f -c "pwrite -S 0xab 0 500" file
+ * $ sync # (or fsync)
+ * $ xfs_io -c "falloc 0 4K" file
+ * $ xfs_io -c "pwrite -S 0xcd 4K 4K"
+ *
+ * So what's in the range [500, 4095] corresponds to zeroes.
+ */
+ if (datal < block_size) {
+ char *map;
+
+ map = kmap(page);
+ memset(map + datal, 0, block_size - datal);
+ flush_dcache_page(page);
+ kunmap(page);
+ }
+
+ SetPageUptodate(page);
+ ClearPageChecked(page);
+ set_page_dirty(page);
+out_unlock:
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+ if (ret)
+ btrfs_delalloc_release_space(inode, data_reserved, file_offset,
+ block_size, true);
+ btrfs_delalloc_release_extents(inode, block_size);
+out:
+ extent_changeset_free(data_reserved);
+
+ return ret;
+}
+
+/*
+ * Deal with cloning of inline extents. We try to copy the inline extent from
+ * the source inode to destination inode when possible. When not possible we
+ * copy the inline extent's data into the respective page of the inode.
+ */
+static int clone_copy_inline_extent(struct inode *dst,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ const u64 drop_start,
+ const u64 datal,
+ const u64 size,
+ const u8 comp_type,
+ char *inline_data,
+ struct btrfs_trans_handle **trans_out)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
+ struct btrfs_root *root = BTRFS_I(dst)->root;
+ const u64 aligned_end = ALIGN(new_key->offset + datal,
+ fs_info->sectorsize);
+ struct btrfs_trans_handle *trans = NULL;
+ int ret;
+ struct btrfs_key key;
+
+ if (new_key->offset > 0) {
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
+ goto out;
+ }
+
+ key.objectid = btrfs_ino(BTRFS_I(dst));
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ goto copy_inline_extent;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
+ key.type == BTRFS_EXTENT_DATA_KEY) {
+ /*
+ * There's an implicit hole at file offset 0, copy the
+ * inline extent's data to the page.
+ */
+ ASSERT(key.offset > 0);
+ goto copy_to_page;
+ }
+ } else if (i_size_read(dst) <= datal) {
+ struct btrfs_file_extent_item *ei;
+
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ /*
+ * If it's an inline extent replace it with the source inline
+ * extent, otherwise copy the source inline extent data into
+ * the respective page at the destination inode.
+ */
+ if (btrfs_file_extent_type(path->nodes[0], ei) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ goto copy_inline_extent;
+
+ goto copy_to_page;
+ }
+
+copy_inline_extent:
+ /*
+ * We have no extent items, or we have an extent at offset 0 which may
+ * or may not be inlined. All these cases are dealt the same way.
+ */
+ if (i_size_read(dst) > datal) {
+ /*
+ * At the destination offset 0 we have either a hole, a regular
+ * extent or an inline extent larger then the one we want to
+ * clone. Deal with all these cases by copying the inline extent
+ * data into the respective page at the destination inode.
+ */
+ goto copy_to_page;
+ }
+
+ /*
+ * Release path before starting a new transaction so we don't hold locks
+ * that would confuse lockdep.
+ */
+ btrfs_release_path(path);
+ /*
+ * If we end up here it means were copy the inline extent into a leaf
+ * of the destination inode. We know we will drop or adjust at most one
+ * extent item in the destination root.
+ *
+ * 1 unit - adjusting old extent (we may have to split it)
+ * 1 unit - add new extent
+ * 1 unit - inode update
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+ ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
+ if (ret)
+ goto out;
+ ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
+ if (ret)
+ goto out;
+
+ write_extent_buffer(path->nodes[0], inline_data,
+ btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]),
+ size);
+ inode_add_bytes(dst, datal);
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
+out:
+ if (!ret && !trans) {
+ /*
+ * No transaction here means we copied the inline extent into a
+ * page of the destination inode.
+ *
+ * 1 unit to update inode item
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ }
+ }
+ if (ret && trans) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ }
+ if (!ret)
+ *trans_out = trans;
+
+ return ret;
+
+copy_to_page:
+ /*
+ * Release our path because we don't need it anymore and also because
+ * copy_inline_to_page() needs to reserve data and metadata, which may
+ * need to flush delalloc when we are low on available space and
+ * therefore cause a deadlock if writeback of an inline extent needs to
+ * write to the same leaf or an ordered extent completion needs to write
+ * to the same leaf.
+ */
+ btrfs_release_path(path);
+
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
+ goto out;
+}
+
+/**
+ * btrfs_clone() - clone a range from inode file to another
+ *
+ * @src: Inode to clone from
+ * @inode: Inode to clone to
+ * @off: Offset within source to start clone from
+ * @olen: Original length, passed by user, of range to clone
+ * @olen_aligned: Block-aligned value of olen
+ * @destoff: Offset within @inode to start clone
+ * @no_time_update: Whether to update mtime/ctime on the target inode
+ */
+static int btrfs_clone(struct inode *src, struct inode *inode,
+ const u64 off, const u64 olen, const u64 olen_aligned,
+ const u64 destoff, int no_time_update)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_path *path = NULL;
+ struct extent_buffer *leaf;
+ struct btrfs_trans_handle *trans;
+ char *buf = NULL;
+ struct btrfs_key key;
+ u32 nritems;
+ int slot;
+ int ret;
+ const u64 len = olen_aligned;
+ u64 last_dest_end = destoff;
+
+ ret = -ENOMEM;
+ buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
+ if (!buf)
+ return ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ kvfree(buf);
+ return ret;
+ }
+
+ path->reada = READA_FORWARD;
+ /* Clone data */
+ key.objectid = btrfs_ino(BTRFS_I(src));
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = off;
+
+ while (1) {
+ u64 next_key_min_offset = key.offset + 1;
+ struct btrfs_file_extent_item *extent;
+ u64 extent_gen;
+ int type;
+ u32 size;
+ struct btrfs_key new_key;
+ u64 disko = 0, diskl = 0;
+ u64 datao = 0, datal = 0;
+ u8 comp;
+ u64 drop_start;
+
+ /* Note the key will change type as we walk through the tree */
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
+ 0, 0);
+ if (ret < 0)
+ goto out;
+ /*
+ * First search, if no extent item that starts at offset off was
+ * found but the previous item is an extent item, it's possible
+ * it might overlap our target range, therefore process it.
+ */
+ if (key.offset == off && ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0] - 1);
+ if (key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+process_slot:
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ }
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.type > BTRFS_EXTENT_DATA_KEY ||
+ key.objectid != btrfs_ino(BTRFS_I(src)))
+ break;
+
+ ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
+
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ extent_gen = btrfs_file_extent_generation(leaf, extent);
+ comp = btrfs_file_extent_compression(leaf, extent);
+ type = btrfs_file_extent_type(leaf, extent);
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disko = btrfs_file_extent_disk_bytenr(leaf, extent);
+ diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
+ datao = btrfs_file_extent_offset(leaf, extent);
+ datal = btrfs_file_extent_num_bytes(leaf, extent);
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ /* Take upper bound, may be compressed */
+ datal = btrfs_file_extent_ram_bytes(leaf, extent);
+ }
+
+ /*
+ * The first search might have left us at an extent item that
+ * ends before our target range's start, can happen if we have
+ * holes and NO_HOLES feature enabled.
+ */
+ if (key.offset + datal <= off) {
+ path->slots[0]++;
+ goto process_slot;
+ } else if (key.offset >= off + len) {
+ break;
+ }
+ next_key_min_offset = key.offset + datal;
+ size = btrfs_item_size_nr(leaf, slot);
+ read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
+ size);
+
+ btrfs_release_path(path);
+ path->leave_spinning = 0;
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.objectid = btrfs_ino(BTRFS_I(inode));
+ if (off <= key.offset)
+ new_key.offset = key.offset + destoff - off;
+ else
+ new_key.offset = destoff;
+
+ /*
+ * Deal with a hole that doesn't have an extent item that
+ * represents it (NO_HOLES feature enabled).
+ * This hole is either in the middle of the cloning range or at
+ * the beginning (fully overlaps it or partially overlaps it).
+ */
+ if (new_key.offset != last_dest_end)
+ drop_start = last_dest_end;
+ else
+ drop_start = new_key.offset;
+
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ struct btrfs_replace_extent_info clone_info;
+
+ /*
+ * a | --- range to clone ---| b
+ * | ------------- extent ------------- |
+ */
+
+ /* Subtract range b */
+ if (key.offset + datal > off + len)
+ datal = off + len - key.offset;
+
+ /* Subtract range a */
+ if (off > key.offset) {
+ datao += off - key.offset;
+ datal -= off - key.offset;
+ }
+
+ clone_info.disk_offset = disko;
+ clone_info.disk_len = diskl;
+ clone_info.data_offset = datao;
+ clone_info.data_len = datal;
+ clone_info.file_offset = new_key.offset;
+ clone_info.extent_buf = buf;
+ clone_info.is_new_extent = false;
+ ret = btrfs_replace_file_extents(inode, path, drop_start,
+ new_key.offset + datal - 1, &clone_info,
+ &trans);
+ if (ret)
+ goto out;
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ /*
+ * Inline extents always have to start at file offset 0
+ * and can never be bigger then the sector size. We can
+ * never clone only parts of an inline extent, since all
+ * reflink operations must start at a sector size aligned
+ * offset, and the length must be aligned too or end at
+ * the i_size (which implies the whole inlined data).
+ */
+ ASSERT(key.offset == 0);
+ ASSERT(datal <= fs_info->sectorsize);
+ if (key.offset != 0 || datal > fs_info->sectorsize)
+ return -EUCLEAN;
+
+ ret = clone_copy_inline_extent(inode, path, &new_key,
+ drop_start, datal, size,
+ comp, buf, &trans);
+ if (ret)
+ goto out;
+ }
+
+ btrfs_release_path(path);
+
+ /*
+ * If this is a new extent update the last_reflink_trans of both
+ * inodes. This is used by fsync to make sure it does not log
+ * multiple checksum items with overlapping ranges. For older
+ * extents we don't need to do it since inode logging skips the
+ * checksums for older extents. Also ignore holes and inline
+ * extents because they don't have checksums in the csum tree.
+ */
+ if (extent_gen == trans->transid && disko > 0) {
+ BTRFS_I(src)->last_reflink_trans = trans->transid;
+ BTRFS_I(inode)->last_reflink_trans = trans->transid;
+ }
+
+ last_dest_end = ALIGN(new_key.offset + datal,
+ fs_info->sectorsize);
+ ret = clone_finish_inode_update(trans, inode, last_dest_end,
+ destoff, olen, no_time_update);
+ if (ret)
+ goto out;
+ if (new_key.offset + datal >= destoff + len)
+ break;
+
+ btrfs_release_path(path);
+ key.offset = next_key_min_offset;
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+
+ cond_resched();
+ }
+ ret = 0;
+
+ if (last_dest_end < destoff + len) {
+ /*
+ * We have an implicit hole that fully or partially overlaps our
+ * cloning range at its end. This means that we either have the
+ * NO_HOLES feature enabled or the implicit hole happened due to
+ * mixing buffered and direct IO writes against this file.
+ */
+ btrfs_release_path(path);
+ path->leave_spinning = 0;
+
+ /*
+ * When using NO_HOLES and we are cloning a range that covers
+ * only a hole (no extents) into a range beyond the current
+ * i_size, punching a hole in the target range will not create
+ * an extent map defining a hole, because the range starts at or
+ * beyond current i_size. If the file previously had an i_size
+ * greater than the new i_size set by this clone operation, we
+ * need to make sure the next fsync is a full fsync, so that it
+ * detects and logs a hole covering a range from the current
+ * i_size to the new i_size. If the clone range covers extents,
+ * besides a hole, then we know the full sync flag was already
+ * set by previous calls to btrfs_replace_file_extents() that
+ * replaced file extent items.
+ */
+ if (last_dest_end >= i_size_read(inode))
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+
+ ret = btrfs_replace_file_extents(inode, path, last_dest_end,
+ destoff + len - 1, NULL, &trans);
+ if (ret)
+ goto out;
+
+ ret = clone_finish_inode_update(trans, inode, destoff + len,
+ destoff, olen, no_time_update);
+ }
+
+out:
+ btrfs_free_path(path);
+ kvfree(buf);
+ clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
+
+ return ret;
+}
+
+static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+ unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+}
+
+static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ if (inode1 < inode2) {
+ swap(inode1, inode2);
+ swap(loff1, loff2);
+ } else if (inode1 == inode2 && loff2 < loff1) {
+ swap(loff1, loff2);
+ }
+ lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+ lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+}
+
+static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
+ struct inode *dst, u64 dst_loff)
+{
+ const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+ int ret;
+
+ /*
+ * Lock destination range to serialize with concurrent readpages() and
+ * source range to serialize with relocation.
+ */
+ btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+ ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
+ btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+
+ return ret;
+}
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
+ struct inode *dst, u64 dst_loff)
+{
+ int ret = 0;
+ u64 i, tail_len, chunk_count;
+ struct btrfs_root *root_dst = BTRFS_I(dst)->root;
+
+ spin_lock(&root_dst->root_item_lock);
+ if (root_dst->send_in_progress) {
+ btrfs_warn_rl(root_dst->fs_info,
+"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
+ root_dst->root_key.objectid,
+ root_dst->send_in_progress);
+ spin_unlock(&root_dst->root_item_lock);
+ return -EAGAIN;
+ }
+ root_dst->dedupe_in_progress++;
+ spin_unlock(&root_dst->root_item_lock);
+
+ tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
+ chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
+
+ for (i = 0; i < chunk_count; i++) {
+ ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
+ dst, dst_loff);
+ if (ret)
+ goto out;
+
+ loff += BTRFS_MAX_DEDUPE_LEN;
+ dst_loff += BTRFS_MAX_DEDUPE_LEN;
+ }
+
+ if (tail_len > 0)
+ ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
+out:
+ spin_lock(&root_dst->root_item_lock);
+ root_dst->dedupe_in_progress--;
+ spin_unlock(&root_dst->root_item_lock);
+
+ return ret;
+}
+
+static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
+ u64 off, u64 olen, u64 destoff)
+{
+ struct inode *inode = file_inode(file);
+ struct inode *src = file_inode(file_src);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ int ret;
+ int wb_ret;
+ u64 len = olen;
+ u64 bs = fs_info->sb->s_blocksize;
+
+ /*
+ * VFS's generic_remap_file_range_prep() protects us from cloning the
+ * eof block into the middle of a file, which would result in corruption
+ * if the file size is not blocksize aligned. So we don't need to check
+ * for that case here.
+ */
+ if (off + len == src->i_size)
+ len = ALIGN(src->i_size, bs) - off;
+
+ if (destoff > inode->i_size) {
+ const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
+
+ ret = btrfs_cont_expand(inode, inode->i_size, destoff);
+ if (ret)
+ return ret;
+ /*
+ * We may have truncated the last block if the inode's size is
+ * not sector size aligned, so we need to wait for writeback to
+ * complete before proceeding further, otherwise we can race
+ * with cloning and attempt to increment a reference to an
+ * extent that no longer exists (writeback completed right after
+ * we found the previous extent covering eof and before we
+ * attempted to increment its reference count).
+ */
+ ret = btrfs_wait_ordered_range(inode, wb_start,
+ destoff - wb_start);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Lock destination range to serialize with concurrent readpages() and
+ * source range to serialize with relocation.
+ */
+ btrfs_double_extent_lock(src, off, inode, destoff, len);
+ ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
+ btrfs_double_extent_unlock(src, off, inode, destoff, len);
+
+ /*
+ * We may have copied an inline extent into a page of the destination
+ * range, so wait for writeback to complete before truncating pages
+ * from the page cache. This is a rare case.
+ */
+ wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
+ ret = ret ? ret : wb_ret;
+ /*
+ * Truncate page cache pages so that future reads will see the cloned
+ * data immediately and not the previous data.
+ */
+ truncate_inode_pages_range(&inode->i_data,
+ round_down(destoff, PAGE_SIZE),
+ round_up(destoff + len, PAGE_SIZE) - 1);
+
+ return ret;
+}
+
+static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
+ bool same_inode = inode_out == inode_in;
+ u64 wb_len;
+ int ret;
+
+ if (!(remap_flags & REMAP_FILE_DEDUP)) {
+ struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
+
+ if (btrfs_root_readonly(root_out))
+ return -EROFS;
+
+ if (file_in->f_path.mnt != file_out->f_path.mnt ||
+ inode_in->i_sb != inode_out->i_sb)
+ return -EXDEV;
+ }
+
+ /* Don't make the dst file partly checksummed */
+ if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+ return -EINVAL;
+ }
+
+ /*
+ * Now that the inodes are locked, we need to start writeback ourselves
+ * and can not rely on the writeback from the VFS's generic helper
+ * generic_remap_file_range_prep() because:
+ *
+ * 1) For compression we must call filemap_fdatawrite_range() range
+ * twice (btrfs_fdatawrite_range() does it for us), and the generic
+ * helper only calls it once;
+ *
+ * 2) filemap_fdatawrite_range(), called by the generic helper only
+ * waits for the writeback to complete, i.e. for IO to be done, and
+ * not for the ordered extents to complete. We need to wait for them
+ * to complete so that new file extent items are in the fs tree.
+ */
+ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
+ wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
+ else
+ wb_len = ALIGN(*len, bs);
+
+ /*
+ * Since we don't lock ranges, wait for ongoing lockless dio writes (as
+ * any in progress could create its ordered extents after we wait for
+ * existing ordered extents below).
+ */
+ inode_dio_wait(inode_in);
+ if (!same_inode)
+ inode_dio_wait(inode_out);
+
+ /*
+ * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
+ *
+ * Btrfs' back references do not have a block level granularity, they
+ * work at the whole extent level.
+ * NOCOW buffered write without data space reserved may not be able
+ * to fall back to CoW due to lack of data space, thus could cause
+ * data loss.
+ *
+ * Here we take a shortcut by flushing the whole inode, so that all
+ * nocow write should reach disk as nocow before we increase the
+ * reference of the extent. We could do better by only flushing NOCOW
+ * data, but that needs extra accounting.
+ *
+ * Also we don't need to check ASYNC_EXTENT, as async extent will be
+ * CoWed anyway, not affecting nocow part.
+ */
+ ret = filemap_flush(inode_in->i_mapping);
+ if (ret < 0)
+ return ret;
+
+ ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
+ wb_len);
+ if (ret < 0)
+ return ret;
+ ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
+ wb_len);
+ if (ret < 0)
+ return ret;
+
+ return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+ len, remap_flags);
+}
+
+loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, loff_t len,
+ unsigned int remap_flags)
+{
+ struct inode *src_inode = file_inode(src_file);
+ struct inode *dst_inode = file_inode(dst_file);
+ bool same_inode = dst_inode == src_inode;
+ int ret;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (same_inode)
+ inode_lock(src_inode);
+ else
+ lock_two_nondirectories(src_inode, dst_inode);
+
+ ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ goto out_unlock;
+
+ if (remap_flags & REMAP_FILE_DEDUP)
+ ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
+ else
+ ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
+
+out_unlock:
+ if (same_inode)
+ inode_unlock(src_inode);
+ else
+ unlock_two_nondirectories(src_inode, dst_inode);
+
+ return ret < 0 ? ret : len;
+}
diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h
new file mode 100644
index 0000000..ecb309b
--- /dev/null
+++ b/fs/btrfs/reflink.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_REFLINK_H
+#define BTRFS_REFLINK_H
+
+#include <linux/fs.h>
+
+loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
+
+#endif /* BTRFS_REFLINK_H */
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ba68b0b..c21545c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -9,6 +9,7 @@
#include <linux/blkdev.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <linux/error-injection.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -22,101 +23,65 @@
#include "print-tree.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "backref.h"
+#include "misc.h"
/*
- * backref_node, mapping_node and tree_block start with this
+ * Relocation overview
+ *
+ * [What does relocation do]
+ *
+ * The objective of relocation is to relocate all extents of the target block
+ * group to other block groups.
+ * This is utilized by resize (shrink only), profile converting, compacting
+ * space, or balance routine to spread chunks over devices.
+ *
+ * Before | After
+ * ------------------------------------------------------------------
+ * BG A: 10 data extents | BG A: deleted
+ * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated)
+ * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated)
+ *
+ * [How does relocation work]
+ *
+ * 1. Mark the target block group read-only
+ * New extents won't be allocated from the target block group.
+ *
+ * 2.1 Record each extent in the target block group
+ * To build a proper map of extents to be relocated.
+ *
+ * 2.2 Build data reloc tree and reloc trees
+ * Data reloc tree will contain an inode, recording all newly relocated
+ * data extents.
+ * There will be only one data reloc tree for one data block group.
+ *
+ * Reloc tree will be a special snapshot of its source tree, containing
+ * relocated tree blocks.
+ * Each tree referring to a tree block in target block group will get its
+ * reloc tree built.
+ *
+ * 2.3 Swap source tree with its corresponding reloc tree
+ * Each involved tree only refers to new extents after swap.
+ *
+ * 3. Cleanup reloc trees and data reloc tree.
+ * As old extents in the target block group are still referenced by reloc
+ * trees, we need to clean them up before really freeing the target block
+ * group.
+ *
+ * The main complexity is in steps 2.2 and 2.3.
+ *
+ * The entry point of relocation is relocate_block_group() function.
*/
-struct tree_entry {
- struct rb_node rb_node;
- u64 bytenr;
-};
-/*
- * present a tree block in the backref cache
- */
-struct backref_node {
- struct rb_node rb_node;
- u64 bytenr;
-
- u64 new_bytenr;
- /* objectid of tree block owner, can be not uptodate */
- u64 owner;
- /* link to pending, changed or detached list */
- struct list_head list;
- /* list of upper level blocks reference this block */
- struct list_head upper;
- /* list of child blocks in the cache */
- struct list_head lower;
- /* NULL if this node is not tree root */
- struct btrfs_root *root;
- /* extent buffer got by COW the block */
- struct extent_buffer *eb;
- /* level of tree block */
- unsigned int level:8;
- /* is the block in non-reference counted tree */
- unsigned int cowonly:1;
- /* 1 if no child node in the cache */
- unsigned int lowest:1;
- /* is the extent buffer locked */
- unsigned int locked:1;
- /* has the block been processed */
- unsigned int processed:1;
- /* have backrefs of this block been checked */
- unsigned int checked:1;
- /*
- * 1 if corresponding block has been cowed but some upper
- * level block pointers may not point to the new location
- */
- unsigned int pending:1;
- /*
- * 1 if the backref node isn't connected to any other
- * backref node.
- */
- unsigned int detached:1;
-};
-
-/*
- * present a block pointer in the backref cache
- */
-struct backref_edge {
- struct list_head list[2];
- struct backref_node *node[2];
-};
-
-#define LOWER 0
-#define UPPER 1
#define RELOCATION_RESERVED_NODES 256
-
-struct backref_cache {
- /* red black tree of all backref nodes in the cache */
- struct rb_root rb_root;
- /* for passing backref nodes to btrfs_reloc_cow_block */
- struct backref_node *path[BTRFS_MAX_LEVEL];
- /*
- * list of blocks that have been cowed but some block
- * pointers in upper level blocks may not reflect the
- * new location
- */
- struct list_head pending[BTRFS_MAX_LEVEL];
- /* list of backref nodes with no child node */
- struct list_head leaves;
- /* list of blocks that have been cowed in current transaction */
- struct list_head changed;
- /* list of detached backref node. */
- struct list_head detached;
-
- u64 last_trans;
-
- int nr_nodes;
- int nr_edges;
-};
-
/*
* map address of tree root to tree
*/
struct mapping_node {
- struct rb_node rb_node;
- u64 bytenr;
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ }; /* Use rb_simle_node for search/insert */
void *data;
};
@@ -129,8 +94,10 @@
* present a tree block to process
*/
struct tree_block {
- struct rb_node rb_node;
- u64 bytenr;
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ }; /* Use rb_simple_node for search/insert */
struct btrfs_key key;
unsigned int level:8;
unsigned int key_ready:1;
@@ -147,7 +114,7 @@
struct reloc_control {
/* block group to relocate */
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
/* extent tree */
struct btrfs_root *extent_root;
/* inode for moving data */
@@ -155,7 +122,7 @@
struct btrfs_block_rsv *block_rsv;
- struct backref_cache backref_cache;
+ struct btrfs_backref_cache backref_cache;
struct file_extent_cluster cluster;
/* tree blocks have been processed */
@@ -186,10 +153,21 @@
#define MOVE_DATA_EXTENTS 0
#define UPDATE_DATA_PTRS 1
-static void remove_backref_node(struct backref_cache *cache,
- struct backref_node *node);
-static void __mark_block_processed(struct reloc_control *rc,
- struct backref_node *node);
+static void mark_block_processed(struct reloc_control *rc,
+ struct btrfs_backref_node *node)
+{
+ u32 blocksize;
+
+ if (node->level == 0 ||
+ in_range(node->bytenr, rc->block_group->start,
+ rc->block_group->length)) {
+ blocksize = rc->extent_root->fs_info->nodesize;
+ set_extent_bits(&rc->processed_blocks, node->bytenr,
+ node->bytenr + blocksize - 1, EXTENT_DIRTY);
+ }
+ node->processed = 1;
+}
+
static void mapping_tree_init(struct mapping_tree *tree)
{
@@ -197,156 +175,19 @@
spin_lock_init(&tree->lock);
}
-static void backref_cache_init(struct backref_cache *cache)
-{
- int i;
- cache->rb_root = RB_ROOT;
- for (i = 0; i < BTRFS_MAX_LEVEL; i++)
- INIT_LIST_HEAD(&cache->pending[i]);
- INIT_LIST_HEAD(&cache->changed);
- INIT_LIST_HEAD(&cache->detached);
- INIT_LIST_HEAD(&cache->leaves);
-}
-
-static void backref_cache_cleanup(struct backref_cache *cache)
-{
- struct backref_node *node;
- int i;
-
- while (!list_empty(&cache->detached)) {
- node = list_entry(cache->detached.next,
- struct backref_node, list);
- remove_backref_node(cache, node);
- }
-
- while (!list_empty(&cache->leaves)) {
- node = list_entry(cache->leaves.next,
- struct backref_node, lower);
- remove_backref_node(cache, node);
- }
-
- cache->last_trans = 0;
-
- for (i = 0; i < BTRFS_MAX_LEVEL; i++)
- ASSERT(list_empty(&cache->pending[i]));
- ASSERT(list_empty(&cache->changed));
- ASSERT(list_empty(&cache->detached));
- ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
- ASSERT(!cache->nr_nodes);
- ASSERT(!cache->nr_edges);
-}
-
-static struct backref_node *alloc_backref_node(struct backref_cache *cache)
-{
- struct backref_node *node;
-
- node = kzalloc(sizeof(*node), GFP_NOFS);
- if (node) {
- INIT_LIST_HEAD(&node->list);
- INIT_LIST_HEAD(&node->upper);
- INIT_LIST_HEAD(&node->lower);
- RB_CLEAR_NODE(&node->rb_node);
- cache->nr_nodes++;
- }
- return node;
-}
-
-static void free_backref_node(struct backref_cache *cache,
- struct backref_node *node)
-{
- if (node) {
- cache->nr_nodes--;
- kfree(node);
- }
-}
-
-static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
-{
- struct backref_edge *edge;
-
- edge = kzalloc(sizeof(*edge), GFP_NOFS);
- if (edge)
- cache->nr_edges++;
- return edge;
-}
-
-static void free_backref_edge(struct backref_cache *cache,
- struct backref_edge *edge)
-{
- if (edge) {
- cache->nr_edges--;
- kfree(edge);
- }
-}
-
-static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
- struct rb_node *node)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct tree_entry *entry;
-
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct tree_entry, rb_node);
-
- if (bytenr < entry->bytenr)
- p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
- p = &(*p)->rb_right;
- else
- return parent;
- }
-
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
-}
-
-static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
-{
- struct rb_node *n = root->rb_node;
- struct tree_entry *entry;
-
- while (n) {
- entry = rb_entry(n, struct tree_entry, rb_node);
-
- if (bytenr < entry->bytenr)
- n = n->rb_left;
- else if (bytenr > entry->bytenr)
- n = n->rb_right;
- else
- return n;
- }
- return NULL;
-}
-
-static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
-{
-
- struct btrfs_fs_info *fs_info = NULL;
- struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
- rb_node);
- if (bnode->root)
- fs_info = bnode->root->fs_info;
- btrfs_panic(fs_info, errno,
- "Inconsistency in backref cache found at offset %llu",
- bytenr);
-}
-
/*
* walk up backref nodes until reach node presents tree root
*/
-static struct backref_node *walk_up_backref(struct backref_node *node,
- struct backref_edge *edges[],
- int *index)
+static struct btrfs_backref_node *walk_up_backref(
+ struct btrfs_backref_node *node,
+ struct btrfs_backref_edge *edges[], int *index)
{
- struct backref_edge *edge;
+ struct btrfs_backref_edge *edge;
int idx = *index;
while (!list_empty(&node->upper)) {
edge = list_entry(node->upper.next,
- struct backref_edge, list[LOWER]);
+ struct btrfs_backref_edge, list[LOWER]);
edges[idx++] = edge;
node = edge->node[UPPER];
}
@@ -358,11 +199,11 @@
/*
* walk down backref nodes to find start of next reference path
*/
-static struct backref_node *walk_down_backref(struct backref_edge *edges[],
- int *index)
+static struct btrfs_backref_node *walk_down_backref(
+ struct btrfs_backref_edge *edges[], int *index)
{
- struct backref_edge *edge;
- struct backref_node *lower;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_node *lower;
int idx = *index;
while (idx > 0) {
@@ -373,7 +214,7 @@
continue;
}
edge = list_entry(edge->list[LOWER].next,
- struct backref_edge, list[LOWER]);
+ struct btrfs_backref_edge, list[LOWER]);
edges[idx - 1] = edge;
*index = idx;
return edge->node[UPPER];
@@ -382,95 +223,24 @@
return NULL;
}
-static void unlock_node_buffer(struct backref_node *node)
-{
- if (node->locked) {
- btrfs_tree_unlock(node->eb);
- node->locked = 0;
- }
-}
-
-static void drop_node_buffer(struct backref_node *node)
-{
- if (node->eb) {
- unlock_node_buffer(node);
- free_extent_buffer(node->eb);
- node->eb = NULL;
- }
-}
-
-static void drop_backref_node(struct backref_cache *tree,
- struct backref_node *node)
-{
- BUG_ON(!list_empty(&node->upper));
-
- drop_node_buffer(node);
- list_del(&node->list);
- list_del(&node->lower);
- if (!RB_EMPTY_NODE(&node->rb_node))
- rb_erase(&node->rb_node, &tree->rb_root);
- free_backref_node(tree, node);
-}
-
-/*
- * remove a backref node from the backref cache
- */
-static void remove_backref_node(struct backref_cache *cache,
- struct backref_node *node)
-{
- struct backref_node *upper;
- struct backref_edge *edge;
-
- if (!node)
- return;
-
- BUG_ON(!node->lowest && !node->detached);
- while (!list_empty(&node->upper)) {
- edge = list_entry(node->upper.next, struct backref_edge,
- list[LOWER]);
- upper = edge->node[UPPER];
- list_del(&edge->list[LOWER]);
- list_del(&edge->list[UPPER]);
- free_backref_edge(cache, edge);
-
- if (RB_EMPTY_NODE(&upper->rb_node)) {
- BUG_ON(!list_empty(&node->upper));
- drop_backref_node(cache, node);
- node = upper;
- node->lowest = 1;
- continue;
- }
- /*
- * add the node to leaf node list if no other
- * child block cached.
- */
- if (list_empty(&upper->lower)) {
- list_add_tail(&upper->lower, &cache->leaves);
- upper->lowest = 1;
- }
- }
-
- drop_backref_node(cache, node);
-}
-
-static void update_backref_node(struct backref_cache *cache,
- struct backref_node *node, u64 bytenr)
+static void update_backref_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node, u64 bytenr)
{
struct rb_node *rb_node;
rb_erase(&node->rb_node, &cache->rb_root);
node->bytenr = bytenr;
- rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+ rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
if (rb_node)
- backref_tree_panic(rb_node, -EEXIST, bytenr);
+ btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST);
}
/*
* update backref cache after a transaction commit
*/
static int update_backref_cache(struct btrfs_trans_handle *trans,
- struct backref_cache *cache)
+ struct btrfs_backref_cache *cache)
{
- struct backref_node *node;
+ struct btrfs_backref_node *node;
int level = 0;
if (cache->last_trans == 0) {
@@ -488,13 +258,13 @@
*/
while (!list_empty(&cache->detached)) {
node = list_entry(cache->detached.next,
- struct backref_node, list);
- remove_backref_node(cache, node);
+ struct btrfs_backref_node, list);
+ btrfs_backref_cleanup_node(cache, node);
}
while (!list_empty(&cache->changed)) {
node = list_entry(cache->changed.next,
- struct backref_node, list);
+ struct btrfs_backref_node, list);
list_del_init(&node->list);
BUG_ON(node->pending);
update_backref_node(cache, node, node->new_bytenr);
@@ -535,7 +305,8 @@
*
* Reloc tree after swap is considered dead, thus not considered as valid.
* This is enough for most callers, as they don't distinguish dead reloc root
- * from no reloc root. But should_ignore_root() below is a special case.
+ * from no reloc root. But btrfs_should_ignore_reloc_root() below is a
+ * special case.
*/
static bool have_reloc_root(struct btrfs_root *root)
{
@@ -546,11 +317,11 @@
return true;
}
-static int should_ignore_root(struct btrfs_root *root)
+int btrfs_should_ignore_reloc_root(struct btrfs_root *root)
{
struct btrfs_root *reloc_root;
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return 0;
/* This root has been merged with its reloc tree, we can ignore it */
@@ -572,624 +343,187 @@
*/
return 1;
}
+
/*
* find reloc tree by address of tree root
*/
-static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
- u64 bytenr)
+struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
{
+ struct reloc_control *rc = fs_info->reloc_ctl;
struct rb_node *rb_node;
struct mapping_node *node;
struct btrfs_root *root = NULL;
+ ASSERT(rc);
spin_lock(&rc->reloc_root_tree.lock);
- rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
+ rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
root = (struct btrfs_root *)node->data;
}
spin_unlock(&rc->reloc_root_tree.lock);
- return root;
-}
-
-static int is_cowonly_root(u64 root_objectid)
-{
- if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
- root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
- root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
- root_objectid == BTRFS_DEV_TREE_OBJECTID ||
- root_objectid == BTRFS_TREE_LOG_OBJECTID ||
- root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
- root_objectid == BTRFS_UUID_TREE_OBJECTID ||
- root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
- root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
- return 1;
- return 0;
-}
-
-static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
- u64 root_objectid)
-{
- struct btrfs_key key;
-
- key.objectid = root_objectid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- if (is_cowonly_root(root_objectid))
- key.offset = 0;
- else
- key.offset = (u64)-1;
-
- return btrfs_get_fs_root(fs_info, &key, false);
-}
-
-static noinline_for_stack
-int find_inline_backref(struct extent_buffer *leaf, int slot,
- unsigned long *ptr, unsigned long *end)
-{
- struct btrfs_key key;
- struct btrfs_extent_item *ei;
- struct btrfs_tree_block_info *bi;
- u32 item_size;
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
-
- item_size = btrfs_item_size_nr(leaf, slot);
- if (item_size < sizeof(*ei)) {
- btrfs_print_v0_err(leaf->fs_info);
- btrfs_handle_fs_error(leaf->fs_info, -EINVAL, NULL);
- return 1;
- }
- ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
- WARN_ON(!(btrfs_extent_flags(leaf, ei) &
- BTRFS_EXTENT_FLAG_TREE_BLOCK));
-
- if (key.type == BTRFS_EXTENT_ITEM_KEY &&
- item_size <= sizeof(*ei) + sizeof(*bi)) {
- WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
- return 1;
- }
- if (key.type == BTRFS_METADATA_ITEM_KEY &&
- item_size <= sizeof(*ei)) {
- WARN_ON(item_size < sizeof(*ei));
- return 1;
- }
-
- if (key.type == BTRFS_EXTENT_ITEM_KEY) {
- bi = (struct btrfs_tree_block_info *)(ei + 1);
- *ptr = (unsigned long)(bi + 1);
- } else {
- *ptr = (unsigned long)(ei + 1);
- }
- *end = (unsigned long)ei + item_size;
- return 0;
+ return btrfs_grab_root(root);
}
/*
- * build backref tree for a given tree block. root of the backref tree
- * corresponds the tree block, leaves of the backref tree correspond
- * roots of b-trees that reference the tree block.
+ * For useless nodes, do two major clean ups:
*
- * the basic idea of this function is check backrefs of a given block
- * to find upper level blocks that reference the block, and then check
- * backrefs of these upper level blocks recursively. the recursion stop
- * when tree root is reached or backrefs for the block is cached.
+ * - Cleanup the children edges and nodes
+ * If child node is also orphan (no parent) during cleanup, then the child
+ * node will also be cleaned up.
*
- * NOTE: if we find backrefs for a block are cached, we know backrefs
- * for all upper level blocks that directly/indirectly reference the
- * block are also cached.
+ * - Freeing up leaves (level 0), keeps nodes detached
+ * For nodes, the node is still cached as "detached"
+ *
+ * Return false if @node is not in the @useless_nodes list.
+ * Return true if @node is in the @useless_nodes list.
*/
-static noinline_for_stack
-struct backref_node *build_backref_tree(struct reloc_control *rc,
- struct btrfs_key *node_key,
- int level, u64 bytenr)
+static bool handle_useless_nodes(struct reloc_control *rc,
+ struct btrfs_backref_node *node)
{
- struct backref_cache *cache = &rc->backref_cache;
- struct btrfs_path *path1; /* For searching extent root */
- struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */
- struct extent_buffer *eb;
- struct btrfs_root *root;
- struct backref_node *cur;
- struct backref_node *upper;
- struct backref_node *lower;
- struct backref_node *node = NULL;
- struct backref_node *exist = NULL;
- struct backref_edge *edge;
- struct rb_node *rb_node;
- struct btrfs_key key;
- unsigned long end;
- unsigned long ptr;
- LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */
- LIST_HEAD(useless);
- int cowonly;
+ struct btrfs_backref_cache *cache = &rc->backref_cache;
+ struct list_head *useless_node = &cache->useless_node;
+ bool ret = false;
+
+ while (!list_empty(useless_node)) {
+ struct btrfs_backref_node *cur;
+
+ cur = list_first_entry(useless_node, struct btrfs_backref_node,
+ list);
+ list_del_init(&cur->list);
+
+ /* Only tree root nodes can be added to @useless_nodes */
+ ASSERT(list_empty(&cur->upper));
+
+ if (cur == node)
+ ret = true;
+
+ /* The node is the lowest node */
+ if (cur->lowest) {
+ list_del_init(&cur->lower);
+ cur->lowest = 0;
+ }
+
+ /* Cleanup the lower edges */
+ while (!list_empty(&cur->lower)) {
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_node *lower;
+
+ edge = list_entry(cur->lower.next,
+ struct btrfs_backref_edge, list[UPPER]);
+ list_del(&edge->list[UPPER]);
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ btrfs_backref_free_edge(cache, edge);
+
+ /* Child node is also orphan, queue for cleanup */
+ if (list_empty(&lower->upper))
+ list_add(&lower->list, useless_node);
+ }
+ /* Mark this block processed for relocation */
+ mark_block_processed(rc, cur);
+
+ /*
+ * Backref nodes for tree leaves are deleted from the cache.
+ * Backref nodes for upper level tree blocks are left in the
+ * cache to avoid unnecessary backref lookup.
+ */
+ if (cur->level > 0) {
+ list_add(&cur->list, &cache->detached);
+ cur->detached = 1;
+ } else {
+ rb_erase(&cur->rb_node, &cache->rb_root);
+ btrfs_backref_free_node(cache, cur);
+ }
+ }
+ return ret;
+}
+
+/*
+ * Build backref tree for a given tree block. Root of the backref tree
+ * corresponds the tree block, leaves of the backref tree correspond roots of
+ * b-trees that reference the tree block.
+ *
+ * The basic idea of this function is check backrefs of a given block to find
+ * upper level blocks that reference the block, and then check backrefs of
+ * these upper level blocks recursively. The recursion stops when tree root is
+ * reached or backrefs for the block is cached.
+ *
+ * NOTE: if we find that backrefs for a block are cached, we know backrefs for
+ * all upper level blocks that directly/indirectly reference the block are also
+ * cached.
+ */
+static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
+ struct reloc_control *rc, struct btrfs_key *node_key,
+ int level, u64 bytenr)
+{
+ struct btrfs_backref_iter *iter;
+ struct btrfs_backref_cache *cache = &rc->backref_cache;
+ /* For searching parent of TREE_BLOCK_REF */
+ struct btrfs_path *path;
+ struct btrfs_backref_node *cur;
+ struct btrfs_backref_node *node = NULL;
+ struct btrfs_backref_edge *edge;
int ret;
int err = 0;
- bool need_check = true;
- path1 = btrfs_alloc_path();
- path2 = btrfs_alloc_path();
- if (!path1 || !path2) {
+ iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
+ if (!iter)
+ return ERR_PTR(-ENOMEM);
+ path = btrfs_alloc_path();
+ if (!path) {
err = -ENOMEM;
goto out;
}
- path1->reada = READA_FORWARD;
- path2->reada = READA_FORWARD;
- node = alloc_backref_node(cache);
+ node = btrfs_backref_alloc_node(cache, bytenr, level);
if (!node) {
err = -ENOMEM;
goto out;
}
- node->bytenr = bytenr;
- node->level = level;
node->lowest = 1;
cur = node;
-again:
- end = 0;
- ptr = 0;
- key.objectid = cur->bytenr;
- key.type = BTRFS_METADATA_ITEM_KEY;
- key.offset = (u64)-1;
- path1->search_commit_root = 1;
- path1->skip_locking = 1;
- ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
- 0, 0);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- ASSERT(ret);
- ASSERT(path1->slots[0]);
-
- path1->slots[0]--;
-
- WARN_ON(cur->checked);
- if (!list_empty(&cur->upper)) {
- /*
- * the backref was added previously when processing
- * backref of type BTRFS_TREE_BLOCK_REF_KEY
- */
- ASSERT(list_is_singular(&cur->upper));
- edge = list_entry(cur->upper.next, struct backref_edge,
- list[LOWER]);
- ASSERT(list_empty(&edge->list[UPPER]));
- exist = edge->node[UPPER];
- /*
- * add the upper level block to pending list if we need
- * check its backrefs
- */
- if (!exist->checked)
- list_add_tail(&edge->list[UPPER], &list);
- } else {
- exist = NULL;
- }
-
- while (1) {
- cond_resched();
- eb = path1->nodes[0];
-
- if (ptr >= end) {
- if (path1->slots[0] >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(rc->extent_root, path1);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- if (ret > 0)
- break;
- eb = path1->nodes[0];
- }
-
- btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
- if (key.objectid != cur->bytenr) {
- WARN_ON(exist);
- break;
- }
-
- if (key.type == BTRFS_EXTENT_ITEM_KEY ||
- key.type == BTRFS_METADATA_ITEM_KEY) {
- ret = find_inline_backref(eb, path1->slots[0],
- &ptr, &end);
- if (ret)
- goto next;
- }
- }
-
- if (ptr < end) {
- /* update key for inline back ref */
- struct btrfs_extent_inline_ref *iref;
- int type;
- iref = (struct btrfs_extent_inline_ref *)ptr;
- type = btrfs_get_extent_inline_ref_type(eb, iref,
- BTRFS_REF_TYPE_BLOCK);
- if (type == BTRFS_REF_TYPE_INVALID) {
- err = -EUCLEAN;
- goto out;
- }
- key.type = type;
- key.offset = btrfs_extent_inline_ref_offset(eb, iref);
-
- WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
- key.type != BTRFS_SHARED_BLOCK_REF_KEY);
- }
-
- /*
- * Parent node found and matches current inline ref, no need to
- * rebuild this node for this inline ref.
- */
- if (exist &&
- ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
- exist->owner == key.offset) ||
- (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
- exist->bytenr == key.offset))) {
- exist = NULL;
- goto next;
- }
-
- /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
- if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
- if (key.objectid == key.offset) {
- /*
- * Only root blocks of reloc trees use backref
- * pointing to itself.
- */
- root = find_reloc_root(rc, cur->bytenr);
- ASSERT(root);
- cur->root = root;
- break;
- }
-
- edge = alloc_backref_edge(cache);
- if (!edge) {
- err = -ENOMEM;
- goto out;
- }
- rb_node = tree_search(&cache->rb_root, key.offset);
- if (!rb_node) {
- upper = alloc_backref_node(cache);
- if (!upper) {
- free_backref_edge(cache, edge);
- err = -ENOMEM;
- goto out;
- }
- upper->bytenr = key.offset;
- upper->level = cur->level + 1;
- /*
- * backrefs for the upper level block isn't
- * cached, add the block to pending list
- */
- list_add_tail(&edge->list[UPPER], &list);
- } else {
- upper = rb_entry(rb_node, struct backref_node,
- rb_node);
- ASSERT(upper->checked);
- INIT_LIST_HEAD(&edge->list[UPPER]);
- }
- list_add_tail(&edge->list[LOWER], &cur->upper);
- edge->node[LOWER] = cur;
- edge->node[UPPER] = upper;
-
- goto next;
- } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
- err = -EINVAL;
- btrfs_print_v0_err(rc->extent_root->fs_info);
- btrfs_handle_fs_error(rc->extent_root->fs_info, err,
- NULL);
- goto out;
- } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
- goto next;
- }
-
- /*
- * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
- * means the root objectid. We need to search the tree to get
- * its parent bytenr.
- */
- root = read_fs_root(rc->extent_root->fs_info, key.offset);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto out;
- }
-
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- cur->cowonly = 1;
-
- if (btrfs_root_level(&root->root_item) == cur->level) {
- /* tree root */
- ASSERT(btrfs_root_bytenr(&root->root_item) ==
- cur->bytenr);
- if (should_ignore_root(root))
- list_add(&cur->list, &useless);
- else
- cur->root = root;
- break;
- }
-
- level = cur->level + 1;
-
- /* Search the tree to find parent blocks referring the block. */
- path2->search_commit_root = 1;
- path2->skip_locking = 1;
- path2->lowest_level = level;
- ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
- path2->lowest_level = 0;
+ /* Breadth-first search to build backref cache */
+ do {
+ ret = btrfs_backref_add_tree_node(cache, path, iter, node_key,
+ cur);
if (ret < 0) {
err = ret;
goto out;
}
- if (ret > 0 && path2->slots[level] > 0)
- path2->slots[level]--;
-
- eb = path2->nodes[level];
- if (btrfs_node_blockptr(eb, path2->slots[level]) !=
- cur->bytenr) {
- btrfs_err(root->fs_info,
- "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
- cur->bytenr, level - 1,
- root->root_key.objectid,
- node_key->objectid, node_key->type,
- node_key->offset);
- err = -ENOENT;
- goto out;
+ edge = list_first_entry_or_null(&cache->pending_edge,
+ struct btrfs_backref_edge, list[UPPER]);
+ /*
+ * The pending list isn't empty, take the first block to
+ * process
+ */
+ if (edge) {
+ list_del_init(&edge->list[UPPER]);
+ cur = edge->node[UPPER];
}
- lower = cur;
- need_check = true;
+ } while (edge);
- /* Add all nodes and edges in the path */
- for (; level < BTRFS_MAX_LEVEL; level++) {
- if (!path2->nodes[level]) {
- ASSERT(btrfs_root_bytenr(&root->root_item) ==
- lower->bytenr);
- if (should_ignore_root(root))
- list_add(&lower->list, &useless);
- else
- lower->root = root;
- break;
- }
-
- edge = alloc_backref_edge(cache);
- if (!edge) {
- err = -ENOMEM;
- goto out;
- }
-
- eb = path2->nodes[level];
- rb_node = tree_search(&cache->rb_root, eb->start);
- if (!rb_node) {
- upper = alloc_backref_node(cache);
- if (!upper) {
- free_backref_edge(cache, edge);
- err = -ENOMEM;
- goto out;
- }
- upper->bytenr = eb->start;
- upper->owner = btrfs_header_owner(eb);
- upper->level = lower->level + 1;
- if (!test_bit(BTRFS_ROOT_REF_COWS,
- &root->state))
- upper->cowonly = 1;
-
- /*
- * if we know the block isn't shared
- * we can void checking its backrefs.
- */
- if (btrfs_block_can_be_shared(root, eb))
- upper->checked = 0;
- else
- upper->checked = 1;
-
- /*
- * add the block to pending list if we
- * need check its backrefs, we only do this once
- * while walking up a tree as we will catch
- * anything else later on.
- */
- if (!upper->checked && need_check) {
- need_check = false;
- list_add_tail(&edge->list[UPPER],
- &list);
- } else {
- if (upper->checked)
- need_check = true;
- INIT_LIST_HEAD(&edge->list[UPPER]);
- }
- } else {
- upper = rb_entry(rb_node, struct backref_node,
- rb_node);
- ASSERT(upper->checked);
- INIT_LIST_HEAD(&edge->list[UPPER]);
- if (!upper->owner)
- upper->owner = btrfs_header_owner(eb);
- }
- list_add_tail(&edge->list[LOWER], &lower->upper);
- edge->node[LOWER] = lower;
- edge->node[UPPER] = upper;
-
- if (rb_node)
- break;
- lower = upper;
- upper = NULL;
- }
- btrfs_release_path(path2);
-next:
- if (ptr < end) {
- ptr += btrfs_extent_inline_ref_size(key.type);
- if (ptr >= end) {
- WARN_ON(ptr > end);
- ptr = 0;
- end = 0;
- }
- }
- if (ptr >= end)
- path1->slots[0]++;
- }
- btrfs_release_path(path1);
-
- cur->checked = 1;
- WARN_ON(exist);
-
- /* the pending list isn't empty, take the first block to process */
- if (!list_empty(&list)) {
- edge = list_entry(list.next, struct backref_edge, list[UPPER]);
- list_del_init(&edge->list[UPPER]);
- cur = edge->node[UPPER];
- goto again;
+ /* Finish the upper linkage of newly added edges/nodes */
+ ret = btrfs_backref_finish_upper_links(cache, node);
+ if (ret < 0) {
+ err = ret;
+ goto out;
}
- /*
- * everything goes well, connect backref nodes and insert backref nodes
- * into the cache.
- */
- ASSERT(node->checked);
- cowonly = node->cowonly;
- if (!cowonly) {
- rb_node = tree_insert(&cache->rb_root, node->bytenr,
- &node->rb_node);
- if (rb_node)
- backref_tree_panic(rb_node, -EEXIST, node->bytenr);
- list_add_tail(&node->lower, &cache->leaves);
- }
-
- list_for_each_entry(edge, &node->upper, list[LOWER])
- list_add_tail(&edge->list[UPPER], &list);
-
- while (!list_empty(&list)) {
- edge = list_entry(list.next, struct backref_edge, list[UPPER]);
- list_del_init(&edge->list[UPPER]);
- upper = edge->node[UPPER];
- if (upper->detached) {
- list_del(&edge->list[LOWER]);
- lower = edge->node[LOWER];
- free_backref_edge(cache, edge);
- if (list_empty(&lower->upper))
- list_add(&lower->list, &useless);
- continue;
- }
-
- if (!RB_EMPTY_NODE(&upper->rb_node)) {
- if (upper->lowest) {
- list_del_init(&upper->lower);
- upper->lowest = 0;
- }
-
- list_add_tail(&edge->list[UPPER], &upper->lower);
- continue;
- }
-
- if (!upper->checked) {
- /*
- * Still want to blow up for developers since this is a
- * logic bug.
- */
- ASSERT(0);
- err = -EINVAL;
- goto out;
- }
- if (cowonly != upper->cowonly) {
- ASSERT(0);
- err = -EINVAL;
- goto out;
- }
-
- if (!cowonly) {
- rb_node = tree_insert(&cache->rb_root, upper->bytenr,
- &upper->rb_node);
- if (rb_node)
- backref_tree_panic(rb_node, -EEXIST,
- upper->bytenr);
- }
-
- list_add_tail(&edge->list[UPPER], &upper->lower);
-
- list_for_each_entry(edge, &upper->upper, list[LOWER])
- list_add_tail(&edge->list[UPPER], &list);
- }
- /*
- * process useless backref nodes. backref nodes for tree leaves
- * are deleted from the cache. backref nodes for upper level
- * tree blocks are left in the cache to avoid unnecessary backref
- * lookup.
- */
- while (!list_empty(&useless)) {
- upper = list_entry(useless.next, struct backref_node, list);
- list_del_init(&upper->list);
- ASSERT(list_empty(&upper->upper));
- if (upper == node)
- node = NULL;
- if (upper->lowest) {
- list_del_init(&upper->lower);
- upper->lowest = 0;
- }
- while (!list_empty(&upper->lower)) {
- edge = list_entry(upper->lower.next,
- struct backref_edge, list[UPPER]);
- list_del(&edge->list[UPPER]);
- list_del(&edge->list[LOWER]);
- lower = edge->node[LOWER];
- free_backref_edge(cache, edge);
-
- if (list_empty(&lower->upper))
- list_add(&lower->list, &useless);
- }
- __mark_block_processed(rc, upper);
- if (upper->level > 0) {
- list_add(&upper->list, &cache->detached);
- upper->detached = 1;
- } else {
- rb_erase(&upper->rb_node, &cache->rb_root);
- free_backref_node(cache, upper);
- }
- }
+ if (handle_useless_nodes(rc, node))
+ node = NULL;
out:
- btrfs_free_path(path1);
- btrfs_free_path(path2);
+ btrfs_backref_iter_free(iter);
+ btrfs_free_path(path);
if (err) {
- while (!list_empty(&useless)) {
- lower = list_entry(useless.next,
- struct backref_node, list);
- list_del_init(&lower->list);
- }
- while (!list_empty(&list)) {
- edge = list_first_entry(&list, struct backref_edge,
- list[UPPER]);
- list_del(&edge->list[UPPER]);
- list_del(&edge->list[LOWER]);
- lower = edge->node[LOWER];
- upper = edge->node[UPPER];
- free_backref_edge(cache, edge);
-
- /*
- * Lower is no longer linked to any upper backref nodes
- * and isn't in the cache, we can free it ourselves.
- */
- if (list_empty(&lower->upper) &&
- RB_EMPTY_NODE(&lower->rb_node))
- list_add(&lower->list, &useless);
-
- if (!RB_EMPTY_NODE(&upper->rb_node))
- continue;
-
- /* Add this guy's upper edges to the list to process */
- list_for_each_entry(edge, &upper->upper, list[LOWER])
- list_add_tail(&edge->list[UPPER], &list);
- if (list_empty(&upper->upper))
- list_add(&upper->list, &useless);
- }
-
- while (!list_empty(&useless)) {
- lower = list_entry(useless.next,
- struct backref_node, list);
- list_del_init(&lower->list);
- if (lower == node)
- node = NULL;
- free_backref_node(cache, lower);
- }
-
- remove_backref_node(cache, node);
+ btrfs_backref_error_cleanup(cache, node);
return ERR_PTR(err);
}
ASSERT(!node || !node->detached);
+ ASSERT(list_empty(&cache->useless_node) &&
+ list_empty(&cache->pending_edge));
return node;
}
@@ -1204,19 +538,19 @@
struct btrfs_root *dest)
{
struct btrfs_root *reloc_root = src->reloc_root;
- struct backref_cache *cache = &rc->backref_cache;
- struct backref_node *node = NULL;
- struct backref_node *new_node;
- struct backref_edge *edge;
- struct backref_edge *new_edge;
+ struct btrfs_backref_cache *cache = &rc->backref_cache;
+ struct btrfs_backref_node *node = NULL;
+ struct btrfs_backref_node *new_node;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_edge *new_edge;
struct rb_node *rb_node;
if (cache->last_trans > 0)
update_backref_cache(trans, cache);
- rb_node = tree_search(&cache->rb_root, src->commit_root->start);
+ rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
if (rb_node) {
- node = rb_entry(rb_node, struct backref_node, rb_node);
+ node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
if (node->detached)
node = NULL;
else
@@ -1224,10 +558,10 @@
}
if (!node) {
- rb_node = tree_search(&cache->rb_root,
- reloc_root->commit_root->start);
+ rb_node = rb_simple_search(&cache->rb_root,
+ reloc_root->commit_root->start);
if (rb_node) {
- node = rb_entry(rb_node, struct backref_node,
+ node = rb_entry(rb_node, struct btrfs_backref_node,
rb_node);
BUG_ON(node->detached);
}
@@ -1236,35 +570,33 @@
if (!node)
return 0;
- new_node = alloc_backref_node(cache);
+ new_node = btrfs_backref_alloc_node(cache, dest->node->start,
+ node->level);
if (!new_node)
return -ENOMEM;
- new_node->bytenr = dest->node->start;
- new_node->level = node->level;
new_node->lowest = node->lowest;
new_node->checked = 1;
- new_node->root = dest;
+ new_node->root = btrfs_grab_root(dest);
+ ASSERT(new_node->root);
if (!node->lowest) {
list_for_each_entry(edge, &node->lower, list[UPPER]) {
- new_edge = alloc_backref_edge(cache);
+ new_edge = btrfs_backref_alloc_edge(cache);
if (!new_edge)
goto fail;
- new_edge->node[UPPER] = new_node;
- new_edge->node[LOWER] = edge->node[LOWER];
- list_add_tail(&new_edge->list[UPPER],
- &new_node->lower);
+ btrfs_backref_link_edge(new_edge, edge->node[LOWER],
+ new_node, LINK_UPPER);
}
} else {
list_add_tail(&new_node->lower, &cache->leaves);
}
- rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
- &new_node->rb_node);
+ rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
+ &new_node->rb_node);
if (rb_node)
- backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
+ btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
if (!new_node->lowest) {
list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
@@ -1276,11 +608,11 @@
fail:
while (!list_empty(&new_node->lower)) {
new_edge = list_entry(new_node->lower.next,
- struct backref_edge, list[UPPER]);
+ struct btrfs_backref_edge, list[UPPER]);
list_del(&new_edge->list[UPPER]);
- free_backref_edge(cache, new_edge);
+ btrfs_backref_free_edge(cache, new_edge);
}
- free_backref_node(cache, new_node);
+ btrfs_backref_free_node(cache, new_node);
return -ENOMEM;
}
@@ -1302,8 +634,8 @@
node->data = root;
spin_lock(&rc->reloc_root_tree.lock);
- rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
- node->bytenr, &node->rb_node);
+ rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
+ node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node) {
btrfs_panic(fs_info, -EEXIST,
@@ -1325,11 +657,12 @@
struct rb_node *rb_node;
struct mapping_node *node = NULL;
struct reloc_control *rc = fs_info->reloc_ctl;
+ bool put_ref = false;
if (rc && root->node) {
spin_lock(&rc->reloc_root_tree.lock);
- rb_node = tree_search(&rc->reloc_root_tree.rb_root,
- root->commit_root->start);
+ rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
+ root->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
@@ -1339,9 +672,22 @@
ASSERT(!node || (struct btrfs_root *)node->data == root);
}
+ /*
+ * We only put the reloc root here if it's on the list. There's a lot
+ * of places where the pattern is to splice the rc->reloc_roots, process
+ * the reloc roots, and then add the reloc root back onto
+ * rc->reloc_roots. If we call __del_reloc_root while it's off of the
+ * list we don't want the reference being dropped, because the guy
+ * messing with the list is in charge of the reference.
+ */
spin_lock(&fs_info->trans_lock);
- list_del_init(&root->root_list);
+ if (!list_empty(&root->root_list)) {
+ put_ref = true;
+ list_del_init(&root->root_list);
+ }
spin_unlock(&fs_info->trans_lock);
+ if (put_ref)
+ btrfs_put_root(root);
kfree(node);
}
@@ -1357,8 +703,8 @@
struct reloc_control *rc = fs_info->reloc_ctl;
spin_lock(&rc->reloc_root_tree.lock);
- rb_node = tree_search(&rc->reloc_root_tree.rb_root,
- root->commit_root->start);
+ rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
+ root->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
@@ -1371,11 +717,11 @@
spin_lock(&rc->reloc_root_tree.lock);
node->bytenr = root->node->start;
- rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
- node->bytenr, &node->rb_node);
+ rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
+ node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node)
- backref_tree_panic(rb_node, -EEXIST, node->bytenr);
+ btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
return 0;
}
@@ -1387,10 +733,12 @@
struct extent_buffer *eb;
struct btrfs_root_item *root_item;
struct btrfs_key root_key;
- int ret;
+ int ret = 0;
+ bool must_abort = false;
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
- BUG_ON(!root_item);
+ if (!root_item)
+ return ERR_PTR(-ENOMEM);
root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1402,7 +750,9 @@
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(ret);
+ if (ret)
+ goto fail;
+
/*
* Set the last_snapshot field to the generation of the commit
* root - like this ctree.c:btrfs_block_can_be_shared() behaves
@@ -1423,9 +773,16 @@
*/
ret = btrfs_copy_root(trans, root, root->node, &eb,
BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(ret);
+ if (ret)
+ goto fail;
}
+ /*
+ * We have changed references at this point, we must abort the
+ * transaction if anything fails.
+ */
+ must_abort = true;
+
memcpy(root_item, &root->root_item, sizeof(*root_item));
btrfs_set_root_bytenr(root_item, eb->start);
btrfs_set_root_level(root_item, btrfs_header_level(eb));
@@ -1443,18 +800,33 @@
ret = btrfs_insert_root(trans, fs_info->tree_root,
&root_key, root_item);
- BUG_ON(ret);
+ if (ret)
+ goto fail;
+
kfree(root_item);
- reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key);
- BUG_ON(IS_ERR(reloc_root));
+ reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
+ if (IS_ERR(reloc_root)) {
+ ret = PTR_ERR(reloc_root);
+ goto abort;
+ }
+ set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
reloc_root->last_trans = trans->transid;
return reloc_root;
+fail:
+ kfree(root_item);
+abort:
+ if (must_abort)
+ btrfs_abort_transaction(trans, ret);
+ return ERR_PTR(ret);
}
/*
* create reloc tree for a given fs tree. reloc tree is just a
* snapshot of the fs tree with special root objectid.
+ *
+ * The reloc_root comes out of here with two references, one for
+ * root->reloc_root, and another for being on the rc->reloc_roots list.
*/
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
@@ -1509,7 +881,7 @@
ret = __add_reloc_root(reloc_root);
BUG_ON(ret < 0);
- root->reloc_root = reloc_root;
+ root->reloc_root = btrfs_grab_root(reloc_root);
return 0;
}
@@ -1525,11 +897,18 @@
int ret;
if (!have_reloc_root(root))
- goto out;
+ return 0;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
+ /*
+ * We are probably ok here, but __del_reloc_root() will drop its ref of
+ * the root. We have the ref for root->reloc_root, but just in case
+ * hold it while we update the reloc root.
+ */
+ btrfs_grab_root(reloc_root);
+
/* root->reloc_root will stay until current relocation finished */
if (fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
@@ -1551,10 +930,8 @@
ret = btrfs_update_root(trans, fs_info->tree_root,
&reloc_root->root_key, root_item);
- BUG_ON(ret);
-
-out:
- return 0;
+ btrfs_put_root(reloc_root);
+ return ret;
}
/*
@@ -1611,15 +988,6 @@
return NULL;
}
-static int in_block_group(u64 bytenr,
- struct btrfs_block_group_cache *block_group)
-{
- if (bytenr >= block_group->key.objectid &&
- bytenr < block_group->key.objectid + block_group->key.offset)
- return 1;
- return 0;
-}
-
/*
* get new location of data
*/
@@ -1717,7 +1085,8 @@
num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
if (bytenr == 0)
continue;
- if (!in_block_group(bytenr, rc->block_group))
+ if (!in_range(bytenr, rc->block_group->start,
+ rc->block_group->length))
continue;
/*
@@ -1855,7 +1224,8 @@
}
if (cow) {
- ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+ ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
+ BTRFS_NESTING_COW);
BUG_ON(ret);
}
btrfs_set_lock_blocking_write(eb);
@@ -1873,7 +1243,7 @@
level = btrfs_header_level(parent);
ASSERT(level >= lowest_level);
- ret = btrfs_bin_search(parent, &key, level, &slot);
+ ret = btrfs_bin_search(parent, &key, &slot);
if (ret < 0)
break;
if (ret && slot > 0)
@@ -1923,7 +1293,8 @@
btrfs_tree_lock(eb);
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, parent,
- slot, &eb);
+ slot, &eb,
+ BTRFS_NESTING_COW);
BUG_ON(ret);
}
btrfs_set_lock_blocking_write(eb);
@@ -2227,7 +1598,7 @@
btrfs_update_reloc_root(trans, root);
if (list_empty(&root->reloc_dirty_list)) {
- btrfs_grab_fs_root(root);
+ btrfs_grab_root(root);
list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
}
}
@@ -2247,24 +1618,34 @@
list_del_init(&root->reloc_dirty_list);
root->reloc_root = NULL;
- if (reloc_root) {
-
- ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
- if (ret2 < 0 && !ret)
- ret = ret2;
- }
/*
* Need barrier to ensure clear_bit() only happens after
* root->reloc_root = NULL. Pairs with have_reloc_root.
*/
smp_wmb();
clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
- btrfs_put_fs_root(root);
+ if (reloc_root) {
+ /*
+ * btrfs_drop_snapshot drops our ref we hold for
+ * ->reloc_root. If it fails however we must
+ * drop the ref ourselves.
+ */
+ ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
+ if (ret2 < 0) {
+ btrfs_put_root(reloc_root);
+ if (!ret)
+ ret = ret2;
+ }
+ }
+ btrfs_put_root(root);
} else {
/* Orphan reloc tree, just clean it up */
- ret2 = btrfs_drop_snapshot(root, NULL, 0, 1);
- if (ret2 < 0 && !ret)
- ret = ret2;
+ ret2 = btrfs_drop_snapshot(root, 0, 1);
+ if (ret2 < 0) {
+ btrfs_put_root(root);
+ if (!ret)
+ ret = ret2;
+ }
}
}
return ret;
@@ -2303,7 +1684,7 @@
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_root_level(root_item);
- extent_buffer_get(reloc_root->node);
+ atomic_inc(&reloc_root->node->refs);
path->nodes[level] = reloc_root->node;
path->slots[level] = 0;
} else {
@@ -2422,7 +1803,8 @@
* relocated and the block is tree root.
*/
leaf = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+ ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
+ BTRFS_NESTING_COW);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
if (ret < 0)
@@ -2473,7 +1855,7 @@
if (IS_ERR(trans)) {
if (!err)
btrfs_block_rsv_release(fs_info, rc->block_rsv,
- num_bytes);
+ num_bytes, NULL);
return PTR_ERR(trans);
}
@@ -2481,7 +1863,7 @@
if (num_bytes != rc->merging_rsv_size) {
btrfs_end_transaction(trans);
btrfs_block_rsv_release(fs_info, rc->block_rsv,
- num_bytes);
+ num_bytes, NULL);
goto again;
}
}
@@ -2493,7 +1875,8 @@
struct btrfs_root, root_list);
list_del_init(&reloc_root->root_list);
- root = read_fs_root(fs_info, reloc_root->root_key.offset);
+ root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
+ false);
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
@@ -2506,6 +1889,7 @@
btrfs_update_reloc_root(trans, root);
list_add(&reloc_root->root_list, &reloc_roots);
+ btrfs_put_root(root);
}
list_splice(&reloc_roots, &rc->reloc_roots);
@@ -2520,17 +1904,10 @@
static noinline_for_stack
void free_reloc_roots(struct list_head *list)
{
- struct btrfs_root *reloc_root;
+ struct btrfs_root *reloc_root, *tmp;
- while (!list_empty(list)) {
- reloc_root = list_entry(list->next, struct btrfs_root,
- root_list);
+ list_for_each_entry_safe(reloc_root, tmp, list, root_list)
__del_reloc_root(reloc_root);
- free_extent_buffer(reloc_root->node);
- free_extent_buffer(reloc_root->commit_root);
- reloc_root->node = NULL;
- reloc_root->commit_root = NULL;
- }
}
static noinline_for_stack
@@ -2560,11 +1937,13 @@
reloc_root = list_entry(reloc_roots.next,
struct btrfs_root, root_list);
- root = read_fs_root(fs_info, reloc_root->root_key.offset);
+ root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
+ false);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
ret = merge_reloc_root(rc, root);
+ btrfs_put_root(root);
if (ret) {
if (list_empty(&reloc_root->root_list))
list_add_tail(&reloc_root->root_list,
@@ -2573,10 +1952,13 @@
}
} else {
if (!IS_ERR(root)) {
- if (root->reloc_root == reloc_root)
+ if (root->reloc_root == reloc_root) {
root->reloc_root = NULL;
+ btrfs_put_root(reloc_root);
+ }
clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
&root->state);
+ btrfs_put_root(root);
}
list_del_init(&reloc_root->root_list);
@@ -2593,15 +1975,13 @@
out:
if (ret) {
btrfs_handle_fs_error(fs_info, ret, NULL);
- if (!list_empty(&reloc_roots))
- free_reloc_roots(&reloc_roots);
+ free_reloc_roots(&reloc_roots);
/* new reloc root may be added */
mutex_lock(&fs_info->reloc_mutex);
list_splice_init(&rc->reloc_roots, &reloc_roots);
mutex_unlock(&fs_info->reloc_mutex);
- if (!list_empty(&reloc_roots))
- free_reloc_roots(&reloc_roots);
+ free_reloc_roots(&reloc_roots);
}
/*
@@ -2637,24 +2017,27 @@
{
struct btrfs_fs_info *fs_info = reloc_root->fs_info;
struct btrfs_root *root;
+ int ret;
if (reloc_root->last_trans == trans->transid)
return 0;
- root = read_fs_root(fs_info, reloc_root->root_key.offset);
+ root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
+ ret = btrfs_record_root_in_trans(trans, root);
+ btrfs_put_root(root);
- return btrfs_record_root_in_trans(trans, root);
+ return ret;
}
static noinline_for_stack
struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
- struct backref_node *node,
- struct backref_edge *edges[])
+ struct btrfs_backref_node *node,
+ struct btrfs_backref_edge *edges[])
{
- struct backref_node *next;
+ struct btrfs_backref_node *next;
struct btrfs_root *root;
int index = 0;
@@ -2664,7 +2047,7 @@
next = walk_up_backref(next, edges, &index);
root = next->root;
BUG_ON(!root);
- BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
+ BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state));
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
record_reloc_root_in_trans(trans, root);
@@ -2678,10 +2061,12 @@
BUG_ON(next->new_bytenr);
BUG_ON(!list_empty(&next->list));
next->new_bytenr = root->node->start;
- next->root = root;
+ btrfs_put_root(next->root);
+ next->root = btrfs_grab_root(root);
+ ASSERT(next->root);
list_add_tail(&next->list,
&rc->backref_cache.changed);
- __mark_block_processed(rc, next);
+ mark_block_processed(rc, next);
break;
}
@@ -2706,18 +2091,21 @@
}
/*
- * select a tree root for relocation. return NULL if the block
- * is reference counted. we should use do_relocation() in this
- * case. return a tree root pointer if the block isn't reference
- * counted. return -ENOENT if the block is root of reloc tree.
+ * Select a tree root for relocation.
+ *
+ * Return NULL if the block is not shareable. We should use do_relocation() in
+ * this case.
+ *
+ * Return a tree root pointer if the block is shareable.
+ * Return -ENOENT if the block is root of reloc tree.
*/
static noinline_for_stack
-struct btrfs_root *select_one_root(struct backref_node *node)
+struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
{
- struct backref_node *next;
+ struct btrfs_backref_node *next;
struct btrfs_root *root;
struct btrfs_root *fs_root = NULL;
- struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
int index = 0;
next = node;
@@ -2727,8 +2115,8 @@
root = next->root;
BUG_ON(!root);
- /* no other choice for non-references counted tree */
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ /* No other choice for non-shareable tree */
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return root;
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
@@ -2749,12 +2137,12 @@
static noinline_for_stack
u64 calcu_metadata_size(struct reloc_control *rc,
- struct backref_node *node, int reserve)
+ struct btrfs_backref_node *node, int reserve)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct backref_node *next = node;
- struct backref_edge *edge;
- struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ struct btrfs_backref_node *next = node;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
u64 num_bytes = 0;
int index = 0;
@@ -2772,7 +2160,7 @@
break;
edge = list_entry(next->upper.next,
- struct backref_edge, list[LOWER]);
+ struct btrfs_backref_edge, list[LOWER]);
edges[index++] = edge;
next = edge->node[UPPER];
}
@@ -2783,7 +2171,7 @@
static int reserve_metadata_space(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
- struct backref_node *node)
+ struct btrfs_backref_node *node)
{
struct btrfs_root *root = rc->extent_root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -2831,14 +2219,14 @@
*/
static int do_relocation(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
- struct backref_node *node,
+ struct btrfs_backref_node *node,
struct btrfs_key *key,
struct btrfs_path *path, int lowest)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct backref_node *upper;
- struct backref_edge *edge;
- struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
struct btrfs_root *root;
struct extent_buffer *eb;
u32 blocksize;
@@ -2864,8 +2252,7 @@
if (upper->eb && !upper->locked) {
if (!lowest) {
- ret = btrfs_bin_search(upper->eb, key,
- upper->level, &slot);
+ ret = btrfs_bin_search(upper->eb, key, &slot);
if (ret < 0) {
err = ret;
goto next;
@@ -2875,7 +2262,7 @@
if (node->eb->start == bytenr)
goto next;
}
- drop_node_buffer(upper);
+ btrfs_backref_drop_node_buffer(upper);
}
if (!upper->eb) {
@@ -2903,8 +2290,7 @@
slot = path->slots[upper->level];
btrfs_release_path(path);
} else {
- ret = btrfs_bin_search(upper->eb, key, upper->level,
- &slot);
+ ret = btrfs_bin_search(upper->eb, key, &slot);
if (ret < 0) {
err = ret;
goto next;
@@ -2945,7 +2331,7 @@
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
- slot, &eb);
+ slot, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
if (ret < 0) {
@@ -2974,15 +2360,15 @@
}
next:
if (!upper->pending)
- drop_node_buffer(upper);
+ btrfs_backref_drop_node_buffer(upper);
else
- unlock_node_buffer(upper);
+ btrfs_backref_unlock_node_buffer(upper);
if (err)
break;
}
if (!err && node->pending) {
- drop_node_buffer(node);
+ btrfs_backref_drop_node_buffer(node);
list_move_tail(&node->list, &rc->backref_cache.changed);
node->pending = 0;
}
@@ -2994,7 +2380,7 @@
static int link_to_upper(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
- struct backref_node *node,
+ struct btrfs_backref_node *node,
struct btrfs_path *path)
{
struct btrfs_key key;
@@ -3008,15 +2394,15 @@
struct btrfs_path *path, int err)
{
LIST_HEAD(list);
- struct backref_cache *cache = &rc->backref_cache;
- struct backref_node *node;
+ struct btrfs_backref_cache *cache = &rc->backref_cache;
+ struct btrfs_backref_node *node;
int level;
int ret;
for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
while (!list_empty(&cache->pending[level])) {
node = list_entry(cache->pending[level].next,
- struct backref_node, list);
+ struct btrfs_backref_node, list);
list_move_tail(&node->list, &list);
BUG_ON(!node->pending);
@@ -3031,35 +2417,16 @@
return err;
}
-static void mark_block_processed(struct reloc_control *rc,
- u64 bytenr, u32 blocksize)
-{
- set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
- EXTENT_DIRTY);
-}
-
-static void __mark_block_processed(struct reloc_control *rc,
- struct backref_node *node)
-{
- u32 blocksize;
- if (node->level == 0 ||
- in_block_group(node->bytenr, rc->block_group)) {
- blocksize = rc->extent_root->fs_info->nodesize;
- mark_block_processed(rc, node->bytenr, blocksize);
- }
- node->processed = 1;
-}
-
/*
* mark a block and all blocks directly/indirectly reference the block
* as processed.
*/
static void update_processed_blocks(struct reloc_control *rc,
- struct backref_node *node)
+ struct btrfs_backref_node *node)
{
- struct backref_node *next = node;
- struct backref_edge *edge;
- struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ struct btrfs_backref_node *next = node;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
int index = 0;
while (next) {
@@ -3068,13 +2435,13 @@
if (next->processed)
break;
- __mark_block_processed(rc, next);
+ mark_block_processed(rc, next);
if (list_empty(&next->upper))
break;
edge = list_entry(next->upper.next,
- struct backref_edge, list[LOWER]);
+ struct btrfs_backref_edge, list[LOWER]);
edges[index++] = edge;
next = edge->node[UPPER];
}
@@ -3097,7 +2464,6 @@
{
struct extent_buffer *eb;
- BUG_ON(block->key_ready);
eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
block->level, NULL);
if (IS_ERR(eb)) {
@@ -3120,7 +2486,7 @@
*/
static int relocate_tree_block(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
- struct backref_node *node,
+ struct btrfs_backref_node *node,
struct btrfs_key *key,
struct btrfs_path *path)
{
@@ -3130,6 +2496,14 @@
if (!node)
return 0;
+ /*
+ * If we fail here we want to drop our backref_node because we are going
+ * to start over and regenerate the tree for it.
+ */
+ ret = reserve_metadata_space(trans, rc, node);
+ if (ret)
+ goto out;
+
BUG_ON(node->processed);
root = select_one_root(node);
if (root == ERR_PTR(-ENOENT)) {
@@ -3137,20 +2511,16 @@
goto out;
}
- if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
- ret = reserve_metadata_space(trans, rc, node);
- if (ret)
- goto out;
- }
-
if (root) {
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
BUG_ON(node->new_bytenr);
BUG_ON(!list_empty(&node->list));
btrfs_record_root_in_trans(trans, root);
root = root->reloc_root;
node->new_bytenr = root->node->start;
- node->root = root;
+ btrfs_put_root(node->root);
+ node->root = btrfs_grab_root(root);
+ ASSERT(node->root);
list_add_tail(&node->list, &rc->backref_cache.changed);
} else {
path->lowest_level = node->level;
@@ -3166,7 +2536,7 @@
}
out:
if (ret || node->level == 0 || node->cowonly)
- remove_backref_node(&rc->backref_cache, node);
+ btrfs_backref_cleanup_node(&rc->backref_cache, node);
return ret;
}
@@ -3178,7 +2548,7 @@
struct reloc_control *rc, struct rb_root *blocks)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct backref_node *node;
+ struct btrfs_backref_node *node;
struct btrfs_path *path;
struct tree_block *block;
struct tree_block *next;
@@ -3232,58 +2602,50 @@
return err;
}
-static noinline_for_stack
-int prealloc_file_extent_cluster(struct inode *inode,
- struct file_extent_cluster *cluster)
+static noinline_for_stack int prealloc_file_extent_cluster(
+ struct btrfs_inode *inode,
+ struct file_extent_cluster *cluster)
{
u64 alloc_hint = 0;
u64 start;
u64 end;
- u64 offset = BTRFS_I(inode)->index_cnt;
+ u64 offset = inode->index_cnt;
u64 num_bytes;
- int nr = 0;
+ int nr;
int ret = 0;
u64 prealloc_start = cluster->start - offset;
u64 prealloc_end = cluster->end - offset;
- u64 cur_offset;
- struct extent_changeset *data_reserved = NULL;
+ u64 cur_offset = prealloc_start;
BUG_ON(cluster->start != cluster->boundary[0]);
- inode_lock(inode);
-
- ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
- prealloc_end + 1 - prealloc_start);
+ ret = btrfs_alloc_data_chunk_ondemand(inode,
+ prealloc_end + 1 - prealloc_start);
if (ret)
- goto out;
+ return ret;
- cur_offset = prealloc_start;
- while (nr < cluster->nr) {
+ inode_lock(&inode->vfs_inode);
+ for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
if (nr + 1 < cluster->nr)
end = cluster->boundary[nr + 1] - 1 - offset;
else
end = cluster->end - offset;
- lock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ lock_extent(&inode->io_tree, start, end);
num_bytes = end + 1 - start;
- if (cur_offset < start)
- btrfs_free_reserved_data_space(inode, data_reserved,
- cur_offset, start - cur_offset);
- ret = btrfs_prealloc_file_range(inode, 0, start,
+ ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
cur_offset = end + 1;
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ unlock_extent(&inode->io_tree, start, end);
if (ret)
break;
- nr++;
}
+ inode_unlock(&inode->vfs_inode);
+
if (cur_offset < prealloc_end)
- btrfs_free_reserved_data_space(inode, data_reserved,
- cur_offset, prealloc_end + 1 - cur_offset);
-out:
- inode_unlock(inode);
- extent_changeset_free(data_reserved);
+ btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
+ prealloc_end + 1 - cur_offset);
return ret;
}
@@ -3291,7 +2653,6 @@
int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
u64 block_start)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
int ret = 0;
@@ -3304,7 +2665,6 @@
em->len = end + 1 - start;
em->block_len = em->len;
em->block_start = block_start;
- em->bdev = fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
lock_extent(&BTRFS_I(inode)->io_tree, start, end);
@@ -3322,6 +2682,16 @@
return ret;
}
+/*
+ * Allow error injection to test balance cancellation
+ */
+int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+ return atomic_read(&fs_info->balance_cancel_req) ||
+ fatal_signal_pending(current);
+}
+ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
+
static int relocate_file_extent_cluster(struct inode *inode,
struct file_extent_cluster *cluster)
{
@@ -3344,7 +2714,7 @@
if (!ra)
return -ENOMEM;
- ret = prealloc_file_extent_cluster(inode, cluster);
+ ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
if (ret)
goto out;
@@ -3416,8 +2786,8 @@
nr++;
}
- ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
- NULL);
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start,
+ page_end, 0, NULL);
if (ret) {
unlock_page(page);
put_page(page);
@@ -3443,6 +2813,10 @@
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
balance_dirty_pages_ratelimited(inode->i_mapping);
btrfs_throttle(fs_info);
+ if (btrfs_should_cancel_balance(fs_info)) {
+ ret = -ECANCELED;
+ goto out;
+ }
}
WARN_ON(nr != cluster->nr);
out:
@@ -3534,9 +2908,10 @@
block->level = level;
block->key_ready = 0;
- rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+ rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
if (rb_node)
- backref_tree_panic(rb_node, -EEXIST, block->bytenr);
+ btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
+ -EEXIST);
return 0;
}
@@ -3557,7 +2932,7 @@
if (tree_block_processed(bytenr, rc))
return 0;
- if (tree_search(blocks, bytenr))
+ if (rb_simple_search(blocks, bytenr))
return 0;
path = btrfs_alloc_path();
@@ -3614,37 +2989,11 @@
return ret;
}
-/*
- * helper to check if the block use full backrefs for pointers in it
- */
-static int block_use_full_backref(struct reloc_control *rc,
- struct extent_buffer *eb)
-{
- u64 flags;
- int ret;
-
- if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
- btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
- return 1;
-
- ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info,
- eb->start, btrfs_header_level(eb), 1,
- NULL, &flags);
- BUG_ON(ret);
-
- if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
- ret = 1;
- else
- ret = 0;
- return ret;
-}
-
static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct inode *inode,
u64 ino)
{
- struct btrfs_key key;
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
int ret = 0;
@@ -3652,11 +3001,7 @@
if (inode)
goto truncate;
- key.objectid = ino;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
-
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, ino, root);
if (IS_ERR(inode))
return -ENOENT;
@@ -3682,172 +3027,45 @@
}
/*
- * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
- * this function scans fs tree to find blocks reference the data extent
+ * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
+ * cache inode, to avoid free space cache data extent blocking data relocation.
*/
-static int find_data_references(struct reloc_control *rc,
- struct btrfs_key *extent_key,
- struct extent_buffer *leaf,
- struct btrfs_extent_data_ref *ref,
- struct rb_root *blocks)
+static int delete_v1_space_cache(struct extent_buffer *leaf,
+ struct btrfs_block_group *block_group,
+ u64 data_bytenr)
{
- struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct btrfs_path *path;
- struct tree_block *block;
- struct btrfs_root *root;
- struct btrfs_file_extent_item *fi;
- struct rb_node *rb_node;
+ u64 space_cache_ino;
+ struct btrfs_file_extent_item *ei;
struct btrfs_key key;
- u64 ref_root;
- u64 ref_objectid;
- u64 ref_offset;
- u32 ref_count;
- u32 nritems;
- int err = 0;
- int added = 0;
- int counted;
+ bool found = false;
+ int i;
int ret;
- ref_root = btrfs_extent_data_ref_root(leaf, ref);
- ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
- ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
- ref_count = btrfs_extent_data_ref_count(leaf, ref);
+ if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
+ return 0;
- /*
- * This is an extent belonging to the free space cache, lets just delete
- * it and redo the search.
- */
- if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
- ret = delete_block_group_cache(fs_info, rc->block_group,
- NULL, ref_objectid);
- if (ret != -ENOENT)
- return ret;
- ret = 0;
- }
+ for (i = 0; i < btrfs_header_nritems(leaf); i++) {
+ u8 type;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->reada = READA_FORWARD;
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(leaf, ei);
- root = read_fs_root(fs_info, ref_root);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto out;
- }
-
- key.objectid = ref_objectid;
- key.type = BTRFS_EXTENT_DATA_KEY;
- if (ref_offset > ((u64)-1 << 32))
- key.offset = 0;
- else
- key.offset = ref_offset;
-
- path->search_commit_root = 1;
- path->skip_locking = 1;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- err = ret;
- goto out;
- }
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- /*
- * the references in tree blocks that use full backrefs
- * are not counted in
- */
- if (block_use_full_backref(rc, leaf))
- counted = 0;
- else
- counted = 1;
- rb_node = tree_search(blocks, leaf->start);
- if (rb_node) {
- if (counted)
- added = 1;
- else
- path->slots[0] = nritems;
- }
-
- while (ref_count > 0) {
- while (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- if (WARN_ON(ret > 0))
- goto out;
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- added = 0;
-
- if (block_use_full_backref(rc, leaf))
- counted = 0;
- else
- counted = 1;
- rb_node = tree_search(blocks, leaf->start);
- if (rb_node) {
- if (counted)
- added = 1;
- else
- path->slots[0] = nritems;
- }
- }
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (WARN_ON(key.objectid != ref_objectid ||
- key.type != BTRFS_EXTENT_DATA_KEY))
+ if ((type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) &&
+ btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
+ found = true;
+ space_cache_ino = key.objectid;
break;
-
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE)
- goto next;
-
- if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
- extent_key->objectid)
- goto next;
-
- key.offset -= btrfs_file_extent_offset(leaf, fi);
- if (key.offset != ref_offset)
- goto next;
-
- if (counted)
- ref_count--;
- if (added)
- goto next;
-
- if (!tree_block_processed(leaf->start, rc)) {
- block = kmalloc(sizeof(*block), GFP_NOFS);
- if (!block) {
- err = -ENOMEM;
- break;
- }
- block->bytenr = leaf->start;
- btrfs_item_key_to_cpu(leaf, &block->key, 0);
- block->level = 0;
- block->key_ready = 1;
- rb_node = tree_insert(blocks, block->bytenr,
- &block->rb_node);
- if (rb_node)
- backref_tree_panic(rb_node, -EEXIST,
- block->bytenr);
}
- if (counted)
- added = 1;
- else
- path->slots[0] = nritems;
-next:
- path->slots[0]++;
-
}
-out:
- btrfs_free_path(path);
- return err;
+ if (!found)
+ return -ENOENT;
+ ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
+ space_cache_ino);
+ return ret;
}
/*
@@ -3859,91 +3077,41 @@
struct btrfs_path *path,
struct rb_root *blocks)
{
- struct btrfs_key key;
- struct extent_buffer *eb;
- struct btrfs_extent_data_ref *dref;
- struct btrfs_extent_inline_ref *iref;
- unsigned long ptr;
- unsigned long end;
- u32 blocksize = rc->extent_root->fs_info->nodesize;
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ struct ulist *leaves = NULL;
+ struct ulist_iterator leaf_uiter;
+ struct ulist_node *ref_node = NULL;
+ const u32 blocksize = fs_info->nodesize;
int ret = 0;
- int err = 0;
- eb = path->nodes[0];
- ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
- end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
- ptr += sizeof(struct btrfs_extent_item);
-
- while (ptr < end) {
- iref = (struct btrfs_extent_inline_ref *)ptr;
- key.type = btrfs_get_extent_inline_ref_type(eb, iref,
- BTRFS_REF_TYPE_DATA);
- if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
- key.offset = btrfs_extent_inline_ref_offset(eb, iref);
- ret = __add_tree_block(rc, key.offset, blocksize,
- blocks);
- } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
- dref = (struct btrfs_extent_data_ref *)(&iref->offset);
- ret = find_data_references(rc, extent_key,
- eb, dref, blocks);
- } else {
- ret = -EUCLEAN;
- btrfs_err(rc->extent_root->fs_info,
- "extent %llu slot %d has an invalid inline ref type",
- eb->start, path->slots[0]);
- }
- if (ret) {
- err = ret;
- goto out;
- }
- ptr += btrfs_extent_inline_ref_size(key.type);
- }
- WARN_ON(ptr > end);
-
- while (1) {
- cond_resched();
- eb = path->nodes[0];
- if (path->slots[0] >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(rc->extent_root, path);
- if (ret < 0) {
- err = ret;
- break;
- }
- if (ret > 0)
- break;
- eb = path->nodes[0];
- }
-
- btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
- if (key.objectid != extent_key->objectid)
- break;
-
- if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
- ret = __add_tree_block(rc, key.offset, blocksize,
- blocks);
- } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
- dref = btrfs_item_ptr(eb, path->slots[0],
- struct btrfs_extent_data_ref);
- ret = find_data_references(rc, extent_key,
- eb, dref, blocks);
- } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
- btrfs_print_v0_err(eb->fs_info);
- btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
- ret = -EINVAL;
- } else {
- ret = 0;
- }
- if (ret) {
- err = ret;
- break;
- }
- path->slots[0]++;
- }
-out:
btrfs_release_path(path);
- if (err)
+ ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
+ 0, &leaves, NULL, true);
+ if (ret < 0)
+ return ret;
+
+ ULIST_ITER_INIT(&leaf_uiter);
+ while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
+ struct extent_buffer *eb;
+
+ eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ break;
+ }
+ ret = delete_v1_space_cache(eb, rc->block_group,
+ extent_key->objectid);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ break;
+ ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
+ if (ret < 0)
+ break;
+ }
+ if (ret < 0)
free_block_list(blocks);
- return err;
+ ulist_free(leaves);
+ return ret;
}
/*
@@ -3959,7 +3127,7 @@
u64 start, end, last;
int ret;
- last = rc->block_group->key.objectid + rc->block_group->key.offset;
+ last = rc->block_group->start + rc->block_group->length;
while (1) {
cond_resched();
if (rc->search_start >= last) {
@@ -4076,7 +3244,7 @@
return -ENOMEM;
memset(&rc->cluster, 0, sizeof(rc->cluster));
- rc->search_start = rc->block_group->key.objectid;
+ rc->search_start = rc->block_group->start;
rc->extents_found = 0;
rc->nodes_relocated = 0;
rc->merging_rsv_size = 0;
@@ -4218,6 +3386,10 @@
break;
}
}
+ if (btrfs_should_cancel_balance(fs_info)) {
+ err = -ECANCELED;
+ break;
+ }
}
if (trans && progress && err == -ENOSPC) {
ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
@@ -4246,16 +3418,24 @@
rc->create_reloc_tree = 0;
set_reloc_control(rc);
- backref_cache_cleanup(&rc->backref_cache);
- btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
+ btrfs_backref_release_cache(&rc->backref_cache);
+ btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
+ /*
+ * Even in the case when the relocation is cancelled, we should all go
+ * through prepare_to_merge() and merge_reloc_roots().
+ *
+ * For error (including cancelled balance), prepare_to_merge() will
+ * mark all reloc trees orphan, then queue them for cleanup in
+ * merge_reloc_roots()
+ */
err = prepare_to_merge(rc, err);
merge_reloc_roots(rc);
rc->merge_reloc_tree = 0;
unset_reloc_control(rc);
- btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
/* get rid of pinned extents */
trans = btrfs_join_transaction(rc->extent_root);
@@ -4309,22 +3489,20 @@
*/
static noinline_for_stack
struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *group)
+ struct btrfs_block_group *group)
{
struct inode *inode = NULL;
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
- struct btrfs_key key;
u64 objectid;
int err = 0;
- root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
- if (IS_ERR(root))
- return ERR_CAST(root);
-
+ root = btrfs_grab_root(fs_info->data_reloc_root);
trans = btrfs_start_transaction(root, 6);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
+ btrfs_put_root(root);
return ERR_CAST(trans);
+ }
err = btrfs_find_free_objectid(root, &objectid);
if (err)
@@ -4333,15 +3511,13 @@
err = __insert_orphan_inode(trans, root, objectid);
BUG_ON(err);
- key.objectid = objectid;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, objectid, root);
BUG_ON(IS_ERR(inode));
- BTRFS_I(inode)->index_cnt = group->key.objectid;
+ BTRFS_I(inode)->index_cnt = group->start;
err = btrfs_orphan_add(trans, BTRFS_I(inode));
out:
+ btrfs_put_root(root);
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
if (err) {
@@ -4362,7 +3538,7 @@
INIT_LIST_HEAD(&rc->reloc_roots);
INIT_LIST_HEAD(&rc->dirty_subvol_roots);
- backref_cache_init(&rc->backref_cache);
+ btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
mapping_tree_init(&rc->reloc_root_tree);
extent_io_tree_init(fs_info, &rc->processed_blocks,
IO_TREE_RELOC_BLOCKS, NULL);
@@ -4385,7 +3561,7 @@
* Print the block group being relocated
*/
static void describe_relocation(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
char buf[128] = {'\0'};
@@ -4393,7 +3569,16 @@
btrfs_info(fs_info,
"relocating block group %llu flags %s",
- block_group->key.objectid, buf);
+ block_group->start, buf);
+}
+
+static const char *stage_to_string(int stage)
+{
+ if (stage == MOVE_DATA_EXTENTS)
+ return "move data extents";
+ if (stage == UPDATE_DATA_PTRS)
+ return "update data pointers";
+ return "unknown";
}
/*
@@ -4401,7 +3586,7 @@
*/
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
{
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
struct btrfs_root *extent_root = fs_info->extent_root;
struct reloc_control *rc;
struct inode *inode;
@@ -4466,16 +3651,19 @@
btrfs_wait_block_group_reservations(rc->block_group);
btrfs_wait_nocow_writers(rc->block_group);
btrfs_wait_ordered_roots(fs_info, U64_MAX,
- rc->block_group->key.objectid,
- rc->block_group->key.offset);
+ rc->block_group->start,
+ rc->block_group->length);
while (1) {
+ int finishes_stage;
+
mutex_lock(&fs_info->cleaner_mutex);
ret = relocate_block_group(rc);
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0)
err = ret;
+ finishes_stage = rc->stage;
/*
* We may have gotten ENOSPC after we already dirtied some
* extents. If writeout happens while we're relocating a
@@ -4501,13 +3689,13 @@
if (rc->extents_found == 0)
break;
- btrfs_info(fs_info, "found %llu extents", rc->extents_found);
-
+ btrfs_info(fs_info, "found %llu extents, stage: %s",
+ rc->extents_found, stage_to_string(finishes_stage));
}
WARN_ON(rc->block_group->pinned > 0);
WARN_ON(rc->block_group->reserved > 0);
- WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
+ WARN_ON(rc->block_group->used > 0);
out:
if (err && rw)
btrfs_dec_block_group_ro(rc->block_group);
@@ -4589,17 +3777,18 @@
key.type != BTRFS_ROOT_ITEM_KEY)
break;
- reloc_root = btrfs_read_fs_root(root, &key);
+ reloc_root = btrfs_read_tree_root(root, &key);
if (IS_ERR(reloc_root)) {
err = PTR_ERR(reloc_root);
goto out;
}
+ set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
list_add(&reloc_root->root_list, &reloc_roots);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
- fs_root = read_fs_root(fs_info,
- reloc_root->root_key.offset);
+ fs_root = btrfs_get_fs_root(fs_info,
+ reloc_root->root_key.offset, false);
if (IS_ERR(fs_root)) {
ret = PTR_ERR(fs_root);
if (ret != -ENOENT) {
@@ -4611,6 +3800,8 @@
err = ret;
goto out;
}
+ } else {
+ btrfs_put_root(fs_root);
}
}
@@ -4653,7 +3844,8 @@
continue;
}
- fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
+ fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
+ false);
if (IS_ERR(fs_root)) {
err = PTR_ERR(fs_root);
list_add_tail(&reloc_root->root_list, &reloc_roots);
@@ -4663,7 +3855,8 @@
err = __add_reloc_root(reloc_root);
BUG_ON(err < 0); /* -ENOMEM or logic error */
- fs_root->reloc_root = reloc_root;
+ fs_root->reloc_root = btrfs_grab_root(reloc_root);
+ btrfs_put_root(fs_root);
}
err = btrfs_commit_transaction(trans);
@@ -4688,18 +3881,16 @@
unset_reloc_control(rc);
free_reloc_control(rc);
out:
- if (!list_empty(&reloc_roots))
- free_reloc_roots(&reloc_roots);
+ free_reloc_roots(&reloc_roots);
btrfs_free_path(path);
if (err == 0) {
/* cleanup orphan inode in data relocation tree */
- fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
- if (IS_ERR(fs_root))
- err = PTR_ERR(fs_root);
- else
- err = btrfs_orphan_cleanup(fs_root);
+ fs_root = btrfs_grab_root(fs_info->data_reloc_root);
+ ASSERT(fs_root);
+ err = btrfs_orphan_cleanup(fs_root);
+ btrfs_put_root(fs_root);
}
return err;
}
@@ -4710,9 +3901,9 @@
* cloning checksum properly handles the nodatasum extents.
* it also saves CPU time to re-calculate the checksum.
*/
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered;
int ret;
@@ -4721,9 +3912,9 @@
LIST_HEAD(list);
ordered = btrfs_lookup_ordered_extent(inode, file_pos);
- BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+ BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
- disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+ disk_bytenr = file_pos + inode->index_cnt;
ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
disk_bytenr + len - 1, &list, 0);
if (ret)
@@ -4745,7 +3936,7 @@
* disk_len vs real len like with real inodes since it's all
* disk length.
*/
- new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
+ new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr;
sums->bytenr = new_bytenr;
btrfs_add_ordered_sum(ordered, sums);
@@ -4761,7 +3952,7 @@
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct reloc_control *rc;
- struct backref_node *node;
+ struct btrfs_backref_node *node;
int first_cow = 0;
int level;
int ret = 0;
@@ -4786,8 +3977,8 @@
BUG_ON(node->bytenr != buf->start &&
node->new_bytenr != buf->start);
- drop_node_buffer(node);
- extent_buffer_get(cow);
+ btrfs_backref_drop_node_buffer(node);
+ atomic_inc(&cow->refs);
node->eb = cow;
node->new_bytenr = cow->start;
@@ -4798,7 +3989,7 @@
}
if (first_cow)
- __mark_block_processed(rc, node);
+ mark_block_processed(rc, node);
if (first_cow && level > 0)
rc->nodes_relocated += buf->len;
@@ -4843,6 +4034,10 @@
/*
* called after snapshot is created. migrate block reservation
* and create reloc root for the newly created snapshot
+ *
+ * This is similar to btrfs_init_reloc_root(), we come out of here with two
+ * references held on the reloc_root, one for root->reloc_root and one for
+ * rc->reloc_roots.
*/
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending)
@@ -4875,7 +4070,7 @@
ret = __add_reloc_root(reloc_root);
BUG_ON(ret < 0);
- new_root->reloc_root = reloc_root;
+ new_root->reloc_root = btrfs_grab_root(reloc_root);
if (rc->create_reloc_tree)
ret = clone_backref_node(trans, rc, root, reloc_root);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 612411c..db37a37 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -22,7 +22,6 @@
static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
struct btrfs_root_item *item)
{
- uuid_le uuid;
u32 len;
int need_reset = 0;
@@ -44,8 +43,7 @@
sizeof(*item) - offsetof(struct btrfs_root_item,
generation_v2));
- uuid_le_gen(&uuid);
- memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE);
+ generate_random_guid(item->uuid);
}
}
@@ -212,7 +210,6 @@
struct extent_buffer *leaf;
struct btrfs_path *path;
struct btrfs_key key;
- struct btrfs_key root_key;
struct btrfs_root *root;
int err = 0;
int ret;
@@ -225,10 +222,9 @@
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = 0;
- root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = (u64)-1;
-
while (1) {
+ u64 root_objectid;
+
ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
if (ret < 0) {
err = ret;
@@ -252,28 +248,10 @@
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
- root_key.objectid = key.offset;
+ root_objectid = key.offset;
key.offset++;
- /*
- * The root might have been inserted already, as before we look
- * for orphan roots, log replay might have happened, which
- * triggers a transaction commit and qgroup accounting, which
- * in turn reads and inserts fs roots while doing backref
- * walking.
- */
- root = btrfs_lookup_fs_root(fs_info, root_key.objectid);
- if (root) {
- WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
- &root->state));
- if (btrfs_root_refs(&root->root_item) == 0) {
- set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
- btrfs_add_dead_root(root);
- }
- continue;
- }
-
- root = btrfs_read_fs_root(tree_root, &root_key);
+ root = btrfs_get_fs_root(fs_info, root_objectid, false);
err = PTR_ERR_OR_ZERO(root);
if (err && err != -ENOENT) {
break;
@@ -290,7 +268,7 @@
break;
}
err = btrfs_del_orphan_item(trans, tree_root,
- root_key.objectid);
+ root_objectid);
btrfs_end_transaction(trans);
if (err) {
btrfs_handle_fs_error(fs_info, err,
@@ -300,25 +278,12 @@
continue;
}
- err = btrfs_init_fs_root(root);
- if (err) {
- btrfs_free_fs_root(root);
- break;
- }
-
- set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
-
- err = btrfs_insert_fs_root(fs_info, root);
- if (err) {
- BUG_ON(err == -EEXIST);
- btrfs_free_fs_root(root);
- break;
- }
-
+ WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
if (btrfs_root_refs(&root->root_item) == 0) {
set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
}
+ btrfs_put_root(root);
}
btrfs_free_path(path);
@@ -371,7 +336,8 @@
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- BUG_ON(ret < 0);
+ if (ret < 0)
+ goto out;
if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -547,11 +513,20 @@
if (ret && qgroup_num_bytes)
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ if (!ret) {
+ spin_lock(&rsv->lock);
+ rsv->qgroup_rsv_reserved += qgroup_num_bytes;
+ spin_unlock(&rsv->lock);
+ }
return ret;
}
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 qgroup_to_release;
+
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e5db948..0392c55 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -8,6 +8,7 @@
#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include "ctree.h"
+#include "discard.h"
#include "volumes.h"
#include "disk-io.h"
#include "ordered-data.h"
@@ -148,7 +149,7 @@
*/
unsigned long *ebitmap;
- unsigned long bitmap[0];
+ unsigned long bitmap[];
};
struct scrub_ctx {
@@ -389,8 +390,7 @@
*
* Caller must ensure @cache is a RAID56 block group.
*/
-static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
- u64 bytenr)
+static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
{
u64 ret;
@@ -404,8 +404,8 @@
* round_down() can only handle power of 2, while RAID56 full
* stripe length can be 64KiB * n, so we need to manually round down.
*/
- ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
- cache->full_stripe_len + cache->key.objectid;
+ ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
+ cache->full_stripe_len + cache->start;
return ret;
}
@@ -423,7 +423,7 @@
static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
bool *locked_ret)
{
- struct btrfs_block_group_cache *bg_cache;
+ struct btrfs_block_group *bg_cache;
struct btrfs_full_stripe_locks_tree *locks_root;
struct full_stripe_lock *existing;
u64 fstripe_start;
@@ -470,7 +470,7 @@
static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
bool locked)
{
- struct btrfs_block_group_cache *bg_cache;
+ struct btrfs_block_group *bg_cache;
struct btrfs_full_stripe_locks_tree *locks_root;
struct full_stripe_lock *fstripe_lock;
u64 fstripe_start;
@@ -647,13 +647,9 @@
struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
struct inode_fs_paths *ipath = NULL;
struct btrfs_root *local_root;
- struct btrfs_key root_key;
struct btrfs_key key;
- root_key.objectid = root;
- root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = (u64)-1;
- local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+ local_root = btrfs_get_fs_root(fs_info, root, true);
if (IS_ERR(local_root)) {
ret = PTR_ERR(local_root);
goto err;
@@ -668,6 +664,7 @@
ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
if (ret) {
+ btrfs_put_root(local_root);
btrfs_release_path(swarn->path);
goto err;
}
@@ -688,6 +685,7 @@
ipath = init_ipath(4096, local_root, swarn->path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(ipath)) {
+ btrfs_put_root(local_root);
ret = PTR_ERR(ipath);
ipath = NULL;
goto err;
@@ -711,6 +709,7 @@
min(isize - offset, (u64)PAGE_SIZE), nlink,
(char *)(unsigned long)ipath->fspath->val[i]);
+ btrfs_put_root(local_root);
free_ipath(ipath);
return 0;
@@ -836,7 +835,7 @@
int success;
bool full_stripe_locked;
unsigned int nofs_flag;
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
BUG_ON(sblock_to_check->page_count < 1);
@@ -970,14 +969,14 @@
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("i/o error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
} else if (sblock_bad->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.csum_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("checksum error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -985,7 +984,7 @@
spin_lock(&sctx->stat_lock);
sctx->stat.verify_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("checksum/header error",
sblock_to_check);
if (sblock_bad->generation_error)
@@ -1617,13 +1616,9 @@
struct scrub_page *spage = sblock->pagev[page_num];
BUG_ON(spage->page == NULL);
- if (spage->io_error) {
- void *mapped_buffer = kmap_atomic(spage->page);
+ if (spage->io_error)
+ clear_page(page_address(spage->page));
- clear_page(mapped_buffer);
- flush_dcache_page(spage->page);
- kunmap_atomic(mapped_buffer);
- }
return scrub_add_page_to_wr_bio(sblock->sctx, spage);
}
@@ -1791,42 +1786,21 @@
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 csum[BTRFS_CSUM_SIZE];
- u8 *on_disk_csum;
- struct page *page;
- void *buffer;
- u64 len;
- int index;
+ struct scrub_page *spage;
+ char *kaddr;
BUG_ON(sblock->page_count < 1);
- if (!sblock->pagev[0]->have_csum)
+ spage = sblock->pagev[0];
+ if (!spage->have_csum)
return 0;
+ kaddr = page_address(spage->page);
+
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
+ crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum);
- on_disk_csum = sblock->pagev[0]->csum;
- page = sblock->pagev[0]->page;
- buffer = kmap_atomic(page);
-
- len = sctx->fs_info->sectorsize;
- index = 0;
- for (;;) {
- u64 l = min_t(u64, len, PAGE_SIZE);
-
- crypto_shash_update(shash, buffer, l);
- kunmap_atomic(buffer);
- len -= l;
- if (len == 0)
- break;
- index++;
- BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index]->page);
- page = sblock->pagev[index]->page;
- buffer = kmap_atomic(page);
- }
-
- crypto_shash_final(shash, csum);
- if (memcmp(csum, on_disk_csum, sctx->csum_size))
+ if (memcmp(csum, spage->csum, sctx->csum_size))
sblock->checksum_error = 1;
return sblock->checksum_error;
@@ -1840,20 +1814,15 @@
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
- struct page *page;
- void *mapped_buffer;
- u64 mapped_size;
- void *p;
- u64 len;
- int index;
-
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
+ const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT;
+ int i;
+ struct scrub_page *spage;
+ char *kaddr;
BUG_ON(sblock->page_count < 1);
- page = sblock->pagev[0]->page;
- mapped_buffer = kmap_atomic(page);
- h = (struct btrfs_header *)mapped_buffer;
+ spage = sblock->pagev[0];
+ kaddr = page_address(spage->page);
+ h = (struct btrfs_header *)kaddr;
memcpy(on_disk_csum, h->csum, sctx->csum_size);
/*
@@ -1861,40 +1830,29 @@
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
- if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
+ if (spage->logical != btrfs_stack_header_bytenr(h))
sblock->header_error = 1;
- if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
+ if (spage->generation != btrfs_stack_header_generation(h)) {
sblock->header_error = 1;
sblock->generation_error = 1;
}
- if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
+ if (!scrub_check_fsid(h->fsid, spage))
sblock->header_error = 1;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE))
sblock->header_error = 1;
- len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
- mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
- p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
- index = 0;
- for (;;) {
- u64 l = min_t(u64, len, mapped_size);
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+ crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
+ PAGE_SIZE - BTRFS_CSUM_SIZE);
- crypto_shash_update(shash, p, l);
- kunmap_atomic(mapped_buffer);
- len -= l;
- if (len == 0)
- break;
- index++;
- BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index]->page);
- page = sblock->pagev[index]->page;
- mapped_buffer = kmap_atomic(page);
- mapped_size = PAGE_SIZE;
- p = mapped_buffer;
+ for (i = 1; i < num_pages; i++) {
+ kaddr = page_address(sblock->pagev[i]->page);
+ crypto_shash_update(shash, kaddr, PAGE_SIZE);
}
crypto_shash_final(shash, calculated_csum);
@@ -1911,57 +1869,31 @@
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
- u8 on_disk_csum[BTRFS_CSUM_SIZE];
- struct page *page;
- void *mapped_buffer;
- u64 mapped_size;
- void *p;
+ struct scrub_page *spage;
+ char *kaddr;
int fail_gen = 0;
int fail_cor = 0;
- u64 len;
- int index;
+
+ BUG_ON(sblock->page_count < 1);
+ spage = sblock->pagev[0];
+ kaddr = page_address(spage->page);
+ s = (struct btrfs_super_block *)kaddr;
+
+ if (spage->logical != btrfs_super_bytenr(s))
+ ++fail_cor;
+
+ if (spage->generation != btrfs_super_generation(s))
+ ++fail_gen;
+
+ if (!scrub_check_fsid(s->fsid, spage))
+ ++fail_cor;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
+ crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
- BUG_ON(sblock->page_count < 1);
- page = sblock->pagev[0]->page;
- mapped_buffer = kmap_atomic(page);
- s = (struct btrfs_super_block *)mapped_buffer;
- memcpy(on_disk_csum, s->csum, sctx->csum_size);
-
- if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
- ++fail_cor;
-
- if (sblock->pagev[0]->generation != btrfs_super_generation(s))
- ++fail_gen;
-
- if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
- ++fail_cor;
-
- len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
- mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
- p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
- index = 0;
- for (;;) {
- u64 l = min_t(u64, len, mapped_size);
-
- crypto_shash_update(shash, p, l);
- kunmap_atomic(mapped_buffer);
- len -= l;
- if (len == 0)
- break;
- index++;
- BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index]->page);
- page = sblock->pagev[index]->page;
- mapped_buffer = kmap_atomic(page);
- mapped_size = PAGE_SIZE;
- p = mapped_buffer;
- }
-
- crypto_shash_final(shash, calculated_csum);
- if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+ if (memcmp(calculated_csum, s->csum, sctx->csum_size))
++fail_cor;
if (fail_cor + fail_gen) {
@@ -1974,10 +1906,10 @@
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
if (fail_cor)
- btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
+ btrfs_dev_stat_inc_and_print(spage->dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
else
- btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
+ btrfs_dev_stat_inc_and_print(spage->dev,
BTRFS_DEV_STAT_GENERATION_ERRS);
}
@@ -3043,7 +2975,8 @@
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
- int num, u64 base, u64 length)
+ int num, u64 base, u64 length,
+ struct btrfs_block_group *cache)
{
struct btrfs_path *path, *ppath;
struct btrfs_fs_info *fs_info = sctx->fs_info;
@@ -3281,6 +3214,20 @@
break;
}
+ /*
+ * If our block group was removed in the meanwhile, just
+ * stop scrubbing since there is no point in continuing.
+ * Continuing would prevent reusing its device extents
+ * for new block groups for a long time.
+ */
+ spin_lock(&cache->lock);
+ if (cache->removed) {
+ spin_unlock(&cache->lock);
+ ret = 0;
+ goto out;
+ }
+ spin_unlock(&cache->lock);
+
extent = btrfs_item_ptr(l, slot,
struct btrfs_extent_item);
flags = btrfs_extent_flags(l, extent);
@@ -3325,13 +3272,14 @@
&extent_dev,
&extent_mirror_num);
- ret = btrfs_lookup_csums_range(csum_root,
- extent_logical,
- extent_logical +
- extent_len - 1,
- &sctx->csum_list, 1);
- if (ret)
- goto out;
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ ret = btrfs_lookup_csums_range(csum_root,
+ extent_logical,
+ extent_logical + extent_len - 1,
+ &sctx->csum_list, 1);
+ if (ret)
+ goto out;
+ }
ret = scrub_extent(sctx, map, extent_logical, extent_len,
extent_physical, extent_dev, flags,
@@ -3417,7 +3365,7 @@
struct btrfs_device *scrub_dev,
u64 chunk_offset, u64 length,
u64 dev_offset,
- struct btrfs_block_group_cache *cache)
+ struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
@@ -3454,7 +3402,7 @@
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
ret = scrub_stripe(sctx, map, scrub_dev, i,
- chunk_offset, length);
+ chunk_offset, length, cache);
if (ret)
goto out;
}
@@ -3481,7 +3429,7 @@
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
path = btrfs_alloc_path();
@@ -3552,6 +3500,23 @@
goto skip;
/*
+ * Make sure that while we are scrubbing the corresponding block
+ * group doesn't get its logical address and its device extents
+ * reused for another block group, which can possibly be of a
+ * different type and different profile. We do this to prevent
+ * false error detections and crashes due to bogus attempts to
+ * repair extents.
+ */
+ spin_lock(&cache->lock);
+ if (cache->removed) {
+ spin_unlock(&cache->lock);
+ btrfs_put_block_group(cache);
+ goto skip;
+ }
+ btrfs_freeze_block_group(cache);
+ spin_unlock(&cache->lock);
+
+ /*
* we need call btrfs_inc_block_group_ro() with scrubs_paused,
* to avoid deadlock caused by:
* btrfs_inc_block_group_ro()
@@ -3578,68 +3543,60 @@
* This can easily boost the amount of SYSTEM chunks if cleaner
* thread can't be triggered fast enough, and use up all space
* of btrfs_super_block::sys_chunk_array
+ *
+ * While for dev replace, we need to try our best to mark block
+ * group RO, to prevent race between:
+ * - Write duplication
+ * Contains latest data
+ * - Scrub copy
+ * Contains data from commit tree
+ *
+ * If target block group is not marked RO, nocow writes can
+ * be overwritten by scrub copy, causing data corruption.
+ * So for dev-replace, it's not allowed to continue if a block
+ * group is not RO.
*/
- ret = btrfs_inc_block_group_ro(cache, false);
- if (!ret && sctx->is_dev_replace) {
- /*
- * If we are doing a device replace wait for any tasks
- * that started delalloc right before we set the block
- * group to RO mode, as they might have just allocated
- * an extent from it or decided they could do a nocow
- * write. And if any such tasks did that, wait for their
- * ordered extents to complete and then commit the
- * current transaction, so that we can later see the new
- * extent items in the extent tree - the ordered extents
- * create delayed data references (for cow writes) when
- * they complete, which will be run and insert the
- * corresponding extent items into the extent tree when
- * we commit the transaction they used when running
- * inode.c:btrfs_finish_ordered_io(). We later use
- * the commit root of the extent tree to find extents
- * to copy from the srcdev into the tgtdev, and we don't
- * want to miss any new extents.
- */
- btrfs_wait_block_group_reservations(cache);
- btrfs_wait_nocow_writers(cache);
- ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
- cache->key.objectid,
- cache->key.offset);
- if (ret > 0) {
- struct btrfs_trans_handle *trans;
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- ret = PTR_ERR(trans);
- else
- ret = btrfs_commit_transaction(trans);
- if (ret) {
- scrub_pause_off(fs_info);
- btrfs_put_block_group(cache);
- break;
- }
- }
- }
- scrub_pause_off(fs_info);
-
+ ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
if (ret == 0) {
ro_set = 1;
- } else if (ret == -ENOSPC) {
+ } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
/*
* btrfs_inc_block_group_ro return -ENOSPC when it
* failed in creating new chunk for metadata.
- * It is not a problem for scrub/replace, because
+ * It is not a problem for scrub, because
* metadata are always cowed, and our scrub paused
* commit_transactions.
*/
ro_set = 0;
+ } else if (ret == -ETXTBSY) {
+ btrfs_warn(fs_info,
+ "skipping scrub of block group %llu due to active swapfile",
+ cache->start);
+ scrub_pause_off(fs_info);
+ ret = 0;
+ goto skip_unfreeze;
} else {
btrfs_warn(fs_info,
"failed setting block group ro: %d", ret);
+ btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
+ scrub_pause_off(fs_info);
break;
}
- down_write(&fs_info->dev_replace.rwsem);
+ /*
+ * Now the target block is marked RO, wait for nocow writes to
+ * finish before dev-replace.
+ * COW is fine, as COW never overwrites extents in commit tree.
+ */
+ if (sctx->is_dev_replace) {
+ btrfs_wait_nocow_writers(cache);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
+ cache->length);
+ }
+
+ scrub_pause_off(fs_info);
+ down_write(&dev_replace->rwsem);
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
@@ -3680,10 +3637,10 @@
scrub_pause_off(fs_info);
- down_write(&fs_info->dev_replace.rwsem);
+ down_write(&dev_replace->rwsem);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
- up_write(&fs_info->dev_replace.rwsem);
+ up_write(&dev_replace->rwsem);
if (ro_set)
btrfs_dec_block_group_ro(cache);
@@ -3697,13 +3654,18 @@
*/
spin_lock(&cache->lock);
if (!cache->removed && !cache->ro && cache->reserved == 0 &&
- btrfs_block_group_used(&cache->item) == 0) {
+ cache->used == 0) {
spin_unlock(&cache->lock);
- btrfs_mark_bg_unused(cache);
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&fs_info->discard_ctl,
+ cache);
+ else
+ btrfs_mark_bg_unused(cache);
} else {
spin_unlock(&cache->lock);
}
-
+skip_unfreeze:
+ btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
if (ret)
break;
@@ -3911,8 +3873,9 @@
if (!is_dev_replace && !readonly &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
- rcu_str_deref(dev->name));
+ btrfs_err_in_rcu(fs_info,
+ "scrub on devid %llu: filesystem on %s is not writable",
+ devid, rcu_str_deref(dev->name));
ret = -EROFS;
goto out;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fc688af..6b80dee 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -122,8 +122,6 @@
struct file_ra_state ra;
- char *read_buf;
-
/*
* We process inodes by their increasing order, so if before an
* incremental send we reverse the parent/child relationship of
@@ -279,11 +277,6 @@
BTRFS_COMPARE_TREE_CHANGED,
BTRFS_COMPARE_TREE_SAME,
};
-typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path,
- struct btrfs_path *right_path,
- struct btrfs_key *key,
- enum btrfs_compare_tree_result result,
- void *ctx);
__cold
static void inconsistent_snapshot_error(struct send_ctx *sctx,
@@ -585,8 +578,8 @@
return -EOVERFLOW;
hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
- hdr->tlv_type = cpu_to_le16(attr);
- hdr->tlv_len = cpu_to_le16(len);
+ put_unaligned_le16(attr, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
memcpy(hdr + 1, data, len);
sctx->send_size += total_len;
@@ -696,7 +689,7 @@
sctx->send_size += sizeof(*hdr);
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->cmd = cpu_to_le16(cmd);
+ put_unaligned_le16(cmd, &hdr->cmd);
return 0;
}
@@ -708,17 +701,17 @@
u32 crc;
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
- hdr->crc = 0;
+ put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
+ put_unaligned_le32(0, &hdr->crc);
crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
- hdr->crc = cpu_to_le32(crc);
+ put_unaligned_le32(crc, &hdr->crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
&sctx->send_off);
sctx->total_send_size += sctx->send_size;
- sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
+ sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size;
sctx->send_size = 0;
return ret;
@@ -4942,36 +4935,44 @@
return ret;
}
-static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+static inline u64 max_send_read_size(const struct send_ctx *sctx)
+{
+ return sctx->send_max_size - SZ_16K;
+}
+
+static int put_data_header(struct send_ctx *sctx, u32 len)
+{
+ struct btrfs_tlv_header *hdr;
+
+ if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
+ return -EOVERFLOW;
+ hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
+ put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
+ sctx->send_size += sizeof(*hdr);
+ return 0;
+}
+
+static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode;
struct page *page;
char *addr;
- struct btrfs_key key;
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
unsigned pg_offset = offset_in_page(offset);
- ssize_t ret = 0;
+ int ret;
- key.objectid = sctx->cur_ino;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
+ ret = put_data_header(sctx, len);
+ if (ret)
+ return ret;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (offset + len > i_size_read(inode)) {
- if (offset > i_size_read(inode))
- len = 0;
- else
- len = offset - i_size_read(inode);
- }
- if (len == 0)
- goto out;
-
last_index = (offset + len - 1) >> PAGE_SHIFT;
/* initial readahead */
@@ -5005,6 +5006,10 @@
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
+ btrfs_err(fs_info,
+ "send: IO error at offset %llu for inode %llu root %llu",
+ page_offset(page), sctx->cur_ino,
+ sctx->send_root->root_key.objectid);
put_page(page);
ret = -EIO;
break;
@@ -5012,16 +5017,16 @@
}
addr = kmap(page);
- memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
+ memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset,
+ cur_len);
kunmap(page);
unlock_page(page);
put_page(page);
index++;
pg_offset = 0;
len -= cur_len;
- ret += cur_len;
+ sctx->send_size += cur_len;
}
-out:
iput(inode);
return ret;
}
@@ -5035,7 +5040,6 @@
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- ssize_t num_read = 0;
p = fs_path_alloc();
if (!p)
@@ -5043,13 +5047,6 @@
btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
- num_read = fill_read_buf(sctx, offset, len);
- if (num_read <= 0) {
- if (num_read < 0)
- ret = num_read;
- goto out;
- }
-
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
goto out;
@@ -5060,16 +5057,16 @@
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
+ ret = put_file_data(sctx, offset, len);
+ if (ret < 0)
+ goto out;
ret = send_cmd(sctx);
tlv_put_failure:
out:
fs_path_free(p);
- if (ret < 0)
- return ret;
- return num_read;
+ return ret;
}
/*
@@ -5181,8 +5178,8 @@
static int send_hole(struct send_ctx *sctx, u64 end)
{
struct fs_path *p = NULL;
+ u64 read_size = max_send_read_size(sctx);
u64 offset = sctx->cur_inode_last_extent;
- u64 len;
int ret = 0;
/*
@@ -5209,16 +5206,19 @@
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
if (ret < 0)
goto tlv_put_failure;
- memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
while (offset < end) {
- len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+ u64 len = min(end - offset, read_size);
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
break;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+ ret = put_data_header(sctx, len);
+ if (ret < 0)
+ break;
+ memset(sctx->send_buf + sctx->send_size, 0, len);
+ sctx->send_size += len;
ret = send_cmd(sctx);
if (ret < 0)
break;
@@ -5234,23 +5234,20 @@
const u64 offset,
const u64 len)
{
+ u64 read_size = max_send_read_size(sctx);
u64 sent = 0;
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, len);
while (sent < len) {
- u64 size = len - sent;
+ u64 size = min(len - sent, read_size);
int ret;
- if (size > BTRFS_SEND_READ_SIZE)
- size = BTRFS_SEND_READ_SIZE;
ret = send_write(sctx, offset + sent, size);
if (ret < 0)
return ret;
- if (!ret)
- break;
- sent += ret;
+ sent += size;
}
return 0;
}
@@ -5565,51 +5562,29 @@
struct clone_root *clone_root)
{
int ret = 0;
- struct btrfs_file_extent_item *ei;
u64 offset = key->offset;
- u64 len;
- u8 type;
+ u64 end;
u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], ei);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
- /*
- * it is possible the inline item won't cover the whole page,
- * but there may be items after this page. Make
- * sure to send the whole thing
- */
- len = PAGE_ALIGN(len);
- } else {
- len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
- }
+ end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
+ if (offset >= end)
+ return 0;
- if (offset >= sctx->cur_inode_size) {
- ret = 0;
- goto out;
- }
- if (offset + len > sctx->cur_inode_size)
- len = sctx->cur_inode_size - offset;
- if (len == 0) {
- ret = 0;
- goto out;
- }
-
- if (clone_root && IS_ALIGNED(offset + len, bs)) {
+ if (clone_root && IS_ALIGNED(end, bs)) {
+ struct btrfs_file_extent_item *ei;
u64 disk_byte;
u64 data_offset;
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
ret = clone_range(sctx, clone_root, disk_byte, data_offset,
- offset, len);
+ offset, end - offset);
} else {
- ret = send_extent_data(sctx, offset, len);
+ ret = send_extent_data(sctx, offset, end - offset);
}
- sctx->cur_inode_next_write_offset = offset + len;
-out:
+ sctx->cur_inode_next_write_offset = end;
return ret;
}
@@ -5807,10 +5782,7 @@
{
struct btrfs_path *path;
struct btrfs_root *root = sctx->send_root;
- struct btrfs_file_extent_item *fi;
struct btrfs_key key;
- u64 extent_end;
- u8 type;
int ret;
path = alloc_path_for_send();
@@ -5830,18 +5802,7 @@
if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
goto out;
- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], fi);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi);
- extent_end = ALIGN(key.offset + size,
- sctx->send_root->fs_info->sectorsize);
- } else {
- extent_end = key.offset +
- btrfs_file_extent_num_bytes(path->nodes[0], fi);
- }
- sctx->cur_inode_last_extent = extent_end;
+ sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
out:
btrfs_free_path(path);
return ret;
@@ -5895,16 +5856,7 @@
break;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE) {
- u64 size = btrfs_file_extent_ram_bytes(leaf, fi);
-
- extent_end = ALIGN(key.offset + size,
- root->fs_info->sectorsize);
- } else {
- extent_end = key.offset +
- btrfs_file_extent_num_bytes(leaf, fi);
- }
+ extent_end = btrfs_file_extent_end(path);
if (extent_end <= start)
goto next;
if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) {
@@ -5925,9 +5877,6 @@
static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
struct btrfs_key *key)
{
- struct btrfs_file_extent_item *fi;
- u64 extent_end;
- u8 type;
int ret = 0;
if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
@@ -5939,18 +5888,6 @@
return ret;
}
- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], fi);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi);
- extent_end = ALIGN(key->offset + size,
- sctx->send_root->fs_info->sectorsize);
- } else {
- extent_end = key->offset +
- btrfs_file_extent_num_bytes(path->nodes[0], fi);
- }
-
if (path->slots[0] == 0 &&
sctx->cur_inode_last_extent < key->offset) {
/*
@@ -5976,7 +5913,7 @@
else
ret = 0;
}
- sctx->cur_inode_last_extent = extent_end;
+ sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
return ret;
}
@@ -6893,8 +6830,7 @@
* If it detects a change, it aborts immediately.
*/
static int btrfs_compare_trees(struct btrfs_root *left_root,
- struct btrfs_root *right_root,
- btrfs_changed_cb_t changed_cb, void *ctx)
+ struct btrfs_root *right_root, void *ctx)
{
struct btrfs_fs_info *fs_info = left_root->fs_info;
int ret;
@@ -7161,8 +7097,7 @@
goto out;
if (sctx->parent_root) {
- ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
- changed_cb, sctx);
+ ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
if (ret < 0)
goto out;
ret = finish_inode_if_needed(sctx, 1);
@@ -7284,14 +7219,12 @@
struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
- struct btrfs_key key;
struct send_ctx *sctx = NULL;
u32 i;
u64 *clone_sources_tmp = NULL;
int clone_sources_to_rollback = 0;
- unsigned alloc_size;
+ size_t alloc_size;
int sort_clone_roots = 0;
- int index;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -7329,13 +7262,6 @@
goto out;
}
- if (!access_ok(arg->clone_sources,
- sizeof(*arg->clone_sources) *
- arg->clone_sources_count)) {
- ret = -EFAULT;
- goto out;
- }
-
if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
ret = -EINVAL;
goto out;
@@ -7379,25 +7305,20 @@
goto out;
}
- sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL);
- if (!sctx->read_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
sctx->pending_dir_moves = RB_ROOT;
sctx->waiting_dir_moves = RB_ROOT;
sctx->orphan_dirs = RB_ROOT;
- alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
-
- sctx->clone_roots = kvzalloc(alloc_size, GFP_KERNEL);
+ sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
+ arg->clone_sources_count + 1,
+ GFP_KERNEL);
if (!sctx->clone_roots) {
ret = -ENOMEM;
goto out;
}
- alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
+ alloc_size = array_size(sizeof(*arg->clone_sources),
+ arg->clone_sources_count);
if (arg->clone_sources_count) {
clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
@@ -7414,15 +7335,9 @@
}
for (i = 0; i < arg->clone_sources_count; i++) {
- key.objectid = clone_sources_tmp[i];
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ clone_root = btrfs_get_fs_root(fs_info,
+ clone_sources_tmp[i], true);
if (IS_ERR(clone_root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(clone_root);
goto out;
}
@@ -7430,20 +7345,19 @@
if (!btrfs_root_readonly(clone_root) ||
btrfs_root_dead(clone_root)) {
spin_unlock(&clone_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ btrfs_put_root(clone_root);
ret = -EPERM;
goto out;
}
if (clone_root->dedupe_in_progress) {
dedupe_in_progress_warn(clone_root);
spin_unlock(&clone_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ btrfs_put_root(clone_root);
ret = -EAGAIN;
goto out;
}
clone_root->send_in_progress++;
spin_unlock(&clone_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
sctx->clone_roots[i].root = clone_root;
clone_sources_to_rollback = i + 1;
@@ -7453,15 +7367,9 @@
}
if (arg->parent_root) {
- key.objectid = arg->parent_root;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root,
+ true);
if (IS_ERR(sctx->parent_root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(sctx->parent_root);
goto out;
}
@@ -7471,20 +7379,16 @@
if (!btrfs_root_readonly(sctx->parent_root) ||
btrfs_root_dead(sctx->parent_root)) {
spin_unlock(&sctx->parent_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -EPERM;
goto out;
}
if (sctx->parent_root->dedupe_in_progress) {
dedupe_in_progress_warn(sctx->parent_root);
spin_unlock(&sctx->parent_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -EAGAIN;
goto out;
}
spin_unlock(&sctx->parent_root->root_item_lock);
-
- srcu_read_unlock(&fs_info->subvol_srcu, index);
}
/*
@@ -7492,7 +7396,8 @@
* is behind the current send position. This is checked while searching
* for possible clone sources.
*/
- sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
+ sctx->clone_roots[sctx->clone_roots_cnt++].root =
+ btrfs_grab_root(sctx->send_root);
/* We do a bsearch later */
sort(sctx->clone_roots, sctx->clone_roots_cnt,
@@ -7577,18 +7482,24 @@
}
if (sort_clone_roots) {
- for (i = 0; i < sctx->clone_roots_cnt; i++)
+ for (i = 0; i < sctx->clone_roots_cnt; i++) {
btrfs_root_dec_send_in_progress(
sctx->clone_roots[i].root);
+ btrfs_put_root(sctx->clone_roots[i].root);
+ }
} else {
- for (i = 0; sctx && i < clone_sources_to_rollback; i++)
+ for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
btrfs_root_dec_send_in_progress(
sctx->clone_roots[i].root);
+ btrfs_put_root(sctx->clone_roots[i].root);
+ }
btrfs_root_dec_send_in_progress(send_root);
}
- if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
+ if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
btrfs_root_dec_send_in_progress(sctx->parent_root);
+ btrfs_put_root(sctx->parent_root);
+ }
kvfree(clone_sources_tmp);
@@ -7598,7 +7509,6 @@
kvfree(sctx->clone_roots);
kvfree(sctx->send_buf);
- kvfree(sctx->read_buf);
name_cache_free(sctx);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index ead397f..de91488 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -13,7 +13,6 @@
#define BTRFS_SEND_STREAM_VERSION 1
#define BTRFS_SEND_BUF_SIZE SZ_64K
-#define BTRFS_SEND_READ_SIZE (48 * SZ_1K)
enum btrfs_tlv_type {
BTRFS_TLV_U8,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 90500b6..69ab10c 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -10,7 +10,154 @@
#include "transaction.h"
#include "block-group.h"
-u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
+/*
+ * HOW DOES SPACE RESERVATION WORK
+ *
+ * If you want to know about delalloc specifically, there is a separate comment
+ * for that with the delalloc code. This comment is about how the whole system
+ * works generally.
+ *
+ * BASIC CONCEPTS
+ *
+ * 1) space_info. This is the ultimate arbiter of how much space we can use.
+ * There's a description of the bytes_ fields with the struct declaration,
+ * refer to that for specifics on each field. Suffice it to say that for
+ * reservations we care about total_bytes - SUM(space_info->bytes_) when
+ * determining if there is space to make an allocation. There is a space_info
+ * for METADATA, SYSTEM, and DATA areas.
+ *
+ * 2) block_rsv's. These are basically buckets for every different type of
+ * metadata reservation we have. You can see the comment in the block_rsv
+ * code on the rules for each type, but generally block_rsv->reserved is how
+ * much space is accounted for in space_info->bytes_may_use.
+ *
+ * 3) btrfs_calc*_size. These are the worst case calculations we used based
+ * on the number of items we will want to modify. We have one for changing
+ * items, and one for inserting new items. Generally we use these helpers to
+ * determine the size of the block reserves, and then use the actual bytes
+ * values to adjust the space_info counters.
+ *
+ * MAKING RESERVATIONS, THE NORMAL CASE
+ *
+ * We call into either btrfs_reserve_data_bytes() or
+ * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
+ * num_bytes we want to reserve.
+ *
+ * ->reserve
+ * space_info->bytes_may_reserve += num_bytes
+ *
+ * ->extent allocation
+ * Call btrfs_add_reserved_bytes() which does
+ * space_info->bytes_may_reserve -= num_bytes
+ * space_info->bytes_reserved += extent_bytes
+ *
+ * ->insert reference
+ * Call btrfs_update_block_group() which does
+ * space_info->bytes_reserved -= extent_bytes
+ * space_info->bytes_used += extent_bytes
+ *
+ * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
+ *
+ * Assume we are unable to simply make the reservation because we do not have
+ * enough space
+ *
+ * -> __reserve_bytes
+ * create a reserve_ticket with ->bytes set to our reservation, add it to
+ * the tail of space_info->tickets, kick async flush thread
+ *
+ * ->handle_reserve_ticket
+ * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
+ * on the ticket.
+ *
+ * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
+ * Flushes various things attempting to free up space.
+ *
+ * -> btrfs_try_granting_tickets()
+ * This is called by anything that either subtracts space from
+ * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
+ * space_info->total_bytes. This loops through the ->priority_tickets and
+ * then the ->tickets list checking to see if the reservation can be
+ * completed. If it can the space is added to space_info->bytes_may_use and
+ * the ticket is woken up.
+ *
+ * -> ticket wakeup
+ * Check if ->bytes == 0, if it does we got our reservation and we can carry
+ * on, if not return the appropriate error (ENOSPC, but can be EINTR if we
+ * were interrupted.)
+ *
+ * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
+ *
+ * Same as the above, except we add ourselves to the
+ * space_info->priority_tickets, and we do not use ticket->wait, we simply
+ * call flush_space() ourselves for the states that are safe for us to call
+ * without deadlocking and hope for the best.
+ *
+ * THE FLUSHING STATES
+ *
+ * Generally speaking we will have two cases for each state, a "nice" state
+ * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to
+ * reduce the locking over head on the various trees, and even to keep from
+ * doing any work at all in the case of delayed refs. Each of these delayed
+ * things however hold reservations, and so letting them run allows us to
+ * reclaim space so we can make new reservations.
+ *
+ * FLUSH_DELAYED_ITEMS
+ * Every inode has a delayed item to update the inode. Take a simple write
+ * for example, we would update the inode item at write time to update the
+ * mtime, and then again at finish_ordered_io() time in order to update the
+ * isize or bytes. We keep these delayed items to coalesce these operations
+ * into a single operation done on demand. These are an easy way to reclaim
+ * metadata space.
+ *
+ * FLUSH_DELALLOC
+ * Look at the delalloc comment to get an idea of how much space is reserved
+ * for delayed allocation. We can reclaim some of this space simply by
+ * running delalloc, but usually we need to wait for ordered extents to
+ * reclaim the bulk of this space.
+ *
+ * FLUSH_DELAYED_REFS
+ * We have a block reserve for the outstanding delayed refs space, and every
+ * delayed ref operation holds a reservation. Running these is a quick way
+ * to reclaim space, but we want to hold this until the end because COW can
+ * churn a lot and we can avoid making some extent tree modifications if we
+ * are able to delay for as long as possible.
+ *
+ * ALLOC_CHUNK
+ * We will skip this the first time through space reservation, because of
+ * overcommit and we don't want to have a lot of useless metadata space when
+ * our worst case reservations will likely never come true.
+ *
+ * RUN_DELAYED_IPUTS
+ * If we're freeing inodes we're likely freeing checksums, file extent
+ * items, and extent tree items. Loads of space could be freed up by these
+ * operations, however they won't be usable until the transaction commits.
+ *
+ * COMMIT_TRANS
+ * may_commit_transaction() is the ultimate arbiter on whether we commit the
+ * transaction or not. In order to avoid constantly churning we do all the
+ * above flushing first and then commit the transaction as the last resort.
+ * However we need to take into account things like pinned space that would
+ * be freed, plus any delayed work we may not have gotten rid of in the case
+ * of metadata.
+ *
+ * OVERCOMMIT
+ *
+ * Because we hold so many reservations for metadata we will allow you to
+ * reserve more space than is currently free in the currently allocate
+ * metadata space. This only happens with metadata, data does not allow
+ * overcommitting.
+ *
+ * You can see the current logic for when we allow overcommit in
+ * btrfs_can_overcommit(), but it only applies to unallocated space. If there
+ * is no unallocated space to be had, all reservations are kept within the
+ * free space in the allocated metadata chunks.
+ *
+ * Because of overcommitting, you generally want to use the
+ * btrfs_can_overcommit() logic for metadata allocations, as it does the right
+ * thing with or without extra unallocated space.
+ */
+
+u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
bool may_use_included)
{
ASSERT(s_info);
@@ -28,10 +175,8 @@
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list)
+ list_for_each_entry(found, head, list)
found->full = 0;
- rcu_read_unlock();
}
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
@@ -58,7 +203,6 @@
spin_lock_init(&space_info->lock);
space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
- init_waitqueue_head(&space_info->wait);
INIT_LIST_HEAD(&space_info->ro_bgs);
INIT_LIST_HEAD(&space_info->tickets);
INIT_LIST_HEAD(&space_info->priority_tickets);
@@ -67,7 +211,7 @@
if (ret)
return ret;
- list_add_rcu(&space_info->list, &info->space_info);
+ list_add(&space_info->list, &info->space_info);
if (flags & BTRFS_BLOCK_GROUP_DATA)
info->data_sinfo = space_info;
@@ -144,41 +288,26 @@
flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
- if (found->flags & flags) {
- rcu_read_unlock();
+ list_for_each_entry(found, head, list) {
+ if (found->flags & flags)
return found;
- }
}
- rcu_read_unlock();
return NULL;
}
-static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
-{
- return (global->size << 1);
-}
-
-int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 bytes,
- enum btrfs_reserve_flush_enum flush)
+static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ enum btrfs_reserve_flush_enum flush)
{
u64 profile;
u64 avail;
- u64 used;
int factor;
- /* Don't overcommit when in mixed mode. */
- if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
- return 0;
-
if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
profile = btrfs_system_alloc_profile(fs_info);
else
profile = btrfs_metadata_alloc_profile(fs_info);
- used = btrfs_space_info_used(space_info, true);
avail = atomic64_read(&fs_info->free_chunk_space);
/*
@@ -199,12 +328,38 @@
avail >>= 3;
else
avail >>= 1;
+ return avail;
+}
+
+int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ u64 avail;
+ u64 used;
+
+ /* Don't overcommit when in mixed mode */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
+ return 0;
+
+ used = btrfs_space_info_used(space_info, true);
+ avail = calc_available_free_space(fs_info, space_info, flush);
if (used + bytes < space_info->total_bytes + avail)
return 1;
return 0;
}
+static void remove_ticket(struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ if (!list_empty(&ticket->list)) {
+ list_del_init(&ticket->list);
+ ASSERT(space_info->reclaim_size >= ticket->bytes);
+ space_info->reclaim_size -= ticket->bytes;
+ }
+}
+
/*
* This is for space we already have accounted in space_info->bytes_may_use, so
* basically when we're returning space from block_rsv's.
@@ -232,7 +387,7 @@
btrfs_space_info_update_bytes_may_use(fs_info,
space_info,
ticket->bytes);
- list_del_init(&ticket->list);
+ remove_ticket(space_info, ticket);
ticket->bytes = 0;
space_info->tickets_id++;
wake_up(&ticket->wait);
@@ -262,9 +417,10 @@
{
lockdep_assert_held(&info->lock);
- btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
+ /* The free space could be negative in case of overcommit */
+ btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull",
info->flags,
- info->total_bytes - btrfs_space_info_used(info, true),
+ (s64)(info->total_bytes - btrfs_space_info_used(info, true)),
info->full ? "" : "not ");
btrfs_info(fs_info,
"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
@@ -284,7 +440,7 @@
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
int index = 0;
spin_lock(&info->lock);
@@ -300,8 +456,7 @@
spin_lock(&cache->lock);
btrfs_info(fs_info,
"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
- cache->key.objectid, cache->key.offset,
- btrfs_block_group_used(&cache->item), cache->pinned,
+ cache->start, cache->length, cache->used, cache->pinned,
cache->reserved, cache->ro ? "[readonly]" : "");
spin_unlock(&cache->lock);
btrfs_dump_free_space(cache, bytes);
@@ -311,28 +466,6 @@
up_read(&info->groups_sem);
}
-static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
- unsigned long nr_pages, int nr_items)
-{
- struct super_block *sb = fs_info->sb;
-
- if (down_read_trylock(&sb->s_umount)) {
- writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
- up_read(&sb->s_umount);
- } else {
- /*
- * We needn't worry the filesystem going from r/w to r/o though
- * we don't acquire ->s_umount mutex, because the filesystem
- * should guarantee the delalloc inodes list be empty after
- * the filesystem is readonly(all dirty pages are written to
- * the disk).
- */
- btrfs_start_delalloc_roots(fs_info, nr_items);
- if (!current->journal_info)
- btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
- }
-}
-
static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
u64 to_reclaim)
{
@@ -351,25 +484,33 @@
/*
* shrink metadata reservation for delalloc
*/
-static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
- u64 orig, bool wait_ordered)
+static void shrink_delalloc(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 to_reclaim, bool wait_ordered)
{
- struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
u64 dio_bytes;
- u64 async_pages;
u64 items;
long time_left;
- unsigned long nr_pages;
int loops;
/* Calc the number of the pages we need flush for space reservation */
- items = calc_reclaim_items_nr(fs_info, to_reclaim);
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ if (to_reclaim == U64_MAX) {
+ items = U64_MAX;
+ } else {
+ /*
+ * to_reclaim is set to however much metadata we need to
+ * reclaim, but reclaiming that much data doesn't really track
+ * exactly, so increase the amount to reclaim by 2x in order to
+ * make sure we're flushing enough delalloc to hopefully reclaim
+ * some metadata reservations.
+ */
+ items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
+ to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ }
trans = (struct btrfs_trans_handle *)current->journal_info;
- space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
@@ -392,44 +533,9 @@
loops = 0;
while ((delalloc_bytes || dio_bytes) && loops < 3) {
- nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+ u64 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
- /*
- * Triggers inode writeback for up to nr_pages. This will invoke
- * ->writepages callback and trigger delalloc filling
- * (btrfs_run_delalloc_range()).
- */
- btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
-
- /*
- * We need to wait for the compressed pages to start before
- * we continue.
- */
- async_pages = atomic_read(&fs_info->async_delalloc_pages);
- if (!async_pages)
- goto skip_async;
-
- /*
- * Calculate how many compressed pages we want to be written
- * before we continue. I.e if there are more async pages than we
- * require wait_event will wait until nr_pages are written.
- */
- if (async_pages <= nr_pages)
- async_pages = 0;
- else
- async_pages -= nr_pages;
-
- wait_event(fs_info->async_submit_wait,
- atomic_read(&fs_info->async_delalloc_pages) <=
- (int)async_pages);
-skip_async:
- spin_lock(&space_info->lock);
- if (list_empty(&space_info->tickets) &&
- list_empty(&space_info->priority_tickets)) {
- spin_unlock(&space_info->lock);
- break;
- }
- spin_unlock(&space_info->lock);
+ btrfs_start_delalloc_roots(fs_info, nr_pages, true);
loops++;
if (wait_ordered && !trans) {
@@ -439,6 +545,15 @@
if (time_left)
break;
}
+
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets) &&
+ list_empty(&space_info->priority_tickets)) {
+ spin_unlock(&space_info->lock);
+ break;
+ }
+ spin_unlock(&space_info->lock);
+
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
@@ -463,8 +578,8 @@
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
struct btrfs_trans_handle *trans;
- u64 bytes_needed;
u64 reclaim_bytes = 0;
+ u64 bytes_needed = 0;
u64 cur_free_bytes = 0;
trans = (struct btrfs_trans_handle *)current->journal_info;
@@ -484,7 +599,8 @@
else if (!list_empty(&space_info->tickets))
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- bytes_needed = (ticket) ? ticket->bytes : 0;
+ if (ticket)
+ bytes_needed = ticket->bytes;
if (bytes_needed > cur_free_bytes)
bytes_needed -= cur_free_bytes;
@@ -511,8 +627,10 @@
goto commit;
/*
- * See if there is some space in the delayed insertion reservation for
- * this reservation.
+ * See if there is some space in the delayed insertion reserve for this
+ * reservation. If the space_info's don't match (like for DATA or
+ * SYSTEM) then just go enospc, reclaiming this space won't recover any
+ * space to satisfy those reservations.
*/
if (space_info != delayed_rsv->space_info)
goto enospc;
@@ -577,7 +695,7 @@
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
+ shrink_delalloc(fs_info, space_info, num_bytes,
state == FLUSH_DELALLOC_WAIT);
break;
case FLUSH_DELAYED_REFS_NR:
@@ -602,7 +720,7 @@
break;
}
ret = btrfs_chunk_alloc(trans,
- btrfs_metadata_alloc_profile(fs_info),
+ btrfs_get_alloc_profile(fs_info, space_info->flags),
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
@@ -635,15 +753,26 @@
btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
- struct reserve_ticket *ticket;
u64 used;
+ u64 avail;
u64 expected;
- u64 to_reclaim = 0;
+ u64 to_reclaim = space_info->reclaim_size;
- list_for_each_entry(ticket, &space_info->tickets, list)
- to_reclaim += ticket->bytes;
- list_for_each_entry(ticket, &space_info->priority_tickets, list)
- to_reclaim += ticket->bytes;
+ lockdep_assert_held(&space_info->lock);
+
+ avail = calc_available_free_space(fs_info, space_info,
+ BTRFS_RESERVE_FLUSH_ALL);
+ used = btrfs_space_info_used(space_info, true);
+
+ /*
+ * We may be flushing because suddenly we have less space than we had
+ * before, and now we're well over-committed based on our current free
+ * space. If that's the case add in our overage so we make sure to put
+ * appropriate pressure on the flushing state machine.
+ */
+ if (space_info->total_bytes + avail < used)
+ to_reclaim += used - (space_info->total_bytes + avail);
+
if (to_reclaim)
return to_reclaim;
@@ -697,14 +826,14 @@
return false;
spin_lock(&global_rsv->lock);
- min_bytes = div_factor(global_rsv->size, 5);
+ min_bytes = div_factor(global_rsv->size, 1);
if (global_rsv->reserved < min_bytes + ticket->bytes) {
spin_unlock(&global_rsv->lock);
return false;
}
global_rsv->reserved -= ticket->bytes;
+ remove_ticket(space_info, ticket);
ticket->bytes = 0;
- list_del_init(&ticket->list);
wake_up(&ticket->wait);
space_info->tickets_id++;
if (global_rsv->reserved < global_rsv->size)
@@ -769,7 +898,7 @@
btrfs_info(fs_info, "failing ticket with %llu bytes",
ticket->bytes);
- list_del_init(&ticket->list);
+ remove_ticket(space_info, ticket);
ticket->error = -ENOSPC;
wake_up(&ticket->wait);
@@ -861,9 +990,132 @@
} while (flush_state <= COMMIT_TRANS);
}
-void btrfs_init_async_reclaim_work(struct work_struct *work)
+/*
+ * FLUSH_DELALLOC_WAIT:
+ * Space is freed from flushing delalloc in one of two ways.
+ *
+ * 1) compression is on and we allocate less space than we reserved
+ * 2) we are overwriting existing space
+ *
+ * For #1 that extra space is reclaimed as soon as the delalloc pages are
+ * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
+ * length to ->bytes_reserved, and subtracts the reserved space from
+ * ->bytes_may_use.
+ *
+ * For #2 this is trickier. Once the ordered extent runs we will drop the
+ * extent in the range we are overwriting, which creates a delayed ref for
+ * that freed extent. This however is not reclaimed until the transaction
+ * commits, thus the next stages.
+ *
+ * RUN_DELAYED_IPUTS
+ * If we are freeing inodes, we want to make sure all delayed iputs have
+ * completed, because they could have been on an inode with i_nlink == 0, and
+ * thus have been truncated and freed up space. But again this space is not
+ * immediately re-usable, it comes in the form of a delayed ref, which must be
+ * run and then the transaction must be committed.
+ *
+ * FLUSH_DELAYED_REFS
+ * The above two cases generate delayed refs that will affect
+ * ->total_bytes_pinned. However this counter can be inconsistent with
+ * reality if there are outstanding delayed refs. This is because we adjust
+ * the counter based solely on the current set of delayed refs and disregard
+ * any on-disk state which might include more refs. So for example, if we
+ * have an extent with 2 references, but we only drop 1, we'll see that there
+ * is a negative delayed ref count for the extent and assume that the space
+ * will be freed, and thus increase ->total_bytes_pinned.
+ *
+ * Running the delayed refs gives us the actual real view of what will be
+ * freed at the transaction commit time. This stage will not actually free
+ * space for us, it just makes sure that may_commit_transaction() has all of
+ * the information it needs to make the right decision.
+ *
+ * COMMIT_TRANS
+ * This is where we reclaim all of the pinned space generated by the previous
+ * two stages. We will not commit the transaction if we don't think we're
+ * likely to satisfy our request, which means if our current free space +
+ * total_bytes_pinned < reservation we will not commit. This is why the
+ * previous states are actually important, to make sure we know for sure
+ * whether committing the transaction will allow us to make progress.
+ *
+ * ALLOC_CHUNK_FORCE
+ * For data we start with alloc chunk force, however we could have been full
+ * before, and then the transaction commit could have freed new block groups,
+ * so if we now have space to allocate do the force chunk allocation.
+ */
+static const enum btrfs_flush_state data_flush_states[] = {
+ FLUSH_DELALLOC_WAIT,
+ RUN_DELAYED_IPUTS,
+ FLUSH_DELAYED_REFS,
+ COMMIT_TRANS,
+ ALLOC_CHUNK_FORCE,
+};
+
+static void btrfs_async_reclaim_data_space(struct work_struct *work)
{
- INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ u64 last_tickets_id;
+ int flush_state = 0;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
+ space_info = fs_info->data_sinfo;
+
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+
+ while (!space_info->full) {
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+ }
+
+ while (flush_state < ARRAY_SIZE(data_flush_states)) {
+ flush_space(fs_info, space_info, U64_MAX,
+ data_flush_states[flush_state]);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
+ if (last_tickets_id == space_info->tickets_id) {
+ flush_state++;
+ } else {
+ last_tickets_id = space_info->tickets_id;
+ flush_state = 0;
+ }
+
+ if (flush_state >= ARRAY_SIZE(data_flush_states)) {
+ if (space_info->full) {
+ if (maybe_fail_all_tickets(fs_info, space_info))
+ flush_state = 0;
+ else
+ space_info->flush = 0;
+ } else {
+ flush_state = 0;
+ }
+ }
+ spin_unlock(&space_info->lock);
+ }
+}
+
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
+{
+ INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
+ INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
}
static const enum btrfs_flush_state priority_flush_states[] = {
@@ -913,6 +1165,21 @@
} while (flush_state < states_nr);
}
+static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ while (!space_info->full) {
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ spin_lock(&space_info->lock);
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ spin_unlock(&space_info->lock);
+ }
+}
+
static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
@@ -933,7 +1200,7 @@
* despite getting an error, resulting in a space leak
* (bytes_may_use counter of our space_info).
*/
- list_del_init(&ticket->list);
+ remove_ticket(space_info, ticket);
ticket->error = -EINTR;
break;
}
@@ -965,6 +1232,7 @@
int ret;
switch (flush) {
+ case BTRFS_RESERVE_FLUSH_DATA:
case BTRFS_RESERVE_FLUSH_ALL:
case BTRFS_RESERVE_FLUSH_ALL_STEAL:
wait_reserve_ticket(fs_info, space_info, ticket);
@@ -979,6 +1247,9 @@
evict_flush_states,
ARRAY_SIZE(evict_flush_states));
break;
+ case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
+ priority_reclaim_data_space(fs_info, space_info, ticket);
+ break;
default:
ASSERT(0);
break;
@@ -988,11 +1259,17 @@
ret = ticket->error;
if (ticket->bytes || ticket->error) {
/*
- * Need to delete here for priority tickets. For regular tickets
- * either the async reclaim job deletes the ticket from the list
- * or we delete it ourselves at wait_reserve_ticket().
+ * We were a priority ticket, so we need to delete ourselves
+ * from the list. Because we could have other priority tickets
+ * behind us that require less space, run
+ * btrfs_try_granting_tickets() to see if their reservations can
+ * now be made.
*/
- list_del_init(&ticket->list);
+ if (!list_empty(&ticket->list)) {
+ remove_ticket(space_info, ticket);
+ btrfs_try_granting_tickets(fs_info, space_info);
+ }
+
if (!ret)
ret = -ENOSPC;
}
@@ -1008,6 +1285,16 @@
return ret;
}
+/*
+ * This returns true if this flush state will go through the ordinary flushing
+ * code.
+ */
+static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
+{
+ return (flush == BTRFS_RESERVE_FLUSH_ALL) ||
+ (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
+}
+
/**
* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
* @root - the root we're allocating for
@@ -1022,11 +1309,11 @@
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
+static int __reserve_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
+ struct work_struct *async_work;
struct reserve_ticket ticket;
u64 used;
int ret = 0;
@@ -1035,11 +1322,25 @@
ASSERT(orig_bytes);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+ if (flush == BTRFS_RESERVE_FLUSH_DATA)
+ async_work = &fs_info->async_data_reclaim_work;
+ else
+ async_work = &fs_info->async_reclaim_work;
+
spin_lock(&space_info->lock);
ret = -ENOSPC;
used = btrfs_space_info_used(space_info, true);
- pending_tickets = !list_empty(&space_info->tickets) ||
- !list_empty(&space_info->priority_tickets);
+
+ /*
+ * We don't want NO_FLUSH allocations to jump everybody, they can
+ * generally handle ENOSPC in a different way, so treat them the same as
+ * normal flushers when it comes to skipping pending tickets.
+ */
+ if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH))
+ pending_tickets = !list_empty(&space_info->tickets) ||
+ !list_empty(&space_info->priority_tickets);
+ else
+ pending_tickets = !list_empty(&space_info->priority_tickets);
/*
* Carry on if we have enough space (short-circuit) OR call
@@ -1063,10 +1364,12 @@
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
ticket.bytes = orig_bytes;
ticket.error = 0;
+ space_info->reclaim_size += ticket.bytes;
init_waitqueue_head(&ticket.wait);
ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
if (flush == BTRFS_RESERVE_FLUSH_ALL ||
- flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
+ flush == BTRFS_RESERVE_FLUSH_DATA) {
list_add_tail(&ticket.list, &space_info->tickets);
if (!space_info->flush) {
space_info->flush = 1;
@@ -1074,8 +1377,7 @@
space_info->flags,
orig_bytes, flush,
"enospc");
- queue_work(system_unbound_wq,
- &fs_info->async_reclaim_work);
+ queue_work(system_unbound_wq, async_work);
}
} else {
list_add_tail(&ticket.list,
@@ -1127,8 +1429,7 @@
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
- ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
- orig_bytes, flush);
+ ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
if (block_rsv != global_rsv &&
@@ -1146,3 +1447,32 @@
}
return ret;
}
+
+/**
+ * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation
+ * @fs_info - the filesystem
+ * @bytes - the number of bytes we need
+ * @flush - how we are allowed to flush
+ *
+ * This will reserve bytes from the data space info. If there is not enough
+ * space then we will attempt to flush space as specified by flush.
+ */
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ int ret;
+
+ ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
+ flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
+
+ ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
+ if (ret == -ENOSPC) {
+ trace_btrfs_space_reservation(fs_info, "space_info:enospc",
+ data_sinfo->flags, bytes, 1);
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
+ }
+ return ret;
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index b9cffc6..74706f6 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -54,6 +54,13 @@
struct list_head ro_bgs;
struct list_head priority_tickets;
struct list_head tickets;
+
+ /*
+ * Size of space that needs to be reclaimed in order to satisfy pending
+ * tickets
+ */
+ u64 reclaim_size;
+
/*
* tickets_id just indicates the next ticket will be handled, so note
* it's not stored per ticket.
@@ -63,7 +70,6 @@
struct rw_semaphore groups_sem;
/* for block groups in our same type */
struct list_head block_groups[BTRFS_NR_RAID_TYPES];
- wait_queue_head_t wait;
struct kobject kobj;
struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
@@ -117,7 +123,7 @@
struct btrfs_space_info **space_info);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags);
-u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
+u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
bool may_use_included);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
@@ -143,5 +149,24 @@
btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush);
+
+static inline void __btrfs_mod_total_bytes_pinned(
+ struct btrfs_space_info *space_info,
+ s64 mod)
+{
+ percpu_counter_add_batch(&space_info->total_bytes_pinned, mod,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH);
+}
+
+static inline void btrfs_mod_total_bytes_pinned(struct btrfs_fs_info *fs_info,
+ u64 flags, s64 mod)
+{
+ struct btrfs_space_info *space_info = btrfs_find_space_info(fs_info, flags);
+
+ ASSERT(space_info);
+ __btrfs_mod_total_bytes_pinned(space_info, mod);
+}
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 73f7987..c46be27 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -7,161 +7,152 @@
#include "ctree.h"
-static inline u8 get_unaligned_le8(const void *p)
+static bool check_setget_bounds(const struct extent_buffer *eb,
+ const void *ptr, unsigned off, int size)
{
- return *(u8 *)p;
-}
+ const unsigned long member_offset = (unsigned long)ptr + off;
-static inline void put_unaligned_le8(u8 val, void *p)
-{
- *(u8 *)p = val;
+ if (member_offset > eb->len) {
+ btrfs_warn(eb->fs_info,
+ "bad eb member start: ptr 0x%lx start %llu member offset %lu size %d",
+ (unsigned long)ptr, eb->start, member_offset, size);
+ return false;
+ }
+ if (member_offset + size > eb->len) {
+ btrfs_warn(eb->fs_info,
+ "bad eb member end: ptr 0x%lx start %llu member offset %lu size %d",
+ (unsigned long)ptr, eb->start, member_offset, size);
+ return false;
+ }
+
+ return true;
}
/*
- * this is some deeply nasty code.
+ * Macro templates that define helpers to read/write extent buffer data of a
+ * given size, that are also used via ctree.h for access to item members by
+ * specialized helpers.
*
- * The end result is that anyone who #includes ctree.h gets a
- * declaration for the btrfs_set_foo functions and btrfs_foo functions,
- * which are wrappers of btrfs_set_token_#bits functions and
- * btrfs_get_token_#bits functions, which are defined in this file.
+ * Generic helpers:
+ * - btrfs_set_8 (for 8/16/32/64)
+ * - btrfs_get_8 (for 8/16/32/64)
*
- * These setget functions do all the extent_buffer related mapping
- * required to efficiently read and write specific fields in the extent
- * buffers. Every pointer to metadata items in btrfs is really just
- * an unsigned long offset into the extent buffer which has been
- * cast to a specific type. This gives us all the gcc type checking.
+ * Generic helpers with a token (cached address of the most recently accessed
+ * page):
+ * - btrfs_set_token_8 (for 8/16/32/64)
+ * - btrfs_get_token_8 (for 8/16/32/64)
*
- * The extent buffer api is used to do the page spanning work required to
- * have a metadata blocksize different from the page size.
+ * The set/get functions handle data spanning two pages transparently, in case
+ * metadata block size is larger than page. Every pointer to metadata items is
+ * an offset into the extent buffer page array, cast to a specific type. This
+ * gives us all the type checking.
*
- * There are 2 variants defined, one with a token pointer and one without.
+ * The extent buffer pages stored in the array pages do not form a contiguous
+ * phyusical range, but the API functions assume the linear offset to the range
+ * from 0 to metadata node size.
*/
#define DEFINE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \
- const void *ptr, unsigned long off, \
- struct btrfs_map_token *token) \
+u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
+ const void *ptr, unsigned long off) \
{ \
- unsigned long part_offset = (unsigned long)ptr; \
- unsigned long offset = part_offset + off; \
- void *p; \
- int err; \
- char *kaddr; \
- unsigned long map_start; \
- unsigned long map_len; \
- int size = sizeof(u##bits); \
- u##bits res; \
+ const unsigned long member_offset = (unsigned long)ptr + off; \
+ const unsigned long idx = member_offset >> PAGE_SHIFT; \
+ const unsigned long oip = offset_in_page(member_offset); \
+ const int size = sizeof(u##bits); \
+ u8 lebytes[sizeof(u##bits)]; \
+ const int part = PAGE_SIZE - oip; \
\
ASSERT(token); \
- ASSERT(token->eb == eb); \
- \
- if (token->kaddr && token->offset <= offset && \
- (token->offset + PAGE_SIZE >= offset + size)) { \
- kaddr = token->kaddr; \
- p = kaddr + part_offset - token->offset; \
- res = get_unaligned_le##bits(p + off); \
- return res; \
+ ASSERT(token->kaddr); \
+ ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
+ if (token->offset <= member_offset && \
+ member_offset + size <= token->offset + PAGE_SIZE) { \
+ return get_unaligned_le##bits(token->kaddr + oip); \
} \
- err = map_private_extent_buffer(eb, offset, size, \
- &kaddr, &map_start, &map_len); \
- if (err) { \
- __le##bits leres; \
+ token->kaddr = page_address(token->eb->pages[idx]); \
+ token->offset = idx << PAGE_SHIFT; \
+ if (oip + size <= PAGE_SIZE) \
+ return get_unaligned_le##bits(token->kaddr + oip); \
\
- read_extent_buffer(eb, &leres, offset, size); \
- return le##bits##_to_cpu(leres); \
- } \
- p = kaddr + part_offset - map_start; \
- res = get_unaligned_le##bits(p + off); \
- token->kaddr = kaddr; \
- token->offset = map_start; \
- return res; \
+ memcpy(lebytes, token->kaddr + oip, part); \
+ token->kaddr = page_address(token->eb->pages[idx + 1]); \
+ token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(lebytes + part, token->kaddr, size - part); \
+ return get_unaligned_le##bits(lebytes); \
} \
u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off) \
{ \
- unsigned long part_offset = (unsigned long)ptr; \
- unsigned long offset = part_offset + off; \
- void *p; \
- int err; \
- char *kaddr; \
- unsigned long map_start; \
- unsigned long map_len; \
- int size = sizeof(u##bits); \
- u##bits res; \
+ const unsigned long member_offset = (unsigned long)ptr + off; \
+ const unsigned long oip = offset_in_page(member_offset); \
+ const unsigned long idx = member_offset >> PAGE_SHIFT; \
+ char *kaddr = page_address(eb->pages[idx]); \
+ const int size = sizeof(u##bits); \
+ const int part = PAGE_SIZE - oip; \
+ u8 lebytes[sizeof(u##bits)]; \
\
- err = map_private_extent_buffer(eb, offset, size, \
- &kaddr, &map_start, &map_len); \
- if (err) { \
- __le##bits leres; \
+ ASSERT(check_setget_bounds(eb, ptr, off, size)); \
+ if (oip + size <= PAGE_SIZE) \
+ return get_unaligned_le##bits(kaddr + oip); \
\
- read_extent_buffer(eb, &leres, offset, size); \
- return le##bits##_to_cpu(leres); \
- } \
- p = kaddr + part_offset - map_start; \
- res = get_unaligned_le##bits(p + off); \
- return res; \
+ memcpy(lebytes, kaddr + oip, part); \
+ kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(lebytes + part, kaddr, size - part); \
+ return get_unaligned_le##bits(lebytes); \
} \
-void btrfs_set_token_##bits(struct extent_buffer *eb, \
+void btrfs_set_token_##bits(struct btrfs_map_token *token, \
const void *ptr, unsigned long off, \
- u##bits val, \
- struct btrfs_map_token *token) \
+ u##bits val) \
{ \
- unsigned long part_offset = (unsigned long)ptr; \
- unsigned long offset = part_offset + off; \
- void *p; \
- int err; \
- char *kaddr; \
- unsigned long map_start; \
- unsigned long map_len; \
- int size = sizeof(u##bits); \
+ const unsigned long member_offset = (unsigned long)ptr + off; \
+ const unsigned long idx = member_offset >> PAGE_SHIFT; \
+ const unsigned long oip = offset_in_page(member_offset); \
+ const int size = sizeof(u##bits); \
+ u8 lebytes[sizeof(u##bits)]; \
+ const int part = PAGE_SIZE - oip; \
\
ASSERT(token); \
- ASSERT(token->eb == eb); \
- \
- if (token->kaddr && token->offset <= offset && \
- (token->offset + PAGE_SIZE >= offset + size)) { \
- kaddr = token->kaddr; \
- p = kaddr + part_offset - token->offset; \
- put_unaligned_le##bits(val, p + off); \
+ ASSERT(token->kaddr); \
+ ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
+ if (token->offset <= member_offset && \
+ member_offset + size <= token->offset + PAGE_SIZE) { \
+ put_unaligned_le##bits(val, token->kaddr + oip); \
return; \
} \
- err = map_private_extent_buffer(eb, offset, size, \
- &kaddr, &map_start, &map_len); \
- if (err) { \
- __le##bits val2; \
- \
- val2 = cpu_to_le##bits(val); \
- write_extent_buffer(eb, &val2, offset, size); \
+ token->kaddr = page_address(token->eb->pages[idx]); \
+ token->offset = idx << PAGE_SHIFT; \
+ if (oip + size <= PAGE_SIZE) { \
+ put_unaligned_le##bits(val, token->kaddr + oip); \
return; \
} \
- p = kaddr + part_offset - map_start; \
- put_unaligned_le##bits(val, p + off); \
- token->kaddr = kaddr; \
- token->offset = map_start; \
+ put_unaligned_le##bits(val, lebytes); \
+ memcpy(token->kaddr + oip, lebytes, part); \
+ token->kaddr = page_address(token->eb->pages[idx + 1]); \
+ token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(token->kaddr, lebytes + part, size - part); \
} \
-void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val) \
{ \
- unsigned long part_offset = (unsigned long)ptr; \
- unsigned long offset = part_offset + off; \
- void *p; \
- int err; \
- char *kaddr; \
- unsigned long map_start; \
- unsigned long map_len; \
- int size = sizeof(u##bits); \
+ const unsigned long member_offset = (unsigned long)ptr + off; \
+ const unsigned long oip = offset_in_page(member_offset); \
+ const unsigned long idx = member_offset >> PAGE_SHIFT; \
+ char *kaddr = page_address(eb->pages[idx]); \
+ const int size = sizeof(u##bits); \
+ const int part = PAGE_SIZE - oip; \
+ u8 lebytes[sizeof(u##bits)]; \
\
- err = map_private_extent_buffer(eb, offset, size, \
- &kaddr, &map_start, &map_len); \
- if (err) { \
- __le##bits val2; \
- \
- val2 = cpu_to_le##bits(val); \
- write_extent_buffer(eb, &val2, offset, size); \
+ ASSERT(check_setget_bounds(eb, ptr, off, size)); \
+ if (oip + size <= PAGE_SIZE) { \
+ put_unaligned_le##bits(val, kaddr + oip); \
return; \
} \
- p = kaddr + part_offset - map_start; \
- put_unaligned_le##bits(val, p + off); \
+ \
+ put_unaligned_le##bits(val, lebytes); \
+ memcpy(kaddr + oip, lebytes, part); \
+ kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(kaddr, lebytes + part, size - part); \
}
DEFINE_BTRFS_SETGET_BITS(8)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1a69bdb..2663485 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -46,6 +46,7 @@
#include "sysfs.h"
#include "tests/btrfs-tests.h"
#include "block-group.h"
+#include "discard.h"
#include "qgroup.h"
#define CREATE_TRACE_POINTS
@@ -66,28 +67,52 @@
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
-const char *btrfs_decode_error(int errno)
+/*
+ * Generally the error codes correspond to their respective errors, but there
+ * are a few special cases.
+ *
+ * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for
+ * instance will return EUCLEAN if any of the blocks are corrupted in
+ * a way that is problematic. We want to reserve EUCLEAN for these
+ * sort of corruptions.
+ *
+ * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
+ * need to use EROFS for this case. We will have no idea of the
+ * original failure, that will have been reported at the time we tripped
+ * over the error. Each subsequent error that doesn't have any context
+ * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
+ */
+const char * __attribute_const__ btrfs_decode_error(int errno)
{
char *errstr = "unknown";
switch (errno) {
- case -EIO:
+ case -ENOENT: /* -2 */
+ errstr = "No such entry";
+ break;
+ case -EIO: /* -5 */
errstr = "IO failure";
break;
- case -ENOMEM:
+ case -ENOMEM: /* -12*/
errstr = "Out of memory";
break;
- case -EROFS:
- errstr = "Readonly filesystem";
- break;
- case -EEXIST:
+ case -EEXIST: /* -17 */
errstr = "Object already exists";
break;
- case -ENOSPC:
+ case -ENOSPC: /* -28 */
errstr = "No space left";
break;
- case -ENOENT:
- errstr = "No such entry";
+ case -EROFS: /* -30 */
+ errstr = "Readonly filesystem";
+ break;
+ case -EOPNOTSUPP: /* -95 */
+ errstr = "Operation not supported";
+ break;
+ case -EUCLEAN: /* -117 */
+ errstr = "Filesystem corrupted";
+ break;
+ case -EDQUOT: /* -122 */
+ errstr = "Quota exceeded";
break;
}
@@ -146,6 +171,8 @@
if (sb_rdonly(sb))
return;
+ btrfs_discard_stop(fs_info);
+
/* btrfs handle error by forcing the filesystem readonly */
sb->s_flags |= SB_RDONLY;
btrfs_info(fs_info, "forced readonly");
@@ -187,7 +214,7 @@
RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
};
-void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
@@ -313,7 +340,7 @@
Opt_datasum, Opt_nodatasum,
Opt_defrag, Opt_nodefrag,
Opt_discard, Opt_nodiscard,
- Opt_nologreplay,
+ Opt_discard_mode,
Opt_norecovery,
Opt_ratio,
Opt_rescan_uuid_tree,
@@ -327,13 +354,15 @@
Opt_subvolid,
Opt_thread_pool,
Opt_treelog, Opt_notreelog,
- Opt_usebackuproot,
Opt_user_subvol_rm_allowed,
+ /* Rescue options */
+ Opt_rescue,
+ Opt_usebackuproot,
+ Opt_nologreplay,
+
/* Deprecated options */
- Opt_alloc_start,
Opt_recovery,
- Opt_subvolrootid,
/* Debugging options */
Opt_check_integrity,
@@ -375,8 +404,8 @@
{Opt_defrag, "autodefrag"},
{Opt_nodefrag, "noautodefrag"},
{Opt_discard, "discard"},
+ {Opt_discard_mode, "discard=%s"},
{Opt_nodiscard, "nodiscard"},
- {Opt_nologreplay, "nologreplay"},
{Opt_norecovery, "norecovery"},
{Opt_ratio, "metadata_ratio=%u"},
{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
@@ -394,13 +423,17 @@
{Opt_thread_pool, "thread_pool=%u"},
{Opt_treelog, "treelog"},
{Opt_notreelog, "notreelog"},
- {Opt_usebackuproot, "usebackuproot"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+ /* Rescue options */
+ {Opt_rescue, "rescue=%s"},
+ /* Deprecated, with alias rescue=nologreplay */
+ {Opt_nologreplay, "nologreplay"},
+ /* Deprecated, with alias rescue=usebackuproot */
+ {Opt_usebackuproot, "usebackuproot"},
+
/* Deprecated options */
- {Opt_alloc_start, "alloc_start=%s"},
{Opt_recovery, "recovery"},
- {Opt_subvolrootid, "subvolrootid=%d"},
/* Debugging options */
{Opt_check_integrity, "check_int"},
@@ -419,6 +452,55 @@
{Opt_err, NULL},
};
+static const match_table_t rescue_tokens = {
+ {Opt_usebackuproot, "usebackuproot"},
+ {Opt_nologreplay, "nologreplay"},
+ {Opt_err, NULL},
+};
+
+static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
+{
+ char *opts;
+ char *orig;
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int ret = 0;
+
+ opts = kstrdup(options, GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+ orig = opts;
+
+ while ((p = strsep(&opts, ":")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+ token = match_token(p, rescue_tokens, args);
+ switch (token){
+ case Opt_usebackuproot:
+ btrfs_info(info,
+ "trying to use backup root at mount time");
+ btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
+ break;
+ case Opt_nologreplay:
+ btrfs_set_and_info(info, NOLOGREPLAY,
+ "disabling log replay at mount time");
+ break;
+ case Opt_err:
+ btrfs_info(info, "unrecognized rescue option '%s'", p);
+ ret = -EINVAL;
+ goto out;
+ default:
+ break;
+ }
+
+ }
+out:
+ kfree(orig);
+ return ret;
+}
+
/*
* Regular mount options parser. Everything that is needed only when
* reading in a new superblock is parsed here.
@@ -466,7 +548,6 @@
case Opt_subvol:
case Opt_subvol_empty:
case Opt_subvolid:
- case Opt_subvolrootid:
case Opt_device:
/*
* These are parsed by btrfs_parse_subvol_options or
@@ -510,7 +591,7 @@
case Opt_compress_force:
case Opt_compress_force_type:
compress_force = true;
- /* Fallthrough */
+ fallthrough;
case Opt_compress:
case Opt_compress_type:
saved_compress_type = btrfs_test_opt(info,
@@ -613,7 +694,7 @@
btrfs_set_opt(info->mount_opt, NOSSD);
btrfs_clear_and_info(info, SSD,
"not using ssd optimizations");
- /* Fallthrough */
+ fallthrough;
case Opt_nossd_spread:
btrfs_clear_and_info(info, SSD_SPREAD,
"not using spread ssd allocation scheme");
@@ -654,10 +735,6 @@
goto out;
}
break;
- case Opt_alloc_start:
- btrfs_info(info,
- "option alloc_start is obsolete, ignored");
- break;
case Opt_acl:
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
info->sb->s_flags |= SB_POSIXACL;
@@ -680,6 +757,8 @@
break;
case Opt_norecovery:
case Opt_nologreplay:
+ btrfs_warn(info,
+ "'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
btrfs_set_and_info(info, NOLOGREPLAY,
"disabling log replay at mount time");
break;
@@ -700,12 +779,26 @@
info->metadata_ratio);
break;
case Opt_discard:
- btrfs_set_and_info(info, DISCARD,
- "turning on discard");
+ case Opt_discard_mode:
+ if (token == Opt_discard ||
+ strcmp(args[0].from, "sync") == 0) {
+ btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC);
+ btrfs_set_and_info(info, DISCARD_SYNC,
+ "turning on sync discard");
+ } else if (strcmp(args[0].from, "async") == 0) {
+ btrfs_clear_opt(info->mount_opt, DISCARD_SYNC);
+ btrfs_set_and_info(info, DISCARD_ASYNC,
+ "turning on async discard");
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
break;
case Opt_nodiscard:
- btrfs_clear_and_info(info, DISCARD,
+ btrfs_clear_and_info(info, DISCARD_SYNC,
"turning off discard");
+ btrfs_clear_and_info(info, DISCARD_ASYNC,
+ "turning off async discard");
break;
case Opt_space_cache:
case Opt_space_cache_version:
@@ -739,6 +832,8 @@
}
break;
case Opt_inode_cache:
+ btrfs_warn(info,
+ "the 'inode_cache' option is deprecated and will have no effect from 5.11");
btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
"enabling inode map caching");
break;
@@ -768,10 +863,11 @@
"disabling auto defrag");
break;
case Opt_recovery:
- btrfs_warn(info,
- "'recovery' is deprecated, use 'usebackuproot' instead");
- /* fall through */
case Opt_usebackuproot:
+ btrfs_warn(info,
+ "'%s' is deprecated, use 'rescue=usebackuproot' instead",
+ token == Opt_recovery ? "recovery" :
+ "usebackuproot");
btrfs_info(info,
"trying to use backup root at mount time");
btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
@@ -836,6 +932,11 @@
}
info->commit_interval = intarg;
break;
+ case Opt_rescue:
+ ret = parse_rescue_options(info, args[0].from);
+ if (ret < 0)
+ goto out;
+ break;
#ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
btrfs_info(info, "fragmenting all space");
@@ -859,7 +960,7 @@
break;
#endif
case Opt_err:
- btrfs_info(info, "unrecognized mount option '%s'", p);
+ btrfs_err(info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
goto out;
default:
@@ -997,9 +1098,6 @@
*subvol_objectid = subvolid;
break;
- case Opt_subvolrootid:
- pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n");
- break;
default:
break;
}
@@ -1014,7 +1112,7 @@
u64 subvol_objectid)
{
struct btrfs_root *root = fs_info->tree_root;
- struct btrfs_root *fs_root;
+ struct btrfs_root *fs_root = NULL;
struct btrfs_root_ref *root_ref;
struct btrfs_inode_ref *inode_ref;
struct btrfs_key key;
@@ -1079,12 +1177,10 @@
dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
btrfs_release_path(path);
- key.objectid = subvol_objectid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
- fs_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ fs_root = btrfs_get_fs_root(fs_info, subvol_objectid, true);
if (IS_ERR(fs_root)) {
ret = PTR_ERR(fs_root);
+ fs_root = NULL;
goto err;
}
@@ -1129,6 +1225,8 @@
ptr[0] = '/';
btrfs_release_path(path);
}
+ btrfs_put_root(fs_root);
+ fs_root = NULL;
}
btrfs_free_path(path);
@@ -1141,6 +1239,7 @@
return name;
err:
+ btrfs_put_root(fs_root);
btrfs_free_path(path);
kfree(name);
return ERR_PTR(ret);
@@ -1193,7 +1292,6 @@
{
struct inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- struct btrfs_key key;
int err;
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -1221,10 +1319,7 @@
return err;
}
- key.objectid = BTRFS_FIRST_FREE_OBJECTID;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
- inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
+ inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail_close;
@@ -1325,11 +1420,13 @@
if (btrfs_test_opt(info, NOTREELOG))
seq_puts(seq, ",notreelog");
if (btrfs_test_opt(info, NOLOGREPLAY))
- seq_puts(seq, ",nologreplay");
+ seq_puts(seq, ",rescue=nologreplay");
if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
- if (btrfs_test_opt(info, DISCARD))
+ if (btrfs_test_opt(info, DISCARD_SYNC))
seq_puts(seq, ",discard");
+ if (btrfs_test_opt(info, DISCARD_ASYNC))
+ seq_puts(seq, ",discard=async");
if (!(info->sb->s_flags & SB_POSIXACL))
seq_puts(seq, ",noacl");
if (btrfs_test_opt(info, SPACE_CACHE))
@@ -1508,14 +1605,17 @@
/*
* Setup a dummy root and fs_info for test/set super. This is because
* we don't actually fill this stuff out until open_ctree, but we need
- * it for searching for existing supers, so this lets us do that and
- * then open_ctree will properly initialize everything later.
+ * then open_ctree will properly initialize the file system specific
+ * settings later. btrfs_init_fs_info initializes the static elements
+ * of the fs_info (locks and such) to make cleanup easier if we find a
+ * superblock with our given fs_devices later on at sget() time.
*/
fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
if (!fs_info) {
error = -ENOMEM;
goto error_sec_opts;
}
+ btrfs_init_fs_info(fs_info);
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
@@ -1561,7 +1661,7 @@
if (s->s_root) {
btrfs_close_devices(fs_devices);
- free_fs_info(fs_info);
+ btrfs_free_fs_info(fs_info);
if ((flags ^ s->s_flags) & SB_RDONLY)
error = -EBUSY;
} else {
@@ -1584,7 +1684,7 @@
error_close_devices:
btrfs_close_devices(fs_devices);
error_fs_info:
- free_fs_info(fs_info);
+ btrfs_free_fs_info(fs_info);
error_sec_opts:
security_free_mnt_opts(&new_sec_opts);
return ERR_PTR(error);
@@ -1680,7 +1780,6 @@
btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
@@ -1694,11 +1793,6 @@
new_pool_size);
}
-static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
-{
- set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
-}
-
static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
unsigned long old_opts, int flags)
{
@@ -1725,7 +1819,13 @@
btrfs_cleanup_defrag_inodes(fs_info);
}
- clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+ /* If we toggled discard async */
+ if (!btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
+ btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_discard_resume(fs_info);
+ else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
+ !btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_discard_cleanup(fs_info);
}
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1741,7 +1841,7 @@
int ret;
sync_filesystem(sb);
- btrfs_remount_prepare(fs_info);
+ set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
if (data) {
void *new_sec_opts = NULL;
@@ -1771,6 +1871,9 @@
* the filesystem is busy.
*/
cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
+
+ btrfs_discard_cleanup(fs_info);
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
@@ -1875,6 +1978,8 @@
wake_up_process(fs_info->transaction_kthread);
btrfs_remount_cleanup(fs_info, old_opts);
+ clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
return 0;
restore:
@@ -1889,6 +1994,8 @@
old_thread_pool_size, fs_info->thread_pool_size);
fs_info->metadata_ratio = old_metadata_ratio;
btrfs_remount_cleanup(fs_info, old_opts);
+ clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
return ret;
}
@@ -1963,6 +2070,10 @@
num_stripes = nr_devices;
else if (type & BTRFS_BLOCK_GROUP_RAID1)
num_stripes = 2;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1C3)
+ num_stripes = 3;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1C4)
+ num_stripes = 4;
else if (type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = 4;
@@ -2049,7 +2160,6 @@
{
struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
struct btrfs_super_block *disk_super = fs_info->super_copy;
- struct list_head *head = &fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_free_data = 0;
@@ -2062,8 +2172,7 @@
u64 thresh = 0;
int mixed = 0;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
+ list_for_each_entry(found, &fs_info->space_info, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
int i;
@@ -2092,8 +2201,6 @@
total_used += found->disk_used;
}
- rcu_read_unlock();
-
buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
buf->f_blocks >>= bits;
buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
@@ -2162,7 +2269,7 @@
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
kill_anon_super(sb);
- free_fs_info(fs_info);
+ btrfs_free_fs_info(fs_info);
}
static struct file_system_type btrfs_fs_type = {
@@ -2195,7 +2302,7 @@
}
/*
- * used by btrfsctl to scan devices when no FS is mounted
+ * Used by /dev/btrfs-control for devices ioctls.
*/
static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
@@ -2325,7 +2432,7 @@
static const struct file_operations btrfs_ctl_fops = {
.open = btrfs_control_open,
.unlocked_ioctl = btrfs_control_ioctl,
- .compat_ioctl = btrfs_control_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.owner = THIS_MODULE,
.llseek = noop_llseek,
};
@@ -2388,10 +2495,14 @@
if (err)
goto free_cachep;
- err = extent_map_init();
+ err = extent_state_cache_init();
if (err)
goto free_extent_io;
+ err = extent_map_init();
+ if (err)
+ goto free_extent_state_cache;
+
err = ordered_data_init();
if (err)
goto free_extent_map;
@@ -2450,6 +2561,8 @@
ordered_data_exit();
free_extent_map:
extent_map_exit();
+free_extent_state_cache:
+ extent_state_cache_exit();
free_extent_io:
extent_io_exit();
free_cachep:
@@ -2470,6 +2583,7 @@
btrfs_prelim_ref_exit();
ordered_data_exit();
extent_map_exit();
+ extent_state_cache_exit();
extent_io_exit();
btrfs_interface_exit();
btrfs_end_io_wq_exit();
@@ -2484,3 +2598,6 @@
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: crc32c");
+MODULE_SOFTDEP("pre: xxhash64");
+MODULE_SOFTDEP("pre: sha256");
+MODULE_SOFTDEP("pre: blake2b-256");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 5c299e1..3bb6b68 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -9,14 +9,18 @@
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/bug.h>
+#include <crypto/hash.h>
#include "ctree.h"
+#include "discard.h"
#include "disk-io.h"
+#include "send.h"
#include "transaction.h"
#include "sysfs.h"
#include "volumes.h"
#include "space-info.h"
#include "block-group.h"
+#include "qgroup.h"
struct btrfs_feature_attr {
struct kobj_attribute kobj_attr;
@@ -153,7 +157,7 @@
} else
val = can_modify_feature(fa);
- return snprintf(buf, PAGE_SIZE, "%d\n", val);
+ return scnprintf(buf, PAGE_SIZE, "%d\n", val);
}
static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
@@ -258,6 +262,7 @@
BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
+BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@ -272,6 +277,7 @@
BTRFS_FEAT_ATTR_PTR(no_holes),
BTRFS_FEAT_ATTR_PTR(metadata_uuid),
BTRFS_FEAT_ATTR_PTR(free_space_tree),
+ BTRFS_FEAT_ATTR_PTR(raid1c34),
NULL
};
@@ -291,12 +297,42 @@
static ssize_t rmdir_subvol_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "0\n");
+ return scnprintf(buf, PAGE_SIZE, "0\n");
}
BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
+static ssize_t supported_checksums_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ ssize_t ret = 0;
+ int i;
+
+ for (i = 0; i < btrfs_get_num_csums(); i++) {
+ /*
+ * This "trick" only works as long as 'enum btrfs_csum_type' has
+ * no holes in it
+ */
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ (i == 0 ? "" : " "), btrfs_super_csum_name(i));
+
+ }
+
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ return ret;
+}
+BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
+
+static ssize_t send_stream_version_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION);
+}
+BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
+
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
+ BTRFS_ATTR_PTR(static_feature, supported_checksums),
+ BTRFS_ATTR_PTR(static_feature, send_stream_version),
NULL
};
@@ -314,11 +350,177 @@
#ifdef CONFIG_BTRFS_DEBUG
/*
+ * Discard statistics and tunables
+ */
+#define discard_to_fs_info(_kobj) to_fs_info((_kobj)->parent->parent)
+
+static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return scnprintf(buf, PAGE_SIZE, "%lld\n",
+ atomic64_read(&fs_info->discard_ctl.discardable_bytes));
+}
+BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show);
+
+static ssize_t btrfs_discardable_extents_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n",
+ atomic_read(&fs_info->discard_ctl.discardable_extents));
+}
+BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show);
+
+static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
+ fs_info->discard_ctl.discard_bitmap_bytes);
+}
+BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
+
+static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return scnprintf(buf, PAGE_SIZE, "%lld\n",
+ atomic64_read(&fs_info->discard_ctl.discard_bytes_saved));
+}
+BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show);
+
+static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
+ fs_info->discard_ctl.discard_extent_bytes);
+}
+BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
+
+static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return scnprintf(buf, PAGE_SIZE, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.iops_limit));
+}
+
+static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+ struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
+ u32 iops_limit;
+ int ret;
+
+ ret = kstrtou32(buf, 10, &iops_limit);
+ if (ret)
+ return -EINVAL;
+
+ WRITE_ONCE(discard_ctl->iops_limit, iops_limit);
+
+ return len;
+}
+BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show,
+ btrfs_discard_iops_limit_store);
+
+static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return scnprintf(buf, PAGE_SIZE, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.kbps_limit));
+}
+
+static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+ struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
+ u32 kbps_limit;
+ int ret;
+
+ ret = kstrtou32(buf, 10, &kbps_limit);
+ if (ret)
+ return -EINVAL;
+
+ WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit);
+
+ return len;
+}
+BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show,
+ btrfs_discard_kbps_limit_store);
+
+static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
+ READ_ONCE(fs_info->discard_ctl.max_discard_size));
+}
+
+static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+ struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
+ u64 max_discard_size;
+ int ret;
+
+ ret = kstrtou64(buf, 10, &max_discard_size);
+ if (ret)
+ return -EINVAL;
+
+ WRITE_ONCE(discard_ctl->max_discard_size, max_discard_size);
+
+ return len;
+}
+BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show,
+ btrfs_discard_max_discard_size_store);
+
+static const struct attribute *discard_debug_attrs[] = {
+ BTRFS_ATTR_PTR(discard, discardable_bytes),
+ BTRFS_ATTR_PTR(discard, discardable_extents),
+ BTRFS_ATTR_PTR(discard, discard_bitmap_bytes),
+ BTRFS_ATTR_PTR(discard, discard_bytes_saved),
+ BTRFS_ATTR_PTR(discard, discard_extent_bytes),
+ BTRFS_ATTR_PTR(discard, iops_limit),
+ BTRFS_ATTR_PTR(discard, kbps_limit),
+ BTRFS_ATTR_PTR(discard, max_discard_size),
+ NULL,
+};
+
+/*
* Runtime debugging exported via sysfs
*
* /sys/fs/btrfs/debug - applies to module or all filesystems
* /sys/fs/btrfs/UUID - applies only to the given filesystem
*/
+static const struct attribute *btrfs_debug_mount_attrs[] = {
+ NULL,
+};
+
static struct attribute *btrfs_debug_feature_attrs[] = {
NULL
};
@@ -338,7 +540,7 @@
val = *value_ptr;
if (lock)
spin_unlock(lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
}
static ssize_t global_rsv_size_show(struct kobject *kobj,
@@ -372,19 +574,19 @@
{
struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
int index = btrfs_bg_flags_to_raid_index(to_raid_kobj(kobj)->flags);
u64 val = 0;
down_read(&sinfo->groups_sem);
list_for_each_entry(block_group, &sinfo->block_groups[index], list) {
if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes))
- val += block_group->key.offset;
+ val += block_group->length;
else
- val += btrfs_block_group_used(&block_group->item);
+ val += block_group->used;
}
up_read(&sinfo->groups_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
}
static struct attribute *raid_attrs[] = {
@@ -421,7 +623,7 @@
{
struct btrfs_space_info *sinfo = to_space_info(kobj);
s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned);
- return snprintf(buf, PAGE_SIZE, "%lld\n", val);
+ return scnprintf(buf, PAGE_SIZE, "%lld\n", val);
}
SPACE_INFO_ATTR(flags);
@@ -478,7 +680,7 @@
ssize_t ret;
spin_lock(&fs_info->super_lock);
- ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+ ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
spin_unlock(&fs_info->super_lock);
return ret;
@@ -526,7 +728,7 @@
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+ return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
}
BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -536,8 +738,8 @@
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%u\n",
- fs_info->super_copy->sectorsize);
+ return scnprintf(buf, PAGE_SIZE, "%u\n",
+ fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -547,8 +749,7 @@
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%u\n",
- fs_info->super_copy->sectorsize);
+ return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
@@ -560,7 +761,7 @@
int quota_override;
quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
- return snprintf(buf, PAGE_SIZE, "%d\n", quota_override);
+ return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override);
}
static ssize_t quota_override_store(struct kobject *kobj,
@@ -598,12 +799,61 @@
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%pU\n",
+ return scnprintf(buf, PAGE_SIZE, "%pU\n",
fs_info->fs_devices->metadata_uuid);
}
BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show);
+static ssize_t btrfs_checksum_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ u16 csum_type = btrfs_super_csum_type(fs_info->super_copy);
+
+ return scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(fs_info->csum_shash));
+}
+
+BTRFS_ATTR(, checksum, btrfs_checksum_show);
+
+static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ const char *str;
+
+ switch (READ_ONCE(fs_info->exclusive_operation)) {
+ case BTRFS_EXCLOP_NONE:
+ str = "none\n";
+ break;
+ case BTRFS_EXCLOP_BALANCE:
+ str = "balance\n";
+ break;
+ case BTRFS_EXCLOP_DEV_ADD:
+ str = "device add\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REMOVE:
+ str = "device remove\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REPLACE:
+ str = "device replace\n";
+ break;
+ case BTRFS_EXCLOP_RESIZE:
+ str = "resize\n";
+ break;
+ case BTRFS_EXCLOP_SWAP_ACTIVATE:
+ str = "swap activate\n";
+ break;
+ default:
+ str = "UNKNOWN\n";
+ break;
+ }
+ return scnprintf(buf, PAGE_SIZE, "%s", str);
+}
+BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -611,6 +861,8 @@
BTRFS_ATTR_PTR(, clone_alignment),
BTRFS_ATTR_PTR(, quota_override),
BTRFS_ATTR_PTR(, metadata_uuid),
+ BTRFS_ATTR_PTR(, checksum),
+ BTRFS_ATTR_PTR(, exclusive_operation),
NULL,
};
@@ -695,10 +947,16 @@
static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
{
- if (fs_devs->device_dir_kobj) {
- kobject_del(fs_devs->device_dir_kobj);
- kobject_put(fs_devs->device_dir_kobj);
- fs_devs->device_dir_kobj = NULL;
+ if (fs_devs->devinfo_kobj) {
+ kobject_del(fs_devs->devinfo_kobj);
+ kobject_put(fs_devs->devinfo_kobj);
+ fs_devs->devinfo_kobj = NULL;
+ }
+
+ if (fs_devs->devices_kobj) {
+ kobject_del(fs_devs->devices_kobj);
+ kobject_put(fs_devs->devices_kobj);
+ fs_devs->devices_kobj = NULL;
}
if (fs_devs->fsid_kobj.state_initialized) {
@@ -723,19 +981,48 @@
}
}
+static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+ }
+}
+
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
{
- btrfs_reset_fs_info_ptr(fs_info);
+ struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
+
+ sysfs_remove_link(fsid_kobj, "bdi");
if (fs_info->space_info_kobj) {
sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
kobject_del(fs_info->space_info_kobj);
kobject_put(fs_info->space_info_kobj);
}
+#ifdef CONFIG_BTRFS_DEBUG
+ if (fs_info->discard_debug_kobj) {
+ sysfs_remove_files(fs_info->discard_debug_kobj,
+ discard_debug_attrs);
+ kobject_del(fs_info->discard_debug_kobj);
+ kobject_put(fs_info->discard_debug_kobj);
+ }
+ if (fs_info->debug_kobj) {
+ sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
+ kobject_del(fs_info->debug_kobj);
+ kobject_put(fs_info->debug_kobj);
+ }
+#endif
addrm_unknown_feature_attrs(fs_info, false);
- sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
- sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
- btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
+ sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
+ sysfs_remove_files(fsid_kobj, btrfs_attrs);
+ btrfs_sysfs_remove_fs_devices(fs_info->fs_devices);
}
static const char * const btrfs_feature_set_names[FEAT_MAX] = {
@@ -744,7 +1031,7 @@
[FEAT_INCOMPAT] = "incompat",
};
-const char * const btrfs_feature_set_name(enum btrfs_feature_set set)
+const char *btrfs_feature_set_name(enum btrfs_feature_set set)
{
return btrfs_feature_set_names[set];
}
@@ -767,7 +1054,7 @@
continue;
name = btrfs_feature_attrs[set][i].kobj_attr.attr.name;
- len += snprintf(str + len, bufsize - len, "%s%s",
+ len += scnprintf(str + len, bufsize - len, "%s%s",
len ? "," : "", name);
}
@@ -822,7 +1109,7 @@
* Create a sysfs entry for a given block group type at path
* /sys/fs/btrfs/UUID/allocation/data/TYPE
*/
-void btrfs_sysfs_add_block_group_type(struct btrfs_block_group_cache *cache)
+void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_space_info *space_info = cache->space_info;
@@ -850,17 +1137,38 @@
rkobj->flags = cache->flags;
kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+ /*
+ * We call this either on mount, or if we've created a block group for a
+ * new index type while running (i.e. when restriping). The running
+ * case is tricky because we could race with other threads, so we need
+ * to have this check to make sure we didn't already init the kobject.
+ *
+ * We don't have to protect on the free side because it only happens on
+ * unmount.
+ */
+ spin_lock(&space_info->lock);
+ if (space_info->block_group_kobjs[index]) {
+ spin_unlock(&space_info->lock);
+ kobject_put(&rkobj->kobj);
+ return;
+ } else {
+ space_info->block_group_kobjs[index] = &rkobj->kobj;
+ }
+ spin_unlock(&space_info->lock);
+
ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s",
btrfs_bg_type_to_raid_name(rkobj->flags));
memalloc_nofs_restore(nofs_flag);
if (ret) {
+ spin_lock(&space_info->lock);
+ space_info->block_group_kobjs[index] = NULL;
+ spin_unlock(&space_info->lock);
kobject_put(&rkobj->kobj);
btrfs_warn(fs_info,
"failed to add kobject for block cache, ignoring");
return;
}
-
- space_info->block_group_kobjs[index] = &rkobj->kobj;
}
/*
@@ -922,83 +1230,184 @@
return 0;
}
-/* when one_device is NULL, it removes all device links */
-
-int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device)
+void btrfs_sysfs_remove_device(struct btrfs_device *device)
{
struct hd_struct *disk;
struct kobject *disk_kobj;
+ struct kobject *devices_kobj;
- if (!fs_devices->device_dir_kobj)
- return -EINVAL;
+ /*
+ * Seed fs_devices devices_kobj aren't used, fetch kobject from the
+ * fs_info::fs_devices.
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ ASSERT(devices_kobj);
- if (one_device && one_device->bdev) {
- disk = one_device->bdev->bd_part;
+ if (device->bdev) {
+ disk = device->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
-
- sysfs_remove_link(fs_devices->device_dir_kobj,
- disk_kobj->name);
+ sysfs_remove_link(devices_kobj, disk_kobj->name);
}
- if (one_device)
- return 0;
-
- list_for_each_entry(one_device,
- &fs_devices->devices, dev_list) {
- if (!one_device->bdev)
- continue;
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
-
- sysfs_remove_link(fs_devices->device_dir_kobj,
- disk_kobj->name);
+ if (device->devid_kobj.state_initialized) {
+ kobject_del(&device->devid_kobj);
+ kobject_put(&device->devid_kobj);
+ wait_for_completion(&device->kobj_unregister);
}
-
- return 0;
}
-int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
+static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
{
- if (!fs_devs->device_dir_kobj)
- fs_devs->device_dir_kobj = kobject_create_and_add("devices",
- &fs_devs->fsid_kobj);
+ int val;
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
- if (!fs_devs->device_dir_kobj)
- return -ENOMEM;
+ val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- return 0;
+ return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show);
+
+static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ int val;
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show);
+
+static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ int val;
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
+
+static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ int val;
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
+
+static struct attribute *devid_attrs[] = {
+ BTRFS_ATTR_PTR(devid, in_fs_metadata),
+ BTRFS_ATTR_PTR(devid, missing),
+ BTRFS_ATTR_PTR(devid, replace_target),
+ BTRFS_ATTR_PTR(devid, writeable),
+ NULL
+};
+ATTRIBUTE_GROUPS(devid);
+
+static void btrfs_release_devid_kobj(struct kobject *kobj)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ memset(&device->devid_kobj, 0, sizeof(struct kobject));
+ complete(&device->kobj_unregister);
}
-int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device)
+static struct kobj_type devid_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = devid_groups,
+ .release = btrfs_release_devid_kobj,
+};
+
+int btrfs_sysfs_add_device(struct btrfs_device *device)
{
- int error = 0;
- struct btrfs_device *dev;
+ int ret;
unsigned int nofs_flag;
+ struct kobject *devices_kobj;
+ struct kobject *devinfo_kobj;
+
+ /*
+ * Make sure we use the fs_info::fs_devices to fetch the kobjects even
+ * for the seed fs_devices
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj;
+ ASSERT(devices_kobj);
+ ASSERT(devinfo_kobj);
nofs_flag = memalloc_nofs_save();
- list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+
+ if (device->bdev) {
struct hd_struct *disk;
struct kobject *disk_kobj;
- if (!dev->bdev)
- continue;
-
- if (one_device && one_device != dev)
- continue;
-
- disk = dev->bdev->bd_part;
+ disk = device->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
- error = sysfs_create_link(fs_devices->device_dir_kobj,
- disk_kobj, disk_kobj->name);
- if (error)
- break;
+ ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
+ if (ret) {
+ btrfs_warn(device->fs_info,
+ "creating sysfs device link for devid %llu failed: %d",
+ device->devid, ret);
+ goto out;
+ }
}
- memalloc_nofs_restore(nofs_flag);
- return error;
+ init_completion(&device->kobj_unregister);
+ ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype,
+ devinfo_kobj, "%llu", device->devid);
+ if (ret) {
+ kobject_put(&device->devid_kobj);
+ btrfs_warn(device->fs_info,
+ "devinfo init for devid %llu failed: %d",
+ device->devid, ret);
+ }
+
+out:
+ memalloc_nofs_restore(nofs_flag);
+ return ret;
+}
+
+static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ int ret;
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+ }
+
+ return 0;
+
+fail:
+ btrfs_sysfs_remove_fs_devices(fs_devices);
+ return ret;
}
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
@@ -1012,8 +1421,8 @@
&disk_to_dev(bdev->bd_disk)->kobj);
}
-void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
- const u8 *fsid)
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices)
+
{
char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
@@ -1021,33 +1430,64 @@
* Sprouting changes fsid of the mounted filesystem, rename the fsid
* directory
*/
- snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid);
+ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid);
if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
btrfs_warn(fs_devices->fs_info,
"sysfs: failed to create fsid for sprout");
}
+void btrfs_sysfs_update_devid(struct btrfs_device *device)
+{
+ char tmp[24];
+
+ snprintf(tmp, sizeof(tmp), "%llu", device->devid);
+
+ if (kobject_rename(&device->devid_kobj, tmp))
+ btrfs_warn(device->fs_devices->fs_info,
+ "sysfs: failed to update devid for %llu",
+ device->devid);
+}
+
/* /sys/fs/btrfs/ entry */
static struct kset *btrfs_kset;
/*
+ * Creates:
+ * /sys/fs/btrfs/UUID
+ *
* Can be called by the device discovery thread.
- * And parent can be specified for seed device
*/
-int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
- struct kobject *parent)
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
{
int error;
init_completion(&fs_devs->kobj_unregister);
fs_devs->fsid_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs_devs->fsid_kobj,
- &btrfs_ktype, parent, "%pU", fs_devs->fsid);
+ error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL,
+ "%pU", fs_devs->fsid);
if (error) {
kobject_put(&fs_devs->fsid_kobj);
return error;
}
+ fs_devs->devices_kobj = kobject_create_and_add("devices",
+ &fs_devs->fsid_kobj);
+ if (!fs_devs->devices_kobj) {
+ btrfs_err(fs_devs->fs_info,
+ "failed to init sysfs device interface");
+ btrfs_sysfs_remove_fsid(fs_devs);
+ return -ENOMEM;
+ }
+
+ fs_devs->devinfo_kobj = kobject_create_and_add("devinfo",
+ &fs_devs->fsid_kobj);
+ if (!fs_devs->devinfo_kobj) {
+ btrfs_err(fs_devs->fs_info,
+ "failed to init sysfs devinfo kobject");
+ btrfs_sysfs_remove_fsid(fs_devs);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -1057,15 +1497,13 @@
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
- btrfs_set_fs_info_ptr(fs_info);
-
- error = btrfs_sysfs_add_device_link(fs_devs, NULL);
+ error = btrfs_sysfs_add_fs_devices(fs_devs);
if (error)
return error;
error = sysfs_create_files(fsid_kobj, btrfs_attrs);
if (error) {
- btrfs_sysfs_rm_device_link(fs_devs, NULL);
+ btrfs_sysfs_remove_fs_devices(fs_devs);
return error;
}
@@ -1075,8 +1513,26 @@
goto failure;
#ifdef CONFIG_BTRFS_DEBUG
- error = sysfs_create_group(fsid_kobj,
- &btrfs_debug_feature_attr_group);
+ fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj);
+ if (!fs_info->debug_kobj) {
+ error = -ENOMEM;
+ goto failure;
+ }
+
+ error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
+ if (error)
+ goto failure;
+
+ /* Discard directory */
+ fs_info->discard_debug_kobj = kobject_create_and_add("discard",
+ fs_info->debug_kobj);
+ if (!fs_info->discard_debug_kobj) {
+ error = -ENOMEM;
+ goto failure;
+ }
+
+ error = sysfs_create_files(fs_info->discard_debug_kobj,
+ discard_debug_attrs);
if (error)
goto failure;
#endif
@@ -1085,6 +1541,10 @@
if (error)
goto failure;
+ error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi");
+ if (error)
+ goto failure;
+
fs_info->space_info_kobj = kobject_create_and_add("allocation",
fsid_kobj);
if (!fs_info->space_info_kobj) {
@@ -1102,6 +1562,155 @@
return error;
}
+static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj)
+{
+ return to_fs_info(kobj->parent->parent);
+}
+
+#define QGROUP_ATTR(_member, _show_name) \
+static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj, \
+ struct kobj_attribute *a, \
+ char *buf) \
+{ \
+ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \
+ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \
+ struct btrfs_qgroup, kobj); \
+ return btrfs_show_u64(&qgroup->_member, &fs_info->qgroup_lock, buf); \
+} \
+BTRFS_ATTR(qgroup, _show_name, btrfs_qgroup_show_##_member)
+
+#define QGROUP_RSV_ATTR(_name, _type) \
+static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj, \
+ struct kobj_attribute *a, \
+ char *buf) \
+{ \
+ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \
+ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \
+ struct btrfs_qgroup, kobj); \
+ return btrfs_show_u64(&qgroup->rsv.values[_type], \
+ &fs_info->qgroup_lock, buf); \
+} \
+BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name)
+
+QGROUP_ATTR(rfer, referenced);
+QGROUP_ATTR(excl, exclusive);
+QGROUP_ATTR(max_rfer, max_referenced);
+QGROUP_ATTR(max_excl, max_exclusive);
+QGROUP_ATTR(lim_flags, limit_flags);
+QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA);
+QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS);
+QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC);
+
+static struct attribute *qgroup_attrs[] = {
+ BTRFS_ATTR_PTR(qgroup, referenced),
+ BTRFS_ATTR_PTR(qgroup, exclusive),
+ BTRFS_ATTR_PTR(qgroup, max_referenced),
+ BTRFS_ATTR_PTR(qgroup, max_exclusive),
+ BTRFS_ATTR_PTR(qgroup, limit_flags),
+ BTRFS_ATTR_PTR(qgroup, rsv_data),
+ BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans),
+ BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc),
+ NULL
+};
+ATTRIBUTE_GROUPS(qgroup);
+
+static void qgroup_release(struct kobject *kobj)
+{
+ struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup, kobj);
+
+ memset(&qgroup->kobj, 0, sizeof(*kobj));
+}
+
+static struct kobj_type qgroup_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = qgroup_release,
+ .default_groups = qgroup_groups,
+};
+
+int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
+{
+ struct kobject *qgroups_kobj = fs_info->qgroups_kobj;
+ int ret;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return 0;
+ if (qgroup->kobj.state_initialized)
+ return 0;
+ if (!qgroups_kobj)
+ return -EINVAL;
+
+ ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj,
+ "%hu_%llu", btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid));
+ if (ret < 0)
+ kobject_put(&qgroup->kobj);
+
+ return ret;
+}
+
+void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *next;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return;
+
+ rbtree_postorder_for_each_entry_safe(qgroup, next,
+ &fs_info->qgroup_tree, node)
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ if (fs_info->qgroups_kobj) {
+ kobject_del(fs_info->qgroups_kobj);
+ kobject_put(fs_info->qgroups_kobj);
+ fs_info->qgroups_kobj = NULL;
+ }
+}
+
+/* Called when qgroups get initialized, thus there is no need for locking */
+int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
+{
+ struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *next;
+ int ret = 0;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return 0;
+
+ ASSERT(fsid_kobj);
+ if (fs_info->qgroups_kobj)
+ return 0;
+
+ fs_info->qgroups_kobj = kobject_create_and_add("qgroups", fsid_kobj);
+ if (!fs_info->qgroups_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ rbtree_postorder_for_each_entry_safe(qgroup, next,
+ &fs_info->qgroup_tree, node) {
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ if (ret < 0)
+ btrfs_sysfs_del_qgroups(fs_info);
+ return ret;
+}
+
+void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
+{
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return;
+
+ if (qgroup->kobj.state_initialized) {
+ kobject_del(&qgroup->kobj);
+ kobject_put(&qgroup->kobj);
+ }
+}
/*
* Change per-fs features in /sys/fs/btrfs/UUID/features to match current
@@ -1112,12 +1721,16 @@
{
struct btrfs_fs_devices *fs_devs;
struct kobject *fsid_kobj;
- u64 features;
- int ret;
+ u64 __maybe_unused features;
+ int __maybe_unused ret;
if (!fs_info)
return;
+ /*
+ * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
+ * safe when called from some contexts (eg. balance)
+ */
features = get_features(fs_info, set);
ASSERT(bit & supported_feature_masks[set]);
@@ -1173,6 +1786,9 @@
sysfs_unmerge_group(&btrfs_kset->kobj,
&btrfs_static_feature_attr_group);
sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+#ifdef CONFIG_BTRFS_DEBUG
+ sysfs_remove_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
+#endif
kset_unregister(btrfs_kset);
}
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 610e9c3..bacef43 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -13,17 +13,12 @@
};
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
-const char * const btrfs_feature_set_name(enum btrfs_feature_set set);
-int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device);
-int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device);
-int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
- struct kobject *parent);
-int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
+const char *btrfs_feature_set_name(enum btrfs_feature_set set);
+int btrfs_sysfs_add_device(struct btrfs_device *device);
+void btrfs_sysfs_remove_device(struct btrfs_device *device);
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
-void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
- const u8 *fsid);
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
u64 bit, enum btrfs_feature_set set);
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
@@ -32,9 +27,17 @@
void __cold btrfs_exit_sysfs(void);
int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
-void btrfs_sysfs_add_block_group_type(struct btrfs_block_group_cache *cache);
+void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache);
int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info);
void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
+void btrfs_sysfs_update_devid(struct btrfs_device *device);
+
+int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup);
+void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
+int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup);
#endif
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 98f9684..999c14e 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -86,6 +86,27 @@
unregister_filesystem(&test_type);
}
+struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_device *dev;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
+ INIT_LIST_HEAD(&dev->dev_list);
+ list_add(&dev->dev_list, &fs_info->fs_devices->devices);
+
+ return dev;
+}
+
+static void btrfs_free_dummy_device(struct btrfs_device *dev)
+{
+ extent_io_tree_release(&dev->alloc_state);
+ kfree(dev);
+}
+
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
{
struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
@@ -99,6 +120,8 @@
kfree(fs_info);
return NULL;
}
+ INIT_LIST_HEAD(&fs_info->fs_devices->devices);
+
fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
GFP_KERNEL);
if (!fs_info->super_copy) {
@@ -107,37 +130,10 @@
return NULL;
}
+ btrfs_init_fs_info(fs_info);
+
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
-
- if (init_srcu_struct(&fs_info->subvol_srcu)) {
- kfree(fs_info->fs_devices);
- kfree(fs_info->super_copy);
- kfree(fs_info);
- return NULL;
- }
-
- spin_lock_init(&fs_info->buffer_lock);
- spin_lock_init(&fs_info->qgroup_lock);
- spin_lock_init(&fs_info->super_lock);
- spin_lock_init(&fs_info->fs_roots_radix_lock);
- mutex_init(&fs_info->qgroup_ioctl_lock);
- mutex_init(&fs_info->qgroup_rescan_lock);
- rwlock_init(&fs_info->tree_mod_log_lock);
- fs_info->running_transaction = NULL;
- fs_info->qgroup_tree = RB_ROOT;
- fs_info->qgroup_ulist = NULL;
- atomic64_set(&fs_info->tree_mod_seq, 0);
- INIT_LIST_HEAD(&fs_info->dirty_qgroups);
- INIT_LIST_HEAD(&fs_info->dead_roots);
- INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
- INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
- INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
- extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
- IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
- extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
- IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
- fs_info->pinned_extents = &fs_info->freed_extents[0];
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
test_mnt->mnt_sb->s_fs_info = fs_info;
@@ -149,6 +145,7 @@
{
struct radix_tree_iter iter;
void **slot;
+ struct btrfs_device *dev, *tmp;
if (!fs_info)
return;
@@ -179,10 +176,16 @@
}
spin_unlock(&fs_info->buffer_lock);
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
+ dev_list) {
+ btrfs_free_dummy_device(dev);
+ }
btrfs_free_qgroup_config(fs_info);
btrfs_free_fs_roots(fs_info);
- cleanup_srcu_struct(&fs_info->subvol_srcu);
kfree(fs_info->super_copy);
+ btrfs_check_leaked_roots(fs_info);
+ btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->fs_devices);
kfree(fs_info);
}
@@ -194,18 +197,14 @@
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
return;
- if (root->node) {
- /* One for allocate_extent_buffer */
- free_extent_buffer(root->node);
- }
- kfree(root);
+ btrfs_put_root(root);
}
-struct btrfs_block_group_cache *
+struct btrfs_block_group *
btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info,
unsigned long length)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
cache = kzalloc(sizeof(*cache), GFP_KERNEL);
if (!cache)
@@ -217,9 +216,8 @@
return NULL;
}
- cache->key.objectid = 0;
- cache->key.offset = length;
- cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ cache->start = 0;
+ cache->length = length;
cache->full_stripe_len = fs_info->sectorsize;
cache->fs_info = fs_info;
@@ -232,7 +230,7 @@
return cache;
}
-void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
{
if (!cache)
return;
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index ee277bb..7a2d7ff 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -41,11 +41,12 @@
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
void btrfs_free_dummy_root(struct btrfs_root *root);
-struct btrfs_block_group_cache *
+struct btrfs_block_group *
btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length);
-void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
+void btrfs_free_dummy_block_group(struct btrfs_block_group *cache);
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
+struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info);
#else
static inline int btrfs_run_sanity_tests(void)
{
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index a1b9f9b..df54cdf 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -60,8 +60,7 @@
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = 0;
- setup_items_for_insert(root, path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, path, &key, &value_len, 1);
item = btrfs_item_nr(0);
write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
value_len);
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 4a7f796..57379e9 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -6,6 +6,9 @@
#include <linux/types.h>
#include "btrfs-tests.h"
#include "../ctree.h"
+#include "../volumes.h"
+#include "../disk-io.h"
+#include "../block-group.h"
static void free_extent_map_tree(struct extent_map_tree *em_tree)
{
@@ -437,11 +440,153 @@
return ret;
}
+struct rmap_test_vector {
+ u64 raid_type;
+ u64 physical_start;
+ u64 data_stripe_size;
+ u64 num_data_stripes;
+ u64 num_stripes;
+ /* Assume we won't have more than 5 physical stripes */
+ u64 data_stripe_phys_start[5];
+ bool expected_mapped_addr;
+ /* Physical to logical addresses */
+ u64 mapped_logical[5];
+};
+
+static int test_rmap_block(struct btrfs_fs_info *fs_info,
+ struct rmap_test_vector *test)
+{
+ struct extent_map *em;
+ struct map_lookup *map = NULL;
+ u64 *logical = NULL;
+ int i, out_ndaddrs, out_stripe_len;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
+
+ map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL);
+ if (!map) {
+ kfree(em);
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
+
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
+ /* Start at 4GiB logical address */
+ em->start = SZ_4G;
+ em->len = test->data_stripe_size * test->num_data_stripes;
+ em->block_len = em->len;
+ em->orig_block_len = test->data_stripe_size;
+ em->map_lookup = map;
+
+ map->num_stripes = test->num_stripes;
+ map->stripe_len = BTRFS_STRIPE_LEN;
+ map->type = test->raid_type;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_device *dev = btrfs_alloc_dummy_device(fs_info);
+
+ if (IS_ERR(dev)) {
+ test_err("cannot allocate device");
+ ret = PTR_ERR(dev);
+ goto out;
+ }
+ map->stripes[i].dev = dev;
+ map->stripes[i].physical = test->data_stripe_phys_start[i];
+ }
+
+ write_lock(&fs_info->mapping_tree.lock);
+ ret = add_extent_mapping(&fs_info->mapping_tree, em, 0);
+ write_unlock(&fs_info->mapping_tree.lock);
+ if (ret) {
+ test_err("error adding block group mapping to mapping tree");
+ goto out_free;
+ }
+
+ ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
+ &logical, &out_ndaddrs, &out_stripe_len);
+ if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
+ test_err("didn't rmap anything but expected %d",
+ test->expected_mapped_addr);
+ goto out;
+ }
+
+ if (out_stripe_len != BTRFS_STRIPE_LEN) {
+ test_err("calculated stripe length doesn't match");
+ goto out;
+ }
+
+ if (out_ndaddrs != test->expected_mapped_addr) {
+ for (i = 0; i < out_ndaddrs; i++)
+ test_msg("mapped %llu", logical[i]);
+ test_err("unexpected number of mapped addresses: %d", out_ndaddrs);
+ goto out;
+ }
+
+ for (i = 0; i < out_ndaddrs; i++) {
+ if (logical[i] != test->mapped_logical[i]) {
+ test_err("unexpected logical address mapped");
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ write_lock(&fs_info->mapping_tree.lock);
+ remove_extent_mapping(&fs_info->mapping_tree, em);
+ write_unlock(&fs_info->mapping_tree.lock);
+ /* For us */
+ free_extent_map(em);
+out_free:
+ /* For the tree */
+ free_extent_map(em);
+ kfree(logical);
+ return ret;
+}
+
int btrfs_test_extent_map(void)
{
struct btrfs_fs_info *fs_info = NULL;
struct extent_map_tree *em_tree;
- int ret = 0;
+ int ret = 0, i;
+ struct rmap_test_vector rmap_tests[] = {
+ {
+ /*
+ * Test a chunk with 2 data stripes one of which
+ * interesects the physical address of the super block
+ * is correctly recognised.
+ */
+ .raid_type = BTRFS_BLOCK_GROUP_RAID1,
+ .physical_start = SZ_64M - SZ_4M,
+ .data_stripe_size = SZ_256M,
+ .num_data_stripes = 2,
+ .num_stripes = 2,
+ .data_stripe_phys_start =
+ {SZ_64M - SZ_4M, SZ_64M - SZ_4M + SZ_256M},
+ .expected_mapped_addr = true,
+ .mapped_logical= {SZ_4G + SZ_4M}
+ },
+ {
+ /*
+ * Test that out-of-range physical addresses are
+ * ignored
+ */
+
+ /* SINGLE chunk type */
+ .raid_type = 0,
+ .physical_start = SZ_4G,
+ .data_stripe_size = SZ_256M,
+ .num_data_stripes = 1,
+ .num_stripes = 1,
+ .data_stripe_phys_start = {SZ_256M},
+ .expected_mapped_addr = false,
+ .mapped_logical = {0}
+ }
+ };
test_msg("running extent_map tests");
@@ -474,6 +619,13 @@
goto out;
ret = test_case_4(fs_info, em_tree);
+ test_msg("running rmap tests");
+ for (i = 0; i < ARRAY_SIZE(rmap_tests); i++) {
+ ret = test_rmap_block(fs_info, &rmap_tests[i]);
+ if (ret)
+ goto out;
+ }
+
out:
kfree(em_tree);
btrfs_free_dummy_fs_info(fs_info);
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 43ec706..aebdf23 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -17,7 +17,7 @@
* entry and remove space from either end and the middle, and make sure we can
* remove space that covers adjacent extent entries.
*/
-static int test_extents(struct btrfs_block_group_cache *cache)
+static int test_extents(struct btrfs_block_group *cache)
{
int ret = 0;
@@ -87,8 +87,7 @@
return 0;
}
-static int test_bitmaps(struct btrfs_block_group_cache *cache,
- u32 sectorsize)
+static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize)
{
u64 next_bitmap_offset;
int ret;
@@ -156,7 +155,7 @@
}
/* This is the high grade jackassery */
-static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
+static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
u32 sectorsize)
{
u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize);
@@ -331,7 +330,7 @@
/* Used by test_steal_space_from_bitmap_to_extent(). */
static int
-check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
+check_num_extents_and_bitmaps(const struct btrfs_block_group *cache,
const int num_extents,
const int num_bitmaps)
{
@@ -351,7 +350,7 @@
}
/* Used by test_steal_space_from_bitmap_to_extent(). */
-static int check_cache_empty(struct btrfs_block_group_cache *cache)
+static int check_cache_empty(struct btrfs_block_group *cache)
{
u64 offset;
u64 max_extent_size;
@@ -393,7 +392,7 @@
* requests.
*/
static int
-test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
+test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
u32 sectorsize)
{
int ret;
@@ -829,7 +828,7 @@
int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
{
struct btrfs_fs_info *fs_info;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
struct btrfs_root *root = NULL;
int ret = -ENOMEM;
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 6e774d0..2c783d2 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -18,7 +18,7 @@
static int __check_free_space_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
const struct free_space_extent * const extents,
unsigned int num_extents)
@@ -48,7 +48,7 @@
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
if (path->slots[0] != 0)
goto invalid;
- end = cache->key.objectid + cache->key.offset;
+ end = cache->start + cache->length;
i = 0;
while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -60,8 +60,6 @@
if (prev_bit == 0 && bit == 1) {
extent_start = offset;
} else if (prev_bit == 1 && bit == 0) {
- if (i >= num_extents)
- goto invalid;
if (i >= num_extents ||
extent_start != extents[i].start ||
offset - extent_start != extents[i].length)
@@ -107,7 +105,7 @@
static int check_free_space_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
const struct free_space_extent * const extents,
unsigned int num_extents)
@@ -150,12 +148,12 @@
static int test_empty_block_group(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, cache->key.offset},
+ {cache->start, cache->length},
};
return check_free_space_extents(trans, fs_info, cache, path,
@@ -164,7 +162,7 @@
static int test_remove_all(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
@@ -172,8 +170,8 @@
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start,
+ cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -185,18 +183,17 @@
static int test_remove_beginning(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid + alignment,
- cache->key.offset - alignment},
+ {cache->start + alignment, cache->length - alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid, alignment);
+ cache->start, alignment);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -209,19 +206,18 @@
static int test_remove_end(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, cache->key.offset - alignment},
+ {cache->start, cache->length - alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid +
- cache->key.offset - alignment,
- alignment);
+ cache->start + cache->length - alignment,
+ alignment);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -233,19 +229,18 @@
static int test_remove_middle(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, alignment},
- {cache->key.objectid + 2 * alignment,
- cache->key.offset - 2 * alignment},
+ {cache->start, alignment},
+ {cache->start + 2 * alignment, cache->length - 2 * alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid + alignment,
+ cache->start + alignment,
alignment);
if (ret) {
test_err("could not remove free space");
@@ -258,24 +253,23 @@
static int test_merge_left(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, 2 * alignment},
+ {cache->start, 2 * alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ ret = __add_to_free_space_tree(trans, cache, path, cache->start,
alignment);
if (ret) {
test_err("could not add free space");
@@ -283,7 +277,7 @@
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + alignment,
+ cache->start + alignment,
alignment);
if (ret) {
test_err("could not add free space");
@@ -296,25 +290,24 @@
static int test_merge_right(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid + alignment, 2 * alignment},
+ {cache->start + alignment, 2 * alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + 2 * alignment,
+ cache->start + 2 * alignment,
alignment);
if (ret) {
test_err("could not add free space");
@@ -322,7 +315,7 @@
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + alignment,
+ cache->start + alignment,
alignment);
if (ret) {
test_err("could not add free space");
@@ -335,24 +328,23 @@
static int test_merge_both(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, 3 * alignment},
+ {cache->start, 3 * alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ ret = __add_to_free_space_tree(trans, cache, path, cache->start,
alignment);
if (ret) {
test_err("could not add free space");
@@ -360,16 +352,14 @@
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + 2 * alignment,
- alignment);
+ cache->start + 2 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + alignment,
- alignment);
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -381,26 +371,25 @@
static int test_merge_none(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, alignment},
- {cache->key.objectid + 2 * alignment, alignment},
- {cache->key.objectid + 4 * alignment, alignment},
+ {cache->start, alignment},
+ {cache->start + 2 * alignment, alignment},
+ {cache->start + 4 * alignment, alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ ret = __add_to_free_space_tree(trans, cache, path, cache->start,
alignment);
if (ret) {
test_err("could not add free space");
@@ -408,16 +397,14 @@
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + 4 * alignment,
- alignment);
+ cache->start + 4 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + 2 * alignment,
- alignment);
+ cache->start + 2 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -429,7 +416,7 @@
typedef int (*test_func_t)(struct btrfs_trans_handle *,
struct btrfs_fs_info *,
- struct btrfs_block_group_cache *,
+ struct btrfs_block_group *,
struct btrfs_path *,
u32 alignment);
@@ -438,7 +425,7 @@
{
struct btrfs_fs_info *fs_info;
struct btrfs_root *root = NULL;
- struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_block_group *cache = NULL;
struct btrfs_trans_handle trans;
struct btrfs_path *path = NULL;
int ret;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 09ecf7d..0402206 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -33,8 +33,7 @@
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = start;
- setup_items_for_insert(root, &path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, &path, &key, &value_len, 1);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, 1);
btrfs_set_file_extent_type(leaf, fi, type);
@@ -64,8 +63,7 @@
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- setup_items_for_insert(root, &path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, &path, &key, &value_len, 1);
}
/*
@@ -263,7 +261,7 @@
/* First with no extents */
BTRFS_I(inode)->root = root;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize);
if (IS_ERR(em)) {
em = NULL;
test_err("got an error when we shouldn't have");
@@ -283,7 +281,7 @@
*/
setup_file_extents(root, sectorsize);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -305,7 +303,7 @@
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -333,7 +331,7 @@
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -356,7 +354,7 @@
free_extent_map(em);
/* Regular extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -384,7 +382,7 @@
free_extent_map(em);
/* The next 3 are split extents */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -413,7 +411,7 @@
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -435,7 +433,7 @@
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -469,7 +467,7 @@
free_extent_map(em);
/* Prealloc extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -498,7 +496,7 @@
free_extent_map(em);
/* The next 3 are a half written prealloc extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -528,7 +526,7 @@
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -561,7 +559,7 @@
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -596,7 +594,7 @@
free_extent_map(em);
/* Now for the compressed extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -630,7 +628,7 @@
free_extent_map(em);
/* Split compressed extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -665,7 +663,7 @@
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -692,7 +690,7 @@
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -727,8 +725,7 @@
free_extent_map(em);
/* A hole between regular extents but no hole extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6,
- sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -755,7 +752,7 @@
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -788,7 +785,7 @@
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -872,7 +869,7 @@
insert_inode_item_key(root);
insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize,
sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -894,8 +891,7 @@
}
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize,
- 2 * sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -953,11 +949,10 @@
}
BTRFS_I(inode)->root = root;
- btrfs_test_inode_set_ops(inode);
/* [BTRFS_MAX_EXTENT_SIZE] */
- ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0,
- NULL);
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0,
+ BTRFS_MAX_EXTENT_SIZE - 1, 0, NULL);
if (ret) {
test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
@@ -970,7 +965,7 @@
}
/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
- ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE,
BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
0, NULL);
if (ret) {
@@ -988,7 +983,8 @@
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1001,7 +997,7 @@
}
/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
- ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1)
+ sectorsize - 1,
0, NULL);
@@ -1019,7 +1015,7 @@
/*
* [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize]
*/
- ret = btrfs_set_extent_delalloc(inode,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
0, NULL);
@@ -1037,7 +1033,7 @@
/*
* [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize]
*/
- ret = btrfs_set_extent_delalloc(inode,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
if (ret) {
@@ -1055,7 +1051,8 @@
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1071,7 +1068,7 @@
* Refill the hole again just for good measure, because I thought it
* might fail and I'd rather satisfy my paranoia at this point.
*/
- ret = btrfs_set_extent_delalloc(inode,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
if (ret) {
@@ -1087,7 +1084,8 @@
/* Empty */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1102,7 +1100,8 @@
out:
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, 0, 0, NULL);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ac035a6..ce1ca8e 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -507,6 +507,7 @@
test_err("couldn't insert fs root %d", ret);
goto out;
}
+ btrfs_put_root(tmp_root);
tmp_root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(tmp_root)) {
@@ -521,6 +522,7 @@
test_err("couldn't insert fs root %d", ret);
goto out;
}
+ btrfs_put_root(tmp_root);
test_msg("running qgroup tests");
ret = test_no_shared_qgroup(root, sectorsize, nodesize);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e6cb95b..8daa9e4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -25,6 +25,77 @@
#define BTRFS_ROOT_TRANS_TAG 0
+/*
+ * Transaction states and transitions
+ *
+ * No running transaction (fs tree blocks are not modified)
+ * |
+ * | To next stage:
+ * | Call start_transaction() variants. Except btrfs_join_transaction_nostart().
+ * V
+ * Transaction N [[TRANS_STATE_RUNNING]]
+ * |
+ * | New trans handles can be attached to transaction N by calling all
+ * | start_transaction() variants.
+ * |
+ * | To next stage:
+ * | Call btrfs_commit_transaction() on any trans handle attached to
+ * | transaction N
+ * V
+ * Transaction N [[TRANS_STATE_COMMIT_START]]
+ * |
+ * | Will wait for previous running transaction to completely finish if there
+ * | is one
+ * |
+ * | Then one of the following happes:
+ * | - Wait for all other trans handle holders to release.
+ * | The btrfs_commit_transaction() caller will do the commit work.
+ * | - Wait for current transaction to be committed by others.
+ * | Other btrfs_commit_transaction() caller will do the commit work.
+ * |
+ * | At this stage, only btrfs_join_transaction*() variants can attach
+ * | to this running transaction.
+ * | All other variants will wait for current one to finish and attach to
+ * | transaction N+1.
+ * |
+ * | To next stage:
+ * | Caller is chosen to commit transaction N, and all other trans handle
+ * | haven been released.
+ * V
+ * Transaction N [[TRANS_STATE_COMMIT_DOING]]
+ * |
+ * | The heavy lifting transaction work is started.
+ * | From running delayed refs (modifying extent tree) to creating pending
+ * | snapshots, running qgroups.
+ * | In short, modify supporting trees to reflect modifications of subvolume
+ * | trees.
+ * |
+ * | At this stage, all start_transaction() calls will wait for this
+ * | transaction to finish and attach to transaction N+1.
+ * |
+ * | To next stage:
+ * | Until all supporting trees are updated.
+ * V
+ * Transaction N [[TRANS_STATE_UNBLOCKED]]
+ * | Transaction N+1
+ * | All needed trees are modified, thus we only [[TRANS_STATE_RUNNING]]
+ * | need to write them back to disk and update |
+ * | super blocks. |
+ * | |
+ * | At this stage, new transaction is allowed to |
+ * | start. |
+ * | All new start_transaction() calls will be |
+ * | attached to transid N+1. |
+ * | |
+ * | To next stage: |
+ * | Until all tree blocks are super blocks are |
+ * | written to block devices |
+ * V |
+ * Transaction N [[TRANS_STATE_COMPLETED]] V
+ * All tree blocks and super blocks are written. Transaction N+1
+ * This transaction is finished and all its [[TRANS_STATE_COMMIT_START]]
+ * data structures will be cleaned up. | Life goes on
+ */
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
[TRANS_STATE_RUNNING] = 0U,
[TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH),
@@ -65,13 +136,13 @@
* discard the physical locations of the block groups.
*/
while (!list_empty(&transaction->deleted_bgs)) {
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
cache = list_first_entry(&transaction->deleted_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
bg_list);
list_del_init(&cache->bg_list);
- btrfs_put_block_group_trimming(cache);
+ btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
}
WARN_ON(!list_empty(&transaction->dev_update_list));
@@ -84,6 +155,7 @@
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root, *tmp;
+ struct btrfs_caching_control *caching_ctl, *next;
down_write(&fs_info->commit_root_sem);
list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
@@ -109,6 +181,45 @@
spin_lock(&cur_trans->dropped_roots_lock);
}
spin_unlock(&cur_trans->dropped_roots_lock);
+
+ /*
+ * We have to update the last_byte_to_unpin under the commit_root_sem,
+ * at the same time we swap out the commit roots.
+ *
+ * This is because we must have a real view of the last spot the caching
+ * kthreads were while caching. Consider the following views of the
+ * extent tree for a block group
+ *
+ * commit root
+ * +----+----+----+----+----+----+----+
+ * |\\\\| |\\\\|\\\\| |\\\\|\\\\|
+ * +----+----+----+----+----+----+----+
+ * 0 1 2 3 4 5 6 7
+ *
+ * new commit root
+ * +----+----+----+----+----+----+----+
+ * | | | |\\\\| | |\\\\|
+ * +----+----+----+----+----+----+----+
+ * 0 1 2 3 4 5 6 7
+ *
+ * If the cache_ctl->progress was at 3, then we are only allowed to
+ * unpin [0,1) and [2,3], because the caching thread has already
+ * processed those extents. We are not allowed to unpin [5,6), because
+ * the caching thread will re-start it's search from 3, and thus find
+ * the hole from [4,6) to add to the free space cache.
+ */
+ list_for_each_entry_safe(caching_ctl, next,
+ &fs_info->caching_block_groups, list) {
+ struct btrfs_block_group *cache = caching_ctl->block_group;
+
+ if (btrfs_block_group_done(cache)) {
+ cache->last_byte_to_unpin = (u64)-1;
+ list_del_init(&caching_ctl->list);
+ btrfs_put_caching_control(caching_ctl);
+ } else {
+ cache->last_byte_to_unpin = caching_ctl->progress;
+ }
+ }
up_write(&fs_info->commit_root_sem);
}
@@ -151,7 +262,7 @@
WARN_ON_ONCE(!list_empty(&trans->new_bgs));
btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
- trans->chunk_bytes_reserved);
+ trans->chunk_bytes_reserved, NULL);
trans->chunk_bytes_reserved = 0;
}
@@ -221,6 +332,8 @@
}
cur_trans->fs_info = fs_info;
+ atomic_set(&cur_trans->pending_ordered, 0);
+ init_waitqueue_head(&cur_trans->pending_wait);
atomic_set(&cur_trans->num_writers, 1);
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
@@ -266,6 +379,8 @@
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
+ extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
+ IO_TREE_FS_PINNED_EXTENTS, NULL);
fs_info->generation++;
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
@@ -276,10 +391,10 @@
}
/*
- * this does all the record keeping required to make sure that a reference
- * counted root is properly recorded in a given transaction. This is required
- * to make sure the old root from before we joined the transaction is deleted
- * when the transaction commits
+ * This does all the record keeping required to make sure that a shareable root
+ * is properly recorded in a given transaction. This is required to make sure
+ * the old root from before we joined the transaction is deleted when the
+ * transaction commits.
*/
static int record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -287,7 +402,7 @@
{
struct btrfs_fs_info *fs_info = root->fs_info;
- if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
root->last_trans < trans->transid) || force) {
WARN_ON(root == fs_info->extent_root);
WARN_ON(!force && root->commit_root != root->node);
@@ -366,7 +481,7 @@
{
struct btrfs_fs_info *fs_info = root->fs_info;
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return 0;
/*
@@ -431,7 +546,7 @@
struct btrfs_fs_info *fs_info = root->fs_info;
if (!fs_info->reloc_ctl ||
- !test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
root->reloc_root)
return false;
@@ -630,7 +745,7 @@
alloc_fail:
if (num_bytes)
btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
- num_bytes);
+ num_bytes, NULL);
reserve_fail:
btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved);
return ERR_PTR(ret);
@@ -657,7 +772,7 @@
true);
}
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
+struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root)
{
return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
BTRFS_RESERVE_NO_FLUSH, true);
@@ -820,7 +935,7 @@
trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid, trans->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, trans->block_rsv,
- trans->bytes_reserved);
+ trans->bytes_reserved, NULL);
trans->bytes_reserved = 0;
}
@@ -983,7 +1098,7 @@
return werr;
}
-int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
+static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
bool errors = false;
@@ -1109,7 +1224,7 @@
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
- 0, &eb);
+ 0, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
@@ -1189,8 +1304,10 @@
struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&fs_info->trans_lock);
- if (list_empty(&root->root_list))
+ if (list_empty(&root->root_list)) {
+ btrfs_grab_root(root);
list_add_tail(&root->root_list, &fs_info->dead_roots);
+ }
spin_unlock(&fs_info->trans_lock);
}
@@ -1407,7 +1524,6 @@
u64 index = 0;
u64 objectid;
u64 root_flags;
- uuid_le new_uuid;
ASSERT(pending->path);
path = pending->path;
@@ -1500,8 +1616,7 @@
btrfs_set_root_generation_v2(new_root_item,
trans->transid);
- uuid_le_gen(&new_uuid);
- memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+ generate_random_guid(new_root_item->uuid);
memcpy(new_root_item->parent_uuid, root->root_item.uuid,
BTRFS_UUID_SIZE);
if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
@@ -1517,7 +1632,8 @@
btrfs_set_root_otransid(new_root_item, trans->transid);
old = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
+ ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(old);
free_extent_buffer(old);
@@ -1563,9 +1679,10 @@
}
key.offset = (u64)-1;
- pending->snap = btrfs_read_fs_root_no_name(fs_info, &key);
+ pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
+ pending->snap = NULL;
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1612,7 +1729,8 @@
btrfs_abort_transaction(trans, ret);
goto fail;
}
- ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL,
+ ret = btrfs_uuid_tree_add(trans, new_root_item->uuid,
+ BTRFS_UUID_KEY_SUBVOL,
objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1834,7 +1952,6 @@
*/
BUG_ON(list_empty(&cur_trans->list));
- list_del_init(&cur_trans->list);
if (cur_trans == fs_info->running_transaction) {
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
@@ -1843,6 +1960,17 @@
spin_lock(&fs_info->trans_lock);
}
+
+ /*
+ * Now that we know no one else is still using the transaction we can
+ * remove the transaction from the list of transactions. This avoids
+ * the transaction kthread from cleaning up the transaction while some
+ * other task is still using it, which could result in a use-after-free
+ * on things like log trees, as it forces the transaction kthread to
+ * wait for this transaction to be cleaned up by us.
+ */
+ list_del_init(&cur_trans->list);
+
spin_unlock(&fs_info->trans_lock);
btrfs_cleanup_one_transaction(trans->transaction, fs_info);
@@ -1873,7 +2001,7 @@
static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_block_group *block_group, *tmp;
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
btrfs_delayed_refs_rsv_release(fs_info, 1);
@@ -1947,6 +2075,8 @@
struct btrfs_transaction *prev_trans = NULL;
int ret;
+ ASSERT(refcount_read(&trans->use_count) == 1);
+
/*
* Some places just start a transaction to commit it. We need to make
* sure that if this commit fails that the abort code actually marks the
@@ -2091,6 +2221,14 @@
btrfs_wait_delalloc_flush(trans);
+ /*
+ * Wait for all ordered extents started by a fast fsync that joined this
+ * transaction. Otherwise if this transaction commits before the ordered
+ * extents complete we lose logged data after a power failure.
+ */
+ wait_event(cur_trans->pending_wait,
+ atomic_read(&cur_trans->pending_ordered) == 0);
+
btrfs_scrub_pause(fs_info);
/*
* Ok now we need to make sure to block out any other joins while we
@@ -2120,10 +2258,8 @@
* core function of the snapshot creation.
*/
ret = create_pending_snapshots(trans);
- if (ret) {
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_reloc;
/*
* We insert the dir indexes of the snapshots and update the inode
@@ -2136,16 +2272,12 @@
* the nodes and leaves.
*/
ret = btrfs_run_delayed_items(trans);
- if (ret) {
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_reloc;
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret) {
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_reloc;
/*
* make sure none of the code above managed to slip in a
@@ -2171,11 +2303,8 @@
mutex_lock(&fs_info->tree_log_mutex);
ret = commit_fs_roots(trans);
- if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_tree_log;
/*
* Since the transaction is done, we can apply the pending changes
@@ -2193,29 +2322,20 @@
* new delayed refs. Must handle them or qgroup can be wrong.
*/
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_tree_log;
/*
* Since fs roots are all committed, we can get a quite accurate
* new_roots. So let's do quota accounting.
*/
ret = btrfs_qgroup_account_extents(trans);
- if (ret < 0) {
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret < 0)
+ goto unlock_tree_log;
ret = commit_cowonly_roots(trans);
- if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_tree_log;
/*
* The tasks which save the space cache and inode cache may also
@@ -2223,13 +2343,9 @@
*/
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
+ goto unlock_tree_log;
}
- btrfs_prepare_extent_commit(fs_info);
-
cur_trans = fs_info->running_transaction;
btrfs_set_root_node(&fs_info->tree_root->root_item,
@@ -2272,6 +2388,10 @@
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Error while writing out transaction");
+ /*
+ * reloc_mutex has been unlocked, tree_log_mutex is still held
+ * but we can't jump to unlock_tree_log causing double unlock
+ */
mutex_unlock(&fs_info->tree_log_mutex);
goto scrub_continue;
}
@@ -2319,6 +2439,10 @@
return ret;
+unlock_tree_log:
+ mutex_unlock(&fs_info->tree_log_mutex);
+unlock_reloc:
+ mutex_unlock(&fs_info->reloc_mutex);
scrub_continue:
btrfs_scrub_continue(fs_info);
cleanup_transaction:
@@ -2362,13 +2486,18 @@
btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
btrfs_kill_all_delayed_nodes(root);
+ if (root->ino_cache_inode) {
+ iput(root->ino_cache_inode);
+ root->ino_cache_inode = NULL;
+ }
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
- ret = btrfs_drop_snapshot(root, NULL, 0, 0);
+ ret = btrfs_drop_snapshot(root, 0, 0);
else
- ret = btrfs_drop_snapshot(root, NULL, 1, 0);
+ ret = btrfs_drop_snapshot(root, 1, 0);
+ btrfs_put_root(root);
return (ret < 0) ? 0 : 1;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index cbede32..f73654d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -71,6 +71,7 @@
*/
struct list_head io_bgs;
struct list_head dropped_roots;
+ struct extent_io_tree pinned_extents;
/*
* we need to make sure block group deletion doesn't race with
@@ -84,6 +85,13 @@
spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
struct btrfs_fs_info *fs_info;
+
+ /*
+ * Number of ordered extents the transaction must wait for before
+ * committing. These are ordered extents started by a fast fsync.
+ */
+ atomic_t pending_ordered;
+ wait_queue_head_t pending_wait;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -104,6 +112,7 @@
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
#define BTRFS_SEND_TRANS_STUB ((void *)1)
+#define BTRFS_DIO_SYNC_STUB ((void *)2)
struct btrfs_trans_handle {
u64 transid;
@@ -150,18 +159,20 @@
struct btrfs_block_rsv block_rsv;
/* extra metadata reservation for relocation */
int error;
+ /* Preallocated anonymous block device number */
+ dev_t anon_dev;
bool readonly;
struct list_head list;
};
static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
- struct inode *inode)
+ struct btrfs_inode *inode)
{
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->last_trans = trans->transaction->transid;
- BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
- BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans - 1;
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ inode->last_trans = trans->transaction->transid;
+ inode->last_sub_trans = inode->root->log_transid;
+ inode->last_log_commit = inode->last_sub_trans - 1;
+ spin_unlock(&inode->lock);
}
/*
@@ -194,7 +205,7 @@
struct btrfs_root *root,
unsigned int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
@@ -214,8 +225,6 @@
struct btrfs_root *root);
int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages, int mark);
-int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *dirty_pages);
int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 368c43c..32f1b15 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -23,6 +23,7 @@
#include "disk-io.h"
#include "compression.h"
#include "volumes.h"
+#include "misc.h"
/*
* Error message should follow the following format:
@@ -124,6 +125,74 @@
return end;
}
+/*
+ * Customized report for dir_item, the only new important information is
+ * key->objectid, which represents inode number
+ */
+__printf(3, 4)
+__cold
+static void dir_item_err(const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ const struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, &vaf);
+ va_end(args);
+}
+
+/*
+ * This functions checks prev_key->objectid, to ensure current key and prev_key
+ * share the same objectid as inode number.
+ *
+ * This is to detect missing INODE_ITEM in subvolume trees.
+ *
+ * Return true if everything is OK or we don't need to check.
+ * Return false if anything is wrong.
+ */
+static bool check_prev_ino(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
+{
+ /* No prev key, skip check */
+ if (slot == 0)
+ return true;
+
+ /* Only these key->types needs to be checked */
+ ASSERT(key->type == BTRFS_XATTR_ITEM_KEY ||
+ key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_DIR_INDEX_KEY ||
+ key->type == BTRFS_DIR_ITEM_KEY ||
+ key->type == BTRFS_EXTENT_DATA_KEY);
+
+ /*
+ * Only subvolume trees along with their reloc trees need this check.
+ * Things like log tree doesn't follow this ino requirement.
+ */
+ if (!is_fstree(btrfs_header_owner(leaf)))
+ return true;
+
+ if (key->objectid == prev_key->objectid)
+ return true;
+
+ /* Error found */
+ dir_item_err(leaf, slot,
+ "invalid previous key objectid, have %llu expect %llu",
+ prev_key->objectid, key->objectid);
+ return false;
+}
static int check_extent_data_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot,
struct btrfs_key *prev_key)
@@ -141,13 +210,33 @@
return -EUCLEAN;
}
+ /*
+ * Previous key must have the same key->objectid (ino).
+ * It can be XATTR_ITEM, INODE_ITEM or just another EXTENT_DATA.
+ * But if objectids mismatch, it means we have a missing
+ * INODE_ITEM.
+ */
+ if (!check_prev_ino(leaf, key, slot, prev_key))
+ return -EUCLEAN;
+
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
+ /*
+ * Make sure the item contains at least inline header, so the file
+ * extent type is not some garbage.
+ */
+ if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) {
+ file_extent_err(leaf, slot,
+ "invalid item size, have %u expect [%zu, %u)",
+ item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START,
+ SZ_4K);
+ return -EUCLEAN;
+ }
+ if (btrfs_file_extent_type(leaf, fi) >= BTRFS_NR_FILE_EXTENT_TYPES) {
file_extent_err(leaf, slot,
"invalid type for file extent, have %u expect range [0, %u]",
btrfs_file_extent_type(leaf, fi),
- BTRFS_FILE_EXTENT_TYPES);
+ BTRFS_NR_FILE_EXTENT_TYPES - 1);
return -EUCLEAN;
}
@@ -155,11 +244,11 @@
* Support for new compression/encryption must introduce incompat flag,
* and must be caught in open_ctree().
*/
- if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
+ if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_NR_COMPRESS_TYPES) {
file_extent_err(leaf, slot,
"invalid compression for file extent, have %u expect range [0, %u]",
btrfs_file_extent_compression(leaf, fi),
- BTRFS_COMPRESS_TYPES);
+ BTRFS_NR_COMPRESS_TYPES - 1);
return -EUCLEAN;
}
if (btrfs_file_extent_encryption(leaf, fi)) {
@@ -284,50 +373,125 @@
return 0;
}
-/*
- * Customized reported for dir_item, only important new info is key->objectid,
- * which represents inode number
- */
-__printf(3, 4)
-__cold
-static void dir_item_err(const struct extent_buffer *eb, int slot,
- const char *fmt, ...)
+/* Inode item error output has the same format as dir_item_err() */
+#define inode_item_err(eb, slot, fmt, ...) \
+ dir_item_err(eb, slot, fmt, __VA_ARGS__)
+
+static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key,
+ int slot)
{
- const struct btrfs_fs_info *fs_info = eb->fs_info;
- struct btrfs_key key;
- struct va_format vaf;
- va_list args;
+ struct btrfs_key item_key;
+ bool is_inode_item;
- btrfs_item_key_to_cpu(eb, &key, slot);
- va_start(args, fmt);
+ btrfs_item_key_to_cpu(leaf, &item_key, slot);
+ is_inode_item = (item_key.type == BTRFS_INODE_ITEM_KEY);
- vaf.fmt = fmt;
- vaf.va = &args;
+ /* For XATTR_ITEM, location key should be all 0 */
+ if (item_key.type == BTRFS_XATTR_ITEM_KEY) {
+ if (key->type != 0 || key->objectid != 0 || key->offset != 0)
+ return -EUCLEAN;
+ return 0;
+ }
- btrfs_crit(fs_info,
- "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
- btrfs_header_level(eb) == 0 ? "leaf" : "node",
- btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
- key.objectid, &vaf);
- va_end(args);
+ if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
+ key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
+ key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
+ key->objectid != BTRFS_FREE_INO_OBJECTID) {
+ if (is_inode_item) {
+ generic_err(leaf, slot,
+ "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
+ key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
+ BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_LAST_FREE_OBJECTID,
+ BTRFS_FREE_INO_OBJECTID);
+ } else {
+ dir_item_err(leaf, slot,
+"invalid location key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
+ key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
+ BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_LAST_FREE_OBJECTID,
+ BTRFS_FREE_INO_OBJECTID);
+ }
+ return -EUCLEAN;
+ }
+ if (key->offset != 0) {
+ if (is_inode_item)
+ inode_item_err(leaf, slot,
+ "invalid key offset: has %llu expect 0",
+ key->offset);
+ else
+ dir_item_err(leaf, slot,
+ "invalid location key offset:has %llu expect 0",
+ key->offset);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
+ int slot)
+{
+ struct btrfs_key item_key;
+ bool is_root_item;
+
+ btrfs_item_key_to_cpu(leaf, &item_key, slot);
+ is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY);
+
+ /* No such tree id */
+ if (key->objectid == 0) {
+ if (is_root_item)
+ generic_err(leaf, slot, "invalid root id 0");
+ else
+ dir_item_err(leaf, slot,
+ "invalid location key root id 0");
+ return -EUCLEAN;
+ }
+
+ /* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */
+ if (!is_fstree(key->objectid) && !is_root_item) {
+ dir_item_err(leaf, slot,
+ "invalid location key objectid, have %llu expect [%llu, %llu]",
+ key->objectid, BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_LAST_FREE_OBJECTID);
+ return -EUCLEAN;
+ }
+
+ /*
+ * ROOT_ITEM with non-zero offset means this is a snapshot, created at
+ * @offset transid.
+ * Furthermore, for location key in DIR_ITEM, its offset is always -1.
+ *
+ * So here we only check offset for reloc tree whose key->offset must
+ * be a valid tree.
+ */
+ if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) {
+ generic_err(leaf, slot, "invalid root id 0 for reloc tree");
+ return -EUCLEAN;
+ }
+ return 0;
}
static int check_dir_item(struct extent_buffer *leaf,
- struct btrfs_key *key, int slot)
+ struct btrfs_key *key, struct btrfs_key *prev_key,
+ int slot)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_dir_item *di;
u32 item_size = btrfs_item_size_nr(leaf, slot);
u32 cur = 0;
+ if (!check_prev_ino(leaf, key, slot, prev_key))
+ return -EUCLEAN;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
while (cur < item_size) {
+ struct btrfs_key location_key;
u32 name_len;
u32 data_len;
u32 max_name_len;
u32 total_size;
u32 name_hash;
u8 dir_type;
+ int ret;
/* header itself should not cross item boundary */
if (cur + sizeof(*di) > item_size) {
@@ -337,6 +501,25 @@
return -EUCLEAN;
}
+ /* Location key check */
+ btrfs_dir_item_key_to_cpu(leaf, di, &location_key);
+ if (location_key.type == BTRFS_ROOT_ITEM_KEY) {
+ ret = check_root_key(leaf, &location_key, slot);
+ if (ret < 0)
+ return ret;
+ } else if (location_key.type == BTRFS_INODE_ITEM_KEY ||
+ location_key.type == 0) {
+ ret = check_inode_key(leaf, &location_key, slot);
+ if (ret < 0)
+ return ret;
+ } else {
+ dir_item_err(leaf, slot,
+ "invalid location key type, have %u, expect %u or %u",
+ location_key.type, BTRFS_ROOT_ITEM_KEY,
+ BTRFS_INODE_ITEM_KEY);
+ return -EUCLEAN;
+ }
+
/* dir type check */
dir_type = btrfs_dir_type(leaf, di);
if (dir_type >= BTRFS_FT_MAX) {
@@ -473,23 +656,23 @@
read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
sizeof(bgi));
- if (btrfs_block_group_chunk_objectid(&bgi) !=
+ if (btrfs_stack_block_group_chunk_objectid(&bgi) !=
BTRFS_FIRST_CHUNK_TREE_OBJECTID) {
block_group_err(leaf, slot,
"invalid block group chunk objectid, have %llu expect %llu",
- btrfs_block_group_chunk_objectid(&bgi),
+ btrfs_stack_block_group_chunk_objectid(&bgi),
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
return -EUCLEAN;
}
- if (btrfs_block_group_used(&bgi) > key->offset) {
+ if (btrfs_stack_block_group_used(&bgi) > key->offset) {
block_group_err(leaf, slot,
"invalid block group used, have %llu expect [0, %llu)",
- btrfs_block_group_used(&bgi), key->offset);
+ btrfs_stack_block_group_used(&bgi), key->offset);
return -EUCLEAN;
}
- flags = btrfs_block_group_flags(&bgi);
+ flags = btrfs_stack_block_group_flags(&bgi);
if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) {
block_group_err(leaf, slot,
"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
@@ -648,7 +831,7 @@
return -EUCLEAN;
}
- if (!is_power_of_2(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+ if (!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) {
chunk_err(leaf, chunk, logical,
"invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set",
@@ -764,6 +947,7 @@
struct btrfs_key *key, int slot)
{
struct btrfs_dev_item *ditem;
+ const u32 item_size = btrfs_item_size_nr(leaf, slot);
if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) {
dev_item_err(leaf, slot,
@@ -771,6 +955,13 @@
key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
return -EUCLEAN;
}
+
+ if (unlikely(item_size != sizeof(*ditem))) {
+ dev_item_err(leaf, slot, "invalid item size: has %u expect %zu",
+ item_size, sizeof(*ditem));
+ return -EUCLEAN;
+ }
+
ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
if (btrfs_device_id(leaf, ditem) != key->offset) {
dev_item_err(leaf, slot,
@@ -799,10 +990,6 @@
return 0;
}
-/* Inode item error output has the same format as dir_item_err() */
-#define inode_item_err(fs_info, eb, slot, fmt, ...) \
- dir_item_err(eb, slot, fmt, __VA_ARGS__)
-
static int check_inode_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
@@ -810,40 +997,34 @@
struct btrfs_inode_item *iitem;
u64 super_gen = btrfs_super_generation(fs_info->super_copy);
u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
+ const u32 item_size = btrfs_item_size_nr(leaf, slot);
u32 mode;
+ int ret;
- if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
- key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
- key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
- key->objectid != BTRFS_FREE_INO_OBJECTID) {
- generic_err(leaf, slot,
- "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
- key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
- BTRFS_FIRST_FREE_OBJECTID,
- BTRFS_LAST_FREE_OBJECTID,
- BTRFS_FREE_INO_OBJECTID);
+ ret = check_inode_key(leaf, key, slot);
+ if (ret < 0)
+ return ret;
+
+ if (unlikely(item_size != sizeof(*iitem))) {
+ generic_err(leaf, slot, "invalid item size: has %u expect %zu",
+ item_size, sizeof(*iitem));
return -EUCLEAN;
}
- if (key->offset != 0) {
- inode_item_err(fs_info, leaf, slot,
- "invalid key offset: has %llu expect 0",
- key->offset);
- return -EUCLEAN;
- }
+
iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item);
/* Here we use super block generation + 1 to handle log tree */
if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) {
- inode_item_err(fs_info, leaf, slot,
- "invalid inode transid: has %llu expect [0, %llu]",
+ inode_item_err(leaf, slot,
+ "invalid inode generation: has %llu expect (0, %llu]",
btrfs_inode_generation(leaf, iitem),
super_gen + 1);
return -EUCLEAN;
}
/* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
- inode_item_err(fs_info, leaf, slot,
- "invalid inode generation: has %llu expect [0, %llu]",
+ inode_item_err(leaf, slot,
+ "invalid inode transid: has %llu expect [0, %llu]",
btrfs_inode_transid(leaf, iitem), super_gen + 1);
return -EUCLEAN;
}
@@ -855,33 +1036,33 @@
*/
mode = btrfs_inode_mode(leaf, iitem);
if (mode & ~valid_mask) {
- inode_item_err(fs_info, leaf, slot,
+ inode_item_err(leaf, slot,
"unknown mode bit detected: 0x%x",
mode & ~valid_mask);
return -EUCLEAN;
}
/*
- * S_IFMT is not bit mapped so we can't completely rely on is_power_of_2,
- * but is_power_of_2() can save us from checking FIFO/CHR/DIR/REG.
- * Only needs to check BLK, LNK and SOCKS
+ * S_IFMT is not bit mapped so we can't completely rely on
+ * is_power_of_2/has_single_bit_set, but it can save us from checking
+ * FIFO/CHR/DIR/REG. Only needs to check BLK, LNK and SOCKS
*/
- if (!is_power_of_2(mode & S_IFMT)) {
+ if (!has_single_bit_set(mode & S_IFMT)) {
if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) {
- inode_item_err(fs_info, leaf, slot,
+ inode_item_err(leaf, slot,
"invalid mode: has 0%o expect valid S_IF* bit(s)",
mode & S_IFMT);
return -EUCLEAN;
}
}
if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) {
- inode_item_err(fs_info, leaf, slot,
+ inode_item_err(leaf, slot,
"invalid nlink: has %u expect no more than 1 for dir",
btrfs_inode_nlink(leaf, iitem));
return -EUCLEAN;
}
if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) {
- inode_item_err(fs_info, leaf, slot,
+ inode_item_err(leaf, slot,
"unknown flags detected: 0x%llx",
btrfs_inode_flags(leaf, iitem) &
~BTRFS_INODE_FLAG_MASK);
@@ -897,22 +1078,11 @@
struct btrfs_root_item ri = { 0 };
const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
BTRFS_ROOT_SUBVOL_DEAD;
+ int ret;
- /* No such tree id */
- if (key->objectid == 0) {
- generic_err(leaf, slot, "invalid root id 0");
- return -EUCLEAN;
- }
-
- /*
- * Some older kernel may create ROOT_ITEM with non-zero offset, so here
- * we only check offset for reloc tree whose key->offset must be a
- * valid tree.
- */
- if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) {
- generic_err(leaf, slot, "invalid root id 0 for reloc tree");
- return -EUCLEAN;
- }
+ ret = check_root_key(leaf, key, slot);
+ if (ret < 0)
+ return ret;
if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) {
@@ -1095,8 +1265,8 @@
btrfs_super_generation(fs_info->super_copy) + 1);
return -EUCLEAN;
}
- if (!is_power_of_2(flags & (BTRFS_EXTENT_FLAG_DATA |
- BTRFS_EXTENT_FLAG_TREE_BLOCK))) {
+ if (!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA |
+ BTRFS_EXTENT_FLAG_TREE_BLOCK))) {
extent_err(leaf, slot,
"invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx",
flags, BTRFS_EXTENT_FLAG_DATA |
@@ -1303,6 +1473,58 @@
return 0;
}
+#define inode_ref_err(eb, slot, fmt, args...) \
+ inode_item_err(eb, slot, fmt, ##args)
+static int check_inode_ref(struct extent_buffer *leaf,
+ struct btrfs_key *key, struct btrfs_key *prev_key,
+ int slot)
+{
+ struct btrfs_inode_ref *iref;
+ unsigned long ptr;
+ unsigned long end;
+
+ if (!check_prev_ino(leaf, key, slot, prev_key))
+ return -EUCLEAN;
+ /* namelen can't be 0, so item_size == sizeof() is also invalid */
+ if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) {
+ inode_ref_err(leaf, slot,
+ "invalid item size, have %u expect (%zu, %u)",
+ btrfs_item_size_nr(leaf, slot),
+ sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
+ return -EUCLEAN;
+ }
+
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ end = ptr + btrfs_item_size_nr(leaf, slot);
+ while (ptr < end) {
+ u16 namelen;
+
+ if (ptr + sizeof(iref) > end) {
+ inode_ref_err(leaf, slot,
+ "inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
+ ptr, end, sizeof(iref));
+ return -EUCLEAN;
+ }
+
+ iref = (struct btrfs_inode_ref *)ptr;
+ namelen = btrfs_inode_ref_name_len(leaf, iref);
+ if (ptr + sizeof(*iref) + namelen > end) {
+ inode_ref_err(leaf, slot,
+ "inode ref overflow, ptr %lu end %lu namelen %u",
+ ptr, end, namelen);
+ return -EUCLEAN;
+ }
+
+ /*
+ * NOTE: In theory we should record all found index numbers
+ * to find any duplicated indexes, but that will be too time
+ * consuming for inodes with too many hard links.
+ */
+ ptr += sizeof(*iref) + namelen;
+ }
+ return 0;
+}
+
/*
* Common point to switch the item-specific validation.
*/
@@ -1323,7 +1545,10 @@
case BTRFS_DIR_ITEM_KEY:
case BTRFS_DIR_INDEX_KEY:
case BTRFS_XATTR_ITEM_KEY:
- ret = check_dir_item(leaf, key, slot);
+ ret = check_dir_item(leaf, key, prev_key, slot);
+ break;
+ case BTRFS_INODE_REF_KEY:
+ ret = check_inode_ref(leaf, key, prev_key, slot);
break;
case BTRFS_BLOCK_GROUP_ITEM_KEY:
ret = check_block_group_item(leaf, key, slot);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 5f9e2dd..d3f28b8 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -35,7 +35,7 @@
goto out;
}
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
goto out;
path = btrfs_alloc_path();
@@ -133,10 +133,9 @@
ret = 0;
}
done:
- if (ret != -EAGAIN) {
+ if (ret != -EAGAIN)
memset(&root->defrag_progress, 0,
sizeof(root->defrag_progress));
- root->defrag_trans_start = trans->transid;
- }
+
return ret;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8ea4b3d..62784b9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,8 @@
#include "compression.h"
#include "qgroup.h"
#include "inode-map.h"
+#include "block-group.h"
+#include "space-info.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -94,8 +96,6 @@
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -213,9 +213,7 @@
*/
void btrfs_pin_log_trans(struct btrfs_root *root)
{
- mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
- mutex_unlock(&root->log_mutex);
}
/*
@@ -315,7 +313,7 @@
}
if (wc->pin)
- ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
+ ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
eb->len);
if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
@@ -507,13 +505,8 @@
*/
if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
- ino_size != 0) {
- struct btrfs_map_token token;
-
- btrfs_init_map_token(&token, dst_eb);
- btrfs_set_token_inode_size(dst_eb, dst_item,
- ino_size, &token);
- }
+ ino_size != 0)
+ btrfs_set_inode_size(dst_eb, dst_item, ino_size);
goto no_copy;
}
@@ -557,13 +550,9 @@
static noinline struct inode *read_one_inode(struct btrfs_root *root,
u64 objectid)
{
- struct btrfs_key key;
struct inode *inode;
- key.objectid = objectid;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
- inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(root->fs_info->sb, objectid, root);
if (IS_ERR(inode))
inode = NULL;
return inode;
@@ -836,6 +825,11 @@
goto out;
}
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
+ extent_end - start);
+ if (ret)
+ goto out;
+
inode_add_bytes(inode, nbytes);
update_inode:
ret = btrfs_update_inode(trans, root, inode);
@@ -900,9 +894,11 @@
}
/*
- * helper function to see if a given name and sequence number found
- * in an inode back reference are already in a directory and correctly
- * point to this inode
+ * See if a given name and sequence number found in an inode back reference are
+ * already in a directory and correctly point to this inode.
+ *
+ * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
+ * exists.
*/
static noinline int inode_in_dir(struct btrfs_root *root,
struct btrfs_path *path,
@@ -911,29 +907,35 @@
{
struct btrfs_dir_item *di;
struct btrfs_key location;
- int match = 0;
+ int ret = 0;
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
index, name, name_len, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ if (PTR_ERR(di) != -ENOENT)
+ ret = PTR_ERR(di);
+ goto out;
+ } else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
if (location.objectid != objectid)
goto out;
- } else
+ } else {
goto out;
- btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
- if (di && !IS_ERR(di)) {
- btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
- if (location.objectid != objectid)
- goto out;
- } else
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
goto out;
- match = 1;
+ } else if (di) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid == objectid)
+ ret = 1;
+ }
out:
btrfs_release_path(path);
- return match;
+ return ret;
}
/*
@@ -952,54 +954,32 @@
const char *name, int namelen)
{
struct btrfs_path *path;
- struct btrfs_inode_ref *ref;
- unsigned long ptr;
- unsigned long ptr_end;
- unsigned long name_ptr;
- int found_name_len;
- int item_size;
int ret;
- int match = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
- if (ret != 0)
+ if (ret < 0) {
goto out;
+ } else if (ret == 1) {
+ ret = 0;
+ goto out;
+ }
- ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
-
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
- if (btrfs_find_name_in_ext_backref(path->nodes[0],
+ if (key->type == BTRFS_INODE_EXTREF_KEY)
+ ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
+ path->slots[0],
+ ref_objectid,
+ name, namelen);
+ else
+ ret = !!btrfs_find_name_in_backref(path->nodes[0],
path->slots[0],
- ref_objectid,
- name, namelen))
- match = 1;
-
- goto out;
- }
-
- item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
- ptr_end = ptr + item_size;
- while (ptr < ptr_end) {
- ref = (struct btrfs_inode_ref *)ptr;
- found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
- if (found_name_len == namelen) {
- name_ptr = (unsigned long)(ref + 1);
- ret = memcmp_extent_buffer(path->nodes[0], name,
- name_ptr, namelen);
- if (ret == 0) {
- match = 1;
- goto out;
- }
- }
- ptr = (unsigned long)(ref + 1) + found_name_len;
- }
+ name, namelen);
out:
btrfs_free_path(path);
- return match;
+ return ret;
}
static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
@@ -1057,10 +1037,13 @@
(unsigned long)(victim_ref + 1),
victim_name_len);
- if (!backref_in_log(log_root, &search_key,
- parent_objectid,
- victim_name,
- victim_name_len)) {
+ ret = backref_in_log(log_root, &search_key,
+ parent_objectid, victim_name,
+ victim_name_len);
+ if (ret < 0) {
+ kfree(victim_name);
+ return ret;
+ } else if (!ret) {
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
@@ -1122,10 +1105,13 @@
search_key.offset = btrfs_extref_hash(parent_objectid,
victim_name,
victim_name_len);
- ret = 0;
- if (!backref_in_log(log_root, &search_key,
- parent_objectid, victim_name,
- victim_name_len)) {
+ ret = backref_in_log(log_root, &search_key,
+ parent_objectid, victim_name,
+ victim_name_len);
+ if (ret < 0) {
+ kfree(victim_name);
+ return ret;
+ } else if (!ret) {
ret = -ENOENT;
victim_parent = read_one_inode(root,
parent_objectid);
@@ -1160,7 +1146,10 @@
/* look for a conflicting sequence number */
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
ref_index, name, namelen, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ if (PTR_ERR(di) != -ENOENT)
+ return PTR_ERR(di);
+ } else if (di) {
ret = drop_one_dir_item(trans, root, path, dir, di);
if (ret)
return ret;
@@ -1170,7 +1159,9 @@
/* look for a conflicting name */
di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
name, namelen, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ return PTR_ERR(di);
+ } else if (di) {
ret = drop_one_dir_item(trans, root, path, dir, di);
if (ret)
return ret;
@@ -1295,6 +1286,15 @@
inode, name, namelen);
kfree(name);
iput(dir);
+ /*
+ * Whenever we need to check if a name exists or not, we
+ * check the subvolume tree. So after an unlink we must
+ * run delayed items, so that future checks for a name
+ * during log replay see that the name does not exists
+ * anymore.
+ */
+ if (!ret)
+ ret = btrfs_run_delayed_items(trans);
if (ret)
goto out;
goto again;
@@ -1495,10 +1495,12 @@
if (ret)
goto out;
- /* if we already have a perfect match, we're done */
- if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index,
- name, namelen)) {
+ ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
+ btrfs_ino(BTRFS_I(inode)), ref_index,
+ name, namelen);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
/*
* look for a conflicting back reference in the
* metadata. if we find one we have to unlink that name
@@ -1544,6 +1546,15 @@
*/
if (!ret && inode->i_nlink == 0)
inc_nlink(inode);
+ /*
+ * Whenever we need to check if a name exists or
+ * not, we check the subvolume tree. So after an
+ * unlink we must run delayed items, so that future
+ * checks for a name during log replay see that the
+ * name does not exists anymore.
+ */
+ if (!ret)
+ ret = btrfs_run_delayed_items(trans);
}
if (ret < 0)
goto out;
@@ -1556,6 +1567,7 @@
btrfs_update_inode(trans, root, inode);
}
+ /* Else, ret == 1, we already have a perfect match, we're done. */
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
kfree(name);
@@ -1891,30 +1903,6 @@
}
/*
- * Return true if an inode reference exists in the log for the given name,
- * inode and parent inode.
- */
-static bool name_in_log_ref(struct btrfs_root *log_root,
- const char *name, const int name_len,
- const u64 dirid, const u64 ino)
-{
- struct btrfs_key search_key;
-
- search_key.objectid = ino;
- search_key.type = BTRFS_INODE_REF_KEY;
- search_key.offset = dirid;
- if (backref_in_log(log_root, &search_key, dirid, name, name_len))
- return true;
-
- search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = btrfs_extref_hash(dirid, name, name_len);
- if (backref_in_log(log_root, &search_key, dirid, name, name_len))
- return true;
-
- return false;
-}
-
-/*
* take a single entry in a log directory item and replay it into
* the subvolume.
*
@@ -1944,8 +1932,8 @@
struct btrfs_key log_key;
struct inode *dir;
u8 log_type;
- int exists;
- int ret = 0;
+ bool exists;
+ int ret;
bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
bool name_added = false;
@@ -1965,12 +1953,12 @@
name_len);
btrfs_dir_item_key_to_cpu(eb, di, &log_key);
- exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
- if (exists == 0)
- exists = 1;
- else
- exists = 0;
+ ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
btrfs_release_path(path);
+ if (ret < 0)
+ goto out;
+ exists = (ret == 0);
+ ret = 0;
if (key->type == BTRFS_DIR_ITEM_KEY) {
dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
@@ -1985,7 +1973,14 @@
ret = -EINVAL;
goto out;
}
- if (IS_ERR_OR_NULL(dst_di)) {
+
+ if (dst_di == ERR_PTR(-ENOENT))
+ dst_di = NULL;
+
+ if (IS_ERR(dst_di)) {
+ ret = PTR_ERR(dst_di);
+ goto out;
+ } else if (!dst_di) {
/* we need a sequence number to insert, so we only
* do inserts for the BTRFS_DIR_INDEX_KEY types
*/
@@ -2030,8 +2025,31 @@
return ret;
insert:
- if (name_in_log_ref(root->log_root, name, name_len,
- key->objectid, log_key.objectid)) {
+ /*
+ * Check if the inode reference exists in the log for the given name,
+ * inode and parent inode
+ */
+ found_key.objectid = log_key.objectid;
+ found_key.type = BTRFS_INODE_REF_KEY;
+ found_key.offset = key->objectid;
+ ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
+ /* The dentry will be added later. */
+ ret = 0;
+ update_size = false;
+ goto out;
+ }
+
+ found_key.objectid = log_key.objectid;
+ found_key.type = BTRFS_INODE_EXTREF_KEY;
+ found_key.offset = key->objectid;
+ ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
+ name_len);
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
/* The dentry will be added later. */
ret = 0;
update_size = false;
@@ -2467,7 +2485,9 @@
else {
ret = find_dir_range(log, path, dirid, key_type,
&range_start, &range_end);
- if (ret != 0)
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
break;
}
@@ -2682,29 +2702,45 @@
return ret;
}
+/*
+ * Correctly adjust the reserved bytes occupied by a log tree extent buffer
+ */
+static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
+{
+ struct btrfs_block_group *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, start);
+ if (!cache) {
+ btrfs_err(fs_info, "unable to find block group for %llu", start);
+ return;
+ }
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->reserved -= fs_info->nodesize;
+ cache->space_info->bytes_reserved -= fs_info->nodesize;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ btrfs_put_block_group(cache);
+}
+
static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int *level,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 root_owner;
u64 bytenr;
u64 ptr_gen;
struct extent_buffer *next;
struct extent_buffer *cur;
- struct extent_buffer *parent;
u32 blocksize;
int ret = 0;
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
-
while (*level > 0) {
struct btrfs_key first_key;
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
cur = path->nodes[*level];
WARN_ON(btrfs_header_level(cur) != *level);
@@ -2718,9 +2754,6 @@
btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
blocksize = fs_info->nodesize;
- parent = path->nodes[*level];
- root_owner = btrfs_header_owner(parent);
-
next = btrfs_find_create_tree_block(fs_info, bytenr);
if (IS_ERR(next))
return PTR_ERR(next);
@@ -2748,19 +2781,16 @@
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
+ ret = btrfs_pin_reserved_extent(trans,
+ bytenr, blocksize);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
- }
-
- WARN_ON(root_owner !=
- BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_and_pin_reserved_extent(
- fs_info, bytenr,
- blocksize);
- if (ret) {
- free_extent_buffer(next);
- return ret;
+ unaccount_log_buffer(fs_info, bytenr);
}
}
free_extent_buffer(next);
@@ -2772,7 +2802,6 @@
return ret;
}
- WARN_ON(*level <= 0);
if (path->nodes[*level-1])
free_extent_buffer(path->nodes[*level-1]);
path->nodes[*level-1] = next;
@@ -2780,9 +2809,6 @@
path->slots[*level] = 0;
cond_resched();
}
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
-
path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
cond_resched();
@@ -2795,7 +2821,6 @@
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 root_owner;
int i;
int slot;
int ret;
@@ -2808,13 +2833,6 @@
WARN_ON(*level == 0);
return 0;
} else {
- struct extent_buffer *parent;
- if (path->nodes[*level] == root->node)
- parent = path->nodes[*level];
- else
- parent = path->nodes[*level + 1];
-
- root_owner = btrfs_header_owner(parent);
ret = wc->process_func(root, path->nodes[*level], wc,
btrfs_header_generation(path->nodes[*level]),
*level);
@@ -2832,18 +2850,18 @@
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
+ ret = btrfs_pin_reserved_extent(trans,
+ path->nodes[*level]->start,
+ path->nodes[*level]->len);
+ if (ret)
+ return ret;
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
- }
- WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_and_pin_reserved_extent(
- fs_info,
- path->nodes[*level]->start,
- path->nodes[*level]->len);
- if (ret)
- return ret;
+ unaccount_log_buffer(fs_info,
+ path->nodes[*level]->start);
+ }
}
free_extent_buffer(path->nodes[*level]);
path->nodes[*level] = NULL;
@@ -2875,7 +2893,7 @@
level = btrfs_header_level(log->node);
orig_level = level;
path->nodes[level] = log->node;
- extent_buffer_get(log->node);
+ atomic_inc(&log->node->refs);
path->slots[level] = 0;
while (1) {
@@ -2914,17 +2932,15 @@
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
+ ret = btrfs_pin_reserved_extent(trans,
+ next->start, next->len);
+ if (ret)
+ goto out;
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
+ unaccount_log_buffer(fs_info, next->start);
}
-
- WARN_ON(log->root_key.objectid !=
- BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_and_pin_reserved_extent(fs_info,
- next->start, next->len);
- if (ret)
- goto out;
}
}
@@ -3305,8 +3321,8 @@
clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
- free_extent_buffer(log->node);
- kfree(log);
+ extent_io_tree_release(&log->log_csum_range);
+ btrfs_put_root(log);
}
/*
@@ -3834,8 +3850,7 @@
found_key.offset = 0;
found_key.type = 0;
- ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
- &start_slot);
+ ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
if (ret < 0)
break;
@@ -3871,44 +3886,41 @@
* just to say 'this inode exists' and a logging
* to say 'update this inode with these values'
*/
- btrfs_set_token_inode_generation(leaf, item, 0, &token);
- btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
+ btrfs_set_token_inode_generation(&token, item, 0);
+ btrfs_set_token_inode_size(&token, item, logged_isize);
} else {
- btrfs_set_token_inode_generation(leaf, item,
- BTRFS_I(inode)->generation,
- &token);
- btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
+ btrfs_set_token_inode_generation(&token, item,
+ BTRFS_I(inode)->generation);
+ btrfs_set_token_inode_size(&token, item, inode->i_size);
}
- btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
- btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
- btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
- btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
+ btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
+ btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
+ btrfs_set_token_inode_mode(&token, item, inode->i_mode);
+ btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
- btrfs_set_token_timespec_sec(leaf, &item->atime,
- inode->i_atime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->atime,
- inode->i_atime.tv_nsec, &token);
+ btrfs_set_token_timespec_sec(&token, &item->atime,
+ inode->i_atime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->atime,
+ inode->i_atime.tv_nsec);
- btrfs_set_token_timespec_sec(leaf, &item->mtime,
- inode->i_mtime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->mtime,
- inode->i_mtime.tv_nsec, &token);
+ btrfs_set_token_timespec_sec(&token, &item->mtime,
+ inode->i_mtime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->mtime,
+ inode->i_mtime.tv_nsec);
- btrfs_set_token_timespec_sec(leaf, &item->ctime,
- inode->i_ctime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->ctime,
- inode->i_ctime.tv_nsec, &token);
+ btrfs_set_token_timespec_sec(&token, &item->ctime,
+ inode->i_ctime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->ctime,
+ inode->i_ctime.tv_nsec);
- btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
- &token);
+ btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
- btrfs_set_token_inode_sequence(leaf, item,
- inode_peek_iversion(inode), &token);
- btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
- btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
- btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
- btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
+ btrfs_set_token_inode_transid(&token, item, trans->transid);
+ btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+ btrfs_set_token_inode_block_group(&token, item, 0);
}
static int log_inode_item(struct btrfs_trans_handle *trans,
@@ -3931,12 +3943,33 @@
}
static int log_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
struct btrfs_root *log_root,
struct btrfs_ordered_sum *sums)
{
+ const u64 lock_end = sums->bytenr + sums->len - 1;
+ struct extent_state *cached_state = NULL;
int ret;
/*
+ * If this inode was not used for reflink operations in the current
+ * transaction with new extents, then do the fast path, no need to
+ * worry about logging checksum items with overlapping ranges.
+ */
+ if (inode->last_reflink_trans < trans->transid)
+ return btrfs_csum_file_blocks(trans, log_root, sums);
+
+ /*
+ * Serialize logging for checksums. This is to avoid racing with the
+ * same checksum being logged by another task that is logging another
+ * file which happens to refer to the same extent as well. Such races
+ * can leave checksum items in the log with overlapping ranges.
+ */
+ ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
+ lock_end, &cached_state);
+ if (ret)
+ return ret;
+ /*
* Due to extent cloning, we might have logged a csum item that covers a
* subrange of a cloned extent, and later we can end up logging a csum
* item for a larger subrange of the same extent or the entire range.
@@ -3946,10 +3979,13 @@
* trim and adjust) any existing csum items in the log for this range.
*/
ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
- if (ret)
- return ret;
+ if (!ret)
+ ret = btrfs_csum_file_blocks(trans, log_root, sums);
- return btrfs_csum_file_blocks(trans, log_root, sums);
+ unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
+ &cached_state);
+
+ return ret;
}
static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -4070,7 +4106,7 @@
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = log_csums(trans, log, sums);
+ ret = log_csums(trans, inode, log, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4078,7 +4114,8 @@
return ret;
}
-static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int extent_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
struct extent_map *em1, *em2;
@@ -4095,10 +4132,14 @@
static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_root *log_root,
- const struct extent_map *em)
+ const struct extent_map *em,
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
u64 csum_offset;
u64 csum_len;
+ u64 mod_start = em->mod_start;
+ u64 mod_len = em->mod_len;
LIST_HEAD(ordered_sums);
int ret = 0;
@@ -4107,13 +4148,71 @@
em->block_start == EXTENT_MAP_HOLE)
return 0;
+ list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
+ const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
+ const u64 mod_end = mod_start + mod_len;
+ struct btrfs_ordered_sum *sums;
+
+ if (mod_len == 0)
+ break;
+
+ if (ordered_end <= mod_start)
+ continue;
+ if (mod_end <= ordered->file_offset)
+ break;
+
+ /*
+ * We are going to copy all the csums on this ordered extent, so
+ * go ahead and adjust mod_start and mod_len in case this ordered
+ * extent has already been logged.
+ */
+ if (ordered->file_offset > mod_start) {
+ if (ordered_end >= mod_end)
+ mod_len = ordered->file_offset - mod_start;
+ /*
+ * If we have this case
+ *
+ * |--------- logged extent ---------|
+ * |----- ordered extent ----|
+ *
+ * Just don't mess with mod_start and mod_len, we'll
+ * just end up logging more csums than we need and it
+ * will be ok.
+ */
+ } else {
+ if (ordered_end < mod_end) {
+ mod_len = mod_end - ordered_end;
+ mod_start = ordered_end;
+ } else {
+ mod_len = 0;
+ }
+ }
+
+ /*
+ * To keep us from looping for the above case of an ordered
+ * extent that falls inside of the logged extent.
+ */
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
+ continue;
+
+ list_for_each_entry(sums, &ordered->list, list) {
+ ret = log_csums(trans, inode, log_root, sums);
+ if (ret)
+ return ret;
+ }
+ }
+
+ /* We're done, found all csums in the ordered extents. */
+ if (mod_len == 0)
+ return 0;
+
/* If we're compressed we have to save the entire range of csums. */
if (em->compress_type) {
csum_offset = 0;
csum_len = max(em->block_len, em->orig_block_len);
} else {
- csum_offset = em->mod_start - em->start;
- csum_len = em->mod_len;
+ csum_offset = mod_start - em->start;
+ csum_len = mod_len;
}
/* block start is already adjusted for the file extent offset. */
@@ -4129,7 +4228,7 @@
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = log_csums(trans, log_root, sums);
+ ret = log_csums(trans, inode, log_root, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4153,11 +4252,11 @@
int ret;
int extent_inserted = 0;
- ret = log_extent_csums(trans, inode, log, em);
+ ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
return ret;
- ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
+ ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
em->start + em->len, NULL, 0, 1,
sizeof(*fi), &extent_inserted);
if (ret)
@@ -4178,43 +4277,35 @@
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
- &token);
+ btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- btrfs_set_token_file_extent_type(leaf, fi,
- BTRFS_FILE_EXTENT_PREALLOC,
- &token);
+ btrfs_set_token_file_extent_type(&token, fi,
+ BTRFS_FILE_EXTENT_PREALLOC);
else
- btrfs_set_token_file_extent_type(leaf, fi,
- BTRFS_FILE_EXTENT_REG,
- &token);
+ btrfs_set_token_file_extent_type(&token, fi,
+ BTRFS_FILE_EXTENT_REG);
block_len = max(em->block_len, em->orig_block_len);
if (em->compress_type != BTRFS_COMPRESS_NONE) {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
- em->block_start,
- &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
- &token);
+ btrfs_set_token_file_extent_disk_bytenr(&token, fi,
+ em->block_start);
+ btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+ btrfs_set_token_file_extent_disk_bytenr(&token, fi,
em->block_start -
- extent_offset, &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
- &token);
+ extent_offset);
+ btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
} else {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
- &token);
+ btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
+ btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
}
- btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
- btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
- btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
- btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
- &token);
- btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
- btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+ btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
+ btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
+ btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
+ btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
+ btrfs_set_token_file_extent_encryption(&token, fi, 0);
+ btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
@@ -4224,7 +4315,7 @@
/*
* Log all prealloc extents beyond the inode's i_size to make sure we do not
- * lose them after doing a fast fsync and replaying the log. We scan the
+ * lose them after doing a full/fast fsync and replaying the log. We scan the
* subvolume's root instead of iterating the inode's extent map tree because
* otherwise we can log incorrect extent items based on extent map conversion.
* That can happen due to the fact that extent maps are merged when they
@@ -4350,12 +4441,9 @@
}
}
}
- if (ins_nr > 0) {
+ if (ins_nr > 0)
ret = copy_items(trans, inode, dst_path, path,
start_slot, ins_nr, 1, 0);
- if (ret > 0)
- ret = 0;
- }
out:
btrfs_release_path(path);
btrfs_free_path(dst_path);
@@ -4366,10 +4454,10 @@
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
- const u64 start,
- const u64 end)
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
@@ -4383,23 +4471,6 @@
test_gen = root->fs_info->last_trans_committed;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
- /*
- * Skip extents outside our logging range. It's important to do
- * it for correctness because if we don't ignore them, we may
- * log them before their ordered extent completes, and therefore
- * we could log them without logging their respective checksums
- * (the checksum items are added to the csum tree at the very
- * end of btrfs_finish_ordered_io()). Also leave such extents
- * outside of our range in the list, since we may have another
- * ranged fsync in the near future that needs them. If an extent
- * outside our range corresponds to a hole, log it to avoid
- * leaving gaps between extents (fsck will complain when we are
- * not using the NO_HOLES feature).
- */
- if ((em->start > end || em->start + em->len <= start) &&
- em->block_start != EXTENT_MAP_HOLE)
- continue;
-
list_del_init(&em->list);
/*
* Just an arbitrary number, this can be really CPU intensive
@@ -4458,8 +4529,32 @@
btrfs_release_path(path);
if (!ret)
ret = btrfs_log_prealloc_extents(trans, inode, path);
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * We have logged all extents successfully, now make sure the commit of
+ * the current transaction waits for the ordered extents to complete
+ * before it commits and wipes out the log trees, otherwise we would
+ * lose data if an ordered extents completes after the transaction
+ * commits and a power failure happens after the transaction commit.
+ */
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
+
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ spin_lock_irq(&inode->ordered_tree.lock);
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+ atomic_inc(&trans->transaction->pending_ordered);
+ }
+ spin_unlock_irq(&inode->ordered_tree.lock);
+ }
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ return 0;
}
static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
@@ -4522,6 +4617,10 @@
const u64 ino = btrfs_ino(inode);
int ins_nr = 0;
int start_slot = 0;
+ bool found_xattrs = false;
+
+ if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
+ return 0;
key.objectid = ino;
key.type = BTRFS_XATTR_ITEM_KEY;
@@ -4560,6 +4659,7 @@
start_slot = slot;
ins_nr++;
path->slots[0]++;
+ found_xattrs = true;
cond_resched();
}
if (ins_nr > 0) {
@@ -4569,6 +4669,9 @@
return ret;
}
+ if (!found_xattrs)
+ set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
+
return 0;
}
@@ -4605,9 +4708,7 @@
return ret;
while (true) {
- struct btrfs_file_extent_item *extent;
struct extent_buffer *leaf = path->nodes[0];
- u64 len;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
@@ -4656,18 +4757,7 @@
leaf = path->nodes[0];
}
- extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, extent) ==
- BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(leaf, extent);
- prev_extent_end = ALIGN(key.offset + len,
- fs_info->sectorsize);
- } else {
- len = btrfs_file_extent_num_bytes(leaf, extent);
- prev_extent_end = key.offset + len;
- }
-
+ prev_extent_end = btrfs_file_extent_end(path);
path->slots[0]++;
cond_resched();
}
@@ -4862,10 +4952,7 @@
btrfs_release_path(path);
- key.objectid = ino;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, ino, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
@@ -4874,16 +4961,14 @@
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
if (ret == -ENOENT) {
- key.objectid = parent;
- inode = btrfs_iget(fs_info->sb, &key, root,
- NULL);
+ inode = btrfs_iget(fs_info->sb, parent, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
} else {
ret = btrfs_log_inode(trans, root,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
- 0, LLONG_MAX, ctx);
+ ctx);
btrfs_add_delayed_iput(inode);
}
}
@@ -4941,7 +5026,7 @@
* log with the new name before we unpin it.
*/
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
+ LOG_OTHER_INODE, ctx);
if (ret) {
btrfs_add_delayed_iput(inode);
continue;
@@ -5017,6 +5102,7 @@
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
struct btrfs_root *root = inode->root;
int ins_start_slot = 0;
int ins_nr = 0;
@@ -5037,13 +5123,21 @@
if (min_key->type > max_key->type)
break;
- if (min_key->type == BTRFS_INODE_ITEM_KEY)
+ if (min_key->type == BTRFS_INODE_ITEM_KEY) {
*need_log_inode_item = false;
-
- if ((min_key->type == BTRFS_INODE_REF_KEY ||
- min_key->type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
+ } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
+ min_key->offset >= i_size) {
+ /*
+ * Extents at and beyond eof are logged with
+ * btrfs_log_prealloc_extents().
+ * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
+ * and no keys greater than that, so bail out.
+ */
+ break;
+ } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
+ inode->generation == trans->transid &&
+ !recursive_logging) {
u64 other_ino = 0;
u64 other_parent = 0;
@@ -5074,10 +5168,8 @@
btrfs_release_path(path);
goto next_key;
}
- }
-
- /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
- if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
@@ -5130,9 +5222,21 @@
break;
}
}
- if (ins_nr)
+ if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
ins_nr, inode_only, logged_isize);
+ if (ret)
+ return ret;
+ }
+
+ if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
+ /*
+ * Release the path because otherwise we might attempt to double
+ * lock the same leaf with btrfs_log_prealloc_extents() below.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+ }
return ret;
}
@@ -5154,8 +5258,6 @@
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct btrfs_path *path;
@@ -5334,7 +5436,7 @@
}
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- ctx, start, end);
+ ctx);
if (ret) {
err = ret;
goto out_unlock;
@@ -5343,31 +5445,8 @@
struct extent_map *em, *n;
write_lock(&em_tree->lock);
- /*
- * We can't just remove every em if we're called for a ranged
- * fsync - that is, one that doesn't cover the whole possible
- * file range (0 to LLONG_MAX). This is because we can have
- * em's that fall outside the range we're logging and therefore
- * their ordered operations haven't completed yet
- * (btrfs_finish_ordered_io() not invoked yet). This means we
- * didn't get their respective file extent item in the fs/subvol
- * tree yet, and need to let the next fast fsync (one which
- * consults the list of modified extent maps) find the em so
- * that it logs a matching file extent item and waits for the
- * respective ordered operation to complete (if it's still
- * running).
- *
- * Removing every em outside the range we're logging would make
- * the next fast fsync not log their matching file extent items,
- * therefore making us lose data after a log replay.
- */
- list_for_each_entry_safe(em, n, &em_tree->modified_extents,
- list) {
- const u64 mod_end = em->mod_start + em->mod_len - 1;
-
- if (em->mod_start >= start && mod_end <= end)
- list_del_init(&em->list);
- }
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
+ list_del_init(&em->list);
write_unlock(&em_tree->lock);
}
@@ -5633,7 +5712,7 @@
continue;
btrfs_release_path(path);
- di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
+ di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
if (IS_ERR(di_inode)) {
ret = PTR_ERR(di_inode);
goto next_dir_inode;
@@ -5648,7 +5727,7 @@
if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
- log_mode, 0, LLONG_MAX, ctx);
+ log_mode, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
ret = 1;
@@ -5759,8 +5838,8 @@
cur_offset = item_size;
}
- dir_inode = btrfs_iget(fs_info->sb, &inode_key,
- root, NULL);
+ dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
+ root);
/*
* If the parent inode was deleted, return an error to
* fallback to a transaction commit. This is to prevent
@@ -5792,7 +5871,7 @@
if (ctx)
ctx->log_new_dentries = false;
ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
- LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+ LOG_INODE_ALL, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
ret = 1;
@@ -5827,21 +5906,23 @@
int slot = path->slots[0];
struct btrfs_key search_key;
struct inode *inode;
+ u64 ino;
int ret = 0;
btrfs_release_path(path);
+ ino = found_key.offset;
+
search_key.objectid = found_key.offset;
search_key.type = BTRFS_INODE_ITEM_KEY;
search_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &search_key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
if (BTRFS_I(inode)->generation > last_committed)
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_INODE_EXISTS,
- 0, LLONG_MAX, ctx);
+ LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
if (ret)
return ret;
@@ -5896,7 +5977,7 @@
if (inode->generation > fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode,
- LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
+ LOG_INODE_EXISTS, ctx);
if (ret)
break;
}
@@ -6004,8 +6085,6 @@
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct dentry *parent,
- const loff_t start,
- const loff_t end,
int inode_only,
struct btrfs_log_ctx *ctx)
{
@@ -6048,7 +6127,8 @@
* (since logging them is pointless, a link count of 0 means they
* will never be accessible).
*/
- if (btrfs_inode_in_log(inode, trans->transid) ||
+ if ((btrfs_inode_in_log(inode, trans->transid) &&
+ list_empty(&ctx->ordered_extents)) ||
inode->vfs_inode.i_nlink == 0) {
ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans;
@@ -6058,7 +6138,7 @@
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
+ ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
if (ret)
goto end_trans;
@@ -6154,15 +6234,13 @@
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
- start, end, LOG_INODE_ALL, ctx);
+ LOG_INODE_ALL, ctx);
dput(parent);
return ret;
@@ -6179,7 +6257,6 @@
struct btrfs_trans_handle *trans;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_key tmp_key;
struct btrfs_root *log;
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
struct walk_control wc = {
@@ -6233,7 +6310,7 @@
if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
break;
- log = btrfs_read_fs_root(log_root_tree, &found_key);
+ log = btrfs_read_tree_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
btrfs_handle_fs_error(fs_info, ret,
@@ -6241,11 +6318,8 @@
goto error;
}
- tmp_key.objectid = found_key.offset;
- tmp_key.type = BTRFS_ROOT_ITEM_KEY;
- tmp_key.offset = (u64)-1;
-
- wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
+ wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
+ true);
if (IS_ERR(wc.replay_dest)) {
ret = PTR_ERR(wc.replay_dest);
@@ -6261,12 +6335,10 @@
* each subsequent pass.
*/
if (ret == -ENOENT)
- ret = btrfs_pin_extent_for_log_replay(fs_info,
+ ret = btrfs_pin_extent_for_log_replay(trans,
log->node->start,
log->node->len);
- free_extent_buffer(log->node);
- free_extent_buffer(log->commit_root);
- kfree(log);
+ btrfs_put_root(log);
if (!ret)
goto next;
@@ -6302,9 +6374,8 @@
}
wc.replay_dest->log_root = NULL;
- free_extent_buffer(log->node);
- free_extent_buffer(log->commit_root);
- kfree(log);
+ btrfs_put_root(wc.replay_dest);
+ btrfs_put_root(log);
if (ret)
goto error;
@@ -6335,10 +6406,9 @@
if (ret)
return ret;
- free_extent_buffer(log_root_tree->node);
log_root_tree->log_root = NULL;
clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
- kfree(log_root_tree);
+ btrfs_put_root(log_root_tree);
return 0;
error:
@@ -6465,7 +6535,6 @@
* we don't need to worry about getting a log committed that has an
* inconsistent state after a rename operation.
*/
- btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, &ctx);
+ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index ddfc678..731bd9c 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,6 +19,8 @@
bool logging_new_name;
struct inode *inode;
struct list_head list;
+ /* Only used for fast fsyncs. */
+ struct list_head ordered_extents;
};
static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
@@ -30,6 +32,20 @@
ctx->logging_new_name = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
+ INIT_LIST_HEAD(&ctx->ordered_extents);
+}
+
+static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
+
+ ASSERT(inode_is_locked(ctx->inode));
+
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
}
static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans)
@@ -51,8 +67,6 @@
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 76b84f2..28525ad 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -246,9 +246,49 @@
return ret;
}
-int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
- int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
- u64))
+/*
+ * Check if there's an matching subvolume for given UUID
+ *
+ * Return:
+ * 0 check succeeded, the entry is not outdated
+ * > 0 if the check failed, the caller should remove the entry
+ * < 0 if an error occurred
+ */
+static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
+ u8 *uuid, u8 type, u64 subvolid)
+{
+ int ret = 0;
+ struct btrfs_root *subvol_root;
+
+ if (type != BTRFS_UUID_KEY_SUBVOL &&
+ type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
+ goto out;
+
+ subvol_root = btrfs_get_fs_root(fs_info, subvolid, true);
+ if (IS_ERR(subvol_root)) {
+ ret = PTR_ERR(subvol_root);
+ if (ret == -ENOENT)
+ ret = 1;
+ goto out;
+ }
+
+ switch (type) {
+ case BTRFS_UUID_KEY_SUBVOL:
+ if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
+ ret = 1;
+ break;
+ case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+ if (memcmp(uuid, subvol_root->root_item.received_uuid,
+ BTRFS_UUID_SIZE))
+ ret = 1;
+ break;
+ }
+ btrfs_put_root(subvol_root);
+out:
+ return ret;
+}
+
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->uuid_root;
struct btrfs_key key;
@@ -278,6 +318,10 @@
}
while (1) {
+ if (btrfs_fs_closing(fs_info)) {
+ ret = -EINTR;
+ goto out;
+ }
cond_resched();
leaf = path->nodes[0];
slot = path->slots[0];
@@ -305,7 +349,8 @@
read_extent_buffer(leaf, &subid_le, offset,
sizeof(subid_le));
subid_cpu = le64_to_cpu(subid_le);
- ret = check_func(fs_info, uuid, key.type, subid_cpu);
+ ret = btrfs_check_uuid_tree_entry(fs_info, uuid,
+ key.type, subid_cpu);
if (ret < 0)
goto out;
if (ret > 0) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8deee49..e462de9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7,7 +7,6 @@
#include <linux/sched/mm.h>
#include <linux/bio.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
@@ -15,6 +14,7 @@
#include <linux/semaphore.h>
#include <linux/uuid.h>
#include <linux/list_sort.h>
+#include <linux/namei.h>
#include "misc.h"
#include "ctree.h"
#include "extent_map.h"
@@ -31,6 +31,7 @@
#include "tree-checker.h"
#include "space-info.h"
#include "block-group.h"
+#include "discard.h"
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
@@ -59,6 +60,32 @@
.bg_flag = BTRFS_BLOCK_GROUP_RAID1,
.mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
},
+ [BTRFS_RAID_RAID1C3] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 3,
+ .devs_min = 3,
+ .tolerated_failures = 2,
+ .devs_increment = 3,
+ .ncopies = 3,
+ .nparity = 0,
+ .raid_name = "raid1c3",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
+ .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
+ },
+ [BTRFS_RAID_RAID1C4] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 4,
+ .devs_min = 4,
+ .tolerated_failures = 3,
+ .devs_increment = 4,
+ .ncopies = 4,
+ .nparity = 0,
+ .raid_name = "raid1c4",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
+ .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
+ },
[BTRFS_RAID_DUP] = {
.sub_stripes = 1,
.dev_stripes = 2,
@@ -260,14 +287,13 @@
* ============
*
* uuid_mutex
- * volume_mutex
- * device_list_mutex
- * chunk_mutex
- * balance_mutex
+ * device_list_mutex
+ * chunk_mutex
+ * balance_mutex
*
*
- * Exclusive operations, BTRFS_FS_EXCL_OP
- * ======================================
+ * Exclusive operations
+ * ====================
*
* Maintains the exclusivity of the following operations that apply to the
* whole filesystem and cannot run in parallel.
@@ -293,17 +319,17 @@
* - system power-cycle and filesystem mounted as read-only
* - filesystem or device errors leading to forced read-only
*
- * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
- * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
+ * The status of exclusive operation is set and cleared atomically.
+ * During the course of Paused state, fs_info::exclusive_operation remains set.
* A device operation in Paused or Running state can be canceled or resumed
* either by ioctl (Balance only) or when remounted as read-write.
- * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
+ * The exclusive status is cleared when the device operation is canceled or
* completed.
*/
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
-struct list_head *btrfs_get_fs_uuids(void)
+struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
{
return &fs_uuids;
}
@@ -331,6 +357,7 @@
INIT_LIST_HEAD(&fs_devs->devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->fs_list);
+ INIT_LIST_HEAD(&fs_devs->seed_list);
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
@@ -381,7 +408,7 @@
* Returned struct is not linked onto any lists and must be destroyed using
* btrfs_free_device.
*/
-static struct btrfs_device *__alloc_device(void)
+static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
{
struct btrfs_device *dev;
@@ -403,14 +430,13 @@
INIT_LIST_HEAD(&dev->dev_alloc_list);
INIT_LIST_HEAD(&dev->post_commit_list);
- spin_lock_init(&dev->io_lock);
-
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
+ extent_io_tree_init(fs_info, &dev->alloc_state,
+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
return dev;
}
@@ -422,39 +448,6 @@
ASSERT(fsid);
- if (metadata_fsid) {
- /*
- * Handle scanned device having completed its fsid change but
- * belonging to a fs_devices that was created by first scanning
- * a device which didn't have its fsid/metadata_uuid changed
- * at all and the CHANGING_FSID_V2 flag set.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (fs_devices->fsid_change &&
- memcmp(metadata_fsid, fs_devices->fsid,
- BTRFS_FSID_SIZE) == 0 &&
- memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) == 0) {
- return fs_devices;
- }
- }
- /*
- * Handle scanned device having completed its fsid change but
- * belonging to a fs_devices that was created by a device that
- * has an outdated pair of fsid/metadata_uuid and
- * CHANGING_FSID_V2 flag set.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (fs_devices->fsid_change &&
- memcmp(fs_devices->metadata_uuid,
- fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
- memcmp(metadata_fsid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) == 0) {
- return fs_devices;
- }
- }
- }
-
/* Handle non-split brain cases */
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
if (metadata_fsid) {
@@ -470,10 +463,51 @@
return NULL;
}
+static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
+ struct btrfs_super_block *disk_super)
+{
+
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * Handle scanned device having completed its fsid change but
+ * belonging to a fs_devices that was created by first scanning
+ * a device which didn't have its fsid/metadata_uuid changed
+ * at all and the CHANGING_FSID_V2 flag set.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (fs_devices->fsid_change &&
+ memcmp(disk_super->metadata_uuid, fs_devices->fsid,
+ BTRFS_FSID_SIZE) == 0 &&
+ memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0) {
+ return fs_devices;
+ }
+ }
+ /*
+ * Handle scanned device having completed its fsid change but
+ * belonging to a fs_devices that was created by a device that
+ * has an outdated pair of fsid/metadata_uuid and
+ * CHANGING_FSID_V2 flag set.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (fs_devices->fsid_change &&
+ memcmp(fs_devices->metadata_uuid,
+ fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
+ memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0) {
+ return fs_devices;
+ }
+ }
+
+ return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
+}
+
+
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
int flush, struct block_device **bdev,
- struct buffer_head **bh)
+ struct btrfs_super_block **disk_super)
{
int ret;
@@ -492,9 +526,9 @@
goto error;
}
invalidate_bdev(*bdev);
- *bh = btrfs_read_dev_super(*bdev);
- if (IS_ERR(*bh)) {
- ret = PTR_ERR(*bh);
+ *disk_super = btrfs_read_dev_super(*bdev);
+ if (IS_ERR(*disk_super)) {
+ ret = PTR_ERR(*disk_super);
blkdev_put(*bdev, flags);
goto error;
}
@@ -503,216 +537,9 @@
error:
*bdev = NULL;
- *bh = NULL;
return ret;
}
-static void requeue_list(struct btrfs_pending_bios *pending_bios,
- struct bio *head, struct bio *tail)
-{
-
- struct bio *old_head;
-
- old_head = pending_bios->head;
- pending_bios->head = head;
- if (pending_bios->tail)
- tail->bi_next = old_head;
- else
- pending_bios->tail = tail;
-}
-
-/*
- * we try to collect pending bios for a device so we don't get a large
- * number of procs sending bios down to the same device. This greatly
- * improves the schedulers ability to collect and merge the bios.
- *
- * But, it also turns into a long list of bios to process and that is sure
- * to eventually make the worker thread block. The solution here is to
- * make some progress and then put this work struct back at the end of
- * the list if the block device is congested. This way, multiple devices
- * can make progress from a single worker thread.
- */
-static noinline void run_scheduled_bios(struct btrfs_device *device)
-{
- struct btrfs_fs_info *fs_info = device->fs_info;
- struct bio *pending;
- struct backing_dev_info *bdi;
- struct btrfs_pending_bios *pending_bios;
- struct bio *tail;
- struct bio *cur;
- int again = 0;
- unsigned long num_run;
- unsigned long batch_run = 0;
- unsigned long last_waited = 0;
- int force_reg = 0;
- int sync_pending = 0;
- struct blk_plug plug;
-
- /*
- * this function runs all the bios we've collected for
- * a particular device. We don't want to wander off to
- * another device without first sending all of these down.
- * So, setup a plug here and finish it off before we return
- */
- blk_start_plug(&plug);
-
- bdi = device->bdev->bd_bdi;
-
-loop:
- spin_lock(&device->io_lock);
-
-loop_lock:
- num_run = 0;
-
- /* take all the bios off the list at once and process them
- * later on (without the lock held). But, remember the
- * tail and other pointers so the bios can be properly reinserted
- * into the list if we hit congestion
- */
- if (!force_reg && device->pending_sync_bios.head) {
- pending_bios = &device->pending_sync_bios;
- force_reg = 1;
- } else {
- pending_bios = &device->pending_bios;
- force_reg = 0;
- }
-
- pending = pending_bios->head;
- tail = pending_bios->tail;
- WARN_ON(pending && !tail);
-
- /*
- * if pending was null this time around, no bios need processing
- * at all and we can stop. Otherwise it'll loop back up again
- * and do an additional check so no bios are missed.
- *
- * device->running_pending is used to synchronize with the
- * schedule_bio code.
- */
- if (device->pending_sync_bios.head == NULL &&
- device->pending_bios.head == NULL) {
- again = 0;
- device->running_pending = 0;
- } else {
- again = 1;
- device->running_pending = 1;
- }
-
- pending_bios->head = NULL;
- pending_bios->tail = NULL;
-
- spin_unlock(&device->io_lock);
-
- while (pending) {
-
- rmb();
- /* we want to work on both lists, but do more bios on the
- * sync list than the regular list
- */
- if ((num_run > 32 &&
- pending_bios != &device->pending_sync_bios &&
- device->pending_sync_bios.head) ||
- (num_run > 64 && pending_bios == &device->pending_sync_bios &&
- device->pending_bios.head)) {
- spin_lock(&device->io_lock);
- requeue_list(pending_bios, pending, tail);
- goto loop_lock;
- }
-
- cur = pending;
- pending = pending->bi_next;
- cur->bi_next = NULL;
-
- BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
-
- /*
- * if we're doing the sync list, record that our
- * plug has some sync requests on it
- *
- * If we're doing the regular list and there are
- * sync requests sitting around, unplug before
- * we add more
- */
- if (pending_bios == &device->pending_sync_bios) {
- sync_pending = 1;
- } else if (sync_pending) {
- blk_finish_plug(&plug);
- blk_start_plug(&plug);
- sync_pending = 0;
- }
-
- btrfsic_submit_bio(cur);
- num_run++;
- batch_run++;
-
- cond_resched();
-
- /*
- * we made progress, there is more work to do and the bdi
- * is now congested. Back off and let other work structs
- * run instead
- */
- if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
- fs_info->fs_devices->open_devices > 1) {
- struct io_context *ioc;
-
- ioc = current->io_context;
-
- /*
- * the main goal here is that we don't want to
- * block if we're going to be able to submit
- * more requests without blocking.
- *
- * This code does two great things, it pokes into
- * the elevator code from a filesystem _and_
- * it makes assumptions about how batching works.
- */
- if (ioc && ioc->nr_batch_requests > 0 &&
- time_before(jiffies, ioc->last_waited + HZ/50UL) &&
- (last_waited == 0 ||
- ioc->last_waited == last_waited)) {
- /*
- * we want to go through our batch of
- * requests and stop. So, we copy out
- * the ioc->last_waited time and test
- * against it before looping
- */
- last_waited = ioc->last_waited;
- cond_resched();
- continue;
- }
- spin_lock(&device->io_lock);
- requeue_list(pending_bios, pending, tail);
- device->running_pending = 1;
-
- spin_unlock(&device->io_lock);
- btrfs_queue_work(fs_info->submit_workers,
- &device->work);
- goto done;
- }
- }
-
- cond_resched();
- if (again)
- goto loop;
-
- spin_lock(&device->io_lock);
- if (device->pending_bios.head || device->pending_sync_bios.head)
- goto loop_lock;
- spin_unlock(&device->io_lock);
-
-done:
- blk_finish_plug(&plug);
-}
-
-static void pending_bios_fn(struct btrfs_work *work)
-{
- struct btrfs_device *device;
-
- device = container_of(work, struct btrfs_device, work);
- run_scheduled_bios(device);
-}
-
static bool device_path_matched(const char *path, struct btrfs_device *device)
{
int found;
@@ -742,6 +569,8 @@
struct btrfs_device *device, *tmp_device;
int ret = 0;
+ lockdep_assert_held(&uuid_mutex);
+
if (path)
ret = -ENOENT;
@@ -769,8 +598,6 @@
btrfs_free_device(device);
ret = 0;
- if (fs_devices->num_devices == 0)
- break;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -795,7 +622,6 @@
{
struct request_queue *q;
struct block_device *bdev;
- struct buffer_head *bh;
struct btrfs_super_block *disk_super;
u64 devid;
int ret;
@@ -806,17 +632,16 @@
return -EINVAL;
ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev, &bh);
+ &bdev, &disk_super);
if (ret)
return ret;
- disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
if (devid != device->devid)
- goto error_brelse;
+ goto error_free_page;
if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
- goto error_brelse;
+ goto error_free_page;
device->generation = btrfs_super_generation(disk_super);
@@ -825,11 +650,11 @@
BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
pr_err(
"BTRFS: Invalid seeding and uuid-changed device detected\n");
- goto error_brelse;
+ goto error_free_page;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- fs_devices->seeding = 1;
+ fs_devices->seeding = true;
} else {
if (bdev_read_only(bdev))
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
@@ -839,7 +664,7 @@
q = bdev_get_queue(bdev);
if (!blk_queue_nonrot(q))
- fs_devices->rotating = 1;
+ fs_devices->rotating = true;
device->bdev = bdev;
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
@@ -851,12 +676,12 @@
fs_devices->rw_devices++;
list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
}
- brelse(bh);
+ btrfs_release_disk_super(disk_super);
return 0;
-error_brelse:
- brelse(bh);
+error_free_page:
+ btrfs_release_disk_super(disk_super);
blkdev_put(bdev, flags);
return -EINVAL;
@@ -864,7 +689,9 @@
/*
* Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
- * being created with a disk that has already completed its fsid change.
+ * being created with a disk that has already completed its fsid change. Such
+ * disk can belong to an fs which has its FSID changed or to one which doesn't.
+ * Handle both cases here.
*/
static struct btrfs_fs_devices *find_fsid_inprogress(
struct btrfs_super_block *disk_super)
@@ -880,7 +707,7 @@
}
}
- return NULL;
+ return find_fsid(disk_super->fsid, NULL);
}
@@ -966,22 +793,12 @@
BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
if (fsid_change_in_progress) {
- if (!has_metadata_uuid) {
- /*
- * When we have an image which has CHANGING_FSID_V2 set
- * it might belong to either a filesystem which has
- * disks with completed fsid change or it might belong
- * to fs with no UUID changes in effect, handle both.
- */
+ if (!has_metadata_uuid)
fs_devices = find_fsid_inprogress(disk_super);
- if (!fs_devices)
- fs_devices = find_fsid(disk_super->fsid, NULL);
- } else {
+ else
fs_devices = find_fsid_changed(disk_super);
- }
} else if (has_metadata_uuid) {
- fs_devices = find_fsid(disk_super->fsid,
- disk_super->metadata_uuid);
+ fs_devices = find_fsid_with_metadata_uuid(disk_super);
} else {
fs_devices = find_fsid_reverted_metadata(disk_super);
if (!fs_devices)
@@ -1061,11 +878,15 @@
*new_device_added = true;
if (disk_super->label[0])
- pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
- disk_super->label, devid, found_transid, path);
+ pr_info(
+ "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
+ disk_super->label, devid, found_transid, path,
+ current->comm, task_pid_nr(current));
else
- pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
- disk_super->fsid, devid, found_transid, path);
+ pr_info(
+ "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
+ disk_super->fsid, devid, found_transid, path,
+ current->comm, task_pid_nr(current));
} else if (!device->name || strcmp(device->name->str, path)) {
/*
@@ -1181,11 +1002,12 @@
struct btrfs_device *orig_dev;
int ret = 0;
+ lockdep_assert_held(&uuid_mutex);
+
fs_devices = alloc_fs_devices(orig->fsid, NULL);
if (IS_ERR(fs_devices))
return fs_devices;
- mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
@@ -1217,36 +1039,27 @@
device->fs_devices = fs_devices;
fs_devices->num_devices++;
}
- mutex_unlock(&orig->device_list_mutex);
return fs_devices;
error:
- mutex_unlock(&orig->device_list_mutex);
free_fs_devices(fs_devices);
return ERR_PTR(ret);
}
-/*
- * After we have read the system tree and know devids belonging to
- * this filesystem, remove the device which does not belong there.
- */
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
+ int step, struct btrfs_device **latest_dev)
{
struct btrfs_device *device, *next;
- struct btrfs_device *latest_dev = NULL;
- mutex_lock(&uuid_mutex);
-again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &device->dev_state)) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state) &&
+ &device->dev_state) &&
!test_bit(BTRFS_DEV_STATE_MISSING,
&device->dev_state) &&
- (!latest_dev ||
- device->generation > latest_dev->generation)) {
- latest_dev = device;
+ (!*latest_dev ||
+ device->generation > (*latest_dev)->generation)) {
+ *latest_dev = device;
}
continue;
}
@@ -1273,10 +1086,22 @@
btrfs_free_device(device);
}
- if (fs_devices->seed) {
- fs_devices = fs_devices->seed;
- goto again;
- }
+}
+
+/*
+ * After we have read the system tree and know devids belonging to this
+ * filesystem, remove the device which does not belong there.
+ */
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+{
+ struct btrfs_device *latest_dev = NULL;
+ struct btrfs_fs_devices *seed_dev;
+
+ mutex_lock(&uuid_mutex);
+ __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
+
+ list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
+ __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
fs_devices->latest_bdev = latest_dev->bdev;
@@ -1299,11 +1124,6 @@
static void btrfs_close_one_device(struct btrfs_device *device)
{
struct btrfs_fs_devices *fs_devices = device->fs_devices;
- struct btrfs_device *new_device;
- struct rcu_string *name;
-
- if (device->bdev)
- fs_devices->open_devices--;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
@@ -1314,70 +1134,78 @@
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+ clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
fs_devices->missing_devices--;
-
- btrfs_close_bdev(device);
-
- new_device = btrfs_alloc_device(NULL, &device->devid,
- device->uuid);
- BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
-
- /* Safe because we are under uuid_mutex */
- if (device->name) {
- name = rcu_string_strdup(device->name->str, GFP_NOFS);
- BUG_ON(!name); /* -ENOMEM */
- rcu_assign_pointer(new_device->name, name);
}
- list_replace_rcu(&device->dev_list, &new_device->dev_list);
- new_device->fs_devices = device->fs_devices;
+ btrfs_close_bdev(device);
+ if (device->bdev) {
+ fs_devices->open_devices--;
+ device->bdev = NULL;
+ }
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- synchronize_rcu();
- btrfs_free_device(device);
+ device->fs_info = NULL;
+ atomic_set(&device->dev_stats_ccnt, 0);
+ extent_io_tree_release(&device->alloc_state);
+
+ /*
+ * Reset the flush error record. We might have a transient flush error
+ * in this mount, and if so we aborted the current transaction and set
+ * the fs to an error state, guaranteeing no super blocks can be further
+ * committed. However that error might be transient and if we unmount the
+ * filesystem and mount it again, we should allow the mount to succeed
+ * (btrfs_check_rw_degradable() should not fail) - if after mounting the
+ * filesystem again we still get flush errors, then we will again abort
+ * any transaction and set the error state, guaranteeing no commits of
+ * unsafe super blocks.
+ */
+ device->last_flush_error = 0;
+
+ /* Verify the device is back in a pristine state */
+ ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
+ ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+ ASSERT(list_empty(&device->dev_alloc_list));
+ ASSERT(list_empty(&device->post_commit_list));
+ ASSERT(atomic_read(&device->reada_in_flight) == 0);
}
-static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
+static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *tmp;
- if (--fs_devices->opened > 0)
- return 0;
+ lockdep_assert_held(&uuid_mutex);
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
+ if (--fs_devices->opened > 0)
+ return;
+
+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
btrfs_close_one_device(device);
- }
- mutex_unlock(&fs_devices->device_list_mutex);
WARN_ON(fs_devices->open_devices);
WARN_ON(fs_devices->rw_devices);
fs_devices->opened = 0;
- fs_devices->seeding = 0;
-
- return 0;
+ fs_devices->seeding = false;
+ fs_devices->fs_info = NULL;
}
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_fs_devices *seed_devices = NULL;
- int ret;
+ LIST_HEAD(list);
+ struct btrfs_fs_devices *tmp;
mutex_lock(&uuid_mutex);
- ret = close_fs_devices(fs_devices);
- if (!fs_devices->opened) {
- seed_devices = fs_devices->seed;
- fs_devices->seed = NULL;
- }
- mutex_unlock(&uuid_mutex);
+ close_fs_devices(fs_devices);
+ if (!fs_devices->opened)
+ list_splice_init(&fs_devices->seed_list, &list);
- while (seed_devices) {
- fs_devices = seed_devices;
- seed_devices = fs_devices->seed;
+ list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
close_fs_devices(fs_devices);
+ list_del(&fs_devices->seed_list);
free_fs_devices(fs_devices);
}
- return ret;
+ mutex_unlock(&uuid_mutex);
}
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
@@ -1385,31 +1213,37 @@
{
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
- int ret = 0;
+ struct btrfs_device *tmp_device;
flags |= FMODE_EXCL;
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- /* Just open everything we can; ignore failures here */
- if (btrfs_open_one_device(fs_devices, device, flags, holder))
- continue;
+ list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
+ dev_list) {
+ int ret;
- if (!latest_dev ||
- device->generation > latest_dev->generation)
+ ret = btrfs_open_one_device(fs_devices, device, flags, holder);
+ if (ret == 0 &&
+ (!latest_dev || device->generation > latest_dev->generation)) {
latest_dev = device;
+ } else if (ret == -ENODATA) {
+ fs_devices->num_devices--;
+ list_del(&device->dev_list);
+ btrfs_free_device(device);
+ }
}
- if (fs_devices->open_devices == 0) {
- ret = -EINVAL;
- goto out;
- }
+ if (fs_devices->open_devices == 0)
+ return -EINVAL;
+
fs_devices->opened = 1;
fs_devices->latest_bdev = latest_dev->bdev;
fs_devices->total_rw_bytes = 0;
-out:
- return ret;
+ fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
+
+ return 0;
}
-static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int devid_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
struct btrfs_device *dev1, *dev2;
@@ -1448,55 +1282,55 @@
return ret;
}
-static void btrfs_release_disk_super(struct page *page)
+void btrfs_release_disk_super(struct btrfs_super_block *super)
{
- kunmap(page);
+ struct page *page = virt_to_page(super);
+
put_page(page);
}
-static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
- struct page **page,
- struct btrfs_super_block **disk_super)
+static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
+ u64 bytenr)
{
+ struct btrfs_super_block *disk_super;
+ struct page *page;
void *p;
pgoff_t index;
/* make sure our super fits in the device */
if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
- return 1;
+ return ERR_PTR(-EINVAL);
/* make sure our super fits in the page */
- if (sizeof(**disk_super) > PAGE_SIZE)
- return 1;
+ if (sizeof(*disk_super) > PAGE_SIZE)
+ return ERR_PTR(-EINVAL);
/* make sure our super doesn't straddle pages on disk */
index = bytenr >> PAGE_SHIFT;
- if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
- return 1;
+ if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
+ return ERR_PTR(-EINVAL);
/* pull in the page with our super */
- *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
- index, GFP_KERNEL);
+ page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
- if (IS_ERR_OR_NULL(*page))
- return 1;
+ if (IS_ERR(page))
+ return ERR_CAST(page);
- p = kmap(*page);
+ p = page_address(page);
/* align our pointer to the offset of the super block */
- *disk_super = p + offset_in_page(bytenr);
+ disk_super = p + offset_in_page(bytenr);
- if (btrfs_super_bytenr(*disk_super) != bytenr ||
- btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
- btrfs_release_disk_super(*page);
- return 1;
+ if (btrfs_super_bytenr(disk_super) != bytenr ||
+ btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
+ btrfs_release_disk_super(p);
+ return ERR_PTR(-EINVAL);
}
- if ((*disk_super)->label[0] &&
- (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
- (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
+ if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
+ disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
- return 0;
+ return disk_super;
}
int btrfs_forget_devices(const char *path)
@@ -1522,7 +1356,6 @@
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct block_device *bdev;
- struct page *page;
u64 bytenr;
lockdep_assert_held(&uuid_mutex);
@@ -1540,8 +1373,9 @@
if (IS_ERR(bdev))
return ERR_CAST(bdev);
- if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
- device = ERR_PTR(-EINVAL);
+ disk_super = btrfs_read_disk_super(bdev, bytenr);
+ if (IS_ERR(disk_super)) {
+ device = ERR_CAST(disk_super);
goto error_bdev_put;
}
@@ -1551,7 +1385,7 @@
btrfs_free_stale_devices(path, device);
}
- btrfs_release_disk_super(page);
+ btrfs_release_disk_super(disk_super);
error_bdev_put:
blkdev_put(bdev, flags);
@@ -1584,6 +1418,59 @@
return false;
}
+static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
+{
+ switch (device->fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ /*
+ * We don't want to overwrite the superblock on the drive nor
+ * any area used by the boot loader (grub for example), so we
+ * make sure to start at an offset of at least 1MB.
+ */
+ return max_t(u64, start, SZ_1M);
+ default:
+ BUG();
+ }
+}
+
+/**
+ * dev_extent_hole_check - check if specified hole is suitable for allocation
+ * @device: the device which we have the hole
+ * @hole_start: starting position of the hole
+ * @hole_size: the size of the hole
+ * @num_bytes: the size of the free space that we need
+ *
+ * This function may modify @hole_start and @hole_end to reflect the suitable
+ * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
+ */
+static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
+ u64 *hole_size, u64 num_bytes)
+{
+ bool changed = false;
+ u64 hole_end = *hole_start + *hole_size;
+
+ /*
+ * Check before we set max_hole_start, otherwise we could end up
+ * sending back this offset anyway.
+ */
+ if (contains_pending_extent(device, hole_start, *hole_size)) {
+ if (hole_end >= *hole_start)
+ *hole_size = hole_end - *hole_start;
+ else
+ *hole_size = 0;
+ changed = true;
+ }
+
+ switch (device->fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ /* No extra check */
+ break;
+ default:
+ BUG();
+ }
+
+ return changed;
+}
/*
* find_free_dev_extent_start - find free space in the specified device
@@ -1630,12 +1517,7 @@
int slot;
struct extent_buffer *l;
- /*
- * We don't want to overwrite the superblock on the drive nor any area
- * used by the boot loader (grub for example), so we make sure to start
- * at an offset of at least 1MB.
- */
- search_start = max_t(u64, search_start, SZ_1M);
+ search_start = dev_extent_search_start(device, search_start);
path = btrfs_alloc_path();
if (!path)
@@ -1693,18 +1575,8 @@
if (key.offset > search_start) {
hole_size = key.offset - search_start;
-
- /*
- * Have to check before we set max_hole_start, otherwise
- * we could end up sending back this offset anyway.
- */
- if (contains_pending_extent(device, &search_start,
- hole_size)) {
- if (key.offset >= search_start)
- hole_size = key.offset - search_start;
- else
- hole_size = 0;
- }
+ dev_extent_hole_check(device, &search_start, &hole_size,
+ num_bytes);
if (hole_size > max_hole_size) {
max_hole_start = search_start;
@@ -1743,8 +1615,8 @@
*/
if (search_end > search_start) {
hole_size = search_end - search_start;
-
- if (contains_pending_extent(device, &search_start, hole_size)) {
+ if (dev_extent_hole_check(device, &search_start, &hole_size,
+ num_bytes)) {
btrfs_release_path(path);
goto again;
}
@@ -2000,16 +1872,22 @@
/*
* Function to update ctime/mtime for a given device path.
* Mainly used for ctime/mtime based probe like libblkid.
+ *
+ * We don't care about errors here, this is just to be kind to userspace.
*/
-static void update_dev_time(const char *path_name)
+static void update_dev_time(const char *device_path)
{
- struct file *filp;
+ struct path path;
+ struct timespec64 now;
+ int ret;
- filp = filp_open(path_name, O_RDWR, 0);
- if (IS_ERR(filp))
+ ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
+ if (ret)
return;
- file_update_time(filp);
- filp_close(filp, NULL);
+
+ now = current_time(d_inode(path.dentry));
+ inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
+ path_put(&path);
}
static int btrfs_rm_dev_item(struct btrfs_device *device)
@@ -2111,17 +1989,14 @@
* where this function called, there should be always be another device (or
* this_dev) which is active.
*/
-void btrfs_assign_next_active_device(struct btrfs_device *device,
- struct btrfs_device *this_dev)
+void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
+ struct btrfs_device *next_device)
{
struct btrfs_fs_info *fs_info = device->fs_info;
- struct btrfs_device *next_device;
- if (this_dev)
- next_device = this_dev;
- else
+ if (!next_device)
next_device = btrfs_find_next_active_device(fs_info->fs_devices,
- device);
+ device);
ASSERT(next_device);
if (fs_info->sb->s_bdev &&
@@ -2150,8 +2025,48 @@
return num_devices;
}
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path)
+{
+ struct btrfs_super_block *disk_super;
+ int copy_num;
+
+ if (!bdev)
+ return;
+
+ for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
+ struct page *page;
+ int ret;
+
+ disk_super = btrfs_read_dev_one_super(bdev, copy_num);
+ if (IS_ERR(disk_super))
+ continue;
+
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+
+ page = virt_to_page(disk_super);
+ set_page_dirty(page);
+ lock_page(page);
+ /* write_on_page() unlocks the page */
+ ret = write_one_page(page);
+ if (ret)
+ btrfs_warn(fs_info,
+ "error clearing superblock number %d (%d)",
+ copy_num, ret);
+ btrfs_release_disk_super(disk_super);
+
+ }
+
+ /* Notify udev that device has changed */
+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
+ /* Update ctime/mtime for device path for libblkid */
+ update_dev_time(device_path);
+}
+
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
- u64 devid)
+ u64 devid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
@@ -2159,8 +2074,11 @@
u64 num_devices;
int ret = 0;
- mutex_lock(&uuid_mutex);
-
+ /*
+ * The device list in fs_devices is accessed without locks (neither
+ * uuid_mutex nor device_list_mutex) as it won't change on a mounted
+ * filesystem and another device rm cannot run.
+ */
num_devices = btrfs_num_devices(fs_info);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
@@ -2204,11 +2122,9 @@
mutex_unlock(&fs_info->chunk_mutex);
}
- mutex_unlock(&uuid_mutex);
ret = btrfs_shrink_device(device, 0);
if (!ret)
btrfs_reada_remove_dev(device);
- mutex_lock(&uuid_mutex);
if (ret)
goto error_undo;
@@ -2257,7 +2173,7 @@
if (device->bdev) {
cur_devices->open_devices--;
/* remove sysfs entry */
- btrfs_sysfs_rm_device_link(fs_devices, device);
+ btrfs_sysfs_remove_device(device);
}
num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
@@ -2270,27 +2186,20 @@
* supers and free the device.
*/
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
- btrfs_scratch_superblocks(device->bdev, device->name->str);
+ btrfs_scratch_superblocks(fs_info, device->bdev,
+ device->name->str);
btrfs_close_bdev(device);
synchronize_rcu();
btrfs_free_device(device);
if (cur_devices->open_devices == 0) {
- while (fs_devices) {
- if (fs_devices->seed == cur_devices) {
- fs_devices->seed = cur_devices->seed;
- break;
- }
- fs_devices = fs_devices->seed;
- }
- cur_devices->seed = NULL;
+ list_del_init(&cur_devices->seed_list);
close_fs_devices(cur_devices);
free_fs_devices(cur_devices);
}
out:
- mutex_unlock(&uuid_mutex);
return ret;
error_undo:
@@ -2334,13 +2243,9 @@
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
{
- struct btrfs_fs_info *fs_info = srcdev->fs_info;
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
- /* zero out the old super if it is writable */
- btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
- }
+ mutex_lock(&uuid_mutex);
btrfs_close_bdev(srcdev);
synchronize_rcu();
@@ -2348,8 +2253,6 @@
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
- struct btrfs_fs_devices *tmp_fs_devices;
-
/*
* On a mounted FS, num_devices can't be zero unless it's a
* seed. In case of a seed device being replaced, the replace
@@ -2358,28 +2261,20 @@
*/
ASSERT(fs_devices->seeding);
- tmp_fs_devices = fs_info->fs_devices;
- while (tmp_fs_devices) {
- if (tmp_fs_devices->seed == fs_devices) {
- tmp_fs_devices->seed = fs_devices->seed;
- break;
- }
- tmp_fs_devices = tmp_fs_devices->seed;
- }
- fs_devices->seed = NULL;
+ list_del_init(&fs_devices->seed_list);
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
}
+ mutex_unlock(&uuid_mutex);
}
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
{
struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
- WARN_ON(!tgtdev);
mutex_lock(&fs_devices->device_list_mutex);
- btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
+ btrfs_sysfs_remove_device(tgtdev);
if (tgtdev->bdev)
fs_devices->open_devices--;
@@ -2399,7 +2294,8 @@
* is already out of device list, so we don't have to hold
* the device_list_mutex lock.
*/
- btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
+ tgtdev->name->str);
btrfs_close_bdev(tgtdev);
synchronize_rcu();
@@ -2414,14 +2310,13 @@
u64 devid;
u8 *dev_uuid;
struct block_device *bdev;
- struct buffer_head *bh;
struct btrfs_device *device;
ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
- fs_info->bdev_holder, 0, &bdev, &bh);
+ fs_info->bdev_holder, 0, &bdev, &disk_super);
if (ret)
return ERR_PTR(ret);
- disk_super = (struct btrfs_super_block *)bh->b_data;
+
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
@@ -2431,7 +2326,7 @@
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
disk_super->fsid, true);
- brelse(bh);
+ btrfs_release_disk_super(disk_super);
if (!device)
device = ERR_PTR(-ENOENT);
blkdev_put(bdev, FMODE_READ);
@@ -2488,10 +2383,20 @@
if (!fs_devices->seeding)
return -EINVAL;
+ /*
+ * Private copy of the seed devices, anchored at
+ * fs_info->fs_devices->seed_list
+ */
seed_devices = alloc_fs_devices(NULL, NULL);
if (IS_ERR(seed_devices))
return PTR_ERR(seed_devices);
+ /*
+ * It's necessary to retain a copy of the original seed fs_devices in
+ * fs_uuids so that filesystems which have been seeded can successfully
+ * reference the seed device from open_seed_devices. This also supports
+ * multiple fs seed.
+ */
old_devices = clone_fs_devices(fs_devices);
if (IS_ERR(old_devices)) {
kfree(seed_devices);
@@ -2512,16 +2417,12 @@
list_for_each_entry(device, &seed_devices->devices, dev_list)
device->fs_devices = seed_devices;
- mutex_lock(&fs_info->chunk_mutex);
- list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
- mutex_unlock(&fs_info->chunk_mutex);
-
- fs_devices->seeding = 0;
+ fs_devices->seeding = false;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
- fs_devices->rotating = 0;
- fs_devices->seed = seed_devices;
+ fs_devices->rotating = false;
+ list_add(&seed_devices->seed_list, &fs_devices->seed_list);
generate_random_uuid(fs_devices->fsid);
memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -2624,7 +2525,7 @@
u64 orig_super_num_devices;
int seeding_dev = 0;
int ret = 0;
- bool unlocked = false;
+ bool locked = false;
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
@@ -2638,20 +2539,20 @@
seeding_dev = 1;
down_write(&sb->s_umount);
mutex_lock(&uuid_mutex);
+ locked = true;
}
- filemap_write_and_wait(bdev->bd_inode->i_mapping);
+ sync_blockdev(bdev);
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
ret = -EEXIST;
- mutex_unlock(
- &fs_devices->device_list_mutex);
+ rcu_read_unlock();
goto error;
}
}
- mutex_unlock(&fs_devices->device_list_mutex);
+ rcu_read_unlock();
device = btrfs_alloc_device(fs_info, NULL, NULL);
if (IS_ERR(device)) {
@@ -2715,7 +2616,7 @@
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
if (!blk_queue_nonrot(q))
- fs_devices->rotating = 1;
+ fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
btrfs_set_super_total_bytes(fs_info->super_copy,
@@ -2735,7 +2636,7 @@
mutex_unlock(&fs_info->chunk_mutex);
/* Add sysfs device entry */
- btrfs_sysfs_add_device_link(fs_devices, device);
+ btrfs_sysfs_add_device(device);
mutex_unlock(&fs_devices->device_list_mutex);
@@ -2762,8 +2663,11 @@
goto error_sysfs;
}
- btrfs_sysfs_update_sprout_fsid(fs_devices,
- fs_info->fs_devices->fsid);
+ /*
+ * fs_devices now represents the newly sprouted filesystem and
+ * its fsid has been changed by btrfs_prepare_sprout
+ */
+ btrfs_sysfs_update_sprout_fsid(fs_devices);
}
ret = btrfs_commit_transaction(trans);
@@ -2771,7 +2675,7 @@
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
- unlocked = true;
+ locked = false;
if (ret) /* transaction commit */
return ret;
@@ -2806,7 +2710,7 @@
return ret;
error_sysfs:
- btrfs_sysfs_rm_device_link(fs_devices, device);
+ btrfs_sysfs_remove_device(device);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_del_rcu(&device->dev_list);
@@ -2832,7 +2736,7 @@
btrfs_free_device(device);
error:
blkdev_put(bdev, FMODE_EXCL);
- if (seeding_dev && !unlocked) {
+ if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
@@ -3132,6 +3036,7 @@
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
+ struct btrfs_block_group *block_group;
int ret;
/*
@@ -3155,6 +3060,12 @@
if (ret)
return ret;
+ block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
+ if (!block_group)
+ return -ENOENT;
+ btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
+ btrfs_put_block_group(block_group);
+
trans = btrfs_start_trans_remove_block_group(root->fs_info,
chunk_offset);
if (IS_ERR(trans)) {
@@ -3254,7 +3165,7 @@
static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
u64 chunk_offset)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
u64 bytes_used;
u64 chunk_type;
@@ -3263,27 +3174,28 @@
chunk_type = cache->flags;
btrfs_put_block_group(cache);
- if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
- spin_lock(&fs_info->data_sinfo->lock);
- bytes_used = fs_info->data_sinfo->bytes_used;
- spin_unlock(&fs_info->data_sinfo->lock);
+ if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
+ return 0;
- if (!bytes_used) {
- struct btrfs_trans_handle *trans;
- int ret;
+ spin_lock(&fs_info->data_sinfo->lock);
+ bytes_used = fs_info->data_sinfo->bytes_used;
+ spin_unlock(&fs_info->data_sinfo->lock);
- trans = btrfs_join_transaction(fs_info->tree_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (!bytes_used) {
+ struct btrfs_trans_handle *trans;
+ int ret;
- ret = btrfs_force_chunk_alloc(trans,
- BTRFS_BLOCK_GROUP_DATA);
- btrfs_end_transaction(trans);
- if (ret < 0)
- return ret;
- return 1;
- }
+ trans = btrfs_join_transaction(fs_info->tree_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
+ btrfs_end_transaction(trans);
+ if (ret < 0)
+ return ret;
+ return 1;
}
+
return 0;
}
@@ -3303,7 +3215,7 @@
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
+ trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
@@ -3353,7 +3265,7 @@
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
@@ -3462,28 +3374,28 @@
static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
u64 chunk_used;
u64 user_thresh_min;
u64 user_thresh_max;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
- chunk_used = btrfs_block_group_used(&cache->item);
+ chunk_used = cache->used;
if (bargs->usage_min == 0)
user_thresh_min = 0;
else
- user_thresh_min = div_factor_fine(cache->key.offset,
- bargs->usage_min);
+ user_thresh_min = div_factor_fine(cache->length,
+ bargs->usage_min);
if (bargs->usage_max == 0)
user_thresh_max = 1;
else if (bargs->usage_max > 100)
- user_thresh_max = cache->key.offset;
+ user_thresh_max = cache->length;
else
- user_thresh_max = div_factor_fine(cache->key.offset,
- bargs->usage_max);
+ user_thresh_max = div_factor_fine(cache->length,
+ bargs->usage_max);
if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
ret = 0;
@@ -3495,20 +3407,19 @@
static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
u64 chunk_offset, struct btrfs_balance_args *bargs)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
u64 chunk_used, user_thresh;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
- chunk_used = btrfs_block_group_used(&cache->item);
+ chunk_used = cache->used;
if (bargs->usage_min == 0)
user_thresh = 1;
else if (bargs->usage > 100)
- user_thresh = cache->key.offset;
+ user_thresh = cache->length;
else
- user_thresh = div_factor_fine(cache->key.offset,
- bargs->usage);
+ user_thresh = div_factor_fine(cache->length, bargs->usage);
if (chunk_used < user_thresh)
ret = 0;
@@ -3921,12 +3832,7 @@
if (flags == 0)
return !extended; /* "0" is valid for usual profiles */
- /* true if exactly one bit set */
- /*
- * Don't use is_power_of_2(unsigned long) because it won't work
- * for the single profile (1ULL << 48) on 32-bit CPUs.
- */
- return flags != 0 && (flags & (flags - 1)) == 0;
+ return has_single_bit_set(flags);
}
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
@@ -3937,13 +3843,25 @@
atomic_read(&fs_info->balance_cancel_req) == 0);
}
-/* Non-zero return value signifies invalidity */
-static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
- u64 allowed)
+/*
+ * Validate target profile against allowed profiles and return true if it's OK.
+ * Otherwise print the error message and return false.
+ */
+static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
+ const struct btrfs_balance_args *bargs,
+ u64 allowed, const char *type)
{
- return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl_arg->target, 1) ||
- (bctl_arg->target & ~allowed)));
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+ return true;
+
+ /* Profile is valid and does not have bits outside of the allowed set */
+ if (alloc_profile_is_valid(bargs->target, 1) &&
+ (bargs->target & ~allowed) == 0)
+ return true;
+
+ btrfs_err(fs_info, "balance: invalid convert %s profile %s",
+ type, btrfs_bg_type_to_raid_name(bargs->target));
+ return false;
}
/*
@@ -4113,12 +4031,12 @@
int ret;
u64 num_devices;
unsigned seq;
- bool reducing_integrity;
+ bool reducing_redundancy;
int i;
if (btrfs_fs_closing(fs_info) ||
atomic_read(&fs_info->balance_pause_req) ||
- atomic_read(&fs_info->balance_cancel_req)) {
+ btrfs_should_cancel_balance(fs_info)) {
ret = -EINVAL;
goto out;
}
@@ -4145,7 +4063,7 @@
/*
* rw_devices will not change at the moment, device add/delete/replace
- * are excluded by EXCL_OP
+ * are exclusive
*/
num_devices = fs_info->fs_devices->rw_devices;
@@ -4159,24 +4077,9 @@
if (num_devices >= btrfs_raid_array[i].devs_min)
allowed |= btrfs_raid_array[i].bg_flag;
- if (validate_convert_profile(&bctl->data, allowed)) {
- btrfs_err(fs_info,
- "balance: invalid convert data profile %s",
- btrfs_bg_type_to_raid_name(bctl->data.target));
- ret = -EINVAL;
- goto out;
- }
- if (validate_convert_profile(&bctl->meta, allowed)) {
- btrfs_err(fs_info,
- "balance: invalid convert metadata profile %s",
- btrfs_bg_type_to_raid_name(bctl->meta.target));
- ret = -EINVAL;
- goto out;
- }
- if (validate_convert_profile(&bctl->sys, allowed)) {
- btrfs_err(fs_info,
- "balance: invalid convert system profile %s",
- btrfs_bg_type_to_raid_name(bctl->sys.target));
+ if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
+ !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
+ !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
ret = -EINVAL;
goto out;
}
@@ -4200,9 +4103,9 @@
((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(fs_info->avail_metadata_alloc_bits & allowed) &&
!(bctl->meta.target & allowed)))
- reducing_integrity = true;
+ reducing_redundancy = true;
else
- reducing_integrity = false;
+ reducing_redundancy = false;
/* if we're not converting, the target field is uninitialized */
meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
@@ -4211,13 +4114,13 @@
bctl->data.target : fs_info->avail_data_alloc_bits;
} while (read_seqretry(&fs_info->profiles_lock, seq));
- if (reducing_integrity) {
+ if (reducing_redundancy) {
if (bctl->flags & BTRFS_BALANCE_FORCE) {
btrfs_info(fs_info,
- "balance: force reducing metadata integrity");
+ "balance: force reducing metadata redundancy");
} else {
btrfs_err(fs_info,
- "balance: reduces metadata integrity, use --force if you want this");
+ "balance: reduces metadata redundancy, use --force if you want this");
ret = -EINVAL;
goto out;
}
@@ -4296,7 +4199,7 @@
if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
balance_need_close(fs_info)) {
reset_balance_state(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
}
wake_up(&fs_info->balance_wait_q);
@@ -4307,7 +4210,7 @@
reset_balance_state(fs_info);
else
kfree(bctl);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return ret;
}
@@ -4409,7 +4312,7 @@
* is in a paused state and must have fs_info::balance_ctl properly
* set up.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
@@ -4493,7 +4396,7 @@
if (fs_info->balance_ctl) {
reset_balance_state(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
btrfs_info(fs_info, "balance: canceled");
}
}
@@ -4505,7 +4408,7 @@
return 0;
}
-static int btrfs_uuid_scan_kthread(void *data)
+int btrfs_uuid_scan_kthread(void *data)
{
struct btrfs_fs_info *fs_info = data;
struct btrfs_root *root = fs_info->tree_root;
@@ -4517,6 +4420,7 @@
struct btrfs_root_item root_item;
u32 item_size;
struct btrfs_trans_handle *trans = NULL;
+ bool closing = false;
path = btrfs_alloc_path();
if (!path) {
@@ -4529,6 +4433,10 @@
key.offset = 0;
while (1) {
+ if (btrfs_fs_closing(fs_info)) {
+ closing = true;
+ break;
+ }
ret = btrfs_search_forward(root, &key, path,
BTRFS_OLDEST_GENERATION);
if (ret) {
@@ -4629,76 +4537,12 @@
btrfs_end_transaction(trans);
if (ret)
btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
- else
+ else if (!closing)
set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
up(&fs_info->uuid_tree_rescan_sem);
return 0;
}
-/*
- * Callback for btrfs_uuid_tree_iterate().
- * returns:
- * 0 check succeeded, the entry is not outdated.
- * < 0 if an error occurred.
- * > 0 if the check failed, which means the caller shall remove the entry.
- */
-static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
- u8 *uuid, u8 type, u64 subid)
-{
- struct btrfs_key key;
- int ret = 0;
- struct btrfs_root *subvol_root;
-
- if (type != BTRFS_UUID_KEY_SUBVOL &&
- type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
- goto out;
-
- key.objectid = subid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
- subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(subvol_root)) {
- ret = PTR_ERR(subvol_root);
- if (ret == -ENOENT)
- ret = 1;
- goto out;
- }
-
- switch (type) {
- case BTRFS_UUID_KEY_SUBVOL:
- if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
- ret = 1;
- break;
- case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
- if (memcmp(uuid, subvol_root->root_item.received_uuid,
- BTRFS_UUID_SIZE))
- ret = 1;
- break;
- }
-
-out:
- return ret;
-}
-
-static int btrfs_uuid_rescan_kthread(void *data)
-{
- struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
- int ret;
-
- /*
- * 1st step is to iterate through the existing UUID tree and
- * to delete all entries that contain outdated data.
- * 2nd step is to add all missing entries to the UUID tree.
- */
- ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
- if (ret < 0) {
- btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
- up(&fs_info->uuid_tree_rescan_sem);
- return ret;
- }
- return btrfs_uuid_scan_kthread(data);
-}
-
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
@@ -4741,22 +4585,6 @@
return 0;
}
-int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
-{
- struct task_struct *task;
-
- down(&fs_info->uuid_tree_rescan_sem);
- task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
- if (IS_ERR(task)) {
- /* fs_info->update_uuid_tree_gen remains 0 in all error case */
- btrfs_warn(fs_info, "failed to start uuid_rescan task");
- up(&fs_info->uuid_tree_rescan_sem);
- return PTR_ERR(task);
- }
-
- return 0;
-}
-
/*
* shrinking a device means finding all of the device extents past
* the new size, and then following the back refs to the chunks.
@@ -5005,96 +4833,119 @@
btrfs_set_fs_incompat(info, RAID56);
}
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- u64 start, u64 type)
+static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
- struct btrfs_fs_info *info = trans->fs_info;
- struct btrfs_fs_devices *fs_devices = info->fs_devices;
- struct btrfs_device *device;
- struct map_lookup *map = NULL;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
- struct btrfs_device_info *devices_info = NULL;
- u64 total_avail;
- int num_stripes; /* total number of stripes to allocate */
- int data_stripes; /* number of stripes that count for
- block group size */
- int sub_stripes; /* sub_stripes info for map */
- int dev_stripes; /* stripes per dev */
- int devs_max; /* max devs to use */
- int devs_min; /* min devs needed */
- int devs_increment; /* ndevs has to be a multiple of this */
- int ncopies; /* how many copies to data has */
- int nparity; /* number of stripes worth of bytes to
- store parity information */
- int ret;
+ if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
+ return;
+
+ btrfs_set_fs_incompat(info, RAID1C34);
+}
+
+/*
+ * Structure used internally for __btrfs_alloc_chunk() function.
+ * Wraps needed parameters.
+ */
+struct alloc_chunk_ctl {
+ u64 start;
+ u64 type;
+ /* Total number of stripes to allocate */
+ int num_stripes;
+ /* sub_stripes info for map */
+ int sub_stripes;
+ /* Stripes per device */
+ int dev_stripes;
+ /* Maximum number of devices to use */
+ int devs_max;
+ /* Minimum number of devices to use */
+ int devs_min;
+ /* ndevs has to be a multiple of this */
+ int devs_increment;
+ /* Number of copies */
+ int ncopies;
+ /* Number of stripes worth of bytes to store parity information */
+ int nparity;
u64 max_stripe_size;
u64 max_chunk_size;
+ u64 dev_extent_min;
u64 stripe_size;
u64 chunk_size;
int ndevs;
- int i;
- int j;
- int index;
+};
- BUG_ON(!alloc_profile_is_valid(type, 0));
-
- if (list_empty(&fs_devices->alloc_list)) {
- if (btrfs_test_opt(info, ENOSPC_DEBUG))
- btrfs_debug(info, "%s: no writable device", __func__);
- return -ENOSPC;
- }
-
- index = btrfs_bg_flags_to_raid_index(type);
-
- sub_stripes = btrfs_raid_array[index].sub_stripes;
- dev_stripes = btrfs_raid_array[index].dev_stripes;
- devs_max = btrfs_raid_array[index].devs_max;
- if (!devs_max)
- devs_max = BTRFS_MAX_DEVS(info);
- devs_min = btrfs_raid_array[index].devs_min;
- devs_increment = btrfs_raid_array[index].devs_increment;
- ncopies = btrfs_raid_array[index].ncopies;
- nparity = btrfs_raid_array[index].nparity;
+static void init_alloc_chunk_ctl_policy_regular(
+ struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl)
+{
+ u64 type = ctl->type;
if (type & BTRFS_BLOCK_GROUP_DATA) {
- max_stripe_size = SZ_1G;
- max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
+ ctl->max_stripe_size = SZ_1G;
+ ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- /* for larger filesystems, use larger metadata chunks */
+ /* For larger filesystems, use larger metadata chunks */
if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
- max_stripe_size = SZ_1G;
+ ctl->max_stripe_size = SZ_1G;
else
- max_stripe_size = SZ_256M;
- max_chunk_size = max_stripe_size;
+ ctl->max_stripe_size = SZ_256M;
+ ctl->max_chunk_size = ctl->max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- max_stripe_size = SZ_32M;
- max_chunk_size = 2 * max_stripe_size;
- devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
+ ctl->max_stripe_size = SZ_32M;
+ ctl->max_chunk_size = 2 * ctl->max_stripe_size;
+ ctl->devs_max = min_t(int, ctl->devs_max,
+ BTRFS_MAX_DEVS_SYS_CHUNK);
} else {
- btrfs_err(info, "invalid chunk type 0x%llx requested",
- type);
BUG();
}
/* We don't want a chunk larger than 10% of writable space */
- max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
- max_chunk_size);
+ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+ ctl->max_chunk_size);
+ ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
+}
- devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
- GFP_NOFS);
- if (!devices_info)
- return -ENOMEM;
+static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl)
+{
+ int index = btrfs_bg_flags_to_raid_index(ctl->type);
+
+ ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
+ ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
+ ctl->devs_max = btrfs_raid_array[index].devs_max;
+ if (!ctl->devs_max)
+ ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
+ ctl->devs_min = btrfs_raid_array[index].devs_min;
+ ctl->devs_increment = btrfs_raid_array[index].devs_increment;
+ ctl->ncopies = btrfs_raid_array[index].ncopies;
+ ctl->nparity = btrfs_raid_array[index].nparity;
+ ctl->ndevs = 0;
+
+ switch (fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static int gather_device_info(struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ struct btrfs_fs_info *info = fs_devices->fs_info;
+ struct btrfs_device *device;
+ u64 total_avail;
+ u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
+ int ret;
+ int ndevs = 0;
+ u64 max_avail;
+ u64 dev_offset;
/*
* in the first pass through the devices list, we gather information
* about the available holes on each device.
*/
- ndevs = 0;
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
- u64 max_avail;
- u64 dev_offset;
-
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
WARN(1, KERN_ERR
"BTRFS: read-only device in alloc_list\n");
@@ -5112,24 +4963,23 @@
total_avail = 0;
/* If there is no space on this device, skip it. */
- if (total_avail == 0)
+ if (total_avail < ctl->dev_extent_min)
continue;
- ret = find_free_dev_extent(device,
- max_stripe_size * dev_stripes,
- &dev_offset, &max_avail);
+ ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
+ &max_avail);
if (ret && ret != -ENOSPC)
- goto error;
+ return ret;
if (ret == 0)
- max_avail = max_stripe_size * dev_stripes;
+ max_avail = dev_extent_want;
- if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
+ if (max_avail < ctl->dev_extent_min) {
if (btrfs_test_opt(info, ENOSPC_DEBUG))
btrfs_debug(info,
- "%s: devid %llu has no free space, have=%llu want=%u",
+ "%s: devid %llu has no free space, have=%llu want=%llu",
__func__, device->devid, max_avail,
- BTRFS_STRIPE_LEN * dev_stripes);
+ ctl->dev_extent_min);
continue;
}
@@ -5144,6 +4994,7 @@
devices_info[ndevs].dev = device;
++ndevs;
}
+ ctl->ndevs = ndevs;
/*
* now sort the devices by hole size / available space
@@ -5151,20 +5002,14 @@
sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
btrfs_cmp_device_info, NULL);
- /* round down to number of usable stripes */
- ndevs = round_down(ndevs, devs_increment);
+ return 0;
+}
- if (ndevs < devs_min) {
- ret = -ENOSPC;
- if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
- btrfs_debug(info,
- "%s: not enough devices with free space: have=%d minimum required=%d",
- __func__, ndevs, devs_min);
- }
- goto error;
- }
-
- ndevs = min(ndevs, devs_max);
+static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ /* Number of stripes that count for block group size */
+ int data_stripes;
/*
* The primary goal is to maximize the number of stripes, so use as
@@ -5173,73 +5018,116 @@
* The DUP profile stores more than one stripe per device, the
* max_avail is the total size so we have to adjust.
*/
- stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
- num_stripes = ndevs * dev_stripes;
+ ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
+ ctl->dev_stripes);
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+
+ /* This will have to be fixed for RAID1 and RAID10 over more drives */
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
/*
- * this will have to be fixed for RAID1 and RAID10 over
- * more drives
+ * Use the number of data stripes to figure out how big this chunk is
+ * really going to be in terms of logical address space, and compare
+ * that answer with the max chunk size. If it's higher, we try to
+ * reduce stripe_size.
*/
- data_stripes = (num_stripes - nparity) / ncopies;
-
- /*
- * Use the number of data stripes to figure out how big this chunk
- * is really going to be in terms of logical address space,
- * and compare that answer with the max chunk size. If it's higher,
- * we try to reduce stripe_size.
- */
- if (stripe_size * data_stripes > max_chunk_size) {
+ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
/*
* Reduce stripe_size, round it up to a 16MB boundary again and
* then use it, unless it ends up being even bigger than the
* previous value we had already.
*/
- stripe_size = min(round_up(div_u64(max_chunk_size,
- data_stripes), SZ_16M),
- stripe_size);
+ ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
+ data_stripes), SZ_16M),
+ ctl->stripe_size);
}
- /* align to BTRFS_STRIPE_LEN */
- stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
+ /* Align to BTRFS_STRIPE_LEN */
+ ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
+ ctl->chunk_size = ctl->stripe_size * data_stripes;
- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
- if (!map) {
- ret = -ENOMEM;
- goto error;
+ return 0;
+}
+
+static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ struct btrfs_fs_info *info = fs_devices->fs_info;
+
+ /*
+ * Round down to number of usable stripes, devs_increment can be any
+ * number so we can't use round_down() that requires power of 2, while
+ * rounddown is safe.
+ */
+ ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
+
+ if (ctl->ndevs < ctl->devs_min) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
+ btrfs_debug(info,
+ "%s: not enough devices with free space: have=%d minimum required=%d",
+ __func__, ctl->ndevs, ctl->devs_min);
+ }
+ return -ENOSPC;
}
- map->num_stripes = num_stripes;
- for (i = 0; i < ndevs; ++i) {
- for (j = 0; j < dev_stripes; ++j) {
- int s = i * dev_stripes + j;
+ ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
+
+ switch (fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ return decide_stripe_size_regular(ctl, devices_info);
+ default:
+ BUG();
+ }
+}
+
+static int create_chunk(struct btrfs_trans_handle *trans,
+ struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ struct btrfs_fs_info *info = trans->fs_info;
+ struct map_lookup *map = NULL;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ u64 start = ctl->start;
+ u64 type = ctl->type;
+ int ret;
+ int i;
+ int j;
+
+ map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
+ if (!map)
+ return -ENOMEM;
+ map->num_stripes = ctl->num_stripes;
+
+ for (i = 0; i < ctl->ndevs; ++i) {
+ for (j = 0; j < ctl->dev_stripes; ++j) {
+ int s = i * ctl->dev_stripes + j;
map->stripes[s].dev = devices_info[i].dev;
map->stripes[s].physical = devices_info[i].dev_offset +
- j * stripe_size;
+ j * ctl->stripe_size;
}
}
map->stripe_len = BTRFS_STRIPE_LEN;
map->io_align = BTRFS_STRIPE_LEN;
map->io_width = BTRFS_STRIPE_LEN;
map->type = type;
- map->sub_stripes = sub_stripes;
+ map->sub_stripes = ctl->sub_stripes;
- chunk_size = stripe_size * data_stripes;
-
- trace_btrfs_chunk_alloc(info, map, start, chunk_size);
+ trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
em = alloc_extent_map();
if (!em) {
kfree(map);
- ret = -ENOMEM;
- goto error;
+ return -ENOMEM;
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->map_lookup = map;
em->start = start;
- em->len = chunk_size;
+ em->len = ctl->chunk_size;
em->block_start = 0;
em->block_len = em->len;
- em->orig_block_len = stripe_size;
+ em->orig_block_len = ctl->stripe_size;
em_tree = &info->mapping_tree;
write_lock(&em_tree->lock);
@@ -5247,29 +5135,31 @@
if (ret) {
write_unlock(&em_tree->lock);
free_extent_map(em);
- goto error;
+ return ret;
}
write_unlock(&em_tree->lock);
- ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
+ ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
if (ret)
goto error_del_extent;
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = map->stripes[i].dev;
- btrfs_device_set_bytes_used(dev, dev->bytes_used + stripe_size);
+ btrfs_device_set_bytes_used(dev,
+ dev->bytes_used + ctl->stripe_size);
if (list_empty(&dev->post_commit_list))
list_add_tail(&dev->post_commit_list,
&trans->transaction->dev_update_list);
}
- atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
+ atomic64_sub(ctl->stripe_size * map->num_stripes,
+ &info->free_chunk_space);
free_extent_map(em);
check_raid56_incompat_flag(info, type);
+ check_raid1c34_incompat_flag(info, type);
- kfree(devices_info);
return 0;
error_del_extent:
@@ -5281,11 +5171,68 @@
free_extent_map(em);
/* One for the tree reference */
free_extent_map(em);
-error:
+
+ return ret;
+}
+
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+ struct btrfs_fs_info *info = trans->fs_info;
+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
+ struct btrfs_device_info *devices_info = NULL;
+ struct alloc_chunk_ctl ctl;
+ int ret;
+
+ lockdep_assert_held(&info->chunk_mutex);
+
+ if (!alloc_profile_is_valid(type, 0)) {
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ if (list_empty(&fs_devices->alloc_list)) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
+ btrfs_debug(info, "%s: no writable device", __func__);
+ return -ENOSPC;
+ }
+
+ if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(info, "invalid chunk type 0x%llx requested", type);
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ ctl.start = find_next_chunk(info);
+ ctl.type = type;
+ init_alloc_chunk_ctl(fs_devices, &ctl);
+
+ devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
+ GFP_NOFS);
+ if (!devices_info)
+ return -ENOMEM;
+
+ ret = gather_device_info(fs_devices, &ctl, devices_info);
+ if (ret < 0)
+ goto out;
+
+ ret = decide_stripe_size(fs_devices, &ctl, devices_info);
+ if (ret < 0)
+ goto out;
+
+ ret = create_chunk(trans, &ctl, devices_info);
+
+out:
kfree(devices_info);
return ret;
}
+/*
+ * Chunk allocation falls into two parts. The first part does work
+ * that makes the new allocated chunk usable, but does not do any operation
+ * that modifies the chunk tree. The second part does the work that
+ * requires modifying the chunk tree. This division is important for the
+ * bootstrap process of adding storage to a seed btrfs.
+ */
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
u64 chunk_offset, u64 chunk_size)
{
@@ -5384,39 +5331,19 @@
return ret;
}
-/*
- * Chunk allocation falls into two parts. The first part does work
- * that makes the new allocated chunk usable, but does not do any operation
- * that modifies the chunk tree. The second part does the work that
- * requires modifying the chunk tree. This division is important for the
- * bootstrap process of adding storage to a seed btrfs.
- */
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
-{
- u64 chunk_offset;
-
- lockdep_assert_held(&trans->fs_info->chunk_mutex);
- chunk_offset = find_next_chunk(trans->fs_info);
- return __btrfs_alloc_chunk(trans, chunk_offset, type);
-}
-
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- u64 chunk_offset;
- u64 sys_chunk_offset;
u64 alloc_profile;
int ret;
- chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
- ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
+ ret = btrfs_alloc_chunk(trans, alloc_profile);
if (ret)
return ret;
- sys_chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_system_alloc_profile(fs_info);
- ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
+ ret = btrfs_alloc_chunk(trans, alloc_profile);
return ret;
}
@@ -5613,31 +5540,19 @@
return preferred_mirror;
}
-static inline int parity_smaller(u64 a, u64 b)
-{
- return a > b;
-}
-
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
{
- struct btrfs_bio_stripe s;
int i;
- u64 l;
int again = 1;
while (again) {
again = 0;
for (i = 0; i < num_stripes - 1; i++) {
- if (parity_smaller(bbio->raid_map[i],
- bbio->raid_map[i+1])) {
- s = bbio->stripes[i];
- l = bbio->raid_map[i];
- bbio->stripes[i] = bbio->stripes[i+1];
- bbio->raid_map[i] = bbio->raid_map[i+1];
- bbio->stripes[i+1] = s;
- bbio->raid_map[i+1] = l;
-
+ /* Swap if parity is on a smaller index */
+ if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
+ swap(bbio->stripes[i], bbio->stripes[i + 1]);
+ swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
again = 1;
}
}
@@ -5663,6 +5578,9 @@
atomic_set(&bbio->error, 0);
refcount_set(&bbio->refs, 1);
+ bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
+ bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
+
return bbio;
}
@@ -6138,10 +6056,7 @@
struct btrfs_io_geometry geom;
ASSERT(bbio_ret);
-
- if (op == BTRFS_MAP_DISCARD)
- return __btrfs_map_block_for_discard(fs_info, logical,
- length, bbio_ret);
+ ASSERT(op != BTRFS_MAP_DISCARD);
ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
if (ret < 0)
@@ -6288,8 +6203,13 @@
ret = -ENOMEM;
goto out;
}
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
- bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
+
+ for (i = 0; i < num_stripes; i++) {
+ bbio->stripes[i].physical = map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+ bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+ stripe_index++;
+ }
/* build raid_map */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
@@ -6297,11 +6217,6 @@
u64 tmp;
unsigned rot;
- bbio->raid_map = (u64 *)((void *)bbio->stripes +
- sizeof(struct btrfs_bio_stripe) *
- num_alloc_stripes +
- sizeof(int) * tgtdev_indexes);
-
/* Work out the disk rotation on this stripe-set */
div_u64_rem(stripe_nr, num_stripes, &rot);
@@ -6315,25 +6230,13 @@
if (map->type & BTRFS_BLOCK_GROUP_RAID6)
bbio->raid_map[(i+rot+1) % num_stripes] =
RAID6_Q_STRIPE;
- }
-
- for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
- map->stripes[stripe_index].physical +
- stripe_offset +
- stripe_nr * map->stripe_len;
- bbio->stripes[i].dev =
- map->stripes[stripe_index].dev;
- stripe_index++;
+ sort_parity_stripes(bbio, num_stripes);
}
if (need_full_stripe(op))
max_errors = btrfs_chunk_max_errors(map);
- if (bbio->raid_map)
- sort_parity_stripes(bbio, num_stripes);
-
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
need_full_stripe(op)) {
handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
@@ -6371,6 +6274,10 @@
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num)
{
+ if (op == BTRFS_MAP_DISCARD)
+ return __btrfs_map_block_for_discard(fs_info, logical,
+ length, bbio_ret);
+
return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
mirror_num, 0);
}
@@ -6383,75 +6290,6 @@
return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
}
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
- u64 physical, u64 **logical, int *naddrs, int *stripe_len)
-{
- struct extent_map *em;
- struct map_lookup *map;
- u64 *buf;
- u64 bytenr;
- u64 length;
- u64 stripe_nr;
- u64 rmap_len;
- int i, j, nr = 0;
-
- em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
- if (IS_ERR(em))
- return -EIO;
-
- map = em->map_lookup;
- length = em->len;
- rmap_len = map->stripe_len;
-
- if (map->type & BTRFS_BLOCK_GROUP_RAID10)
- length = div_u64(length, map->num_stripes / map->sub_stripes);
- else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
- length = div_u64(length, map->num_stripes);
- else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- length = div_u64(length, nr_data_stripes(map));
- rmap_len = map->stripe_len * nr_data_stripes(map);
- }
-
- buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
- BUG_ON(!buf); /* -ENOMEM */
-
- for (i = 0; i < map->num_stripes; i++) {
- if (map->stripes[i].physical > physical ||
- map->stripes[i].physical + length <= physical)
- continue;
-
- stripe_nr = physical - map->stripes[i].physical;
- stripe_nr = div64_u64(stripe_nr, map->stripe_len);
-
- if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- stripe_nr = stripe_nr * map->num_stripes + i;
- stripe_nr = div_u64(stripe_nr, map->sub_stripes);
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- stripe_nr = stripe_nr * map->num_stripes + i;
- } /* else if RAID[56], multiply by nr_data_stripes().
- * Alternatively, just use rmap_len below instead of
- * map->stripe_len */
-
- bytenr = chunk_start + stripe_nr * rmap_len;
- WARN_ON(nr >= map->num_stripes);
- for (j = 0; j < nr; j++) {
- if (buf[j] == bytenr)
- break;
- }
- if (j == nr) {
- WARN_ON(nr >= map->num_stripes);
- buf[nr++] = bytenr;
- }
- }
-
- *logical = buf;
- *naddrs = nr;
- *stripe_len = rmap_len;
-
- free_extent_map(em);
- return 0;
-}
-
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
{
bio->bi_private = bbio->private;
@@ -6470,23 +6308,18 @@
atomic_inc(&bbio->error);
if (bio->bi_status == BLK_STS_IOERR ||
bio->bi_status == BLK_STS_TARGET) {
- unsigned int stripe_index =
- btrfs_io_bio(bio)->stripe_index;
- struct btrfs_device *dev;
+ struct btrfs_device *dev = btrfs_io_bio(bio)->device;
- BUG_ON(stripe_index >= bbio->num_stripes);
- dev = bbio->stripes[stripe_index].dev;
- if (dev->bdev) {
- if (bio_op(bio) == REQ_OP_WRITE)
- btrfs_dev_stat_inc_and_print(dev,
+ ASSERT(dev->bdev);
+ if (bio_op(bio) == REQ_OP_WRITE)
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
- else if (!(bio->bi_opf & REQ_RAHEAD))
- btrfs_dev_stat_inc_and_print(dev,
+ else if (!(bio->bi_opf & REQ_RAHEAD))
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_READ_ERRS);
- if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc_and_print(dev,
+ if (bio->bi_opf & REQ_PREFLUSH)
+ btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
- }
}
}
@@ -6521,73 +6354,25 @@
}
}
-/*
- * see run_scheduled_bios for a description of why bios are collected for
- * async submit.
- *
- * This will add one bio to the pending list for a device and make sure
- * the work struct is scheduled.
- */
-static noinline void btrfs_schedule_bio(struct btrfs_device *device,
- struct bio *bio)
-{
- struct btrfs_fs_info *fs_info = device->fs_info;
- int should_queue = 1;
- struct btrfs_pending_bios *pending_bios;
-
- /* don't bother with additional async steps for reads, right now */
- if (bio_op(bio) == REQ_OP_READ) {
- btrfsic_submit_bio(bio);
- return;
- }
-
- WARN_ON(bio->bi_next);
- bio->bi_next = NULL;
-
- spin_lock(&device->io_lock);
- if (op_is_sync(bio->bi_opf))
- pending_bios = &device->pending_sync_bios;
- else
- pending_bios = &device->pending_bios;
-
- if (pending_bios->tail)
- pending_bios->tail->bi_next = bio;
-
- pending_bios->tail = bio;
- if (!pending_bios->head)
- pending_bios->head = bio;
- if (device->running_pending)
- should_queue = 0;
-
- spin_unlock(&device->io_lock);
-
- if (should_queue)
- btrfs_queue_work(fs_info->submit_workers, &device->work);
-}
-
static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
- u64 physical, int dev_nr, int async)
+ u64 physical, struct btrfs_device *dev)
{
- struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
struct btrfs_fs_info *fs_info = bbio->fs_info;
bio->bi_private = bbio;
- btrfs_io_bio(bio)->stripe_index = dev_nr;
+ btrfs_io_bio(bio)->device = dev;
bio->bi_end_io = btrfs_end_bio;
bio->bi_iter.bi_sector = physical >> 9;
btrfs_debug_in_rcu(fs_info,
"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
- (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid,
- bio->bi_iter.bi_size);
+ (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
+ dev->devid, bio->bi_iter.bi_size);
bio_set_dev(bio, dev->bdev);
btrfs_bio_counter_inc_noblocked(fs_info);
- if (async)
- btrfs_schedule_bio(dev, bio);
- else
- btrfsic_submit_bio(bio);
+ btrfsic_submit_bio(bio);
}
static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
@@ -6608,7 +6393,7 @@
}
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num, int async_submit)
+ int mirror_num)
{
struct btrfs_device *dev;
struct bio *first_bio = bio;
@@ -6676,8 +6461,7 @@
else
bio = first_bio;
- submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
- dev_nr, async_submit);
+ submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
}
btrfs_bio_counter_dec(fs_info);
return BLK_STS_OK;
@@ -6697,11 +6481,21 @@
bool seed)
{
struct btrfs_device *device;
+ struct btrfs_fs_devices *seed_devs;
- while (fs_devices) {
+ if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->devid == devid &&
+ (!uuid || memcmp(device->uuid, uuid,
+ BTRFS_UUID_SIZE) == 0))
+ return device;
+ }
+ }
+
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
if (!fsid ||
- !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- list_for_each_entry(device, &fs_devices->devices,
+ !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &seed_devs->devices,
dev_list) {
if (device->devid == devid &&
(!uuid || memcmp(device->uuid, uuid,
@@ -6709,11 +6503,8 @@
return device;
}
}
- if (seed)
- fs_devices = fs_devices->seed;
- else
- return NULL;
}
+
return NULL;
}
@@ -6768,7 +6559,7 @@
if (WARN_ON(!devid && !fs_info))
return ERR_PTR(-EINVAL);
- dev = __alloc_device();
+ dev = __alloc_device(fs_info);
if (IS_ERR(dev))
return dev;
@@ -6790,8 +6581,6 @@
else
generate_random_uuid(dev->uuid);
- btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
-
return dev;
}
@@ -6810,19 +6599,14 @@
{
int index = btrfs_bg_flags_to_raid_index(type);
int ncopies = btrfs_raid_array[index].ncopies;
+ const int nparity = btrfs_raid_array[index].nparity;
int data_stripes;
- switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
- case BTRFS_BLOCK_GROUP_RAID5:
- data_stripes = num_stripes - 1;
- break;
- case BTRFS_BLOCK_GROUP_RAID6:
- data_stripes = num_stripes - 2;
- break;
- default:
+ if (nparity)
+ data_stripes = num_stripes - nparity;
+ else
data_stripes = num_stripes / ncopies;
- break;
- }
+
return div_u64(chunk_len, data_stripes);
}
@@ -6971,13 +6755,11 @@
lockdep_assert_held(&uuid_mutex);
ASSERT(fsid);
- fs_devices = fs_info->fs_devices->seed;
- while (fs_devices) {
+ /* This will match only for multi-device seed fs */
+ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
return fs_devices;
- fs_devices = fs_devices->seed;
- }
fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
@@ -6988,11 +6770,15 @@
if (IS_ERR(fs_devices))
return fs_devices;
- fs_devices->seeding = 1;
+ fs_devices->seeding = true;
fs_devices->opened = 1;
return fs_devices;
}
+ /*
+ * Upon first call for a seed fs fsid, just create a private copy of the
+ * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
+ */
fs_devices = clone_fs_devices(fs_devices);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -7000,20 +6786,17 @@
ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
- fs_devices = ERR_PTR(ret);
- goto out;
+ return ERR_PTR(ret);
}
if (!fs_devices->seeding) {
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
- fs_devices = ERR_PTR(-EINVAL);
- goto out;
+ return ERR_PTR(-EINVAL);
}
- fs_devices->seed = fs_info->fs_devices->seed;
- fs_info->fs_devices->seed = fs_devices;
-out:
+ list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
+
return fs_devices;
}
@@ -7177,48 +6960,49 @@
sb_array_offset += len;
cur_offset += len;
- if (key.type == BTRFS_CHUNK_ITEM_KEY) {
- chunk = (struct btrfs_chunk *)sb_array_offset;
- /*
- * At least one btrfs_chunk with one stripe must be
- * present, exact stripe count check comes afterwards
- */
- len = btrfs_chunk_item_size(1);
- if (cur_offset + len > array_size)
- goto out_short_read;
-
- num_stripes = btrfs_chunk_num_stripes(sb, chunk);
- if (!num_stripes) {
- btrfs_err(fs_info,
- "invalid number of stripes %u in sys_array at offset %u",
- num_stripes, cur_offset);
- ret = -EIO;
- break;
- }
-
- type = btrfs_chunk_type(sb, chunk);
- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
- btrfs_err(fs_info,
- "invalid chunk type %llu in sys_array at offset %u",
- type, cur_offset);
- ret = -EIO;
- break;
- }
-
- len = btrfs_chunk_item_size(num_stripes);
- if (cur_offset + len > array_size)
- goto out_short_read;
-
- ret = read_one_chunk(&key, sb, chunk);
- if (ret)
- break;
- } else {
+ if (key.type != BTRFS_CHUNK_ITEM_KEY) {
btrfs_err(fs_info,
"unexpected item type %u in sys_array at offset %u",
(u32)key.type, cur_offset);
ret = -EIO;
break;
}
+
+ chunk = (struct btrfs_chunk *)sb_array_offset;
+ /*
+ * At least one btrfs_chunk with one stripe must be present,
+ * exact stripe count check comes afterwards
+ */
+ len = btrfs_chunk_item_size(1);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
+ num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ if (!num_stripes) {
+ btrfs_err(fs_info,
+ "invalid number of stripes %u in sys_array at offset %u",
+ num_stripes, cur_offset);
+ ret = -EIO;
+ break;
+ }
+
+ type = btrfs_chunk_type(sb, chunk);
+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
+ btrfs_err(fs_info,
+ "invalid chunk type %llu in sys_array at offset %u",
+ type, cur_offset);
+ ret = -EIO;
+ break;
+ }
+
+ len = btrfs_chunk_item_size(num_stripes);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
+ ret = read_one_chunk(&key, sb, chunk);
+ if (ret)
+ break;
+
array_ptr += len;
sb_array_offset += len;
cur_offset += len;
@@ -7300,6 +7084,19 @@
return ret;
}
+static void readahead_tree_node_children(struct extent_buffer *node)
+{
+ int i;
+ const int nr_items = btrfs_header_nritems(node);
+
+ for (i = 0; i < nr_items; i++) {
+ u64 start;
+
+ start = btrfs_node_blockptr(node, i);
+ readahead_tree_block(node->fs_info, start);
+ }
+}
+
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->chunk_root;
@@ -7310,6 +7107,7 @@
int ret;
int slot;
u64 total_dev = 0;
+ u64 last_ra_node = 0;
path = btrfs_alloc_path();
if (!path)
@@ -7342,6 +7140,8 @@
if (ret < 0)
goto error;
while (1) {
+ struct extent_buffer *node;
+
leaf = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
@@ -7352,6 +7152,17 @@
goto error;
break;
}
+ /*
+ * The nodes on level 1 are not locked but we don't need to do
+ * that during mount time as nothing else can access the tree
+ */
+ node = path->nodes[1];
+ if (node) {
+ if (last_ra_node != node->start) {
+ readahead_tree_node_children(node);
+ last_ra_node = node->start;
+ }
+ }
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_DEV_ITEM_KEY) {
struct btrfs_dev_item *dev_item;
@@ -7404,17 +7215,22 @@
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
- while (fs_devices) {
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list)
- device->fs_info = fs_info;
- mutex_unlock(&fs_devices->device_list_mutex);
+ fs_devices->fs_info = fs_info;
- fs_devices = fs_devices->seed;
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->fs_info = fs_info;
+
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list)
+ device->fs_info = fs_info;
+
+ seed_devs->fs_info = fs_info;
}
+ mutex_unlock(&fs_devices->device_list_mutex);
}
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
@@ -7440,17 +7256,53 @@
sizeof(val));
}
+static int btrfs_device_init_dev_stats(struct btrfs_device *device,
+ struct btrfs_path *path)
+{
+ struct btrfs_dev_stats_item *ptr;
+ struct extent_buffer *eb;
+ struct btrfs_key key;
+ int item_size;
+ int i, ret, slot;
+
+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
+ key.offset = device->devid;
+ ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
+ if (ret) {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_dev_stat_set(device, i, 0);
+ device->dev_stats_valid = 1;
+ btrfs_release_path(path);
+ return ret < 0 ? ret : 0;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (item_size >= (1 + i) * sizeof(__le64))
+ btrfs_dev_stat_set(device, i,
+ btrfs_dev_stats_value(eb, ptr, i));
+ else
+ btrfs_dev_stat_set(device, i, 0);
+ }
+
+ device->dev_stats_valid = 1;
+ btrfs_dev_stat_print_on_load(device);
+ btrfs_release_path(path);
+
+ return 0;
+}
+
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
- struct btrfs_key key;
- struct btrfs_root *dev_root = fs_info->dev_root;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- struct extent_buffer *eb;
- int slot;
- int ret = 0;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
struct btrfs_path *path = NULL;
- int i;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
@@ -7458,43 +7310,22 @@
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- int item_size;
- struct btrfs_dev_stats_item *ptr;
-
- key.objectid = BTRFS_DEV_STATS_OBJECTID;
- key.type = BTRFS_PERSISTENT_ITEM_KEY;
- key.offset = device->devid;
- ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
- if (ret) {
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
- btrfs_dev_stat_set(device, i, 0);
- device->dev_stats_valid = 1;
- btrfs_release_path(path);
- continue;
- }
- slot = path->slots[0];
- eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, slot);
-
- ptr = btrfs_item_ptr(eb, slot,
- struct btrfs_dev_stats_item);
-
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
- if (item_size >= (1 + i) * sizeof(__le64))
- btrfs_dev_stat_set(device, i,
- btrfs_dev_stats_value(eb, ptr, i));
- else
- btrfs_dev_stat_set(device, i, 0);
- }
-
- device->dev_stats_valid = 1;
- btrfs_dev_stat_print_on_load(device);
- btrfs_release_path(path);
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
}
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
+ }
+ }
+out:
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_free_path(path);
- return ret < 0 ? ret : 0;
+ return ret;
}
static int update_dev_stat_item(struct btrfs_trans_handle *trans,
@@ -7680,36 +7511,6 @@
return 0;
}
-void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
-{
- struct buffer_head *bh;
- struct btrfs_super_block *disk_super;
- int copy_num;
-
- if (!bdev)
- return;
-
- for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
- copy_num++) {
-
- if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
- continue;
-
- disk_super = (struct btrfs_super_block *)bh->b_data;
-
- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
- set_buffer_dirty(bh);
- sync_dirty_buffer(bh);
- brelse(bh);
- }
-
- /* Notify udev that device has changed */
- btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
-
- /* Update ctime/mtime for device path for libblkid */
- update_dev_time(device_path);
-}
-
/*
* Update the size and bytes used for each device where it changed. This is
* delayed since we would otherwise get errors while writing out the
@@ -7741,24 +7542,6 @@
mutex_unlock(&trans->fs_info->chunk_mutex);
}
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- while (fs_devices) {
- fs_devices->fs_info = fs_info;
- fs_devices = fs_devices->seed;
- }
-}
-
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- while (fs_devices) {
- fs_devices->fs_info = NULL;
- fs_devices = fs_devices->seed;
- }
-}
-
/*
* Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
*/
@@ -7839,8 +7622,11 @@
/* It's possible this device is a dummy for seed device */
if (dev->disk_total_bytes == 0) {
- dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
- NULL, false);
+ struct btrfs_fs_devices *devs;
+
+ devs = list_first_entry(&fs_info->fs_devices->seed_list,
+ struct btrfs_fs_devices, seed_list);
+ dev = btrfs_find_device(devs, devid, NULL, NULL, false);
if (!dev) {
btrfs_err(fs_info, "failed to find seed devid %llu",
devid);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index aa6a6d7..f217726 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -17,12 +17,6 @@
#define BTRFS_STRIPE_LEN SZ_64K
-struct buffer_head;
-struct btrfs_pending_bios {
- struct bio *head;
- struct bio *tail;
-};
-
struct btrfs_io_geometry {
/* remaining bytes before crossing a stripe */
u64 len;
@@ -65,17 +59,10 @@
struct btrfs_fs_devices *fs_devices;
struct btrfs_fs_info *fs_info;
- struct rcu_string *name;
+ struct rcu_string __rcu *name;
u64 generation;
- spinlock_t io_lock ____cacheline_aligned;
- int running_pending;
- /* regular prio bios */
- struct btrfs_pending_bios pending_bios;
- /* sync bios */
- struct btrfs_pending_bios pending_sync_bios;
-
struct block_device *bdev;
/* the mode sent to blkdev_get */
@@ -132,8 +119,6 @@
/* per-device scrub information */
struct scrub_ctx *scrub_ctx;
- struct btrfs_work work;
-
/* readahead state */
atomic_t reada_in_flight;
u64 reada_next;
@@ -150,6 +135,10 @@
atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
struct extent_io_tree alloc_state;
+
+ struct completion kobj_unregister;
+ /* For sysfs/FSID/devinfo/devid/ */
+ struct kobject devid_kobj;
};
/*
@@ -180,7 +169,7 @@
write_seqcount_end(&dev->data_seqcount); \
preempt_enable(); \
}
-#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
+#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
#define BTRFS_DEVICE_GETSET_FUNCS(name) \
static inline u64 \
btrfs_device_get_##name(const struct btrfs_device *dev) \
@@ -219,6 +208,10 @@
BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
+enum btrfs_chunk_allocation_policy {
+ BTRFS_CHUNK_ALLOC_REGULAR,
+};
+
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
u8 metadata_uuid[BTRFS_FSID_SIZE];
@@ -254,21 +247,24 @@
*/
struct list_head alloc_list;
- struct btrfs_fs_devices *seed;
- int seeding;
+ struct list_head seed_list;
+ bool seeding;
int opened;
/* set when we find or add a device that doesn't have the
* nonrot flag set
*/
- int rotating;
+ bool rotating;
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
struct kobject fsid_kobj;
- struct kobject *device_dir_kobj;
+ struct kobject *devices_kobj;
+ struct kobject *devinfo_kobj;
struct completion kobj_unregister;
+
+ enum btrfs_chunk_allocation_policy chunk_alloc_policy;
};
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
@@ -293,7 +289,7 @@
*/
struct btrfs_io_bio {
unsigned int mirror_num;
- unsigned int stripe_index;
+ struct btrfs_device *device;
u64 logical;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
@@ -413,7 +409,7 @@
return BTRFS_MAP_WRITE;
default:
WARN_ON_ONCE(1);
- /* fall through */
+ fallthrough;
case REQ_OP_READ:
return BTRFS_MAP_READ;
}
@@ -429,20 +425,18 @@
struct btrfs_bio **bbio_ret);
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 len, struct btrfs_io_geometry *io_geom);
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
- u64 physical, u64 **logical, int *naddrs, int *stripe_len);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num, int async_submit);
+ int mirror_num);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
int btrfs_forget_devices(const char *path);
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev);
@@ -472,7 +466,7 @@
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
-int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
+int btrfs_uuid_scan_kthread(void *data);
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
@@ -485,7 +479,6 @@
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
-void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
@@ -495,6 +488,7 @@
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
+void btrfs_release_disk_super(struct btrfs_super_block *super);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
@@ -557,6 +551,10 @@
return BTRFS_RAID_RAID10;
else if (flags & BTRFS_BLOCK_GROUP_RAID1)
return BTRFS_RAID_RAID1;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
+ return BTRFS_RAID_RAID1C3;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
+ return BTRFS_RAID_RAID1C4;
else if (flags & BTRFS_BLOCK_GROUP_DUP)
return BTRFS_RAID_DUP;
else if (flags & BTRFS_BLOCK_GROUP_RAID0)
@@ -571,11 +569,12 @@
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
-struct list_head *btrfs_get_fs_uuids(void);
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
+struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev);
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path);
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 4885851..f1a60bc 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -213,9 +213,11 @@
}
out:
btrfs_free_path(path);
- if (!ret)
+ if (!ret) {
set_bit(BTRFS_INODE_COPY_EVERYTHING,
&BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_NO_XATTRS, &BTRFS_I(inode)->runtime_flags);
+ }
return ret;
}
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index df1aace..05615a1 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -20,28 +20,22 @@
#include <linux/refcount.h>
#include "compression.h"
+/* workspace buffer size for s390 zlib hardware support */
+#define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE)
+
struct workspace {
z_stream strm;
char *buf;
+ unsigned int buf_size;
struct list_head list;
int level;
};
static struct workspace_manager wsm;
-static void zlib_init_workspace_manager(void)
+struct list_head *zlib_get_workspace(unsigned int level)
{
- btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress);
-}
-
-static void zlib_cleanup_workspace_manager(void)
-{
- btrfs_cleanup_workspace_manager(&wsm);
-}
-
-static struct list_head *zlib_get_workspace(unsigned int level)
-{
- struct list_head *ws = btrfs_get_workspace(&wsm, level);
+ struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level);
struct workspace *workspace = list_entry(ws, struct workspace, list);
workspace->level = level;
@@ -49,12 +43,7 @@
return ws;
}
-static void zlib_put_workspace(struct list_head *ws)
-{
- btrfs_put_workspace(&wsm, ws);
-}
-
-static void zlib_free_workspace(struct list_head *ws)
+void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -63,7 +52,7 @@
kfree(workspace);
}
-static struct list_head *zlib_alloc_workspace(unsigned int level)
+struct list_head *zlib_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
int workspacesize;
@@ -76,7 +65,21 @@
zlib_inflate_workspacesize());
workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
workspace->level = level;
- workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ workspace->buf = NULL;
+ /*
+ * In case of s390 zlib hardware support, allocate lager workspace
+ * buffer. If allocator fails, fall back to a single page buffer.
+ */
+ if (zlib_deflate_dfltcc_enabled()) {
+ workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE,
+ __GFP_NOMEMALLOC | __GFP_NORETRY |
+ __GFP_NOWARN | GFP_NOIO);
+ workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE;
+ }
+ if (!workspace->buf) {
+ workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ workspace->buf_size = PAGE_SIZE;
+ }
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
@@ -88,13 +91,9 @@
return ERR_PTR(-ENOMEM);
}
-static int zlib_compress_pages(struct list_head *ws,
- struct address_space *mapping,
- u64 start,
- struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out)
+int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret;
@@ -104,6 +103,7 @@
struct page *in_page = NULL;
struct page *out_page = NULL;
unsigned long bytes_left;
+ unsigned int in_buf_pages;
unsigned long len = *total_out;
unsigned long nr_dest_pages = *out_pages;
const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
@@ -121,9 +121,6 @@
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = kmap(in_page);
-
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (out_page == NULL) {
ret = -ENOMEM;
@@ -133,12 +130,51 @@
pages[0] = out_page;
nr_pages = 1;
- workspace->strm.next_in = data_in;
+ workspace->strm.next_in = workspace->buf;
+ workspace->strm.avail_in = 0;
workspace->strm.next_out = cpage_out;
workspace->strm.avail_out = PAGE_SIZE;
- workspace->strm.avail_in = min(len, PAGE_SIZE);
while (workspace->strm.total_in < len) {
+ /*
+ * Get next input pages and copy the contents to
+ * the workspace buffer if required.
+ */
+ if (workspace->strm.avail_in == 0) {
+ bytes_left = len - workspace->strm.total_in;
+ in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
+ workspace->buf_size / PAGE_SIZE);
+ if (in_buf_pages > 1) {
+ int i;
+
+ for (i = 0; i < in_buf_pages; i++) {
+ if (in_page) {
+ kunmap(in_page);
+ put_page(in_page);
+ }
+ in_page = find_get_page(mapping,
+ start >> PAGE_SHIFT);
+ data_in = kmap(in_page);
+ memcpy(workspace->buf + i * PAGE_SIZE,
+ data_in, PAGE_SIZE);
+ start += PAGE_SIZE;
+ }
+ workspace->strm.next_in = workspace->buf;
+ } else {
+ if (in_page) {
+ kunmap(in_page);
+ put_page(in_page);
+ }
+ in_page = find_get_page(mapping,
+ start >> PAGE_SHIFT);
+ data_in = kmap(in_page);
+ start += PAGE_SIZE;
+ workspace->strm.next_in = data_in;
+ }
+ workspace->strm.avail_in = min(bytes_left,
+ (unsigned long) workspace->buf_size);
+ }
+
ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
if (ret != Z_OK) {
pr_debug("BTRFS: deflate in loop returned %d\n",
@@ -180,33 +216,43 @@
/* we're all done */
if (workspace->strm.total_in >= len)
break;
-
- /* we've read in a full page, get a new one */
- if (workspace->strm.avail_in == 0) {
- if (workspace->strm.total_out > max_out)
- break;
-
- bytes_left = len - workspace->strm.total_in;
- kunmap(in_page);
- put_page(in_page);
-
- start += PAGE_SIZE;
- in_page = find_get_page(mapping,
- start >> PAGE_SHIFT);
- data_in = kmap(in_page);
- workspace->strm.avail_in = min(bytes_left,
- PAGE_SIZE);
- workspace->strm.next_in = data_in;
- }
+ if (workspace->strm.total_out > max_out)
+ break;
}
workspace->strm.avail_in = 0;
- ret = zlib_deflate(&workspace->strm, Z_FINISH);
- zlib_deflateEnd(&workspace->strm);
-
- if (ret != Z_STREAM_END) {
- ret = -EIO;
- goto out;
+ /*
+ * Call deflate with Z_FINISH flush parameter providing more output
+ * space but no more input data, until it returns with Z_STREAM_END.
+ */
+ while (ret != Z_STREAM_END) {
+ ret = zlib_deflate(&workspace->strm, Z_FINISH);
+ if (ret == Z_STREAM_END)
+ break;
+ if (ret != Z_OK && ret != Z_BUF_ERROR) {
+ zlib_deflateEnd(&workspace->strm);
+ ret = -EIO;
+ goto out;
+ } else if (workspace->strm.avail_out == 0) {
+ /* get another page for the stream end */
+ kunmap(out_page);
+ if (nr_pages == nr_dest_pages) {
+ out_page = NULL;
+ ret = -E2BIG;
+ goto out;
+ }
+ out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (out_page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ cpage_out = kmap(out_page);
+ pages[nr_pages] = out_page;
+ nr_pages++;
+ workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.next_out = cpage_out;
+ }
}
+ zlib_deflateEnd(&workspace->strm);
if (workspace->strm.total_out >= workspace->strm.total_in) {
ret = -E2BIG;
@@ -228,7 +274,7 @@
return ret;
}
-static int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0, ret2;
@@ -250,7 +296,7 @@
workspace->strm.total_out = 0;
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = workspace->buf_size;
/* If it's deflate, and it's got no preset dictionary, then
we can tell zlib to skip the adler32 check. */
@@ -289,7 +335,7 @@
}
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = workspace->buf_size;
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
@@ -319,10 +365,9 @@
return ret;
}
-static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page,
- unsigned long start_byte,
- size_t srclen, size_t destlen)
+int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
@@ -340,7 +385,7 @@
workspace->strm.total_in = 0;
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = workspace->buf_size;
workspace->strm.total_out = 0;
/* If it's deflate, and it's got no preset dictionary, then
we can tell zlib to skip the adler32 check. */
@@ -384,7 +429,7 @@
buf_offset = 0;
bytes = min(PAGE_SIZE - pg_offset,
- PAGE_SIZE - buf_offset);
+ PAGE_SIZE - (buf_offset % PAGE_SIZE));
bytes = min(bytes, bytes_left);
kaddr = kmap_atomic(dest_page);
@@ -395,7 +440,7 @@
bytes_left -= bytes;
next:
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = workspace->buf_size;
}
if (ret != Z_STREAM_END && bytes_left != 0)
@@ -419,15 +464,7 @@
}
const struct btrfs_compress_op btrfs_zlib_compress = {
- .init_workspace_manager = zlib_init_workspace_manager,
- .cleanup_workspace_manager = zlib_cleanup_workspace_manager,
- .get_workspace = zlib_get_workspace,
- .put_workspace = zlib_put_workspace,
- .alloc_workspace = zlib_alloc_workspace,
- .free_workspace = zlib_free_workspace,
- .compress_pages = zlib_compress_pages,
- .decompress_bio = zlib_decompress_bio,
- .decompress = zlib_decompress,
+ .workspace_manager = &wsm,
.max_level = 9,
.default_level = BTRFS_ZLIB_DEFAULT_LEVEL,
};
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 764d47b..9a48716 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -91,9 +91,8 @@
return container_of(list, struct workspace, list);
}
-static void zstd_free_workspace(struct list_head *ws);
-static struct list_head *zstd_alloc_workspace(unsigned int level);
-
+void zstd_free_workspace(struct list_head *ws);
+struct list_head *zstd_alloc_workspace(unsigned int level);
/*
* zstd_reclaim_timer_fn - reclaim timer
* @t: timer
@@ -168,7 +167,7 @@
}
}
-static void zstd_init_workspace_manager(void)
+void zstd_init_workspace_manager(void)
{
struct list_head *ws;
int i;
@@ -194,7 +193,7 @@
}
}
-static void zstd_cleanup_workspace_manager(void)
+void zstd_cleanup_workspace_manager(void)
{
struct workspace *workspace;
int i;
@@ -261,7 +260,7 @@
* attempt to allocate a new workspace. If we fail to allocate one due to
* memory pressure, go to sleep waiting for the max level workspace to free up.
*/
-static struct list_head *zstd_get_workspace(unsigned int level)
+struct list_head *zstd_get_workspace(unsigned int level)
{
struct list_head *ws;
unsigned int nofs_flag;
@@ -302,7 +301,7 @@
* isn't set, it is also set here. Only the max level workspace tries and wakes
* up waiting workspaces.
*/
-static void zstd_put_workspace(struct list_head *ws)
+void zstd_put_workspace(struct list_head *ws)
{
struct workspace *workspace = list_to_workspace(ws);
@@ -332,7 +331,7 @@
cond_wake_up(&wsm.wait);
}
-static void zstd_free_workspace(struct list_head *ws)
+void zstd_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -341,7 +340,7 @@
kfree(workspace);
}
-static struct list_head *zstd_alloc_workspace(unsigned int level)
+struct list_head *zstd_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
@@ -367,13 +366,9 @@
return ERR_PTR(-ENOMEM);
}
-static int zstd_compress_pages(struct list_head *ws,
- struct address_space *mapping,
- u64 start,
- struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out)
+int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
ZSTD_CStream *stream;
@@ -548,7 +543,7 @@
return ret;
}
-static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
struct page **pages_in = cb->compressed_pages;
@@ -626,10 +621,9 @@
return ret;
}
-static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page,
- unsigned long start_byte,
- size_t srclen, size_t destlen)
+int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
ZSTD_DStream *stream;
@@ -712,15 +706,8 @@
}
const struct btrfs_compress_op btrfs_zstd_compress = {
- .init_workspace_manager = zstd_init_workspace_manager,
- .cleanup_workspace_manager = zstd_cleanup_workspace_manager,
- .get_workspace = zstd_get_workspace,
- .put_workspace = zstd_put_workspace,
- .alloc_workspace = zstd_alloc_workspace,
- .free_workspace = zstd_free_workspace,
- .compress_pages = zstd_compress_pages,
- .decompress_bio = zstd_decompress_bio,
- .decompress = zstd_decompress,
+ /* ZSTD uses own workspace manager */
+ .workspace_manager = NULL,
.max_level = ZSTD_BTRFS_MAX_LEVEL,
.default_level = ZSTD_BTRFS_DEFAULT_LEVEL,
};