Blame - fs/btrfs/extent_io.c - hafnium/third_party/linux.git

blob: 5fc65a780f83bae66b3035dcec8b2be7cbf2114d [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2
				3	#include <linux/bitops.h>
				4	#include <linux/slab.h>
				5	#include <linux/bio.h>
				6	#include <linux/mm.h>
				7	#include <linux/pagemap.h>
				8	#include <linux/page-flags.h>
				9	#include <linux/spinlock.h>
				10	#include <linux/blkdev.h>
				11	#include <linux/swap.h>
				12	#include <linux/writeback.h>
				13	#include <linux/pagevec.h>
				14	#include <linux/prefetch.h>
				15	#include <linux/cleancache.h>
				16	#include "extent_io.h"
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	17	#include "extent-io-tree.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	18	#include "extent_map.h"
				19	#include "ctree.h"
				20	#include "btrfs_inode.h"
				21	#include "volumes.h"
				22	#include "check-integrity.h"
				23	#include "locking.h"
				24	#include "rcu-string.h"
				25	#include "backref.h"
				26	#include "disk-io.h"
				27
				28	static struct kmem_cache *extent_state_cache;
				29	static struct kmem_cache *extent_buffer_cache;
				30	static struct bio_set btrfs_bioset;
				31
				32	static inline bool extent_state_in_tree(const struct extent_state *state)
				33	{
				34	return !RB_EMPTY_NODE(&state->rb_node);
				35	}
				36
				37	#ifdef CONFIG_BTRFS_DEBUG
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	38	static LIST_HEAD(states);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	39	static DEFINE_SPINLOCK(leak_lock);
				40
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	41	static inline void btrfs_leak_debug_add(spinlock_t *lock,
				42	struct list_head *new,
				43	struct list_head *head)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	44	{
				45	unsigned long flags;
				46
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	47	spin_lock_irqsave(lock, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	48	list_add(new, head);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	49	spin_unlock_irqrestore(lock, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	50	}
				51
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	52	static inline void btrfs_leak_debug_del(spinlock_t *lock,
				53	struct list_head *entry)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	54	{
				55	unsigned long flags;
				56
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	57	spin_lock_irqsave(lock, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	58	list_del(entry);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	59	spin_unlock_irqrestore(lock, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	60	}
				61
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	62	void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
				63	{
				64	struct extent_buffer *eb;
				65	unsigned long flags;
				66
				67	/*
				68	* If we didn't get into open_ctree our allocated_ebs will not be
				69	* initialized, so just skip this.
				70	*/
				71	if (!fs_info->allocated_ebs.next)
				72	return;
				73
				74	spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
				75	while (!list_empty(&fs_info->allocated_ebs)) {
				76	eb = list_first_entry(&fs_info->allocated_ebs,
				77	struct extent_buffer, leak_list);
				78	pr_err(
				79	"BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
				80	eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
				81	btrfs_header_owner(eb));
				82	list_del(&eb->leak_list);
				83	kmem_cache_free(extent_buffer_cache, eb);
				84	}
				85	spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
				86	}
				87
				88	static inline void btrfs_extent_state_leak_debug_check(void)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	89	{
				90	struct extent_state *state;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	91
				92	while (!list_empty(&states)) {
				93	state = list_entry(states.next, struct extent_state, leak_list);
				94	pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
				95	state->start, state->end, state->state,
				96	extent_state_in_tree(state),
				97	refcount_read(&state->refs));
				98	list_del(&state->leak_list);
				99	kmem_cache_free(extent_state_cache, state);
				100	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	101	}
				102
				103	#define btrfs_debug_check_extent_io_range(tree, start, end) \
				104	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
				105	static inline void __btrfs_debug_check_extent_io_range(const char *caller,
				106	struct extent_io_tree *tree, u64 start, u64 end)
				107	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	108	struct inode *inode = tree->private_data;
				109	u64 isize;
				110
				111	if (!inode \|\| !is_data_inode(inode))
				112	return;
				113
				114	isize = i_size_read(inode);
				115	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
				116	btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
				117	"%s: ino %llu isize %llu odd range [%llu,%llu]",
				118	caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
				119	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	120	}
				121	#else
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	122	#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
				123	#define btrfs_leak_debug_del(lock, entry) do {} while (0)
				124	#define btrfs_extent_state_leak_debug_check() do {} while (0)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	125	#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
				126	#endif
				127
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	128	struct tree_entry {
				129	u64 start;
				130	u64 end;
				131	struct rb_node rb_node;
				132	};
				133
				134	struct extent_page_data {
				135	struct bio *bio;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	136	/* tells writepage not to lock the state bits for this range
				137	* it still does the unlocking
				138	*/
				139	unsigned int extent_locked:1;
				140
				141	/* tells the submit_bio code to use REQ_SYNC */
				142	unsigned int sync_io:1;
				143	};
				144
				145	static int add_extent_changeset(struct extent_state *state, unsigned bits,
				146	struct extent_changeset *changeset,
				147	int set)
				148	{
				149	int ret;
				150
				151	if (!changeset)
				152	return 0;
				153	if (set && (state->state & bits) == bits)
				154	return 0;
				155	if (!set && (state->state & bits) == 0)
				156	return 0;
				157	changeset->bytes_changed += state->end - state->start + 1;
				158	ret = ulist_add(&changeset->range_changed, state->start, state->end,
				159	GFP_ATOMIC);
				160	return ret;
				161	}
				162
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	163	int __must_check submit_one_bio(struct bio *bio, int mirror_num,
				164	unsigned long bio_flags)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	165	{
				166	blk_status_t ret = 0;
				167	struct extent_io_tree *tree = bio->bi_private;
				168
				169	bio->bi_private = NULL;
				170
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	171	if (is_data_inode(tree->private_data))
				172	ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
				173	bio_flags);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	174	else
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	175	ret = btrfs_submit_metadata_bio(tree->private_data, bio,
				176	mirror_num, bio_flags);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	177
				178	return blk_status_to_errno(ret);
				179	}
				180
				181	/* Cleanup unsubmitted bios */
				182	static void end_write_bio(struct extent_page_data *epd, int ret)
				183	{
				184	if (epd->bio) {
				185	epd->bio->bi_status = errno_to_blk_status(ret);
				186	bio_endio(epd->bio);
				187	epd->bio = NULL;
				188	}
				189	}
				190
				191	/*
				192	* Submit bio from extent page data via submit_one_bio
				193	*
				194	* Return 0 if everything is OK.
				195	* Return <0 for error.
				196	*/
				197	static int __must_check flush_write_bio(struct extent_page_data *epd)
				198	{
				199	int ret = 0;
				200
				201	if (epd->bio) {
				202	ret = submit_one_bio(epd->bio, 0, 0);
				203	/*
				204	* Clean up of epd->bio is handled by its endio function.
				205	* And endio is either triggered by successful bio execution
				206	* or the error handler of submit bio hook.
				207	* So at this point, no matter what happened, we don't need
				208	* to clean up epd->bio.
				209	*/
				210	epd->bio = NULL;
				211	}
				212	return ret;
				213	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	214
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	215	int __init extent_state_cache_init(void)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	216	{
				217	extent_state_cache = kmem_cache_create("btrfs_extent_state",
				218	sizeof(struct extent_state), 0,
				219	SLAB_MEM_SPREAD, NULL);
				220	if (!extent_state_cache)
				221	return -ENOMEM;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	222	return 0;
				223	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	224
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	225	int __init extent_io_init(void)
				226	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	227	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
				228	sizeof(struct extent_buffer), 0,
				229	SLAB_MEM_SPREAD, NULL);
				230	if (!extent_buffer_cache)
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	231	return -ENOMEM;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	232
				233	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
				234	offsetof(struct btrfs_io_bio, bio),
				235	BIOSET_NEED_BVECS))
				236	goto free_buffer_cache;
				237
				238	if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
				239	goto free_bioset;
				240
				241	return 0;
				242
				243	free_bioset:
				244	bioset_exit(&btrfs_bioset);
				245
				246	free_buffer_cache:
				247	kmem_cache_destroy(extent_buffer_cache);
				248	extent_buffer_cache = NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	249	return -ENOMEM;
				250	}
				251
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	252	void __cold extent_state_cache_exit(void)
				253	{
				254	btrfs_extent_state_leak_debug_check();
				255	kmem_cache_destroy(extent_state_cache);
				256	}
				257
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	258	void __cold extent_io_exit(void)
				259	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	260	/*
				261	* Make sure all delayed rcu free are flushed before we
				262	* destroy caches.
				263	*/
				264	rcu_barrier();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	265	kmem_cache_destroy(extent_buffer_cache);
				266	bioset_exit(&btrfs_bioset);
				267	}
				268
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	269	/*
				270	* For the file_extent_tree, we want to hold the inode lock when we lookup and
				271	* update the disk_i_size, but lockdep will complain because our io_tree we hold
				272	* the tree lock and get the inode lock when setting delalloc. These two things
				273	* are unrelated, so make a class for the file_extent_tree so we don't get the
				274	* two locking patterns mixed up.
				275	*/
				276	static struct lock_class_key file_extent_tree_class;
				277
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	278	void extent_io_tree_init(struct btrfs_fs_info *fs_info,
				279	struct extent_io_tree *tree, unsigned int owner,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	280	void *private_data)
				281	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	282	tree->fs_info = fs_info;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	283	tree->state = RB_ROOT;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	284	tree->dirty_bytes = 0;
				285	spin_lock_init(&tree->lock);
				286	tree->private_data = private_data;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	287	tree->owner = owner;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	288	if (owner == IO_TREE_INODE_FILE_EXTENT)
				289	lockdep_set_class(&tree->lock, &file_extent_tree_class);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	290	}
				291
				292	void extent_io_tree_release(struct extent_io_tree *tree)
				293	{
				294	spin_lock(&tree->lock);
				295	/*
				296	* Do a single barrier for the waitqueue_active check here, the state
				297	* of the waitqueue should not change once extent_io_tree_release is
				298	* called.
				299	*/
				300	smp_mb();
				301	while (!RB_EMPTY_ROOT(&tree->state)) {
				302	struct rb_node *node;
				303	struct extent_state *state;
				304
				305	node = rb_first(&tree->state);
				306	state = rb_entry(node, struct extent_state, rb_node);
				307	rb_erase(&state->rb_node, &tree->state);
				308	RB_CLEAR_NODE(&state->rb_node);
				309	/*
				310	* btree io trees aren't supposed to have tasks waiting for
				311	* changes in the flags of extent states ever.
				312	*/
				313	ASSERT(!waitqueue_active(&state->wq));
				314	free_extent_state(state);
				315
				316	cond_resched_lock(&tree->lock);
				317	}
				318	spin_unlock(&tree->lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	319	}
				320
				321	static struct extent_state *alloc_extent_state(gfp_t mask)
				322	{
				323	struct extent_state *state;
				324
				325	/*
				326	* The given mask might be not appropriate for the slab allocator,
				327	* drop the unsupported bits
				328	*/
				329	mask &= ~(__GFP_DMA32\|__GFP_HIGHMEM);
				330	state = kmem_cache_alloc(extent_state_cache, mask);
				331	if (!state)
				332	return state;
				333	state->state = 0;
				334	state->failrec = NULL;
				335	RB_CLEAR_NODE(&state->rb_node);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	336	btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	337	refcount_set(&state->refs, 1);
				338	init_waitqueue_head(&state->wq);
				339	trace_alloc_extent_state(state, mask, _RET_IP_);
				340	return state;
				341	}
				342
				343	void free_extent_state(struct extent_state *state)
				344	{
				345	if (!state)
				346	return;
				347	if (refcount_dec_and_test(&state->refs)) {
				348	WARN_ON(extent_state_in_tree(state));
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	349	btrfs_leak_debug_del(&leak_lock, &state->leak_list);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	350	trace_free_extent_state(state, _RET_IP_);
				351	kmem_cache_free(extent_state_cache, state);
				352	}
				353	}
				354
				355	static struct rb_node tree_insert(struct rb_root root,
				356	struct rb_node *search_start,
				357	u64 offset,
				358	struct rb_node *node,
				359	struct rb_node ***p_in,
				360	struct rb_node **parent_in)
				361	{
				362	struct rb_node **p;
				363	struct rb_node *parent = NULL;
				364	struct tree_entry *entry;
				365
				366	if (p_in && parent_in) {
				367	p = *p_in;
				368	parent = *parent_in;
				369	goto do_insert;
				370	}
				371
				372	p = search_start ? &search_start : &root->rb_node;
				373	while (*p) {
				374	parent = *p;
				375	entry = rb_entry(parent, struct tree_entry, rb_node);
				376
				377	if (offset < entry->start)
				378	p = &(*p)->rb_left;
				379	else if (offset > entry->end)
				380	p = &(*p)->rb_right;
				381	else
				382	return parent;
				383	}
				384
				385	do_insert:
				386	rb_link_node(node, parent, p);
				387	rb_insert_color(node, root);
				388	return NULL;
				389	}
				390
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	391	/**
				392	* __etree_search - searche @tree for an entry that contains @offset. Such
				393	* entry would have entry->start <= offset && entry->end >= offset.
				394	*
				395	* @tree - the tree to search
				396	* @offset - offset that should fall within an entry in @tree
				397	* @next_ret - pointer to the first entry whose range ends after @offset
				398	* @prev - pointer to the first entry whose range begins before @offset
				399	* @p_ret - pointer where new node should be anchored (used when inserting an
				400	* entry in the tree)
				401	* @parent_ret - points to entry which would have been the parent of the entry,
				402	* containing @offset
				403	*
				404	* This function returns a pointer to the entry that contains @offset byte
				405	* address. If no such entry exists, then NULL is returned and the other
				406	* pointer arguments to the function are filled, otherwise the found entry is
				407	* returned and other pointers are left untouched.
				408	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	409	static struct rb_node __etree_search(struct extent_io_tree tree, u64 offset,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	410	struct rb_node **next_ret,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	411	struct rb_node **prev_ret,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	412	struct rb_node ***p_ret,
				413	struct rb_node **parent_ret)
				414	{
				415	struct rb_root *root = &tree->state;
				416	struct rb_node **n = &root->rb_node;
				417	struct rb_node *prev = NULL;
				418	struct rb_node *orig_prev = NULL;
				419	struct tree_entry *entry;
				420	struct tree_entry *prev_entry = NULL;
				421
				422	while (*n) {
				423	prev = *n;
				424	entry = rb_entry(prev, struct tree_entry, rb_node);
				425	prev_entry = entry;
				426
				427	if (offset < entry->start)
				428	n = &(*n)->rb_left;
				429	else if (offset > entry->end)
				430	n = &(*n)->rb_right;
				431	else
				432	return *n;
				433	}
				434
				435	if (p_ret)
				436	*p_ret = n;
				437	if (parent_ret)
				438	*parent_ret = prev;
				439
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	440	if (next_ret) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	441	orig_prev = prev;
				442	while (prev && offset > prev_entry->end) {
				443	prev = rb_next(prev);
				444	prev_entry = rb_entry(prev, struct tree_entry, rb_node);
				445	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	446	*next_ret = prev;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	447	prev = orig_prev;
				448	}
				449
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	450	if (prev_ret) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	451	prev_entry = rb_entry(prev, struct tree_entry, rb_node);
				452	while (prev && offset < prev_entry->start) {
				453	prev = rb_prev(prev);
				454	prev_entry = rb_entry(prev, struct tree_entry, rb_node);
				455	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	456	*prev_ret = prev;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	457	}
				458	return NULL;
				459	}
				460
				461	static inline struct rb_node *
				462	tree_search_for_insert(struct extent_io_tree *tree,
				463	u64 offset,
				464	struct rb_node ***p_ret,
				465	struct rb_node **parent_ret)
				466	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	467	struct rb_node *next= NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	468	struct rb_node *ret;
				469
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	470	ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	471	if (!ret)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	472	return next;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	473	return ret;
				474	}
				475
				476	static inline struct rb_node tree_search(struct extent_io_tree tree,
				477	u64 offset)
				478	{
				479	return tree_search_for_insert(tree, offset, NULL, NULL);
				480	}
				481
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	482	/*
				483	* utility function to look for merge candidates inside a given range.
				484	* Any extents with matching state are merged together into a single
				485	* extent in the tree. Extents with EXTENT_IO in their state field
				486	* are not merged because the end_io handlers need to be able to do
				487	* operations on them without sleeping (or doing allocations/splits).
				488	*
				489	* This should be called with the tree lock held.
				490	*/
				491	static void merge_state(struct extent_io_tree *tree,
				492	struct extent_state *state)
				493	{
				494	struct extent_state *other;
				495	struct rb_node *other_node;
				496
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	497	if (state->state & (EXTENT_LOCKED \| EXTENT_BOUNDARY))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	498	return;
				499
				500	other_node = rb_prev(&state->rb_node);
				501	if (other_node) {
				502	other = rb_entry(other_node, struct extent_state, rb_node);
				503	if (other->end == state->start - 1 &&
				504	other->state == state->state) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	505	if (tree->private_data &&
				506	is_data_inode(tree->private_data))
				507	btrfs_merge_delalloc_extent(tree->private_data,
				508	state, other);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	509	state->start = other->start;
				510	rb_erase(&other->rb_node, &tree->state);
				511	RB_CLEAR_NODE(&other->rb_node);
				512	free_extent_state(other);
				513	}
				514	}
				515	other_node = rb_next(&state->rb_node);
				516	if (other_node) {
				517	other = rb_entry(other_node, struct extent_state, rb_node);
				518	if (other->start == state->end + 1 &&
				519	other->state == state->state) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	520	if (tree->private_data &&
				521	is_data_inode(tree->private_data))
				522	btrfs_merge_delalloc_extent(tree->private_data,
				523	state, other);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	524	state->end = other->end;
				525	rb_erase(&other->rb_node, &tree->state);
				526	RB_CLEAR_NODE(&other->rb_node);
				527	free_extent_state(other);
				528	}
				529	}
				530	}
				531
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	532	static void set_state_bits(struct extent_io_tree *tree,
				533	struct extent_state state, unsigned bits,
				534	struct extent_changeset *changeset);
				535
				536	/*
				537	* insert an extent_state struct into the tree. 'bits' are set on the
				538	* struct before it is inserted.
				539	*
				540	* This may return -EEXIST if the extent is already there, in which case the
				541	* state struct is freed.
				542	*
				543	* The tree lock is not taken internally. This is a utility function and
				544	* probably isn't what you want to call (see set/clear_extent_bit).
				545	*/
				546	static int insert_state(struct extent_io_tree *tree,
				547	struct extent_state *state, u64 start, u64 end,
				548	struct rb_node ***p,
				549	struct rb_node **parent,
				550	unsigned bits, struct extent_changeset changeset)
				551	{
				552	struct rb_node *node;
				553
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	554	if (end < start) {
				555	btrfs_err(tree->fs_info,
				556	"insert state: end < start %llu %llu", end, start);
				557	WARN_ON(1);
				558	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	559	state->start = start;
				560	state->end = end;
				561
				562	set_state_bits(tree, state, bits, changeset);
				563
				564	node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
				565	if (node) {
				566	struct extent_state *found;
				567	found = rb_entry(node, struct extent_state, rb_node);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	568	btrfs_err(tree->fs_info,
				569	"found node %llu %llu on insert of %llu %llu",
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	570	found->start, found->end, start, end);
				571	return -EEXIST;
				572	}
				573	merge_state(tree, state);
				574	return 0;
				575	}
				576
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	577	/*
				578	* split a given extent state struct in two, inserting the preallocated
				579	* struct 'prealloc' as the newly created second half. 'split' indicates an
				580	* offset inside 'orig' where it should be split.
				581	*
				582	* Before calling,
				583	* the tree has 'orig' at [orig->start, orig->end]. After calling, there
				584	* are two extent state structs in the tree:
				585	* prealloc: [orig->start, split - 1]
				586	* orig: [ split, orig->end ]
				587	*
				588	* The tree locks are not taken by this function. They need to be held
				589	* by the caller.
				590	*/
				591	static int split_state(struct extent_io_tree tree, struct extent_state orig,
				592	struct extent_state *prealloc, u64 split)
				593	{
				594	struct rb_node *node;
				595
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	596	if (tree->private_data && is_data_inode(tree->private_data))
				597	btrfs_split_delalloc_extent(tree->private_data, orig, split);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	598
				599	prealloc->start = orig->start;
				600	prealloc->end = split - 1;
				601	prealloc->state = orig->state;
				602	orig->start = split;
				603
				604	node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
				605	&prealloc->rb_node, NULL, NULL);
				606	if (node) {
				607	free_extent_state(prealloc);
				608	return -EEXIST;
				609	}
				610	return 0;
				611	}
				612
				613	static struct extent_state next_state(struct extent_state state)
				614	{
				615	struct rb_node *next = rb_next(&state->rb_node);
				616	if (next)
				617	return rb_entry(next, struct extent_state, rb_node);
				618	else
				619	return NULL;
				620	}
				621
				622	/*
				623	* utility function to clear some bits in an extent state struct.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	624	* it will optionally wake up anyone waiting on this state (wake == 1).
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	625	*
				626	* If no bits are set on the state struct after clearing things, the
				627	* struct is freed and removed from the tree
				628	*/
				629	static struct extent_state clear_state_bit(struct extent_io_tree tree,
				630	struct extent_state *state,
				631	unsigned *bits, int wake,
				632	struct extent_changeset *changeset)
				633	{
				634	struct extent_state *next;
				635	unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
				636	int ret;
				637
				638	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
				639	u64 range = state->end - state->start + 1;
				640	WARN_ON(range > tree->dirty_bytes);
				641	tree->dirty_bytes -= range;
				642	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	643
				644	if (tree->private_data && is_data_inode(tree->private_data))
				645	btrfs_clear_delalloc_extent(tree->private_data, state, bits);
				646
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	647	ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
				648	BUG_ON(ret < 0);
				649	state->state &= ~bits_to_clear;
				650	if (wake)
				651	wake_up(&state->wq);
				652	if (state->state == 0) {
				653	next = next_state(state);
				654	if (extent_state_in_tree(state)) {
				655	rb_erase(&state->rb_node, &tree->state);
				656	RB_CLEAR_NODE(&state->rb_node);
				657	free_extent_state(state);
				658	} else {
				659	WARN_ON(1);
				660	}
				661	} else {
				662	merge_state(tree, state);
				663	next = next_state(state);
				664	}
				665	return next;
				666	}
				667
				668	static struct extent_state *
				669	alloc_extent_state_atomic(struct extent_state *prealloc)
				670	{
				671	if (!prealloc)
				672	prealloc = alloc_extent_state(GFP_ATOMIC);
				673
				674	return prealloc;
				675	}
				676
				677	static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
				678	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	679	btrfs_panic(tree->fs_info, err,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	680	"locking error: extent tree was modified by another thread while locked");
				681	}
				682
				683	/*
				684	* clear some bits on a range in the tree. This may require splitting
				685	* or inserting elements in the tree, so the gfp mask is used to
				686	* indicate which allocations or sleeping are allowed.
				687	*
				688	* pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
				689	* the given range from the tree regardless of state (ie for truncate).
				690	*
				691	* the range [start, end] is inclusive.
				692	*
				693	* This takes the tree lock, and returns 0 on success and < 0 on error.
				694	*/
				695	int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				696	unsigned bits, int wake, int delete,
				697	struct extent_state **cached_state,
				698	gfp_t mask, struct extent_changeset *changeset)
				699	{
				700	struct extent_state *state;
				701	struct extent_state *cached;
				702	struct extent_state *prealloc = NULL;
				703	struct rb_node *node;
				704	u64 last_end;
				705	int err;
				706	int clear = 0;
				707
				708	btrfs_debug_check_extent_io_range(tree, start, end);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	709	trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	710
				711	if (bits & EXTENT_DELALLOC)
				712	bits \|= EXTENT_NORESERVE;
				713
				714	if (delete)
				715	bits \|= ~EXTENT_CTLBITS;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	716
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	717	if (bits & (EXTENT_LOCKED \| EXTENT_BOUNDARY))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	718	clear = 1;
				719	again:
				720	if (!prealloc && gfpflags_allow_blocking(mask)) {
				721	/*
				722	* Don't care for allocation failure here because we might end
				723	* up not needing the pre-allocated extent state at all, which
				724	* is the case if we only have in the tree extent states that
				725	* cover our input range and don't cover too any other range.
				726	* If we end up needing a new extent state we allocate it later.
				727	*/
				728	prealloc = alloc_extent_state(mask);
				729	}
				730
				731	spin_lock(&tree->lock);
				732	if (cached_state) {
				733	cached = *cached_state;
				734
				735	if (clear) {
				736	*cached_state = NULL;
				737	cached_state = NULL;
				738	}
				739
				740	if (cached && extent_state_in_tree(cached) &&
				741	cached->start <= start && cached->end > start) {
				742	if (clear)
				743	refcount_dec(&cached->refs);
				744	state = cached;
				745	goto hit_next;
				746	}
				747	if (clear)
				748	free_extent_state(cached);
				749	}
				750	/*
				751	* this search will find the extents that end after
				752	* our range starts
				753	*/
				754	node = tree_search(tree, start);
				755	if (!node)
				756	goto out;
				757	state = rb_entry(node, struct extent_state, rb_node);
				758	hit_next:
				759	if (state->start > end)
				760	goto out;
				761	WARN_ON(state->end < start);
				762	last_end = state->end;
				763
				764	/* the state doesn't have the wanted bits, go ahead */
				765	if (!(state->state & bits)) {
				766	state = next_state(state);
				767	goto next;
				768	}
				769
				770	/*
				771	* \| ---- desired range ---- \|
				772	* \| state \| or
				773	* \| ------------- state -------------- \|
				774	*
				775	* We need to split the extent we found, and may flip
				776	* bits on second half.
				777	*
				778	* If the extent we found extends past our range, we
				779	* just split and search again. It'll get split again
				780	* the next time though.
				781	*
				782	* If the extent we found is inside our range, we clear
				783	* the desired bit on it.
				784	*/
				785
				786	if (state->start < start) {
				787	prealloc = alloc_extent_state_atomic(prealloc);
				788	BUG_ON(!prealloc);
				789	err = split_state(tree, state, prealloc, start);
				790	if (err)
				791	extent_io_tree_panic(tree, err);
				792
				793	prealloc = NULL;
				794	if (err)
				795	goto out;
				796	if (state->end <= end) {
				797	state = clear_state_bit(tree, state, &bits, wake,
				798	changeset);
				799	goto next;
				800	}
				801	goto search_again;
				802	}
				803	/*
				804	* \| ---- desired range ---- \|
				805	* \| state \|
				806	* We need to split the extent, and clear the bit
				807	* on the first half
				808	*/
				809	if (state->start <= end && state->end > end) {
				810	prealloc = alloc_extent_state_atomic(prealloc);
				811	BUG_ON(!prealloc);
				812	err = split_state(tree, state, prealloc, end + 1);
				813	if (err)
				814	extent_io_tree_panic(tree, err);
				815
				816	if (wake)
				817	wake_up(&state->wq);
				818
				819	clear_state_bit(tree, prealloc, &bits, wake, changeset);
				820
				821	prealloc = NULL;
				822	goto out;
				823	}
				824
				825	state = clear_state_bit(tree, state, &bits, wake, changeset);
				826	next:
				827	if (last_end == (u64)-1)
				828	goto out;
				829	start = last_end + 1;
				830	if (start <= end && state && !need_resched())
				831	goto hit_next;
				832
				833	search_again:
				834	if (start > end)
				835	goto out;
				836	spin_unlock(&tree->lock);
				837	if (gfpflags_allow_blocking(mask))
				838	cond_resched();
				839	goto again;
				840
				841	out:
				842	spin_unlock(&tree->lock);
				843	if (prealloc)
				844	free_extent_state(prealloc);
				845
				846	return 0;
				847
				848	}
				849
				850	static void wait_on_state(struct extent_io_tree *tree,
				851	struct extent_state *state)
				852	__releases(tree->lock)
				853	__acquires(tree->lock)
				854	{
				855	DEFINE_WAIT(wait);
				856	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
				857	spin_unlock(&tree->lock);
				858	schedule();
				859	spin_lock(&tree->lock);
				860	finish_wait(&state->wq, &wait);
				861	}
				862
				863	/*
				864	* waits for one or more bits to clear on a range in the state tree.
				865	* The range [start, end] is inclusive.
				866	* The tree lock is taken by this function
				867	*/
				868	static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				869	unsigned long bits)
				870	{
				871	struct extent_state *state;
				872	struct rb_node *node;
				873
				874	btrfs_debug_check_extent_io_range(tree, start, end);
				875
				876	spin_lock(&tree->lock);
				877	again:
				878	while (1) {
				879	/*
				880	* this search will find all the extents that end after
				881	* our range starts
				882	*/
				883	node = tree_search(tree, start);
				884	process_node:
				885	if (!node)
				886	break;
				887
				888	state = rb_entry(node, struct extent_state, rb_node);
				889
				890	if (state->start > end)
				891	goto out;
				892
				893	if (state->state & bits) {
				894	start = state->start;
				895	refcount_inc(&state->refs);
				896	wait_on_state(tree, state);
				897	free_extent_state(state);
				898	goto again;
				899	}
				900	start = state->end + 1;
				901
				902	if (start > end)
				903	break;
				904
				905	if (!cond_resched_lock(&tree->lock)) {
				906	node = rb_next(node);
				907	goto process_node;
				908	}
				909	}
				910	out:
				911	spin_unlock(&tree->lock);
				912	}
				913
				914	static void set_state_bits(struct extent_io_tree *tree,
				915	struct extent_state *state,
				916	unsigned bits, struct extent_changeset changeset)
				917	{
				918	unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
				919	int ret;
				920
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	921	if (tree->private_data && is_data_inode(tree->private_data))
				922	btrfs_set_delalloc_extent(tree->private_data, state, bits);
				923
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	924	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
				925	u64 range = state->end - state->start + 1;
				926	tree->dirty_bytes += range;
				927	}
				928	ret = add_extent_changeset(state, bits_to_set, changeset, 1);
				929	BUG_ON(ret < 0);
				930	state->state \|= bits_to_set;
				931	}
				932
				933	static void cache_state_if_flags(struct extent_state *state,
				934	struct extent_state **cached_ptr,
				935	unsigned flags)
				936	{
				937	if (cached_ptr && !(*cached_ptr)) {
				938	if (!flags \|\| (state->state & flags)) {
				939	*cached_ptr = state;
				940	refcount_inc(&state->refs);
				941	}
				942	}
				943	}
				944
				945	static void cache_state(struct extent_state *state,
				946	struct extent_state **cached_ptr)
				947	{
				948	return cache_state_if_flags(state, cached_ptr,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	949	EXTENT_LOCKED \| EXTENT_BOUNDARY);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	950	}
				951
				952	/*
				953	* set some bits on a range in the tree. This may require allocations or
				954	* sleeping, so the gfp mask is used to indicate what is allowed.
				955	*
				956	* If any of the exclusive bits are set, this will fail with -EEXIST if some
				957	* part of the range already has the desired bits set. The start of the
				958	* existing range is returned in failed_start in this case.
				959	*
				960	* [start, end] is inclusive This takes the tree lock.
				961	*/
				962
				963	static int __must_check
				964	__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				965	unsigned bits, unsigned exclusive_bits,
				966	u64 failed_start, struct extent_state *cached_state,
				967	gfp_t mask, struct extent_changeset *changeset)
				968	{
				969	struct extent_state *state;
				970	struct extent_state *prealloc = NULL;
				971	struct rb_node *node;
				972	struct rb_node **p;
				973	struct rb_node *parent;
				974	int err = 0;
				975	u64 last_start;
				976	u64 last_end;
				977
				978	btrfs_debug_check_extent_io_range(tree, start, end);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	979	trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	980
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	981	again:
				982	if (!prealloc && gfpflags_allow_blocking(mask)) {
				983	/*
				984	* Don't care for allocation failure here because we might end
				985	* up not needing the pre-allocated extent state at all, which
				986	* is the case if we only have in the tree extent states that
				987	* cover our input range and don't cover too any other range.
				988	* If we end up needing a new extent state we allocate it later.
				989	*/
				990	prealloc = alloc_extent_state(mask);
				991	}
				992
				993	spin_lock(&tree->lock);
				994	if (cached_state && *cached_state) {
				995	state = *cached_state;
				996	if (state->start <= start && state->end > start &&
				997	extent_state_in_tree(state)) {
				998	node = &state->rb_node;
				999	goto hit_next;
				1000	}
				1001	}
				1002	/*
				1003	* this search will find all the extents that end after
				1004	* our range starts.
				1005	*/
				1006	node = tree_search_for_insert(tree, start, &p, &parent);
				1007	if (!node) {
				1008	prealloc = alloc_extent_state_atomic(prealloc);
				1009	BUG_ON(!prealloc);
				1010	err = insert_state(tree, prealloc, start, end,
				1011	&p, &parent, &bits, changeset);
				1012	if (err)
				1013	extent_io_tree_panic(tree, err);
				1014
				1015	cache_state(prealloc, cached_state);
				1016	prealloc = NULL;
				1017	goto out;
				1018	}
				1019	state = rb_entry(node, struct extent_state, rb_node);
				1020	hit_next:
				1021	last_start = state->start;
				1022	last_end = state->end;
				1023
				1024	/*
				1025	* \| ---- desired range ---- \|
				1026	* \| state \|
				1027	*
				1028	* Just lock what we found and keep going
				1029	*/
				1030	if (state->start == start && state->end <= end) {
				1031	if (state->state & exclusive_bits) {
				1032	*failed_start = state->start;
				1033	err = -EEXIST;
				1034	goto out;
				1035	}
				1036
				1037	set_state_bits(tree, state, &bits, changeset);
				1038	cache_state(state, cached_state);
				1039	merge_state(tree, state);
				1040	if (last_end == (u64)-1)
				1041	goto out;
				1042	start = last_end + 1;
				1043	state = next_state(state);
				1044	if (start < end && state && state->start == start &&
				1045	!need_resched())
				1046	goto hit_next;
				1047	goto search_again;
				1048	}
				1049
				1050	/*
				1051	* \| ---- desired range ---- \|
				1052	* \| state \|
				1053	* or
				1054	* \| ------------- state -------------- \|
				1055	*
				1056	* We need to split the extent we found, and may flip bits on
				1057	* second half.
				1058	*
				1059	* If the extent we found extends past our
				1060	* range, we just split and search again. It'll get split
				1061	* again the next time though.
				1062	*
				1063	* If the extent we found is inside our range, we set the
				1064	* desired bit on it.
				1065	*/
				1066	if (state->start < start) {
				1067	if (state->state & exclusive_bits) {
				1068	*failed_start = start;
				1069	err = -EEXIST;
				1070	goto out;
				1071	}
				1072
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1073	/*
				1074	* If this extent already has all the bits we want set, then
				1075	* skip it, not necessary to split it or do anything with it.
				1076	*/
				1077	if ((state->state & bits) == bits) {
				1078	start = state->end + 1;
				1079	cache_state(state, cached_state);
				1080	goto search_again;
				1081	}
				1082
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1083	prealloc = alloc_extent_state_atomic(prealloc);
				1084	BUG_ON(!prealloc);
				1085	err = split_state(tree, state, prealloc, start);
				1086	if (err)
				1087	extent_io_tree_panic(tree, err);
				1088
				1089	prealloc = NULL;
				1090	if (err)
				1091	goto out;
				1092	if (state->end <= end) {
				1093	set_state_bits(tree, state, &bits, changeset);
				1094	cache_state(state, cached_state);
				1095	merge_state(tree, state);
				1096	if (last_end == (u64)-1)
				1097	goto out;
				1098	start = last_end + 1;
				1099	state = next_state(state);
				1100	if (start < end && state && state->start == start &&
				1101	!need_resched())
				1102	goto hit_next;
				1103	}
				1104	goto search_again;
				1105	}
				1106	/*
				1107	* \| ---- desired range ---- \|
				1108	* \| state \| or \| state \|
				1109	*
				1110	* There's a hole, we need to insert something in it and
				1111	* ignore the extent we found.
				1112	*/
				1113	if (state->start > start) {
				1114	u64 this_end;
				1115	if (end < last_start)
				1116	this_end = end;
				1117	else
				1118	this_end = last_start - 1;
				1119
				1120	prealloc = alloc_extent_state_atomic(prealloc);
				1121	BUG_ON(!prealloc);
				1122
				1123	/*
				1124	* Avoid to free 'prealloc' if it can be merged with
				1125	* the later extent.
				1126	*/
				1127	err = insert_state(tree, prealloc, start, this_end,
				1128	NULL, NULL, &bits, changeset);
				1129	if (err)
				1130	extent_io_tree_panic(tree, err);
				1131
				1132	cache_state(prealloc, cached_state);
				1133	prealloc = NULL;
				1134	start = this_end + 1;
				1135	goto search_again;
				1136	}
				1137	/*
				1138	* \| ---- desired range ---- \|
				1139	* \| state \|
				1140	* We need to split the extent, and set the bit
				1141	* on the first half
				1142	*/
				1143	if (state->start <= end && state->end > end) {
				1144	if (state->state & exclusive_bits) {
				1145	*failed_start = start;
				1146	err = -EEXIST;
				1147	goto out;
				1148	}
				1149
				1150	prealloc = alloc_extent_state_atomic(prealloc);
				1151	BUG_ON(!prealloc);
				1152	err = split_state(tree, state, prealloc, end + 1);
				1153	if (err)
				1154	extent_io_tree_panic(tree, err);
				1155
				1156	set_state_bits(tree, prealloc, &bits, changeset);
				1157	cache_state(prealloc, cached_state);
				1158	merge_state(tree, prealloc);
				1159	prealloc = NULL;
				1160	goto out;
				1161	}
				1162
				1163	search_again:
				1164	if (start > end)
				1165	goto out;
				1166	spin_unlock(&tree->lock);
				1167	if (gfpflags_allow_blocking(mask))
				1168	cond_resched();
				1169	goto again;
				1170
				1171	out:
				1172	spin_unlock(&tree->lock);
				1173	if (prealloc)
				1174	free_extent_state(prealloc);
				1175
				1176	return err;
				1177
				1178	}
				1179
				1180	int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				1181	unsigned bits, u64 * failed_start,
				1182	struct extent_state **cached_state, gfp_t mask)
				1183	{
				1184	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
				1185	cached_state, mask, NULL);
				1186	}
				1187
				1188
				1189	/**
				1190	* convert_extent_bit - convert all bits in a given range from one bit to
				1191	* another
				1192	* @tree: the io tree to search
				1193	* @start: the start offset in bytes
				1194	* @end: the end offset in bytes (inclusive)
				1195	* @bits: the bits to set in this range
				1196	* @clear_bits: the bits to clear in this range
				1197	* @cached_state: state that we're going to cache
				1198	*
				1199	* This will go through and set bits for the given range. If any states exist
				1200	* already in this range they are set with the given bit and cleared of the
				1201	* clear_bits. This is only meant to be used by things that are mergeable, ie
				1202	* converting from say DELALLOC to DIRTY. This is not meant to be used with
				1203	* boundary bits like LOCK.
				1204	*
				1205	* All allocations are done with GFP_NOFS.
				1206	*/
				1207	int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				1208	unsigned bits, unsigned clear_bits,
				1209	struct extent_state **cached_state)
				1210	{
				1211	struct extent_state *state;
				1212	struct extent_state *prealloc = NULL;
				1213	struct rb_node *node;
				1214	struct rb_node **p;
				1215	struct rb_node *parent;
				1216	int err = 0;
				1217	u64 last_start;
				1218	u64 last_end;
				1219	bool first_iteration = true;
				1220
				1221	btrfs_debug_check_extent_io_range(tree, start, end);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1222	trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
				1223	clear_bits);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1224
				1225	again:
				1226	if (!prealloc) {
				1227	/*
				1228	* Best effort, don't worry if extent state allocation fails
				1229	* here for the first iteration. We might have a cached state
				1230	* that matches exactly the target range, in which case no
				1231	* extent state allocations are needed. We'll only know this
				1232	* after locking the tree.
				1233	*/
				1234	prealloc = alloc_extent_state(GFP_NOFS);
				1235	if (!prealloc && !first_iteration)
				1236	return -ENOMEM;
				1237	}
				1238
				1239	spin_lock(&tree->lock);
				1240	if (cached_state && *cached_state) {
				1241	state = *cached_state;
				1242	if (state->start <= start && state->end > start &&
				1243	extent_state_in_tree(state)) {
				1244	node = &state->rb_node;
				1245	goto hit_next;
				1246	}
				1247	}
				1248
				1249	/*
				1250	* this search will find all the extents that end after
				1251	* our range starts.
				1252	*/
				1253	node = tree_search_for_insert(tree, start, &p, &parent);
				1254	if (!node) {
				1255	prealloc = alloc_extent_state_atomic(prealloc);
				1256	if (!prealloc) {
				1257	err = -ENOMEM;
				1258	goto out;
				1259	}
				1260	err = insert_state(tree, prealloc, start, end,
				1261	&p, &parent, &bits, NULL);
				1262	if (err)
				1263	extent_io_tree_panic(tree, err);
				1264	cache_state(prealloc, cached_state);
				1265	prealloc = NULL;
				1266	goto out;
				1267	}
				1268	state = rb_entry(node, struct extent_state, rb_node);
				1269	hit_next:
				1270	last_start = state->start;
				1271	last_end = state->end;
				1272
				1273	/*
				1274	* \| ---- desired range ---- \|
				1275	* \| state \|
				1276	*
				1277	* Just lock what we found and keep going
				1278	*/
				1279	if (state->start == start && state->end <= end) {
				1280	set_state_bits(tree, state, &bits, NULL);
				1281	cache_state(state, cached_state);
				1282	state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
				1283	if (last_end == (u64)-1)
				1284	goto out;
				1285	start = last_end + 1;
				1286	if (start < end && state && state->start == start &&
				1287	!need_resched())
				1288	goto hit_next;
				1289	goto search_again;
				1290	}
				1291
				1292	/*
				1293	* \| ---- desired range ---- \|
				1294	* \| state \|
				1295	* or
				1296	* \| ------------- state -------------- \|
				1297	*
				1298	* We need to split the extent we found, and may flip bits on
				1299	* second half.
				1300	*
				1301	* If the extent we found extends past our
				1302	* range, we just split and search again. It'll get split
				1303	* again the next time though.
				1304	*
				1305	* If the extent we found is inside our range, we set the
				1306	* desired bit on it.
				1307	*/
				1308	if (state->start < start) {
				1309	prealloc = alloc_extent_state_atomic(prealloc);
				1310	if (!prealloc) {
				1311	err = -ENOMEM;
				1312	goto out;
				1313	}
				1314	err = split_state(tree, state, prealloc, start);
				1315	if (err)
				1316	extent_io_tree_panic(tree, err);
				1317	prealloc = NULL;
				1318	if (err)
				1319	goto out;
				1320	if (state->end <= end) {
				1321	set_state_bits(tree, state, &bits, NULL);
				1322	cache_state(state, cached_state);
				1323	state = clear_state_bit(tree, state, &clear_bits, 0,
				1324	NULL);
				1325	if (last_end == (u64)-1)
				1326	goto out;
				1327	start = last_end + 1;
				1328	if (start < end && state && state->start == start &&
				1329	!need_resched())
				1330	goto hit_next;
				1331	}
				1332	goto search_again;
				1333	}
				1334	/*
				1335	* \| ---- desired range ---- \|
				1336	* \| state \| or \| state \|
				1337	*
				1338	* There's a hole, we need to insert something in it and
				1339	* ignore the extent we found.
				1340	*/
				1341	if (state->start > start) {
				1342	u64 this_end;
				1343	if (end < last_start)
				1344	this_end = end;
				1345	else
				1346	this_end = last_start - 1;
				1347
				1348	prealloc = alloc_extent_state_atomic(prealloc);
				1349	if (!prealloc) {
				1350	err = -ENOMEM;
				1351	goto out;
				1352	}
				1353
				1354	/*
				1355	* Avoid to free 'prealloc' if it can be merged with
				1356	* the later extent.
				1357	*/
				1358	err = insert_state(tree, prealloc, start, this_end,
				1359	NULL, NULL, &bits, NULL);
				1360	if (err)
				1361	extent_io_tree_panic(tree, err);
				1362	cache_state(prealloc, cached_state);
				1363	prealloc = NULL;
				1364	start = this_end + 1;
				1365	goto search_again;
				1366	}
				1367	/*
				1368	* \| ---- desired range ---- \|
				1369	* \| state \|
				1370	* We need to split the extent, and set the bit
				1371	* on the first half
				1372	*/
				1373	if (state->start <= end && state->end > end) {
				1374	prealloc = alloc_extent_state_atomic(prealloc);
				1375	if (!prealloc) {
				1376	err = -ENOMEM;
				1377	goto out;
				1378	}
				1379
				1380	err = split_state(tree, state, prealloc, end + 1);
				1381	if (err)
				1382	extent_io_tree_panic(tree, err);
				1383
				1384	set_state_bits(tree, prealloc, &bits, NULL);
				1385	cache_state(prealloc, cached_state);
				1386	clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
				1387	prealloc = NULL;
				1388	goto out;
				1389	}
				1390
				1391	search_again:
				1392	if (start > end)
				1393	goto out;
				1394	spin_unlock(&tree->lock);
				1395	cond_resched();
				1396	first_iteration = false;
				1397	goto again;
				1398
				1399	out:
				1400	spin_unlock(&tree->lock);
				1401	if (prealloc)
				1402	free_extent_state(prealloc);
				1403
				1404	return err;
				1405	}
				1406
				1407	/* wrappers around set/clear extent bit */
				1408	int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
				1409	unsigned bits, struct extent_changeset *changeset)
				1410	{
				1411	/*
				1412	* We don't support EXTENT_LOCKED yet, as current changeset will
				1413	* record any bits changed, so for EXTENT_LOCKED case, it will
				1414	* either fail with -EEXIST or changeset will record the whole
				1415	* range.
				1416	*/
				1417	BUG_ON(bits & EXTENT_LOCKED);
				1418
				1419	return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
				1420	changeset);
				1421	}
				1422
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1423	int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
				1424	unsigned bits)
				1425	{
				1426	return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
				1427	GFP_NOWAIT, NULL);
				1428	}
				1429
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1430	int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				1431	unsigned bits, int wake, int delete,
				1432	struct extent_state **cached)
				1433	{
				1434	return __clear_extent_bit(tree, start, end, bits, wake, delete,
				1435	cached, GFP_NOFS, NULL);
				1436	}
				1437
				1438	int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
				1439	unsigned bits, struct extent_changeset *changeset)
				1440	{
				1441	/*
				1442	* Don't support EXTENT_LOCKED case, same reason as
				1443	* set_record_extent_bits().
				1444	*/
				1445	BUG_ON(bits & EXTENT_LOCKED);
				1446
				1447	return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
				1448	changeset);
				1449	}
				1450
				1451	/*
				1452	* either insert or lock state struct between start and end use mask to tell
				1453	* us if waiting is desired.
				1454	*/
				1455	int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
				1456	struct extent_state **cached_state)
				1457	{
				1458	int err;
				1459	u64 failed_start;
				1460
				1461	while (1) {
				1462	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
				1463	EXTENT_LOCKED, &failed_start,
				1464	cached_state, GFP_NOFS, NULL);
				1465	if (err == -EEXIST) {
				1466	wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
				1467	start = failed_start;
				1468	} else
				1469	break;
				1470	WARN_ON(start > end);
				1471	}
				1472	return err;
				1473	}
				1474
				1475	int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
				1476	{
				1477	int err;
				1478	u64 failed_start;
				1479
				1480	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
				1481	&failed_start, NULL, GFP_NOFS, NULL);
				1482	if (err == -EEXIST) {
				1483	if (failed_start > start)
				1484	clear_extent_bit(tree, start, failed_start - 1,
				1485	EXTENT_LOCKED, 1, 0, NULL);
				1486	return 0;
				1487	}
				1488	return 1;
				1489	}
				1490
				1491	void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
				1492	{
				1493	unsigned long index = start >> PAGE_SHIFT;
				1494	unsigned long end_index = end >> PAGE_SHIFT;
				1495	struct page *page;
				1496
				1497	while (index <= end_index) {
				1498	page = find_get_page(inode->i_mapping, index);
				1499	BUG_ON(!page); /* Pages should be in the extent_io_tree */
				1500	clear_page_dirty_for_io(page);
				1501	put_page(page);
				1502	index++;
				1503	}
				1504	}
				1505
				1506	void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
				1507	{
				1508	unsigned long index = start >> PAGE_SHIFT;
				1509	unsigned long end_index = end >> PAGE_SHIFT;
				1510	struct page *page;
				1511
				1512	while (index <= end_index) {
				1513	page = find_get_page(inode->i_mapping, index);
				1514	BUG_ON(!page); /* Pages should be in the extent_io_tree */
				1515	__set_page_dirty_nobuffers(page);
				1516	account_page_redirty(page);
				1517	put_page(page);
				1518	index++;
				1519	}
				1520	}
				1521
				1522	/* find the first state struct with 'bits' set after 'start', and
				1523	* return it. tree->lock must be held. NULL will returned if
				1524	* nothing was found after 'start'
				1525	*/
				1526	static struct extent_state *
				1527	find_first_extent_bit_state(struct extent_io_tree *tree,
				1528	u64 start, unsigned bits)
				1529	{
				1530	struct rb_node *node;
				1531	struct extent_state *state;
				1532
				1533	/*
				1534	* this search will find all the extents that end after
				1535	* our range starts.
				1536	*/
				1537	node = tree_search(tree, start);
				1538	if (!node)
				1539	goto out;
				1540
				1541	while (1) {
				1542	state = rb_entry(node, struct extent_state, rb_node);
				1543	if (state->end >= start && (state->state & bits))
				1544	return state;
				1545
				1546	node = rb_next(node);
				1547	if (!node)
				1548	break;
				1549	}
				1550	out:
				1551	return NULL;
				1552	}
				1553
				1554	/*
				1555	* find the first offset in the io tree with 'bits' set. zero is
				1556	* returned if we find something, and start_ret and end_ret are
				1557	* set to reflect the state struct that was found.
				1558	*
				1559	* If nothing was found, 1 is returned. If found something, return 0.
				1560	*/
				1561	int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
				1562	u64 start_ret, u64 end_ret, unsigned bits,
				1563	struct extent_state **cached_state)
				1564	{
				1565	struct extent_state *state;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1566	int ret = 1;
				1567
				1568	spin_lock(&tree->lock);
				1569	if (cached_state && *cached_state) {
				1570	state = *cached_state;
				1571	if (state->end == start - 1 && extent_state_in_tree(state)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1572	while ((state = next_state(state)) != NULL) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1573	if (state->state & bits)
				1574	goto got_it;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1575	}
				1576	free_extent_state(*cached_state);
				1577	*cached_state = NULL;
				1578	goto out;
				1579	}
				1580	free_extent_state(*cached_state);
				1581	*cached_state = NULL;
				1582	}
				1583
				1584	state = find_first_extent_bit_state(tree, start, bits);
				1585	got_it:
				1586	if (state) {
				1587	cache_state_if_flags(state, cached_state, 0);
				1588	*start_ret = state->start;
				1589	*end_ret = state->end;
				1590	ret = 0;
				1591	}
				1592	out:
				1593	spin_unlock(&tree->lock);
				1594	return ret;
				1595	}
				1596
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1597	/**
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1598	* find_contiguous_extent_bit: find a contiguous area of bits
				1599	* @tree - io tree to check
				1600	* @start - offset to start the search from
				1601	* @start_ret - the first offset we found with the bits set
				1602	* @end_ret - the final contiguous range of the bits that were set
				1603	* @bits - bits to look for
				1604	*
				1605	* set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
				1606	* to set bits appropriately, and then merge them again. During this time it
				1607	* will drop the tree->lock, so use this helper if you want to find the actual
				1608	* contiguous area for given bits. We will search to the first bit we find, and
				1609	* then walk down the tree until we find a non-contiguous area. The area
				1610	* returned will be the full contiguous area with the bits set.
				1611	*/
				1612	int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
				1613	u64 start_ret, u64 end_ret, unsigned bits)
				1614	{
				1615	struct extent_state *state;
				1616	int ret = 1;
				1617
				1618	spin_lock(&tree->lock);
				1619	state = find_first_extent_bit_state(tree, start, bits);
				1620	if (state) {
				1621	*start_ret = state->start;
				1622	*end_ret = state->end;
				1623	while ((state = next_state(state)) != NULL) {
				1624	if (state->start > (*end_ret + 1))
				1625	break;
				1626	*end_ret = state->end;
				1627	}
				1628	ret = 0;
				1629	}
				1630	spin_unlock(&tree->lock);
				1631	return ret;
				1632	}
				1633
				1634	/**
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1635	* find_first_clear_extent_bit - find the first range that has @bits not set.
				1636	* This range could start before @start.
				1637	*
				1638	* @tree - the tree to search
				1639	* @start - the offset at/after which the found extent should start
				1640	* @start_ret - records the beginning of the range
				1641	* @end_ret - records the end of the range (inclusive)
				1642	* @bits - the set of bits which must be unset
				1643	*
				1644	* Since unallocated range is also considered one which doesn't have the bits
				1645	* set it's possible that @end_ret contains -1, this happens in case the range
				1646	* spans (last_range_end, end of device]. In this case it's up to the caller to
				1647	* trim @end_ret to the appropriate size.
				1648	*/
				1649	void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
				1650	u64 start_ret, u64 end_ret, unsigned bits)
				1651	{
				1652	struct extent_state *state;
				1653	struct rb_node node, prev = NULL, *next;
				1654
				1655	spin_lock(&tree->lock);
				1656
				1657	/* Find first extent with bits cleared */
				1658	while (1) {
				1659	node = __etree_search(tree, start, &next, &prev, NULL, NULL);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	1660	if (!node && !next && !prev) {
				1661	/*
				1662	* Tree is completely empty, send full range and let
				1663	* caller deal with it
				1664	*/
				1665	*start_ret = 0;
				1666	*end_ret = -1;
				1667	goto out;
				1668	} else if (!node && !next) {
				1669	/*
				1670	* We are past the last allocated chunk, set start at
				1671	* the end of the last extent.
				1672	*/
				1673	state = rb_entry(prev, struct extent_state, rb_node);
				1674	*start_ret = state->end + 1;
				1675	*end_ret = -1;
				1676	goto out;
				1677	} else if (!node) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1678	node = next;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1679	}
				1680	/*
				1681	* At this point 'node' either contains 'start' or start is
				1682	* before 'node'
				1683	*/
				1684	state = rb_entry(node, struct extent_state, rb_node);
				1685
				1686	if (in_range(start, state->start, state->end - state->start + 1)) {
				1687	if (state->state & bits) {
				1688	/*
				1689	* \|--range with bits sets--\|
				1690	* \|
				1691	* start
				1692	*/
				1693	start = state->end + 1;
				1694	} else {
				1695	/*
				1696	* 'start' falls within a range that doesn't
				1697	* have the bits set, so take its start as
				1698	* the beginning of the desired range
				1699	*
				1700	* \|--range with bits cleared----\|
				1701	* \|
				1702	* start
				1703	*/
				1704	*start_ret = state->start;
				1705	break;
				1706	}
				1707	} else {
				1708	/*
				1709	* \|---prev range---\|---hole/unset---\|---node range---\|
				1710	* \|
				1711	* start
				1712	*
				1713	* or
				1714	*
				1715	* \|---hole/unset--\|\|--first node--\|
				1716	* 0 \|
				1717	* start
				1718	*/
				1719	if (prev) {
				1720	state = rb_entry(prev, struct extent_state,
				1721	rb_node);
				1722	*start_ret = state->end + 1;
				1723	} else {
				1724	*start_ret = 0;
				1725	}
				1726	break;
				1727	}
				1728	}
				1729
				1730	/*
				1731	* Find the longest stretch from start until an entry which has the
				1732	* bits set
				1733	*/
				1734	while (1) {
				1735	state = rb_entry(node, struct extent_state, rb_node);
				1736	if (state->end >= start && !(state->state & bits)) {
				1737	*end_ret = state->end;
				1738	} else {
				1739	*end_ret = state->start - 1;
				1740	break;
				1741	}
				1742
				1743	node = rb_next(node);
				1744	if (!node)
				1745	break;
				1746	}
				1747	out:
				1748	spin_unlock(&tree->lock);
				1749	}
				1750
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1751	/*
				1752	* find a contiguous range of bytes in the file marked as delalloc, not
				1753	* more than 'max_bytes'. start and end are used to return the range,
				1754	*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1755	* true is returned if we find something, false if nothing was in the tree
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1756	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1757	bool btrfs_find_delalloc_range(struct extent_io_tree tree, u64 start,
				1758	u64 *end, u64 max_bytes,
				1759	struct extent_state **cached_state)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1760	{
				1761	struct rb_node *node;
				1762	struct extent_state *state;
				1763	u64 cur_start = *start;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1764	bool found = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1765	u64 total_bytes = 0;
				1766
				1767	spin_lock(&tree->lock);
				1768
				1769	/*
				1770	* this search will find all the extents that end after
				1771	* our range starts.
				1772	*/
				1773	node = tree_search(tree, cur_start);
				1774	if (!node) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1775	*end = (u64)-1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1776	goto out;
				1777	}
				1778
				1779	while (1) {
				1780	state = rb_entry(node, struct extent_state, rb_node);
				1781	if (found && (state->start != cur_start \|\|
				1782	(state->state & EXTENT_BOUNDARY))) {
				1783	goto out;
				1784	}
				1785	if (!(state->state & EXTENT_DELALLOC)) {
				1786	if (!found)
				1787	*end = state->end;
				1788	goto out;
				1789	}
				1790	if (!found) {
				1791	*start = state->start;
				1792	*cached_state = state;
				1793	refcount_inc(&state->refs);
				1794	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1795	found = true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1796	*end = state->end;
				1797	cur_start = state->end + 1;
				1798	node = rb_next(node);
				1799	total_bytes += state->end - state->start + 1;
				1800	if (total_bytes >= max_bytes)
				1801	break;
				1802	if (!node)
				1803	break;
				1804	}
				1805	out:
				1806	spin_unlock(&tree->lock);
				1807	return found;
				1808	}
				1809
				1810	static int __process_pages_contig(struct address_space *mapping,
				1811	struct page *locked_page,
				1812	pgoff_t start_index, pgoff_t end_index,
				1813	unsigned long page_ops, pgoff_t *index_ret);
				1814
				1815	static noinline void __unlock_for_delalloc(struct inode *inode,
				1816	struct page *locked_page,
				1817	u64 start, u64 end)
				1818	{
				1819	unsigned long index = start >> PAGE_SHIFT;
				1820	unsigned long end_index = end >> PAGE_SHIFT;
				1821
				1822	ASSERT(locked_page);
				1823	if (index == locked_page->index && end_index == index)
				1824	return;
				1825
				1826	__process_pages_contig(inode->i_mapping, locked_page, index, end_index,
				1827	PAGE_UNLOCK, NULL);
				1828	}
				1829
				1830	static noinline int lock_delalloc_pages(struct inode *inode,
				1831	struct page *locked_page,
				1832	u64 delalloc_start,
				1833	u64 delalloc_end)
				1834	{
				1835	unsigned long index = delalloc_start >> PAGE_SHIFT;
				1836	unsigned long index_ret = index;
				1837	unsigned long end_index = delalloc_end >> PAGE_SHIFT;
				1838	int ret;
				1839
				1840	ASSERT(locked_page);
				1841	if (index == locked_page->index && index == end_index)
				1842	return 0;
				1843
				1844	ret = __process_pages_contig(inode->i_mapping, locked_page, index,
				1845	end_index, PAGE_LOCK, &index_ret);
				1846	if (ret == -EAGAIN)
				1847	__unlock_for_delalloc(inode, locked_page, delalloc_start,
				1848	(u64)index_ret << PAGE_SHIFT);
				1849	return ret;
				1850	}
				1851
				1852	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1853	* Find and lock a contiguous range of bytes in the file marked as delalloc, no
				1854	* more than @max_bytes. @Start and @end are used to return the range,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1855	*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1856	* Return: true if we find something
				1857	* false if nothing was in the tree
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1858	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1859	EXPORT_FOR_TESTS
				1860	noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1861	struct page locked_page, u64 start,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1862	u64 *end)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1863	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1864	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
				1865	u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1866	u64 delalloc_start;
				1867	u64 delalloc_end;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1868	bool found;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1869	struct extent_state *cached_state = NULL;
				1870	int ret;
				1871	int loops = 0;
				1872
				1873	again:
				1874	/* step one, find a bunch of delalloc bytes starting at start */
				1875	delalloc_start = *start;
				1876	delalloc_end = 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1877	found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
				1878	max_bytes, &cached_state);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1879	if (!found \|\| delalloc_end <= *start) {
				1880	*start = delalloc_start;
				1881	*end = delalloc_end;
				1882	free_extent_state(cached_state);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1883	return false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1884	}
				1885
				1886	/*
				1887	* start comes from the offset of locked_page. We have to lock
				1888	* pages in order, so we can't process delalloc bytes before
				1889	* locked_page
				1890	*/
				1891	if (delalloc_start < *start)
				1892	delalloc_start = *start;
				1893
				1894	/*
				1895	* make sure to limit the number of pages we try to lock down
				1896	*/
				1897	if (delalloc_end + 1 - delalloc_start > max_bytes)
				1898	delalloc_end = delalloc_start + max_bytes - 1;
				1899
				1900	/* step two, lock all the pages after the page that has start */
				1901	ret = lock_delalloc_pages(inode, locked_page,
				1902	delalloc_start, delalloc_end);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1903	ASSERT(!ret \|\| ret == -EAGAIN);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1904	if (ret == -EAGAIN) {
				1905	/* some of the pages are gone, lets avoid looping by
				1906	* shortening the size of the delalloc range we're searching
				1907	*/
				1908	free_extent_state(cached_state);
				1909	cached_state = NULL;
				1910	if (!loops) {
				1911	max_bytes = PAGE_SIZE;
				1912	loops = 1;
				1913	goto again;
				1914	} else {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1915	found = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1916	goto out_failed;
				1917	}
				1918	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1919
				1920	/* step three, lock the state bits for the whole range */
				1921	lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
				1922
				1923	/* then test to make sure it is all still delalloc */
				1924	ret = test_range_bit(tree, delalloc_start, delalloc_end,
				1925	EXTENT_DELALLOC, 1, cached_state);
				1926	if (!ret) {
				1927	unlock_extent_cached(tree, delalloc_start, delalloc_end,
				1928	&cached_state);
				1929	__unlock_for_delalloc(inode, locked_page,
				1930	delalloc_start, delalloc_end);
				1931	cond_resched();
				1932	goto again;
				1933	}
				1934	free_extent_state(cached_state);
				1935	*start = delalloc_start;
				1936	*end = delalloc_end;
				1937	out_failed:
				1938	return found;
				1939	}
				1940
				1941	static int __process_pages_contig(struct address_space *mapping,
				1942	struct page *locked_page,
				1943	pgoff_t start_index, pgoff_t end_index,
				1944	unsigned long page_ops, pgoff_t *index_ret)
				1945	{
				1946	unsigned long nr_pages = end_index - start_index + 1;
				1947	unsigned long pages_locked = 0;
				1948	pgoff_t index = start_index;
				1949	struct page *pages[16];
				1950	unsigned ret;
				1951	int err = 0;
				1952	int i;
				1953
				1954	if (page_ops & PAGE_LOCK) {
				1955	ASSERT(page_ops == PAGE_LOCK);
				1956	ASSERT(index_ret && *index_ret == start_index);
				1957	}
				1958
				1959	if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
				1960	mapping_set_error(mapping, -EIO);
				1961
				1962	while (nr_pages > 0) {
				1963	ret = find_get_pages_contig(mapping, index,
				1964	min_t(unsigned long,
				1965	nr_pages, ARRAY_SIZE(pages)), pages);
				1966	if (ret == 0) {
				1967	/*
				1968	* Only if we're going to lock these pages,
				1969	* can we find nothing at @index.
				1970	*/
				1971	ASSERT(page_ops & PAGE_LOCK);
				1972	err = -EAGAIN;
				1973	goto out;
				1974	}
				1975
				1976	for (i = 0; i < ret; i++) {
				1977	if (page_ops & PAGE_SET_PRIVATE2)
				1978	SetPagePrivate2(pages[i]);
				1979
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	1980	if (locked_page && pages[i] == locked_page) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1981	put_page(pages[i]);
				1982	pages_locked++;
				1983	continue;
				1984	}
				1985	if (page_ops & PAGE_CLEAR_DIRTY)
				1986	clear_page_dirty_for_io(pages[i]);
				1987	if (page_ops & PAGE_SET_WRITEBACK)
				1988	set_page_writeback(pages[i]);
				1989	if (page_ops & PAGE_SET_ERROR)
				1990	SetPageError(pages[i]);
				1991	if (page_ops & PAGE_END_WRITEBACK)
				1992	end_page_writeback(pages[i]);
				1993	if (page_ops & PAGE_UNLOCK)
				1994	unlock_page(pages[i]);
				1995	if (page_ops & PAGE_LOCK) {
				1996	lock_page(pages[i]);
				1997	if (!PageDirty(pages[i]) \|\|
				1998	pages[i]->mapping != mapping) {
				1999	unlock_page(pages[i]);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	2000	for (; i < ret; i++)
				2001	put_page(pages[i]);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2002	err = -EAGAIN;
				2003	goto out;
				2004	}
				2005	}
				2006	put_page(pages[i]);
				2007	pages_locked++;
				2008	}
				2009	nr_pages -= ret;
				2010	index += ret;
				2011	cond_resched();
				2012	}
				2013	out:
				2014	if (err && index_ret)
				2015	*index_ret = start_index + pages_locked - 1;
				2016	return err;
				2017	}
				2018
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2019	void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2020	struct page *locked_page,
				2021	unsigned clear_bits,
				2022	unsigned long page_ops)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2023	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2024	clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2025
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2026	__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2027	start >> PAGE_SHIFT, end >> PAGE_SHIFT,
				2028	page_ops, NULL);
				2029	}
				2030
				2031	/*
				2032	* count the number of bytes in the tree that have a given bit(s)
				2033	* set. This can be fairly slow, except for EXTENT_DIRTY which is
				2034	* cached. The total number found is returned.
				2035	*/
				2036	u64 count_range_bits(struct extent_io_tree *tree,
				2037	u64 *start, u64 search_end, u64 max_bytes,
				2038	unsigned bits, int contig)
				2039	{
				2040	struct rb_node *node;
				2041	struct extent_state *state;
				2042	u64 cur_start = *start;
				2043	u64 total_bytes = 0;
				2044	u64 last = 0;
				2045	int found = 0;
				2046
				2047	if (WARN_ON(search_end <= cur_start))
				2048	return 0;
				2049
				2050	spin_lock(&tree->lock);
				2051	if (cur_start == 0 && bits == EXTENT_DIRTY) {
				2052	total_bytes = tree->dirty_bytes;
				2053	goto out;
				2054	}
				2055	/*
				2056	* this search will find all the extents that end after
				2057	* our range starts.
				2058	*/
				2059	node = tree_search(tree, cur_start);
				2060	if (!node)
				2061	goto out;
				2062
				2063	while (1) {
				2064	state = rb_entry(node, struct extent_state, rb_node);
				2065	if (state->start > search_end)
				2066	break;
				2067	if (contig && found && state->start > last + 1)
				2068	break;
				2069	if (state->end >= cur_start && (state->state & bits) == bits) {
				2070	total_bytes += min(search_end, state->end) + 1 -
				2071	max(cur_start, state->start);
				2072	if (total_bytes >= max_bytes)
				2073	break;
				2074	if (!found) {
				2075	*start = max(cur_start, state->start);
				2076	found = 1;
				2077	}
				2078	last = state->end;
				2079	} else if (contig && found) {
				2080	break;
				2081	}
				2082	node = rb_next(node);
				2083	if (!node)
				2084	break;
				2085	}
				2086	out:
				2087	spin_unlock(&tree->lock);
				2088	return total_bytes;
				2089	}
				2090
				2091	/*
				2092	* set the private field for a given byte offset in the tree. If there isn't
				2093	* an extent_state there already, this does nothing.
				2094	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2095	int set_state_failrec(struct extent_io_tree *tree, u64 start,
				2096	struct io_failure_record *failrec)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2097	{
				2098	struct rb_node *node;
				2099	struct extent_state *state;
				2100	int ret = 0;
				2101
				2102	spin_lock(&tree->lock);
				2103	/*
				2104	* this search will find all the extents that end after
				2105	* our range starts.
				2106	*/
				2107	node = tree_search(tree, start);
				2108	if (!node) {
				2109	ret = -ENOENT;
				2110	goto out;
				2111	}
				2112	state = rb_entry(node, struct extent_state, rb_node);
				2113	if (state->start != start) {
				2114	ret = -ENOENT;
				2115	goto out;
				2116	}
				2117	state->failrec = failrec;
				2118	out:
				2119	spin_unlock(&tree->lock);
				2120	return ret;
				2121	}
				2122
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2123	struct io_failure_record get_state_failrec(struct extent_io_tree tree, u64 start)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2124	{
				2125	struct rb_node *node;
				2126	struct extent_state *state;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2127	struct io_failure_record *failrec;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2128
				2129	spin_lock(&tree->lock);
				2130	/*
				2131	* this search will find all the extents that end after
				2132	* our range starts.
				2133	*/
				2134	node = tree_search(tree, start);
				2135	if (!node) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2136	failrec = ERR_PTR(-ENOENT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2137	goto out;
				2138	}
				2139	state = rb_entry(node, struct extent_state, rb_node);
				2140	if (state->start != start) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2141	failrec = ERR_PTR(-ENOENT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2142	goto out;
				2143	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2144
				2145	failrec = state->failrec;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2146	out:
				2147	spin_unlock(&tree->lock);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2148	return failrec;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2149	}
				2150
				2151	/*
				2152	* searches a range in the state tree for a given mask.
				2153	* If 'filled' == 1, this returns 1 only if every extent in the tree
				2154	* has the bits set. Otherwise, 1 is returned if any bit in the
				2155	* range is found set.
				2156	*/
				2157	int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
				2158	unsigned bits, int filled, struct extent_state *cached)
				2159	{
				2160	struct extent_state *state = NULL;
				2161	struct rb_node *node;
				2162	int bitset = 0;
				2163
				2164	spin_lock(&tree->lock);
				2165	if (cached && extent_state_in_tree(cached) && cached->start <= start &&
				2166	cached->end > start)
				2167	node = &cached->rb_node;
				2168	else
				2169	node = tree_search(tree, start);
				2170	while (node && start <= end) {
				2171	state = rb_entry(node, struct extent_state, rb_node);
				2172
				2173	if (filled && state->start > start) {
				2174	bitset = 0;
				2175	break;
				2176	}
				2177
				2178	if (state->start > end)
				2179	break;
				2180
				2181	if (state->state & bits) {
				2182	bitset = 1;
				2183	if (!filled)
				2184	break;
				2185	} else if (filled) {
				2186	bitset = 0;
				2187	break;
				2188	}
				2189
				2190	if (state->end == (u64)-1)
				2191	break;
				2192
				2193	start = state->end + 1;
				2194	if (start > end)
				2195	break;
				2196	node = rb_next(node);
				2197	if (!node) {
				2198	if (filled)
				2199	bitset = 0;
				2200	break;
				2201	}
				2202	}
				2203	spin_unlock(&tree->lock);
				2204	return bitset;
				2205	}
				2206
				2207	/*
				2208	* helper function to set a given page up to date if all the
				2209	* extents in the tree for that page are up to date
				2210	*/
				2211	static void check_page_uptodate(struct extent_io_tree tree, struct page page)
				2212	{
				2213	u64 start = page_offset(page);
				2214	u64 end = start + PAGE_SIZE - 1;
				2215	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
				2216	SetPageUptodate(page);
				2217	}
				2218
				2219	int free_io_failure(struct extent_io_tree *failure_tree,
				2220	struct extent_io_tree *io_tree,
				2221	struct io_failure_record *rec)
				2222	{
				2223	int ret;
				2224	int err = 0;
				2225
				2226	set_state_failrec(failure_tree, rec->start, NULL);
				2227	ret = clear_extent_bits(failure_tree, rec->start,
				2228	rec->start + rec->len - 1,
				2229	EXTENT_LOCKED \| EXTENT_DIRTY);
				2230	if (ret)
				2231	err = ret;
				2232
				2233	ret = clear_extent_bits(io_tree, rec->start,
				2234	rec->start + rec->len - 1,
				2235	EXTENT_DAMAGED);
				2236	if (ret && !err)
				2237	err = ret;
				2238
				2239	kfree(rec);
				2240	return err;
				2241	}
				2242
				2243	/*
				2244	* this bypasses the standard btrfs submit functions deliberately, as
				2245	* the standard behavior is to write all copies in a raid setup. here we only
				2246	* want to write the one bad copy. so we do the mapping for ourselves and issue
				2247	* submit_bio directly.
				2248	* to avoid any synchronization issues, wait for the data after writing, which
				2249	* actually prevents the read that triggered the error from finishing.
				2250	* currently, there can be no more than two copies of every data bit. thus,
				2251	* exactly one rewrite is required.
				2252	*/
				2253	int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
				2254	u64 length, u64 logical, struct page *page,
				2255	unsigned int pg_offset, int mirror_num)
				2256	{
				2257	struct bio *bio;
				2258	struct btrfs_device *dev;
				2259	u64 map_length = 0;
				2260	u64 sector;
				2261	struct btrfs_bio *bbio = NULL;
				2262	int ret;
				2263
				2264	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
				2265	BUG_ON(!mirror_num);
				2266
				2267	bio = btrfs_io_bio_alloc(1);
				2268	bio->bi_iter.bi_size = 0;
				2269	map_length = length;
				2270
				2271	/*
				2272	* Avoid races with device replace and make sure our bbio has devices
				2273	* associated to its stripes that don't go away while we are doing the
				2274	* read repair operation.
				2275	*/
				2276	btrfs_bio_counter_inc_blocked(fs_info);
				2277	if (btrfs_is_parity_mirror(fs_info, logical, length)) {
				2278	/*
				2279	* Note that we don't use BTRFS_MAP_WRITE because it's supposed
				2280	* to update all raid stripes, but here we just want to correct
				2281	* bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
				2282	* stripe's dev and sector.
				2283	*/
				2284	ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
				2285	&map_length, &bbio, 0);
				2286	if (ret) {
				2287	btrfs_bio_counter_dec(fs_info);
				2288	bio_put(bio);
				2289	return -EIO;
				2290	}
				2291	ASSERT(bbio->mirror_num == 1);
				2292	} else {
				2293	ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
				2294	&map_length, &bbio, mirror_num);
				2295	if (ret) {
				2296	btrfs_bio_counter_dec(fs_info);
				2297	bio_put(bio);
				2298	return -EIO;
				2299	}
				2300	BUG_ON(mirror_num != bbio->mirror_num);
				2301	}
				2302
				2303	sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
				2304	bio->bi_iter.bi_sector = sector;
				2305	dev = bbio->stripes[bbio->mirror_num - 1].dev;
				2306	btrfs_put_bbio(bbio);
				2307	if (!dev \|\| !dev->bdev \|\|
				2308	!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
				2309	btrfs_bio_counter_dec(fs_info);
				2310	bio_put(bio);
				2311	return -EIO;
				2312	}
				2313	bio_set_dev(bio, dev->bdev);
				2314	bio->bi_opf = REQ_OP_WRITE \| REQ_SYNC;
				2315	bio_add_page(bio, page, length, pg_offset);
				2316
				2317	if (btrfsic_submit_bio_wait(bio)) {
				2318	/* try to remap that extent elsewhere? */
				2319	btrfs_bio_counter_dec(fs_info);
				2320	bio_put(bio);
				2321	btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
				2322	return -EIO;
				2323	}
				2324
				2325	btrfs_info_rl_in_rcu(fs_info,
				2326	"read error corrected: ino %llu off %llu (dev %s sector %llu)",
				2327	ino, start,
				2328	rcu_str_deref(dev->name), sector);
				2329	btrfs_bio_counter_dec(fs_info);
				2330	bio_put(bio);
				2331	return 0;
				2332	}
				2333
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2334	int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2335	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2336	struct btrfs_fs_info *fs_info = eb->fs_info;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2337	u64 start = eb->start;
				2338	int i, num_pages = num_extent_pages(eb);
				2339	int ret = 0;
				2340
				2341	if (sb_rdonly(fs_info->sb))
				2342	return -EROFS;
				2343
				2344	for (i = 0; i < num_pages; i++) {
				2345	struct page *p = eb->pages[i];
				2346
				2347	ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
				2348	start - page_offset(p), mirror_num);
				2349	if (ret)
				2350	break;
				2351	start += PAGE_SIZE;
				2352	}
				2353
				2354	return ret;
				2355	}
				2356
				2357	/*
				2358	* each time an IO finishes, we do a fast check in the IO failure tree
				2359	* to see if we need to process or clean up an io_failure_record
				2360	*/
				2361	int clean_io_failure(struct btrfs_fs_info *fs_info,
				2362	struct extent_io_tree *failure_tree,
				2363	struct extent_io_tree *io_tree, u64 start,
				2364	struct page *page, u64 ino, unsigned int pg_offset)
				2365	{
				2366	u64 private;
				2367	struct io_failure_record *failrec;
				2368	struct extent_state *state;
				2369	int num_copies;
				2370	int ret;
				2371
				2372	private = 0;
				2373	ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
				2374	EXTENT_DIRTY, 0);
				2375	if (!ret)
				2376	return 0;
				2377
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2378	failrec = get_state_failrec(failure_tree, start);
				2379	if (IS_ERR(failrec))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2380	return 0;
				2381
				2382	BUG_ON(!failrec->this_mirror);
				2383
				2384	if (failrec->in_validation) {
				2385	/* there was no real error, just free the record */
				2386	btrfs_debug(fs_info,
				2387	"clean_io_failure: freeing dummy error at %llu",
				2388	failrec->start);
				2389	goto out;
				2390	}
				2391	if (sb_rdonly(fs_info->sb))
				2392	goto out;
				2393
				2394	spin_lock(&io_tree->lock);
				2395	state = find_first_extent_bit_state(io_tree,
				2396	failrec->start,
				2397	EXTENT_LOCKED);
				2398	spin_unlock(&io_tree->lock);
				2399
				2400	if (state && state->start <= failrec->start &&
				2401	state->end >= failrec->start + failrec->len - 1) {
				2402	num_copies = btrfs_num_copies(fs_info, failrec->logical,
				2403	failrec->len);
				2404	if (num_copies > 1) {
				2405	repair_io_failure(fs_info, ino, start, failrec->len,
				2406	failrec->logical, page, pg_offset,
				2407	failrec->failed_mirror);
				2408	}
				2409	}
				2410
				2411	out:
				2412	free_io_failure(failure_tree, io_tree, failrec);
				2413
				2414	return 0;
				2415	}
				2416
				2417	/*
				2418	* Can be called when
				2419	* - hold extent lock
				2420	* - under ordered extent
				2421	* - the inode is freeing
				2422	*/
				2423	void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
				2424	{
				2425	struct extent_io_tree *failure_tree = &inode->io_failure_tree;
				2426	struct io_failure_record *failrec;
				2427	struct extent_state state, next;
				2428
				2429	if (RB_EMPTY_ROOT(&failure_tree->state))
				2430	return;
				2431
				2432	spin_lock(&failure_tree->lock);
				2433	state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
				2434	while (state) {
				2435	if (state->start > end)
				2436	break;
				2437
				2438	ASSERT(state->end <= end);
				2439
				2440	next = next_state(state);
				2441
				2442	failrec = state->failrec;
				2443	free_extent_state(state);
				2444	kfree(failrec);
				2445
				2446	state = next;
				2447	}
				2448	spin_unlock(&failure_tree->lock);
				2449	}
				2450
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2451	static struct io_failure_record btrfs_get_io_failure_record(struct inode inode,
				2452	u64 start, u64 end)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2453	{
				2454	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				2455	struct io_failure_record *failrec;
				2456	struct extent_map *em;
				2457	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
				2458	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
				2459	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
				2460	int ret;
				2461	u64 logical;
				2462
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2463	failrec = get_state_failrec(failure_tree, start);
				2464	if (!IS_ERR(failrec)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2465	btrfs_debug(fs_info,
				2466	"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
				2467	failrec->logical, failrec->start, failrec->len,
				2468	failrec->in_validation);
				2469	/*
				2470	* when data can be on disk more than twice, add to failrec here
				2471	* (e.g. with a list for failed_mirror) to make
				2472	* clean_io_failure() clean all those errors at once.
				2473	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2474
				2475	return failrec;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2476	}
				2477
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2478	failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
				2479	if (!failrec)
				2480	return ERR_PTR(-ENOMEM);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2481
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2482	failrec->start = start;
				2483	failrec->len = end - start + 1;
				2484	failrec->this_mirror = 0;
				2485	failrec->bio_flags = 0;
				2486	failrec->in_validation = 0;
				2487
				2488	read_lock(&em_tree->lock);
				2489	em = lookup_extent_mapping(em_tree, start, failrec->len);
				2490	if (!em) {
				2491	read_unlock(&em_tree->lock);
				2492	kfree(failrec);
				2493	return ERR_PTR(-EIO);
				2494	}
				2495
				2496	if (em->start > start \|\| em->start + em->len <= start) {
				2497	free_extent_map(em);
				2498	em = NULL;
				2499	}
				2500	read_unlock(&em_tree->lock);
				2501	if (!em) {
				2502	kfree(failrec);
				2503	return ERR_PTR(-EIO);
				2504	}
				2505
				2506	logical = start - em->start;
				2507	logical = em->block_start + logical;
				2508	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
				2509	logical = em->block_start;
				2510	failrec->bio_flags = EXTENT_BIO_COMPRESSED;
				2511	extent_set_compress_type(&failrec->bio_flags, em->compress_type);
				2512	}
				2513
				2514	btrfs_debug(fs_info,
				2515	"Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
				2516	logical, start, failrec->len);
				2517
				2518	failrec->logical = logical;
				2519	free_extent_map(em);
				2520
				2521	/* Set the bits in the private failure tree */
				2522	ret = set_extent_bits(failure_tree, start, end,
				2523	EXTENT_LOCKED \| EXTENT_DIRTY);
				2524	if (ret >= 0) {
				2525	ret = set_state_failrec(failure_tree, start, failrec);
				2526	/* Set the bits in the inode's tree */
				2527	ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
				2528	} else if (ret < 0) {
				2529	kfree(failrec);
				2530	return ERR_PTR(ret);
				2531	}
				2532
				2533	return failrec;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2534	}
				2535
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2536	static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
				2537	struct io_failure_record *failrec,
				2538	int failed_mirror)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2539	{
				2540	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				2541	int num_copies;
				2542
				2543	num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
				2544	if (num_copies == 1) {
				2545	/*
				2546	* we only have a single copy of the data, so don't bother with
				2547	* all the retry and error correction code that follows. no
				2548	* matter what the error is, it is very likely to persist.
				2549	*/
				2550	btrfs_debug(fs_info,
				2551	"Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
				2552	num_copies, failrec->this_mirror, failed_mirror);
				2553	return false;
				2554	}
				2555
				2556	/*
				2557	* there are two premises:
				2558	* a) deliver good data to the caller
				2559	* b) correct the bad sectors on disk
				2560	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2561	if (needs_validation) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2562	/*
				2563	* to fulfill b), we need to know the exact failing sectors, as
				2564	* we don't want to rewrite any more than the failed ones. thus,
				2565	* we need separate read requests for the failed bio
				2566	*
				2567	* if the following BUG_ON triggers, our validation request got
				2568	* merged. we need separate requests for our algorithm to work.
				2569	*/
				2570	BUG_ON(failrec->in_validation);
				2571	failrec->in_validation = 1;
				2572	failrec->this_mirror = failed_mirror;
				2573	} else {
				2574	/*
				2575	* we're ready to fulfill a) and b) alongside. get a good copy
				2576	* of the failed sector and if we succeed, we have setup
				2577	* everything for repair_io_failure to do the rest for us.
				2578	*/
				2579	if (failrec->in_validation) {
				2580	BUG_ON(failrec->this_mirror != failed_mirror);
				2581	failrec->in_validation = 0;
				2582	failrec->this_mirror = 0;
				2583	}
				2584	failrec->failed_mirror = failed_mirror;
				2585	failrec->this_mirror++;
				2586	if (failrec->this_mirror == failed_mirror)
				2587	failrec->this_mirror++;
				2588	}
				2589
				2590	if (failrec->this_mirror > num_copies) {
				2591	btrfs_debug(fs_info,
				2592	"Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
				2593	num_copies, failrec->this_mirror, failed_mirror);
				2594	return false;
				2595	}
				2596
				2597	return true;
				2598	}
				2599
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2600	static bool btrfs_io_needs_validation(struct inode inode, struct bio bio)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2601	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2602	u64 len = 0;
				2603	const u32 blocksize = inode->i_sb->s_blocksize;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2604
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2605	/*
				2606	* If bi_status is BLK_STS_OK, then this was a checksum error, not an
				2607	* I/O error. In this case, we already know exactly which sector was
				2608	* bad, so we don't need to validate.
				2609	*/
				2610	if (bio->bi_status == BLK_STS_OK)
				2611	return false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2612
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2613	/*
				2614	* We need to validate each sector individually if the failed I/O was
				2615	* for multiple sectors.
				2616	*
				2617	* There are a few possible bios that can end up here:
				2618	* 1. A buffered read bio, which is not cloned.
				2619	* 2. A direct I/O read bio, which is cloned.
				2620	* 3. A (buffered or direct) repair bio, which is not cloned.
				2621	*
				2622	* For cloned bios (case 2), we can get the size from
				2623	* btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get
				2624	* it from the bvecs.
				2625	*/
				2626	if (bio_flagged(bio, BIO_CLONED)) {
				2627	if (btrfs_io_bio(bio)->iter.bi_size > blocksize)
				2628	return true;
				2629	} else {
				2630	struct bio_vec *bvec;
				2631	int i;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2632
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2633	bio_for_each_bvec_all(bvec, bio, i) {
				2634	len += bvec->bv_len;
				2635	if (len > blocksize)
				2636	return true;
				2637	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2638	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2639	return false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2640	}
				2641
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2642	blk_status_t btrfs_submit_read_repair(struct inode *inode,
				2643	struct bio *failed_bio, u64 phy_offset,
				2644	struct page *page, unsigned int pgoff,
				2645	u64 start, u64 end, int failed_mirror,
				2646	submit_bio_hook_t *submit_bio_hook)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2647	{
				2648	struct io_failure_record *failrec;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2649	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2650	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
				2651	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2652	struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
				2653	const int icsum = phy_offset >> inode->i_sb->s_blocksize_bits;
				2654	bool need_validation;
				2655	struct bio *repair_bio;
				2656	struct btrfs_io_bio *repair_io_bio;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2657	blk_status_t status;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2658
				2659	btrfs_debug(fs_info,
				2660	"repair read error: read error at %llu", start);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2661
				2662	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
				2663
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2664	failrec = btrfs_get_io_failure_record(inode, start, end);
				2665	if (IS_ERR(failrec))
				2666	return errno_to_blk_status(PTR_ERR(failrec));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2667
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2668	need_validation = btrfs_io_needs_validation(inode, failed_bio);
				2669
				2670	if (!btrfs_check_repairable(inode, need_validation, failrec,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2671	failed_mirror)) {
				2672	free_io_failure(failure_tree, tree, failrec);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2673	return BLK_STS_IOERR;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2674	}
				2675
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2676	repair_bio = btrfs_io_bio_alloc(1);
				2677	repair_io_bio = btrfs_io_bio(repair_bio);
				2678	repair_bio->bi_opf = REQ_OP_READ;
				2679	if (need_validation)
				2680	repair_bio->bi_opf \|= REQ_FAILFAST_DEV;
				2681	repair_bio->bi_end_io = failed_bio->bi_end_io;
				2682	repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
				2683	repair_bio->bi_private = failed_bio->bi_private;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2684
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2685	if (failed_io_bio->csum) {
				2686	const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
				2687
				2688	repair_io_bio->csum = repair_io_bio->csum_inline;
				2689	memcpy(repair_io_bio->csum,
				2690	failed_io_bio->csum + csum_size * icsum, csum_size);
				2691	}
				2692
				2693	bio_add_page(repair_bio, page, failrec->len, pgoff);
				2694	repair_io_bio->logical = failrec->start;
				2695	repair_io_bio->iter = repair_bio->bi_iter;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2696
				2697	btrfs_debug(btrfs_sb(inode->i_sb),
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2698	"repair read error: submitting new read to mirror %d, in_validation=%d",
				2699	failrec->this_mirror, failrec->in_validation);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2700
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2701	status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
				2702	failrec->bio_flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2703	if (status) {
				2704	free_io_failure(failure_tree, tree, failrec);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2705	bio_put(repair_bio);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2706	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2707	return status;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2708	}
				2709
				2710	/* lots and lots of room for performance fixes in the end_bio funcs */
				2711
				2712	void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
				2713	{
				2714	int uptodate = (err == 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2715	int ret = 0;
				2716
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2717	btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2718
				2719	if (!uptodate) {
				2720	ClearPageUptodate(page);
				2721	SetPageError(page);
				2722	ret = err < 0 ? err : -EIO;
				2723	mapping_set_error(page->mapping, ret);
				2724	}
				2725	}
				2726
				2727	/*
				2728	* after a writepage IO is done, we need to:
				2729	* clear the uptodate bits on error
				2730	* clear the writeback bits in the extent tree for this IO
				2731	* end_page_writeback if the page has no more pending IO
				2732	*
				2733	* Scheduling is not allowed, so the extent state tree is expected
				2734	* to have one and only one object corresponding to this IO.
				2735	*/
				2736	static void end_bio_extent_writepage(struct bio *bio)
				2737	{
				2738	int error = blk_status_to_errno(bio->bi_status);
				2739	struct bio_vec *bvec;
				2740	u64 start;
				2741	u64 end;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2742	struct bvec_iter_all iter_all;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2743
				2744	ASSERT(!bio_flagged(bio, BIO_CLONED));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2745	bio_for_each_segment_all(bvec, bio, iter_all) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2746	struct page *page = bvec->bv_page;
				2747	struct inode *inode = page->mapping->host;
				2748	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				2749
				2750	/* We always issue full-page reads, but if some block
				2751	* in a page fails to read, blk_update_request() will
				2752	* advance bv_offset and adjust bv_len to compensate.
				2753	* Print a warning for nonzero offsets, and an error
				2754	* if they don't add up to a full page. */
				2755	if (bvec->bv_offset \|\| bvec->bv_len != PAGE_SIZE) {
				2756	if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
				2757	btrfs_err(fs_info,
				2758	"partial page write in btrfs with offset %u and length %u",
				2759	bvec->bv_offset, bvec->bv_len);
				2760	else
				2761	btrfs_info(fs_info,
				2762	"incomplete page write in btrfs with offset %u and length %u",
				2763	bvec->bv_offset, bvec->bv_len);
				2764	}
				2765
				2766	start = page_offset(page);
				2767	end = start + bvec->bv_offset + bvec->bv_len - 1;
				2768
				2769	end_extent_writepage(page, error, start, end);
				2770	end_page_writeback(page);
				2771	}
				2772
				2773	bio_put(bio);
				2774	}
				2775
				2776	static void
				2777	endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
				2778	int uptodate)
				2779	{
				2780	struct extent_state *cached = NULL;
				2781	u64 end = start + len - 1;
				2782
				2783	if (uptodate && tree->track_uptodate)
				2784	set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
				2785	unlock_extent_cached_atomic(tree, start, end, &cached);
				2786	}
				2787
				2788	/*
				2789	* after a readpage IO is done, we need to:
				2790	* clear the uptodate bits on error
				2791	* set the uptodate bits if things worked
				2792	* set the page up to date if all extents in the tree are uptodate
				2793	* clear the lock bit in the extent tree
				2794	* unlock the page if there are no other extents locked for it
				2795	*
				2796	* Scheduling is not allowed, so the extent state tree is expected
				2797	* to have one and only one object corresponding to this IO.
				2798	*/
				2799	static void end_bio_extent_readpage(struct bio *bio)
				2800	{
				2801	struct bio_vec *bvec;
				2802	int uptodate = !bio->bi_status;
				2803	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
				2804	struct extent_io_tree tree, failure_tree;
				2805	u64 offset = 0;
				2806	u64 start;
				2807	u64 end;
				2808	u64 len;
				2809	u64 extent_start = 0;
				2810	u64 extent_len = 0;
				2811	int mirror;
				2812	int ret;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2813	struct bvec_iter_all iter_all;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2814
				2815	ASSERT(!bio_flagged(bio, BIO_CLONED));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2816	bio_for_each_segment_all(bvec, bio, iter_all) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2817	struct page *page = bvec->bv_page;
				2818	struct inode *inode = page->mapping->host;
				2819	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				2820
				2821	btrfs_debug(fs_info,
				2822	"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
				2823	(u64)bio->bi_iter.bi_sector, bio->bi_status,
				2824	io_bio->mirror_num);
				2825	tree = &BTRFS_I(inode)->io_tree;
				2826	failure_tree = &BTRFS_I(inode)->io_failure_tree;
				2827
				2828	/* We always issue full-page reads, but if some block
				2829	* in a page fails to read, blk_update_request() will
				2830	* advance bv_offset and adjust bv_len to compensate.
				2831	* Print a warning for nonzero offsets, and an error
				2832	* if they don't add up to a full page. */
				2833	if (bvec->bv_offset \|\| bvec->bv_len != PAGE_SIZE) {
				2834	if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
				2835	btrfs_err(fs_info,
				2836	"partial page read in btrfs with offset %u and length %u",
				2837	bvec->bv_offset, bvec->bv_len);
				2838	else
				2839	btrfs_info(fs_info,
				2840	"incomplete page read in btrfs with offset %u and length %u",
				2841	bvec->bv_offset, bvec->bv_len);
				2842	}
				2843
				2844	start = page_offset(page);
				2845	end = start + bvec->bv_offset + bvec->bv_len - 1;
				2846	len = bvec->bv_len;
				2847
				2848	mirror = io_bio->mirror_num;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2849	if (likely(uptodate)) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2850	if (is_data_inode(inode))
				2851	ret = btrfs_verify_data_csum(io_bio, offset, page,
				2852	start, end, mirror);
				2853	else
				2854	ret = btrfs_validate_metadata_buffer(io_bio,
				2855	offset, page, start, end, mirror);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2856	if (ret)
				2857	uptodate = 0;
				2858	else
				2859	clean_io_failure(BTRFS_I(inode)->root->fs_info,
				2860	failure_tree, tree, start,
				2861	page,
				2862	btrfs_ino(BTRFS_I(inode)), 0);
				2863	}
				2864
				2865	if (likely(uptodate))
				2866	goto readpage_ok;
				2867
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2868	if (is_data_inode(inode)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2869
				2870	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2871	* The generic bio_readpage_error handles errors the
				2872	* following way: If possible, new read requests are
				2873	* created and submitted and will end up in
				2874	* end_bio_extent_readpage as well (if we're lucky,
				2875	* not in the !uptodate case). In that case it returns
				2876	* 0 and we just go on with the next page in our bio.
				2877	* If it can't handle the error it will return -EIO and
				2878	* we remain responsible for that page.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2879	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2880	if (!btrfs_submit_read_repair(inode, bio, offset, page,
				2881	start - page_offset(page),
				2882	start, end, mirror,
				2883	btrfs_submit_data_bio)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2884	uptodate = !bio->bi_status;
				2885	offset += len;
				2886	continue;
				2887	}
				2888	} else {
				2889	struct extent_buffer *eb;
				2890
				2891	eb = (struct extent_buffer *)page->private;
				2892	set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
				2893	eb->read_mirror = mirror;
				2894	atomic_dec(&eb->io_pages);
				2895	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
				2896	&eb->bflags))
				2897	btree_readahead_hook(eb, -EIO);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2898	}
				2899	readpage_ok:
				2900	if (likely(uptodate)) {
				2901	loff_t i_size = i_size_read(inode);
				2902	pgoff_t end_index = i_size >> PAGE_SHIFT;
				2903	unsigned off;
				2904
				2905	/* Zero out the end if this page straddles i_size */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2906	off = offset_in_page(i_size);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2907	if (page->index == end_index && off)
				2908	zero_user_segment(page, off, PAGE_SIZE);
				2909	SetPageUptodate(page);
				2910	} else {
				2911	ClearPageUptodate(page);
				2912	SetPageError(page);
				2913	}
				2914	unlock_page(page);
				2915	offset += len;
				2916
				2917	if (unlikely(!uptodate)) {
				2918	if (extent_len) {
				2919	endio_readpage_release_extent(tree,
				2920	extent_start,
				2921	extent_len, 1);
				2922	extent_start = 0;
				2923	extent_len = 0;
				2924	}
				2925	endio_readpage_release_extent(tree, start,
				2926	end - start + 1, 0);
				2927	} else if (!extent_len) {
				2928	extent_start = start;
				2929	extent_len = end + 1 - start;
				2930	} else if (extent_start + extent_len == start) {
				2931	extent_len += end + 1 - start;
				2932	} else {
				2933	endio_readpage_release_extent(tree, extent_start,
				2934	extent_len, uptodate);
				2935	extent_start = start;
				2936	extent_len = end + 1 - start;
				2937	}
				2938	}
				2939
				2940	if (extent_len)
				2941	endio_readpage_release_extent(tree, extent_start, extent_len,
				2942	uptodate);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2943	btrfs_io_bio_free_csum(io_bio);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2944	bio_put(bio);
				2945	}
				2946
				2947	/*
				2948	* Initialize the members up to but not including 'bio'. Use after allocating a
				2949	* new bio by bio_alloc_bioset as it does not initialize the bytes outside of
				2950	* 'bio' because use of __GFP_ZERO is not supported.
				2951	*/
				2952	static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
				2953	{
				2954	memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
				2955	}
				2956
				2957	/*
				2958	* The following helpers allocate a bio. As it's backed by a bioset, it'll
				2959	* never fail. We're returning a bio right now but you can call btrfs_io_bio
				2960	* for the appropriate container_of magic
				2961	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2962	struct bio *btrfs_bio_alloc(u64 first_byte)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2963	{
				2964	struct bio *bio;
				2965
				2966	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2967	bio->bi_iter.bi_sector = first_byte >> 9;
				2968	btrfs_io_bio_init(btrfs_io_bio(bio));
				2969	return bio;
				2970	}
				2971
				2972	struct bio btrfs_bio_clone(struct bio bio)
				2973	{
				2974	struct btrfs_io_bio *btrfs_bio;
				2975	struct bio *new;
				2976
				2977	/* Bio allocation backed by a bioset does not fail */
				2978	new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
				2979	btrfs_bio = btrfs_io_bio(new);
				2980	btrfs_io_bio_init(btrfs_bio);
				2981	btrfs_bio->iter = bio->bi_iter;
				2982	return new;
				2983	}
				2984
				2985	struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
				2986	{
				2987	struct bio *bio;
				2988
				2989	/* Bio allocation backed by a bioset does not fail */
				2990	bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
				2991	btrfs_io_bio_init(btrfs_io_bio(bio));
				2992	return bio;
				2993	}
				2994
				2995	struct bio btrfs_bio_clone_partial(struct bio orig, int offset, int size)
				2996	{
				2997	struct bio *bio;
				2998	struct btrfs_io_bio *btrfs_bio;
				2999
				3000	/* this will never fail when it's backed by a bioset */
				3001	bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
				3002	ASSERT(bio);
				3003
				3004	btrfs_bio = btrfs_io_bio(bio);
				3005	btrfs_io_bio_init(btrfs_bio);
				3006
				3007	bio_trim(bio, offset >> 9, size >> 9);
				3008	btrfs_bio->iter = bio->bi_iter;
				3009	return bio;
				3010	}
				3011
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3012	/*
				3013	* @opf: bio REQ_OP_* and REQ_* flags as one value
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3014	* @wbc: optional writeback control for io accounting
				3015	* @page: page to add to the bio
				3016	* @pg_offset: offset of the new bio or to check whether we are adding
				3017	* a contiguous page to the previous one
				3018	* @size: portion of page that we want to write
				3019	* @offset: starting offset in the page
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3020	* @bio_ret: must be valid pointer, newly allocated bio will be stored there
				3021	* @end_io_func: end_io callback for new bio
				3022	* @mirror_num: desired mirror to read/write
				3023	* @prev_bio_flags: flags of previous bio to see if we can merge the current one
				3024	* @bio_flags: flags of the current bio to see if we can merge them
				3025	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3026	static int submit_extent_page(unsigned int opf,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3027	struct writeback_control *wbc,
				3028	struct page *page, u64 offset,
				3029	size_t size, unsigned long pg_offset,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3030	struct bio **bio_ret,
				3031	bio_end_io_t end_io_func,
				3032	int mirror_num,
				3033	unsigned long prev_bio_flags,
				3034	unsigned long bio_flags,
				3035	bool force_bio_submit)
				3036	{
				3037	int ret = 0;
				3038	struct bio *bio;
				3039	size_t page_size = min_t(size_t, size, PAGE_SIZE);
				3040	sector_t sector = offset >> 9;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3041	struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3042
				3043	ASSERT(bio_ret);
				3044
				3045	if (*bio_ret) {
				3046	bool contig;
				3047	bool can_merge = true;
				3048
				3049	bio = *bio_ret;
				3050	if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
				3051	contig = bio->bi_iter.bi_sector == sector;
				3052	else
				3053	contig = bio_end_sector(bio) == sector;
				3054
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3055	if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3056	can_merge = false;
				3057
				3058	if (prev_bio_flags != bio_flags \|\| !contig \|\| !can_merge \|\|
				3059	force_bio_submit \|\|
				3060	bio_add_page(bio, page, page_size, pg_offset) < page_size) {
				3061	ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
				3062	if (ret < 0) {
				3063	*bio_ret = NULL;
				3064	return ret;
				3065	}
				3066	bio = NULL;
				3067	} else {
				3068	if (wbc)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3069	wbc_account_cgroup_owner(wbc, page, page_size);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3070	return 0;
				3071	}
				3072	}
				3073
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3074	bio = btrfs_bio_alloc(offset);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3075	bio_add_page(bio, page, page_size, pg_offset);
				3076	bio->bi_end_io = end_io_func;
				3077	bio->bi_private = tree;
				3078	bio->bi_write_hint = page->mapping->host->i_write_hint;
				3079	bio->bi_opf = opf;
				3080	if (wbc) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3081	struct block_device *bdev;
				3082
				3083	bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
				3084	bio_set_dev(bio, bdev);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3085	wbc_init_bio(wbc, bio);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3086	wbc_account_cgroup_owner(wbc, page, page_size);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3087	}
				3088
				3089	*bio_ret = bio;
				3090
				3091	return ret;
				3092	}
				3093
				3094	static void attach_extent_buffer_page(struct extent_buffer *eb,
				3095	struct page *page)
				3096	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3097	if (!PagePrivate(page))
				3098	attach_page_private(page, eb);
				3099	else
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3100	WARN_ON(page->private != (unsigned long)eb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3101	}
				3102
				3103	void set_page_extent_mapped(struct page *page)
				3104	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3105	if (!PagePrivate(page))
				3106	attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3107	}
				3108
				3109	static struct extent_map *
				3110	__get_extent_map(struct inode inode, struct page page, size_t pg_offset,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3111	u64 start, u64 len, struct extent_map **em_cached)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3112	{
				3113	struct extent_map *em;
				3114
				3115	if (em_cached && *em_cached) {
				3116	em = *em_cached;
				3117	if (extent_map_in_tree(em) && start >= em->start &&
				3118	start < extent_map_end(em)) {
				3119	refcount_inc(&em->refs);
				3120	return em;
				3121	}
				3122
				3123	free_extent_map(em);
				3124	*em_cached = NULL;
				3125	}
				3126
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3127	em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3128	if (em_cached && !IS_ERR_OR_NULL(em)) {
				3129	BUG_ON(*em_cached);
				3130	refcount_inc(&em->refs);
				3131	*em_cached = em;
				3132	}
				3133	return em;
				3134	}
				3135	/*
				3136	* basic readpage implementation. Locked extent state structs are inserted
				3137	* into the tree that are removed when the IO is done (by the end_io
				3138	* handlers)
				3139	* XXX JDM: This needs looking at to ensure proper page locking
				3140	* return 0 on success, otherwise return error
				3141	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3142	int btrfs_do_readpage(struct page page, struct extent_map *em_cached,
				3143	struct bio *bio, unsigned long bio_flags,
				3144	unsigned int read_flags, u64 *prev_em_start)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3145	{
				3146	struct inode *inode = page->mapping->host;
				3147	u64 start = page_offset(page);
				3148	const u64 end = start + PAGE_SIZE - 1;
				3149	u64 cur = start;
				3150	u64 extent_offset;
				3151	u64 last_byte = i_size_read(inode);
				3152	u64 block_start;
				3153	u64 cur_end;
				3154	struct extent_map *em;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3155	int ret = 0;
				3156	int nr = 0;
				3157	size_t pg_offset = 0;
				3158	size_t iosize;
				3159	size_t disk_io_size;
				3160	size_t blocksize = inode->i_sb->s_blocksize;
				3161	unsigned long this_bio_flag = 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3162	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3163
				3164	set_page_extent_mapped(page);
				3165
				3166	if (!PageUptodate(page)) {
				3167	if (cleancache_get_page(page) == 0) {
				3168	BUG_ON(blocksize != PAGE_SIZE);
				3169	unlock_extent(tree, start, end);
				3170	goto out;
				3171	}
				3172	}
				3173
				3174	if (page->index == last_byte >> PAGE_SHIFT) {
				3175	char *userpage;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3176	size_t zero_offset = offset_in_page(last_byte);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3177
				3178	if (zero_offset) {
				3179	iosize = PAGE_SIZE - zero_offset;
				3180	userpage = kmap_atomic(page);
				3181	memset(userpage + zero_offset, 0, iosize);
				3182	flush_dcache_page(page);
				3183	kunmap_atomic(userpage);
				3184	}
				3185	}
				3186	while (cur <= end) {
				3187	bool force_bio_submit = false;
				3188	u64 offset;
				3189
				3190	if (cur >= last_byte) {
				3191	char *userpage;
				3192	struct extent_state *cached = NULL;
				3193
				3194	iosize = PAGE_SIZE - pg_offset;
				3195	userpage = kmap_atomic(page);
				3196	memset(userpage + pg_offset, 0, iosize);
				3197	flush_dcache_page(page);
				3198	kunmap_atomic(userpage);
				3199	set_extent_uptodate(tree, cur, cur + iosize - 1,
				3200	&cached, GFP_NOFS);
				3201	unlock_extent_cached(tree, cur,
				3202	cur + iosize - 1, &cached);
				3203	break;
				3204	}
				3205	em = __get_extent_map(inode, page, pg_offset, cur,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3206	end - cur + 1, em_cached);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3207	if (IS_ERR_OR_NULL(em)) {
				3208	SetPageError(page);
				3209	unlock_extent(tree, cur, end);
				3210	break;
				3211	}
				3212	extent_offset = cur - em->start;
				3213	BUG_ON(extent_map_end(em) <= cur);
				3214	BUG_ON(end < cur);
				3215
				3216	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
				3217	this_bio_flag \|= EXTENT_BIO_COMPRESSED;
				3218	extent_set_compress_type(&this_bio_flag,
				3219	em->compress_type);
				3220	}
				3221
				3222	iosize = min(extent_map_end(em) - cur, end - cur + 1);
				3223	cur_end = min(extent_map_end(em) - 1, end);
				3224	iosize = ALIGN(iosize, blocksize);
				3225	if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
				3226	disk_io_size = em->block_len;
				3227	offset = em->block_start;
				3228	} else {
				3229	offset = em->block_start + extent_offset;
				3230	disk_io_size = iosize;
				3231	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3232	block_start = em->block_start;
				3233	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
				3234	block_start = EXTENT_MAP_HOLE;
				3235
				3236	/*
				3237	* If we have a file range that points to a compressed extent
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	3238	* and it's followed by a consecutive file range that points
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3239	* to the same compressed extent (possibly with a different
				3240	* offset and/or length, so it either points to the whole extent
				3241	* or only part of it), we must make sure we do not submit a
				3242	* single bio to populate the pages for the 2 ranges because
				3243	* this makes the compressed extent read zero out the pages
				3244	* belonging to the 2nd range. Imagine the following scenario:
				3245	*
				3246	* File layout
				3247	* [0 - 8K] [8K - 24K]
				3248	* \| \|
				3249	* \| \|
				3250	* points to extent X, points to extent X,
				3251	* offset 4K, length of 8K offset 0, length 16K
				3252	*
				3253	* [extent X, compressed length = 4K uncompressed length = 16K]
				3254	*
				3255	* If the bio to read the compressed extent covers both ranges,
				3256	* it will decompress extent X into the pages belonging to the
				3257	* first range and then it will stop, zeroing out the remaining
				3258	* pages that belong to the other range that points to extent X.
				3259	* So here we make sure we submit 2 bios, one for the first
				3260	* range and another one for the third range. Both will target
				3261	* the same physical extent from disk, but we can't currently
				3262	* make the compressed bio endio callback populate the pages
				3263	* for both ranges because each compressed bio is tightly
				3264	* coupled with a single extent map, and each range can have
				3265	* an extent map with a different offset value relative to the
				3266	* uncompressed data of our extent and different lengths. This
				3267	* is a corner case so we prioritize correctness over
				3268	* non-optimal behavior (submitting 2 bios for the same extent).
				3269	*/
				3270	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
				3271	prev_em_start && *prev_em_start != (u64)-1 &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3272	*prev_em_start != em->start)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3273	force_bio_submit = true;
				3274
				3275	if (prev_em_start)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3276	*prev_em_start = em->start;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3277
				3278	free_extent_map(em);
				3279	em = NULL;
				3280
				3281	/* we've found a hole, just zero and go on */
				3282	if (block_start == EXTENT_MAP_HOLE) {
				3283	char *userpage;
				3284	struct extent_state *cached = NULL;
				3285
				3286	userpage = kmap_atomic(page);
				3287	memset(userpage + pg_offset, 0, iosize);
				3288	flush_dcache_page(page);
				3289	kunmap_atomic(userpage);
				3290
				3291	set_extent_uptodate(tree, cur, cur + iosize - 1,
				3292	&cached, GFP_NOFS);
				3293	unlock_extent_cached(tree, cur,
				3294	cur + iosize - 1, &cached);
				3295	cur = cur + iosize;
				3296	pg_offset += iosize;
				3297	continue;
				3298	}
				3299	/* the get_extent function already copied into the page */
				3300	if (test_range_bit(tree, cur, cur_end,
				3301	EXTENT_UPTODATE, 1, NULL)) {
				3302	check_page_uptodate(tree, page);
				3303	unlock_extent(tree, cur, cur + iosize - 1);
				3304	cur = cur + iosize;
				3305	pg_offset += iosize;
				3306	continue;
				3307	}
				3308	/* we have an inline extent but it didn't get marked up
				3309	* to date. Error out
				3310	*/
				3311	if (block_start == EXTENT_MAP_INLINE) {
				3312	SetPageError(page);
				3313	unlock_extent(tree, cur, cur + iosize - 1);
				3314	cur = cur + iosize;
				3315	pg_offset += iosize;
				3316	continue;
				3317	}
				3318
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3319	ret = submit_extent_page(REQ_OP_READ \| read_flags, NULL,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3320	page, offset, disk_io_size,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3321	pg_offset, bio,
				3322	end_bio_extent_readpage, 0,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3323	*bio_flags,
				3324	this_bio_flag,
				3325	force_bio_submit);
				3326	if (!ret) {
				3327	nr++;
				3328	*bio_flags = this_bio_flag;
				3329	} else {
				3330	SetPageError(page);
				3331	unlock_extent(tree, cur, cur + iosize - 1);
				3332	goto out;
				3333	}
				3334	cur = cur + iosize;
				3335	pg_offset += iosize;
				3336	}
				3337	out:
				3338	if (!nr) {
				3339	if (!PageError(page))
				3340	SetPageUptodate(page);
				3341	unlock_page(page);
				3342	}
				3343	return ret;
				3344	}
				3345
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3346	static inline void contiguous_readpages(struct page *pages[], int nr_pages,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3347	u64 start, u64 end,
				3348	struct extent_map **em_cached,
				3349	struct bio **bio,
				3350	unsigned long *bio_flags,
				3351	u64 *prev_em_start)
				3352	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3353	struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3354	int index;
				3355
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3356	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3357
				3358	for (index = 0; index < nr_pages; index++) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3359	btrfs_do_readpage(pages[index], em_cached, bio, bio_flags,
				3360	REQ_RAHEAD, prev_em_start);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3361	put_page(pages[index]);
				3362	}
				3363	}
				3364
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3365	static void update_nr_written(struct writeback_control *wbc,
				3366	unsigned long nr_written)
				3367	{
				3368	wbc->nr_to_write -= nr_written;
				3369	}
				3370
				3371	/*
				3372	* helper for __extent_writepage, doing all of the delayed allocation setup.
				3373	*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3374	* This returns 1 if btrfs_run_delalloc_range function did all the work required
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3375	* to write the page (copy into inline extent). In this case the IO has
				3376	* been started and the page is already unlocked.
				3377	*
				3378	* This returns 0 if all went well (page still locked)
				3379	* This returns < 0 if there were errors (page still locked)
				3380	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3381	static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3382	struct page page, struct writeback_control wbc,
				3383	u64 delalloc_start, unsigned long *nr_written)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3384	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3385	u64 page_end = delalloc_start + PAGE_SIZE - 1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3386	bool found;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3387	u64 delalloc_to_write = 0;
				3388	u64 delalloc_end = 0;
				3389	int ret;
				3390	int page_started = 0;
				3391
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3392
				3393	while (delalloc_end < page_end) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3394	found = find_lock_delalloc_range(&inode->vfs_inode, page,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3395	&delalloc_start,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3396	&delalloc_end);
				3397	if (!found) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3398	delalloc_start = delalloc_end + 1;
				3399	continue;
				3400	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3401	ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
				3402	delalloc_end, &page_started, nr_written, wbc);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3403	if (ret) {
				3404	SetPageError(page);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3405	/*
				3406	* btrfs_run_delalloc_range should return < 0 for error
				3407	* but just in case, we use > 0 here meaning the IO is
				3408	* started, so we don't want to return > 0 unless
				3409	* things are going well.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3410	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3411	return ret < 0 ? ret : -EIO;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3412	}
				3413	/*
				3414	* delalloc_end is already one less than the total length, so
				3415	* we don't subtract one from PAGE_SIZE
				3416	*/
				3417	delalloc_to_write += (delalloc_end - delalloc_start +
				3418	PAGE_SIZE) >> PAGE_SHIFT;
				3419	delalloc_start = delalloc_end + 1;
				3420	}
				3421	if (wbc->nr_to_write < delalloc_to_write) {
				3422	int thresh = 8192;
				3423
				3424	if (delalloc_to_write < thresh * 2)
				3425	thresh = delalloc_to_write;
				3426	wbc->nr_to_write = min_t(u64, delalloc_to_write,
				3427	thresh);
				3428	}
				3429
				3430	/* did the fill delalloc function already unlock and start
				3431	* the IO?
				3432	*/
				3433	if (page_started) {
				3434	/*
				3435	* we've unlocked the page, so we can't update
				3436	* the mapping's writeback index, just update
				3437	* nr_to_write.
				3438	*/
				3439	wbc->nr_to_write -= *nr_written;
				3440	return 1;
				3441	}
				3442
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3443	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3444	}
				3445
				3446	/*
				3447	* helper for __extent_writepage. This calls the writepage start hooks,
				3448	* and does the loop to map the page into extents and bios.
				3449	*
				3450	* We return 1 if the IO is started and the page is unlocked,
				3451	* 0 if all went well (page still locked)
				3452	* < 0 if there were errors (page still locked)
				3453	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3454	static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3455	struct page *page,
				3456	struct writeback_control *wbc,
				3457	struct extent_page_data *epd,
				3458	loff_t i_size,
				3459	unsigned long nr_written,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3460	int *nr_ret)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3461	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3462	struct extent_io_tree *tree = &inode->io_tree;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3463	u64 start = page_offset(page);
				3464	u64 page_end = start + PAGE_SIZE - 1;
				3465	u64 end;
				3466	u64 cur = start;
				3467	u64 extent_offset;
				3468	u64 block_start;
				3469	u64 iosize;
				3470	struct extent_map *em;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3471	size_t pg_offset = 0;
				3472	size_t blocksize;
				3473	int ret = 0;
				3474	int nr = 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3475	const unsigned int write_flags = wbc_to_write_flags(wbc);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3476	bool compressed;
				3477
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3478	ret = btrfs_writepage_cow_fixup(page, start, page_end);
				3479	if (ret) {
				3480	/* Fixup worker will requeue */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3481	redirty_page_for_writepage(wbc, page);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3482	update_nr_written(wbc, nr_written);
				3483	unlock_page(page);
				3484	return 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3485	}
				3486
				3487	/*
				3488	* we don't want to touch the inode after unlocking the page,
				3489	* so we update the mapping writeback index now
				3490	*/
				3491	update_nr_written(wbc, nr_written + 1);
				3492
				3493	end = page_end;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3494	blocksize = inode->vfs_inode.i_sb->s_blocksize;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3495
				3496	while (cur <= end) {
				3497	u64 em_end;
				3498	u64 offset;
				3499
				3500	if (cur >= i_size) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3501	btrfs_writepage_endio_finish_ordered(page, cur,
				3502	page_end, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3503	break;
				3504	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3505	em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3506	if (IS_ERR_OR_NULL(em)) {
				3507	SetPageError(page);
				3508	ret = PTR_ERR_OR_ZERO(em);
				3509	break;
				3510	}
				3511
				3512	extent_offset = cur - em->start;
				3513	em_end = extent_map_end(em);
				3514	BUG_ON(em_end <= cur);
				3515	BUG_ON(end < cur);
				3516	iosize = min(em_end - cur, end - cur + 1);
				3517	iosize = ALIGN(iosize, blocksize);
				3518	offset = em->block_start + extent_offset;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3519	block_start = em->block_start;
				3520	compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
				3521	free_extent_map(em);
				3522	em = NULL;
				3523
				3524	/*
				3525	* compressed and inline extents are written through other
				3526	* paths in the FS
				3527	*/
				3528	if (compressed \|\| block_start == EXTENT_MAP_HOLE \|\|
				3529	block_start == EXTENT_MAP_INLINE) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3530	if (compressed)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3531	nr++;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3532	else
				3533	btrfs_writepage_endio_finish_ordered(page, cur,
				3534	cur + iosize - 1, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3535	cur += iosize;
				3536	pg_offset += iosize;
				3537	continue;
				3538	}
				3539
				3540	btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
				3541	if (!PageWriteback(page)) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3542	btrfs_err(inode->root->fs_info,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3543	"page %lu not writeback, cur %llu end %llu",
				3544	page->index, cur, end);
				3545	}
				3546
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3547	ret = submit_extent_page(REQ_OP_WRITE \| write_flags, wbc,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3548	page, offset, iosize, pg_offset,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3549	&epd->bio,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3550	end_bio_extent_writepage,
				3551	0, 0, 0, false);
				3552	if (ret) {
				3553	SetPageError(page);
				3554	if (PageWriteback(page))
				3555	end_page_writeback(page);
				3556	}
				3557
				3558	cur = cur + iosize;
				3559	pg_offset += iosize;
				3560	nr++;
				3561	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3562	*nr_ret = nr;
				3563	return ret;
				3564	}
				3565
				3566	/*
				3567	* the writepage semantics are similar to regular writepage. extent
				3568	* records are inserted to lock ranges in the tree, and as dirty areas
				3569	* are found, they are marked writeback. Then the lock bits are removed
				3570	* and the end_io handler clears the writeback ranges
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3571	*
				3572	* Return 0 if everything goes well.
				3573	* Return <0 for error.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3574	*/
				3575	static int __extent_writepage(struct page page, struct writeback_control wbc,
				3576	struct extent_page_data *epd)
				3577	{
				3578	struct inode *inode = page->mapping->host;
				3579	u64 start = page_offset(page);
				3580	u64 page_end = start + PAGE_SIZE - 1;
				3581	int ret;
				3582	int nr = 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3583	size_t pg_offset;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3584	loff_t i_size = i_size_read(inode);
				3585	unsigned long end_index = i_size >> PAGE_SHIFT;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3586	unsigned long nr_written = 0;
				3587
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3588	trace___extent_writepage(page, inode, wbc);
				3589
				3590	WARN_ON(!PageLocked(page));
				3591
				3592	ClearPageError(page);
				3593
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3594	pg_offset = offset_in_page(i_size);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3595	if (page->index > end_index \|\|
				3596	(page->index == end_index && !pg_offset)) {
				3597	page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
				3598	unlock_page(page);
				3599	return 0;
				3600	}
				3601
				3602	if (page->index == end_index) {
				3603	char *userpage;
				3604
				3605	userpage = kmap_atomic(page);
				3606	memset(userpage + pg_offset, 0,
				3607	PAGE_SIZE - pg_offset);
				3608	kunmap_atomic(userpage);
				3609	flush_dcache_page(page);
				3610	}
				3611
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3612	set_page_extent_mapped(page);
				3613
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3614	if (!epd->extent_locked) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3615	ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
				3616	&nr_written);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3617	if (ret == 1)
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3618	return 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3619	if (ret)
				3620	goto done;
				3621	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3622
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3623	ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
				3624	nr_written, &nr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3625	if (ret == 1)
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3626	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3627
				3628	done:
				3629	if (nr == 0) {
				3630	/* make sure the mapping tag for page dirty gets cleared */
				3631	set_page_writeback(page);
				3632	end_page_writeback(page);
				3633	}
				3634	if (PageError(page)) {
				3635	ret = ret < 0 ? ret : -EIO;
				3636	end_extent_writepage(page, ret, start, page_end);
				3637	}
				3638	unlock_page(page);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3639	ASSERT(ret <= 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3640	return ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3641	}
				3642
				3643	void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
				3644	{
				3645	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
				3646	TASK_UNINTERRUPTIBLE);
				3647	}
				3648
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3649	static void end_extent_buffer_writeback(struct extent_buffer *eb)
				3650	{
				3651	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
				3652	smp_mb__after_atomic();
				3653	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
				3654	}
				3655
				3656	/*
				3657	* Lock eb pages and flush the bio if we can't the locks
				3658	*
				3659	* Return 0 if nothing went wrong
				3660	* Return >0 is same as 0, except bio is not submitted
				3661	* Return <0 if something went wrong, no page is locked
				3662	*/
				3663	static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3664	struct extent_page_data *epd)
				3665	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3666	struct btrfs_fs_info *fs_info = eb->fs_info;
				3667	int i, num_pages, failed_page_nr;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3668	int flush = 0;
				3669	int ret = 0;
				3670
				3671	if (!btrfs_try_tree_write_lock(eb)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3672	ret = flush_write_bio(epd);
				3673	if (ret < 0)
				3674	return ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3675	flush = 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3676	btrfs_tree_lock(eb);
				3677	}
				3678
				3679	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
				3680	btrfs_tree_unlock(eb);
				3681	if (!epd->sync_io)
				3682	return 0;
				3683	if (!flush) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3684	ret = flush_write_bio(epd);
				3685	if (ret < 0)
				3686	return ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3687	flush = 1;
				3688	}
				3689	while (1) {
				3690	wait_on_extent_buffer_writeback(eb);
				3691	btrfs_tree_lock(eb);
				3692	if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
				3693	break;
				3694	btrfs_tree_unlock(eb);
				3695	}
				3696	}
				3697
				3698	/*
				3699	* We need to do this to prevent races in people who check if the eb is
				3700	* under IO since we can end up having no IO bits set for a short period
				3701	* of time.
				3702	*/
				3703	spin_lock(&eb->refs_lock);
				3704	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
				3705	set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
				3706	spin_unlock(&eb->refs_lock);
				3707	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
				3708	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
				3709	-eb->len,
				3710	fs_info->dirty_metadata_batch);
				3711	ret = 1;
				3712	} else {
				3713	spin_unlock(&eb->refs_lock);
				3714	}
				3715
				3716	btrfs_tree_unlock(eb);
				3717
				3718	if (!ret)
				3719	return ret;
				3720
				3721	num_pages = num_extent_pages(eb);
				3722	for (i = 0; i < num_pages; i++) {
				3723	struct page *p = eb->pages[i];
				3724
				3725	if (!trylock_page(p)) {
				3726	if (!flush) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3727	int err;
				3728
				3729	err = flush_write_bio(epd);
				3730	if (err < 0) {
				3731	ret = err;
				3732	failed_page_nr = i;
				3733	goto err_unlock;
				3734	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3735	flush = 1;
				3736	}
				3737	lock_page(p);
				3738	}
				3739	}
				3740
				3741	return ret;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3742	err_unlock:
				3743	/* Unlock already locked pages */
				3744	for (i = 0; i < failed_page_nr; i++)
				3745	unlock_page(eb->pages[i]);
				3746	/*
				3747	* Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
				3748	* Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
				3749	* be made and undo everything done before.
				3750	*/
				3751	btrfs_tree_lock(eb);
				3752	spin_lock(&eb->refs_lock);
				3753	set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
				3754	end_extent_buffer_writeback(eb);
				3755	spin_unlock(&eb->refs_lock);
				3756	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
				3757	fs_info->dirty_metadata_batch);
				3758	btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
				3759	btrfs_tree_unlock(eb);
				3760	return ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3761	}
				3762
				3763	static void set_btree_ioerr(struct page *page)
				3764	{
				3765	struct extent_buffer eb = (struct extent_buffer )page->private;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3766	struct btrfs_fs_info *fs_info;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3767
				3768	SetPageError(page);
				3769	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
				3770	return;
				3771
				3772	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3773	* A read may stumble upon this buffer later, make sure that it gets an
				3774	* error and knows there was an error.
				3775	*/
				3776	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				3777
				3778	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3779	* If we error out, we should add back the dirty_metadata_bytes
				3780	* to make it consistent.
				3781	*/
				3782	fs_info = eb->fs_info;
				3783	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
				3784	eb->len, fs_info->dirty_metadata_batch);
				3785
				3786	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3787	* If writeback for a btree extent that doesn't belong to a log tree
				3788	* failed, increment the counter transaction->eb_write_errors.
				3789	* We do this because while the transaction is running and before it's
				3790	* committing (when we call filemap_fdata[write\|wait]_range against
				3791	* the btree inode), we might have
				3792	* btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
				3793	* returns an error or an error happens during writeback, when we're
				3794	* committing the transaction we wouldn't know about it, since the pages
				3795	* can be no longer dirty nor marked anymore for writeback (if a
				3796	* subsequent modification to the extent buffer didn't happen before the
				3797	* transaction commit), which makes filemap_fdata[write\|wait]_range not
				3798	* able to find the pages tagged with SetPageError at transaction
				3799	* commit time. So if this happens we must abort the transaction,
				3800	* otherwise we commit a super block with btree roots that point to
				3801	* btree nodes/leafs whose content on disk is invalid - either garbage
				3802	* or the content of some node/leaf from a past generation that got
				3803	* cowed or deleted and is no longer valid.
				3804	*
				3805	* Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
				3806	* not be enough - we need to distinguish between log tree extents vs
				3807	* non-log tree extents, and the next filemap_fdatawait_range() call
				3808	* will catch and clear such errors in the mapping - and that call might
				3809	* be from a log sync and not from a transaction commit. Also, checking
				3810	* for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
				3811	* not done and would not be reliable - the eb might have been released
				3812	* from memory and reading it back again means that flag would not be
				3813	* set (since it's a runtime flag, not persisted on disk).
				3814	*
				3815	* Using the flags below in the btree inode also makes us achieve the
				3816	* goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
				3817	* writeback for all dirty pages and before filemap_fdatawait_range()
				3818	* is called, the writeback for all dirty pages had already finished
				3819	* with errors - because we were not using AS_EIO/AS_ENOSPC,
				3820	* filemap_fdatawait_range() would return success, as it could not know
				3821	* that writeback errors happened (the pages were no longer tagged for
				3822	* writeback).
				3823	*/
				3824	switch (eb->log_index) {
				3825	case -1:
				3826	set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
				3827	break;
				3828	case 0:
				3829	set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
				3830	break;
				3831	case 1:
				3832	set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
				3833	break;
				3834	default:
				3835	BUG(); /* unexpected, logic error */
				3836	}
				3837	}
				3838
				3839	static void end_bio_extent_buffer_writepage(struct bio *bio)
				3840	{
				3841	struct bio_vec *bvec;
				3842	struct extent_buffer *eb;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3843	int done;
				3844	struct bvec_iter_all iter_all;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3845
				3846	ASSERT(!bio_flagged(bio, BIO_CLONED));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3847	bio_for_each_segment_all(bvec, bio, iter_all) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3848	struct page *page = bvec->bv_page;
				3849
				3850	eb = (struct extent_buffer *)page->private;
				3851	BUG_ON(!eb);
				3852	done = atomic_dec_and_test(&eb->io_pages);
				3853
				3854	if (bio->bi_status \|\|
				3855	test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
				3856	ClearPageUptodate(page);
				3857	set_btree_ioerr(page);
				3858	}
				3859
				3860	end_page_writeback(page);
				3861
				3862	if (!done)
				3863	continue;
				3864
				3865	end_extent_buffer_writeback(eb);
				3866	}
				3867
				3868	bio_put(bio);
				3869	}
				3870
				3871	static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3872	struct writeback_control *wbc,
				3873	struct extent_page_data *epd)
				3874	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3875	u64 offset = eb->start;
				3876	u32 nritems;
				3877	int i, num_pages;
				3878	unsigned long start, end;
				3879	unsigned int write_flags = wbc_to_write_flags(wbc) \| REQ_META;
				3880	int ret = 0;
				3881
				3882	clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
				3883	num_pages = num_extent_pages(eb);
				3884	atomic_set(&eb->io_pages, num_pages);
				3885
				3886	/* set btree blocks beyond nritems with 0 to avoid stale content. */
				3887	nritems = btrfs_header_nritems(eb);
				3888	if (btrfs_header_level(eb) > 0) {
				3889	end = btrfs_node_key_ptr_offset(nritems);
				3890
				3891	memzero_extent_buffer(eb, end, eb->len - end);
				3892	} else {
				3893	/*
				3894	* leaf:
				3895	* header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
				3896	*/
				3897	start = btrfs_item_nr_offset(nritems);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3898	end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3899	memzero_extent_buffer(eb, start, end - start);
				3900	}
				3901
				3902	for (i = 0; i < num_pages; i++) {
				3903	struct page *p = eb->pages[i];
				3904
				3905	clear_page_dirty_for_io(p);
				3906	set_page_writeback(p);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3907	ret = submit_extent_page(REQ_OP_WRITE \| write_flags, wbc,
				3908	p, offset, PAGE_SIZE, 0,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3909	&epd->bio,
				3910	end_bio_extent_buffer_writepage,
				3911	0, 0, 0, false);
				3912	if (ret) {
				3913	set_btree_ioerr(p);
				3914	if (PageWriteback(p))
				3915	end_page_writeback(p);
				3916	if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
				3917	end_extent_buffer_writeback(eb);
				3918	ret = -EIO;
				3919	break;
				3920	}
				3921	offset += PAGE_SIZE;
				3922	update_nr_written(wbc, 1);
				3923	unlock_page(p);
				3924	}
				3925
				3926	if (unlikely(ret)) {
				3927	for (; i < num_pages; i++) {
				3928	struct page *p = eb->pages[i];
				3929	clear_page_dirty_for_io(p);
				3930	unlock_page(p);
				3931	}
				3932	}
				3933
				3934	return ret;
				3935	}
				3936
				3937	int btree_write_cache_pages(struct address_space *mapping,
				3938	struct writeback_control *wbc)
				3939	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3940	struct extent_buffer eb, prev_eb = NULL;
				3941	struct extent_page_data epd = {
				3942	.bio = NULL,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3943	.extent_locked = 0,
				3944	.sync_io = wbc->sync_mode == WB_SYNC_ALL,
				3945	};
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	3946	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3947	int ret = 0;
				3948	int done = 0;
				3949	int nr_to_write_done = 0;
				3950	struct pagevec pvec;
				3951	int nr_pages;
				3952	pgoff_t index;
				3953	pgoff_t end; /* Inclusive */
				3954	int scanned = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3955	xa_mark_t tag;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3956
				3957	pagevec_init(&pvec);
				3958	if (wbc->range_cyclic) {
				3959	index = mapping->writeback_index; /* Start from prev offset */
				3960	end = -1;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	3961	/*
				3962	* Start from the beginning does not need to cycle over the
				3963	* range, mark it as scanned.
				3964	*/
				3965	scanned = (index == 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3966	} else {
				3967	index = wbc->range_start >> PAGE_SHIFT;
				3968	end = wbc->range_end >> PAGE_SHIFT;
				3969	scanned = 1;
				3970	}
				3971	if (wbc->sync_mode == WB_SYNC_ALL)
				3972	tag = PAGECACHE_TAG_TOWRITE;
				3973	else
				3974	tag = PAGECACHE_TAG_DIRTY;
				3975	retry:
				3976	if (wbc->sync_mode == WB_SYNC_ALL)
				3977	tag_pages_for_writeback(mapping, index, end);
				3978	while (!done && !nr_to_write_done && (index <= end) &&
				3979	(nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
				3980	tag))) {
				3981	unsigned i;
				3982
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3983	for (i = 0; i < nr_pages; i++) {
				3984	struct page *page = pvec.pages[i];
				3985
				3986	if (!PagePrivate(page))
				3987	continue;
				3988
				3989	spin_lock(&mapping->private_lock);
				3990	if (!PagePrivate(page)) {
				3991	spin_unlock(&mapping->private_lock);
				3992	continue;
				3993	}
				3994
				3995	eb = (struct extent_buffer *)page->private;
				3996
				3997	/*
				3998	* Shouldn't happen and normally this would be a BUG_ON
				3999	* but no sense in crashing the users box for something
				4000	* we can survive anyway.
				4001	*/
				4002	if (WARN_ON(!eb)) {
				4003	spin_unlock(&mapping->private_lock);
				4004	continue;
				4005	}
				4006
				4007	if (eb == prev_eb) {
				4008	spin_unlock(&mapping->private_lock);
				4009	continue;
				4010	}
				4011
				4012	ret = atomic_inc_not_zero(&eb->refs);
				4013	spin_unlock(&mapping->private_lock);
				4014	if (!ret)
				4015	continue;
				4016
				4017	prev_eb = eb;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4018	ret = lock_extent_buffer_for_io(eb, &epd);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4019	if (!ret) {
				4020	free_extent_buffer(eb);
				4021	continue;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4022	} else if (ret < 0) {
				4023	done = 1;
				4024	free_extent_buffer(eb);
				4025	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4026	}
				4027
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4028	ret = write_one_eb(eb, wbc, &epd);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4029	if (ret) {
				4030	done = 1;
				4031	free_extent_buffer(eb);
				4032	break;
				4033	}
				4034	free_extent_buffer(eb);
				4035
				4036	/*
				4037	* the filesystem may choose to bump up nr_to_write.
				4038	* We have to make sure to honor the new nr_to_write
				4039	* at any time
				4040	*/
				4041	nr_to_write_done = wbc->nr_to_write <= 0;
				4042	}
				4043	pagevec_release(&pvec);
				4044	cond_resched();
				4045	}
				4046	if (!scanned && !done) {
				4047	/*
				4048	* We hit the last page and there is more work to be done: wrap
				4049	* back to the start of the file
				4050	*/
				4051	scanned = 1;
				4052	index = 0;
				4053	goto retry;
				4054	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4055	ASSERT(ret <= 0);
				4056	if (ret < 0) {
				4057	end_write_bio(&epd, ret);
				4058	return ret;
				4059	}
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	4060	/*
				4061	* If something went wrong, don't allow any metadata write bio to be
				4062	* submitted.
				4063	*
				4064	* This would prevent use-after-free if we had dirty pages not
				4065	* cleaned up, which can still happen by fuzzed images.
				4066	*
				4067	* - Bad extent tree
				4068	* Allowing existing tree block to be allocated for other trees.
				4069	*
				4070	* - Log tree operations
				4071	* Exiting tree blocks get allocated to log tree, bumps its
				4072	* generation, then get cleaned in tree re-balance.
				4073	* Such tree block will not be written back, since it's clean,
				4074	* thus no WRITTEN flag set.
				4075	* And after log writes back, this tree block is not traced by
				4076	* any dirty extent_io_tree.
				4077	*
				4078	* - Offending tree block gets re-dirtied from its original owner
				4079	* Since it has bumped generation, no WRITTEN flag, it can be
				4080	* reused without COWing. This tree block will not be traced
				4081	* by btrfs_transaction::dirty_pages.
				4082	*
				4083	* Now such dirty tree block will not be cleaned by any dirty
				4084	* extent io tree. Thus we don't want to submit such wild eb
				4085	* if the fs already has error.
				4086	*/
				4087	if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
				4088	ret = flush_write_bio(&epd);
				4089	} else {
				4090	ret = -EROFS;
				4091	end_write_bio(&epd, ret);
				4092	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4093	return ret;
				4094	}
				4095
				4096	/**
				4097	* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
				4098	* @mapping: address space structure to write
				4099	* @wbc: subtract the number of written pages from *@wbc->nr_to_write
				4100	* @data: data passed to __extent_writepage function
				4101	*
				4102	* If a page is already under I/O, write_cache_pages() skips it, even
				4103	* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
				4104	* but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
				4105	* and msync() need to guarantee that all the data which was dirty at the time
				4106	* the call was made get new I/O started against them. If wbc->sync_mode is
				4107	* WB_SYNC_ALL then we were called for data integrity and we must wait for
				4108	* existing IO to complete.
				4109	*/
				4110	static int extent_write_cache_pages(struct address_space *mapping,
				4111	struct writeback_control *wbc,
				4112	struct extent_page_data *epd)
				4113	{
				4114	struct inode *inode = mapping->host;
				4115	int ret = 0;
				4116	int done = 0;
				4117	int nr_to_write_done = 0;
				4118	struct pagevec pvec;
				4119	int nr_pages;
				4120	pgoff_t index;
				4121	pgoff_t end; /* Inclusive */
				4122	pgoff_t done_index;
				4123	int range_whole = 0;
				4124	int scanned = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4125	xa_mark_t tag;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4126
				4127	/*
				4128	* We have to hold onto the inode so that ordered extents can do their
				4129	* work when the IO finishes. The alternative to this is failing to add
				4130	* an ordered extent if the igrab() fails there and that is a huge pain
				4131	* to deal with, so instead just hold onto the inode throughout the
				4132	* writepages operation. If it fails here we are freeing up the inode
				4133	* anyway and we'd rather not waste our time writing out stuff that is
				4134	* going to be truncated anyway.
				4135	*/
				4136	if (!igrab(inode))
				4137	return 0;
				4138
				4139	pagevec_init(&pvec);
				4140	if (wbc->range_cyclic) {
				4141	index = mapping->writeback_index; /* Start from prev offset */
				4142	end = -1;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	4143	/*
				4144	* Start from the beginning does not need to cycle over the
				4145	* range, mark it as scanned.
				4146	*/
				4147	scanned = (index == 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4148	} else {
				4149	index = wbc->range_start >> PAGE_SHIFT;
				4150	end = wbc->range_end >> PAGE_SHIFT;
				4151	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
				4152	range_whole = 1;
				4153	scanned = 1;
				4154	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4155
				4156	/*
				4157	* We do the tagged writepage as long as the snapshot flush bit is set
				4158	* and we are the first one who do the filemap_flush() on this inode.
				4159	*
				4160	* The nr_to_write == LONG_MAX is needed to make sure other flushers do
				4161	* not race in and drop the bit.
				4162	*/
				4163	if (range_whole && wbc->nr_to_write == LONG_MAX &&
				4164	test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
				4165	&BTRFS_I(inode)->runtime_flags))
				4166	wbc->tagged_writepages = 1;
				4167
				4168	if (wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4169	tag = PAGECACHE_TAG_TOWRITE;
				4170	else
				4171	tag = PAGECACHE_TAG_DIRTY;
				4172	retry:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4173	if (wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4174	tag_pages_for_writeback(mapping, index, end);
				4175	done_index = index;
				4176	while (!done && !nr_to_write_done && (index <= end) &&
				4177	(nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
				4178	&index, end, tag))) {
				4179	unsigned i;
				4180
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4181	for (i = 0; i < nr_pages; i++) {
				4182	struct page *page = pvec.pages[i];
				4183
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	4184	done_index = page->index + 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4185	/*
				4186	* At this point we hold neither the i_pages lock nor
				4187	* the page lock: the page may be truncated or
				4188	* invalidated (changing page->mapping to NULL),
				4189	* or even swizzled back from swapper_space to
				4190	* tmpfs file mapping
				4191	*/
				4192	if (!trylock_page(page)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4193	ret = flush_write_bio(epd);
				4194	BUG_ON(ret < 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4195	lock_page(page);
				4196	}
				4197
				4198	if (unlikely(page->mapping != mapping)) {
				4199	unlock_page(page);
				4200	continue;
				4201	}
				4202
				4203	if (wbc->sync_mode != WB_SYNC_NONE) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4204	if (PageWriteback(page)) {
				4205	ret = flush_write_bio(epd);
				4206	BUG_ON(ret < 0);
				4207	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4208	wait_on_page_writeback(page);
				4209	}
				4210
				4211	if (PageWriteback(page) \|\|
				4212	!clear_page_dirty_for_io(page)) {
				4213	unlock_page(page);
				4214	continue;
				4215	}
				4216
				4217	ret = __extent_writepage(page, wbc, epd);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4218	if (ret < 0) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4219	done = 1;
				4220	break;
				4221	}
				4222
				4223	/*
				4224	* the filesystem may choose to bump up nr_to_write.
				4225	* We have to make sure to honor the new nr_to_write
				4226	* at any time
				4227	*/
				4228	nr_to_write_done = wbc->nr_to_write <= 0;
				4229	}
				4230	pagevec_release(&pvec);
				4231	cond_resched();
				4232	}
				4233	if (!scanned && !done) {
				4234	/*
				4235	* We hit the last page and there is more work to be done: wrap
				4236	* back to the start of the file
				4237	*/
				4238	scanned = 1;
				4239	index = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	4240
				4241	/*
				4242	* If we're looping we could run into a page that is locked by a
				4243	* writer and that writer could be waiting on writeback for a
				4244	* page in our current bio, and thus deadlock, so flush the
				4245	* write bio here.
				4246	*/
				4247	ret = flush_write_bio(epd);
				4248	if (!ret)
				4249	goto retry;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4250	}
				4251
				4252	if (wbc->range_cyclic \|\| (wbc->nr_to_write > 0 && range_whole))
				4253	mapping->writeback_index = done_index;
				4254
				4255	btrfs_add_delayed_iput(inode);
				4256	return ret;
				4257	}
				4258
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4259	int extent_write_full_page(struct page page, struct writeback_control wbc)
				4260	{
				4261	int ret;
				4262	struct extent_page_data epd = {
				4263	.bio = NULL,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4264	.extent_locked = 0,
				4265	.sync_io = wbc->sync_mode == WB_SYNC_ALL,
				4266	};
				4267
				4268	ret = __extent_writepage(page, wbc, &epd);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4269	ASSERT(ret <= 0);
				4270	if (ret < 0) {
				4271	end_write_bio(&epd, ret);
				4272	return ret;
				4273	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4274
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4275	ret = flush_write_bio(&epd);
				4276	ASSERT(ret <= 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4277	return ret;
				4278	}
				4279
				4280	int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
				4281	int mode)
				4282	{
				4283	int ret = 0;
				4284	struct address_space *mapping = inode->i_mapping;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4285	struct page *page;
				4286	unsigned long nr_pages = (end - start + PAGE_SIZE) >>
				4287	PAGE_SHIFT;
				4288
				4289	struct extent_page_data epd = {
				4290	.bio = NULL,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4291	.extent_locked = 1,
				4292	.sync_io = mode == WB_SYNC_ALL,
				4293	};
				4294	struct writeback_control wbc_writepages = {
				4295	.sync_mode = mode,
				4296	.nr_to_write = nr_pages * 2,
				4297	.range_start = start,
				4298	.range_end = end + 1,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4299	/* We're called from an async helper function */
				4300	.punt_to_cgroup = 1,
				4301	.no_cgroup_owner = 1,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4302	};
				4303
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4304	wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4305	while (start <= end) {
				4306	page = find_get_page(mapping, start >> PAGE_SHIFT);
				4307	if (clear_page_dirty_for_io(page))
				4308	ret = __extent_writepage(page, &wbc_writepages, &epd);
				4309	else {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4310	btrfs_writepage_endio_finish_ordered(page, start,
				4311	start + PAGE_SIZE - 1, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4312	unlock_page(page);
				4313	}
				4314	put_page(page);
				4315	start += PAGE_SIZE;
				4316	}
				4317
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4318	ASSERT(ret <= 0);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4319	if (ret == 0)
				4320	ret = flush_write_bio(&epd);
				4321	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4322	end_write_bio(&epd, ret);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4323
				4324	wbc_detach_inode(&wbc_writepages);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4325	return ret;
				4326	}
				4327
				4328	int extent_writepages(struct address_space *mapping,
				4329	struct writeback_control *wbc)
				4330	{
				4331	int ret = 0;
				4332	struct extent_page_data epd = {
				4333	.bio = NULL,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4334	.extent_locked = 0,
				4335	.sync_io = wbc->sync_mode == WB_SYNC_ALL,
				4336	};
				4337
				4338	ret = extent_write_cache_pages(mapping, wbc, &epd);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4339	ASSERT(ret <= 0);
				4340	if (ret < 0) {
				4341	end_write_bio(&epd, ret);
				4342	return ret;
				4343	}
				4344	ret = flush_write_bio(&epd);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4345	return ret;
				4346	}
				4347
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4348	void extent_readahead(struct readahead_control *rac)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4349	{
				4350	struct bio *bio = NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4351	unsigned long bio_flags = 0;
				4352	struct page *pagepool[16];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4353	struct extent_map *em_cached = NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4354	u64 prev_em_start = (u64)-1;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4355	int nr;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4356
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4357	while ((nr = readahead_page_batch(rac, pagepool))) {
				4358	u64 contig_start = page_offset(pagepool[0]);
				4359	u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4360
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4361	ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4362
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4363	contiguous_readpages(pagepool, nr, contig_start, contig_end,
				4364	&em_cached, &bio, &bio_flags, &prev_em_start);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4365	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4366
				4367	if (em_cached)
				4368	free_extent_map(em_cached);
				4369
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4370	if (bio) {
				4371	if (submit_one_bio(bio, 0, bio_flags))
				4372	return;
				4373	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4374	}
				4375
				4376	/*
				4377	* basic invalidatepage code, this waits on any locked or writeback
				4378	* ranges corresponding to the page, and then deletes any extent state
				4379	* records from the tree
				4380	*/
				4381	int extent_invalidatepage(struct extent_io_tree *tree,
				4382	struct page *page, unsigned long offset)
				4383	{
				4384	struct extent_state *cached_state = NULL;
				4385	u64 start = page_offset(page);
				4386	u64 end = start + PAGE_SIZE - 1;
				4387	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
				4388
				4389	start += ALIGN(offset, blocksize);
				4390	if (start > end)
				4391	return 0;
				4392
				4393	lock_extent_bits(tree, start, end, &cached_state);
				4394	wait_on_page_writeback(page);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4395	clear_extent_bit(tree, start, end, EXTENT_LOCKED \| EXTENT_DELALLOC \|
				4396	EXTENT_DO_ACCOUNTING, 1, 1, &cached_state);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4397	return 0;
				4398	}
				4399
				4400	/*
				4401	* a helper for releasepage, this tests for areas of the page that
				4402	* are locked or under IO and drops the related state bits if it is safe
				4403	* to drop the page.
				4404	*/
				4405	static int try_release_extent_state(struct extent_io_tree *tree,
				4406	struct page *page, gfp_t mask)
				4407	{
				4408	u64 start = page_offset(page);
				4409	u64 end = start + PAGE_SIZE - 1;
				4410	int ret = 1;
				4411
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4412	if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4413	ret = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4414	} else {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4415	/*
				4416	* at this point we can safely clear everything except the
				4417	* locked bit and the nodatasum bit
				4418	*/
				4419	ret = __clear_extent_bit(tree, start, end,
				4420	~(EXTENT_LOCKED \| EXTENT_NODATASUM),
				4421	0, 0, NULL, mask, NULL);
				4422
				4423	/* if clear_extent_bit failed for enomem reasons,
				4424	* we can't allow the release to continue.
				4425	*/
				4426	if (ret < 0)
				4427	ret = 0;
				4428	else
				4429	ret = 1;
				4430	}
				4431	return ret;
				4432	}
				4433
				4434	/*
				4435	* a helper for releasepage. As long as there are no locked extents
				4436	* in the range corresponding to the page, both state records and extent
				4437	* map records are removed
				4438	*/
				4439	int try_release_extent_mapping(struct page *page, gfp_t mask)
				4440	{
				4441	struct extent_map *em;
				4442	u64 start = page_offset(page);
				4443	u64 end = start + PAGE_SIZE - 1;
				4444	struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
				4445	struct extent_io_tree *tree = &btrfs_inode->io_tree;
				4446	struct extent_map_tree *map = &btrfs_inode->extent_tree;
				4447
				4448	if (gfpflags_allow_blocking(mask) &&
				4449	page->mapping->host->i_size > SZ_16M) {
				4450	u64 len;
				4451	while (start <= end) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4452	struct btrfs_fs_info *fs_info;
				4453	u64 cur_gen;
				4454
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4455	len = end - start + 1;
				4456	write_lock(&map->lock);
				4457	em = lookup_extent_mapping(map, start, len);
				4458	if (!em) {
				4459	write_unlock(&map->lock);
				4460	break;
				4461	}
				4462	if (test_bit(EXTENT_FLAG_PINNED, &em->flags) \|\|
				4463	em->start != start) {
				4464	write_unlock(&map->lock);
				4465	free_extent_map(em);
				4466	break;
				4467	}
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	4468	if (test_range_bit(tree, em->start,
				4469	extent_map_end(em) - 1,
				4470	EXTENT_LOCKED, 0, NULL))
				4471	goto next;
				4472	/*
				4473	* If it's not in the list of modified extents, used
				4474	* by a fast fsync, we can remove it. If it's being
				4475	* logged we can safely remove it since fsync took an
				4476	* extra reference on the em.
				4477	*/
				4478	if (list_empty(&em->list) \|\|
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4479	test_bit(EXTENT_FLAG_LOGGING, &em->flags))
				4480	goto remove_em;
				4481	/*
				4482	* If it's in the list of modified extents, remove it
				4483	* only if its generation is older then the current one,
				4484	* in which case we don't need it for a fast fsync.
				4485	* Otherwise don't remove it, we could be racing with an
				4486	* ongoing fast fsync that could miss the new extent.
				4487	*/
				4488	fs_info = btrfs_inode->root->fs_info;
				4489	spin_lock(&fs_info->trans_lock);
				4490	cur_gen = fs_info->generation;
				4491	spin_unlock(&fs_info->trans_lock);
				4492	if (em->generation >= cur_gen)
				4493	goto next;
				4494	remove_em:
				4495	/*
				4496	* We only remove extent maps that are not in the list of
				4497	* modified extents or that are in the list but with a
				4498	* generation lower then the current generation, so there
				4499	* is no need to set the full fsync flag on the inode (it
				4500	* hurts the fsync performance for workloads with a data
				4501	* size that exceeds or is close to the system's memory).
				4502	*/
				4503	remove_extent_mapping(map, em);
				4504	/* once for the rb tree */
				4505	free_extent_map(em);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	4506	next:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4507	start = extent_map_end(em);
				4508	write_unlock(&map->lock);
				4509
				4510	/* once for us */
				4511	free_extent_map(em);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	4512
				4513	cond_resched(); /* Allow large-extent preemption. */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4514	}
				4515	}
				4516	return try_release_extent_state(tree, page, mask);
				4517	}
				4518
				4519	/*
				4520	* helper function for fiemap, which doesn't want to see any holes.
				4521	* This maps until we find something past 'last'
				4522	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4523	static struct extent_map get_extent_skip_holes(struct btrfs_inode inode,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4524	u64 offset, u64 last)
				4525	{
				4526	u64 sectorsize = btrfs_inode_sectorsize(inode);
				4527	struct extent_map *em;
				4528	u64 len;
				4529
				4530	if (offset >= last)
				4531	return NULL;
				4532
				4533	while (1) {
				4534	len = last - offset;
				4535	if (len == 0)
				4536	break;
				4537	len = ALIGN(len, sectorsize);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4538	em = btrfs_get_extent_fiemap(inode, offset, len);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4539	if (IS_ERR_OR_NULL(em))
				4540	return em;
				4541
				4542	/* if this isn't a hole return it */
				4543	if (em->block_start != EXTENT_MAP_HOLE)
				4544	return em;
				4545
				4546	/* this is a hole, advance to the next extent */
				4547	offset = extent_map_end(em);
				4548	free_extent_map(em);
				4549	if (offset >= last)
				4550	break;
				4551	}
				4552	return NULL;
				4553	}
				4554
				4555	/*
				4556	* To cache previous fiemap extent
				4557	*
				4558	* Will be used for merging fiemap extent
				4559	*/
				4560	struct fiemap_cache {
				4561	u64 offset;
				4562	u64 phys;
				4563	u64 len;
				4564	u32 flags;
				4565	bool cached;
				4566	};
				4567
				4568	/*
				4569	* Helper to submit fiemap extent.
				4570	*
				4571	* Will try to merge current fiemap extent specified by @offset, @phys,
				4572	* @len and @flags with cached one.
				4573	* And only when we fails to merge, cached one will be submitted as
				4574	* fiemap extent.
				4575	*
				4576	* Return value is the same as fiemap_fill_next_extent().
				4577	*/
				4578	static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
				4579	struct fiemap_cache *cache,
				4580	u64 offset, u64 phys, u64 len, u32 flags)
				4581	{
				4582	int ret = 0;
				4583
				4584	if (!cache->cached)
				4585	goto assign;
				4586
				4587	/*
				4588	* Sanity check, extent_fiemap() should have ensured that new
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4589	* fiemap extent won't overlap with cached one.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4590	* Not recoverable.
				4591	*
				4592	* NOTE: Physical address can overlap, due to compression
				4593	*/
				4594	if (cache->offset + cache->len > offset) {
				4595	WARN_ON(1);
				4596	return -EINVAL;
				4597	}
				4598
				4599	/*
				4600	* Only merges fiemap extents if
				4601	* 1) Their logical addresses are continuous
				4602	*
				4603	* 2) Their physical addresses are continuous
				4604	* So truly compressed (physical size smaller than logical size)
				4605	* extents won't get merged with each other
				4606	*
				4607	* 3) Share same flags except FIEMAP_EXTENT_LAST
				4608	* So regular extent won't get merged with prealloc extent
				4609	*/
				4610	if (cache->offset + cache->len == offset &&
				4611	cache->phys + cache->len == phys &&
				4612	(cache->flags & ~FIEMAP_EXTENT_LAST) ==
				4613	(flags & ~FIEMAP_EXTENT_LAST)) {
				4614	cache->len += len;
				4615	cache->flags \|= flags;
				4616	goto try_submit_last;
				4617	}
				4618
				4619	/* Not mergeable, need to submit cached one */
				4620	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
				4621	cache->len, cache->flags);
				4622	cache->cached = false;
				4623	if (ret)
				4624	return ret;
				4625	assign:
				4626	cache->cached = true;
				4627	cache->offset = offset;
				4628	cache->phys = phys;
				4629	cache->len = len;
				4630	cache->flags = flags;
				4631	try_submit_last:
				4632	if (cache->flags & FIEMAP_EXTENT_LAST) {
				4633	ret = fiemap_fill_next_extent(fieinfo, cache->offset,
				4634	cache->phys, cache->len, cache->flags);
				4635	cache->cached = false;
				4636	}
				4637	return ret;
				4638	}
				4639
				4640	/*
				4641	* Emit last fiemap cache
				4642	*
				4643	* The last fiemap cache may still be cached in the following case:
				4644	* 0 4k 8k
				4645	* \|<- Fiemap range ->\|
				4646	* \|<------------ First extent ----------->\|
				4647	*
				4648	* In this case, the first extent range will be cached but not emitted.
				4649	* So we must emit it before ending extent_fiemap().
				4650	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4651	static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4652	struct fiemap_cache *cache)
				4653	{
				4654	int ret;
				4655
				4656	if (!cache->cached)
				4657	return 0;
				4658
				4659	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
				4660	cache->len, cache->flags);
				4661	cache->cached = false;
				4662	if (ret > 0)
				4663	ret = 0;
				4664	return ret;
				4665	}
				4666
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4667	int extent_fiemap(struct btrfs_inode inode, struct fiemap_extent_info fieinfo,
				4668	u64 start, u64 len)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4669	{
				4670	int ret = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	4671	u64 off;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4672	u64 max = start + len;
				4673	u32 flags = 0;
				4674	u32 found_type;
				4675	u64 last;
				4676	u64 last_for_get_extent = 0;
				4677	u64 disko = 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4678	u64 isize = i_size_read(&inode->vfs_inode);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4679	struct btrfs_key found_key;
				4680	struct extent_map *em = NULL;
				4681	struct extent_state *cached_state = NULL;
				4682	struct btrfs_path *path;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4683	struct btrfs_root *root = inode->root;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4684	struct fiemap_cache cache = { 0 };
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4685	struct ulist *roots;
				4686	struct ulist *tmp_ulist;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4687	int end = 0;
				4688	u64 em_start = 0;
				4689	u64 em_len = 0;
				4690	u64 em_end = 0;
				4691
				4692	if (len == 0)
				4693	return -EINVAL;
				4694
				4695	path = btrfs_alloc_path();
				4696	if (!path)
				4697	return -ENOMEM;
				4698	path->leave_spinning = 1;
				4699
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4700	roots = ulist_alloc(GFP_KERNEL);
				4701	tmp_ulist = ulist_alloc(GFP_KERNEL);
				4702	if (!roots \|\| !tmp_ulist) {
				4703	ret = -ENOMEM;
				4704	goto out_free_ulist;
				4705	}
				4706
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	4707	/*
				4708	* We can't initialize that to 'start' as this could miss extents due
				4709	* to extent item merging
				4710	*/
				4711	off = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4712	start = round_down(start, btrfs_inode_sectorsize(inode));
				4713	len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
				4714
				4715	/*
				4716	* lookup the last file extent. We're not using i_size here
				4717	* because there might be preallocation past i_size
				4718	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4719	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
				4720	0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4721	if (ret < 0) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4722	goto out_free_ulist;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4723	} else {
				4724	WARN_ON(!ret);
				4725	if (ret == 1)
				4726	ret = 0;
				4727	}
				4728
				4729	path->slots[0]--;
				4730	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
				4731	found_type = found_key.type;
				4732
				4733	/* No extents, but there might be delalloc bits */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4734	if (found_key.objectid != btrfs_ino(inode) \|\|
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4735	found_type != BTRFS_EXTENT_DATA_KEY) {
				4736	/* have to trust i_size as the end */
				4737	last = (u64)-1;
				4738	last_for_get_extent = isize;
				4739	} else {
				4740	/*
				4741	* remember the start of the last extent. There are a
				4742	* bunch of different factors that go into the length of the
				4743	* extent, so its much less complex to remember where it started
				4744	*/
				4745	last = found_key.offset;
				4746	last_for_get_extent = last + 1;
				4747	}
				4748	btrfs_release_path(path);
				4749
				4750	/*
				4751	* we might have some extents allocated but more delalloc past those
				4752	* extents. so, we trust isize unless the start of the last extent is
				4753	* beyond isize
				4754	*/
				4755	if (last < isize) {
				4756	last = (u64)-1;
				4757	last_for_get_extent = isize;
				4758	}
				4759
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4760	lock_extent_bits(&inode->io_tree, start, start + len - 1,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4761	&cached_state);
				4762
				4763	em = get_extent_skip_holes(inode, start, last_for_get_extent);
				4764	if (!em)
				4765	goto out;
				4766	if (IS_ERR(em)) {
				4767	ret = PTR_ERR(em);
				4768	goto out;
				4769	}
				4770
				4771	while (!end) {
				4772	u64 offset_in_extent = 0;
				4773
				4774	/* break if the extent we found is outside the range */
				4775	if (em->start >= max \|\| extent_map_end(em) < off)
				4776	break;
				4777
				4778	/*
				4779	* get_extent may return an extent that starts before our
				4780	* requested range. We have to make sure the ranges
				4781	* we return to fiemap always move forward and don't
				4782	* overlap, so adjust the offsets here
				4783	*/
				4784	em_start = max(em->start, off);
				4785
				4786	/*
				4787	* record the offset from the start of the extent
				4788	* for adjusting the disk offset below. Only do this if the
				4789	* extent isn't compressed since our in ram offset may be past
				4790	* what we have actually allocated on disk.
				4791	*/
				4792	if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
				4793	offset_in_extent = em_start - em->start;
				4794	em_end = extent_map_end(em);
				4795	em_len = em_end - em_start;
				4796	flags = 0;
				4797	if (em->block_start < EXTENT_MAP_LAST_BYTE)
				4798	disko = em->block_start + offset_in_extent;
				4799	else
				4800	disko = 0;
				4801
				4802	/*
				4803	* bump off for our next call to get_extent
				4804	*/
				4805	off = extent_map_end(em);
				4806	if (off >= max)
				4807	end = 1;
				4808
				4809	if (em->block_start == EXTENT_MAP_LAST_BYTE) {
				4810	end = 1;
				4811	flags \|= FIEMAP_EXTENT_LAST;
				4812	} else if (em->block_start == EXTENT_MAP_INLINE) {
				4813	flags \|= (FIEMAP_EXTENT_DATA_INLINE \|
				4814	FIEMAP_EXTENT_NOT_ALIGNED);
				4815	} else if (em->block_start == EXTENT_MAP_DELALLOC) {
				4816	flags \|= (FIEMAP_EXTENT_DELALLOC \|
				4817	FIEMAP_EXTENT_UNKNOWN);
				4818	} else if (fieinfo->fi_extents_max) {
				4819	u64 bytenr = em->block_start -
				4820	(em->start - em->orig_start);
				4821
				4822	/*
				4823	* As btrfs supports shared space, this information
				4824	* can be exported to userspace tools via
				4825	* flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
				4826	* then we're just getting a count and we can skip the
				4827	* lookup stuff.
				4828	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4829	ret = btrfs_check_shared(root, btrfs_ino(inode),
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4830	bytenr, roots, tmp_ulist);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4831	if (ret < 0)
				4832	goto out_free;
				4833	if (ret)
				4834	flags \|= FIEMAP_EXTENT_SHARED;
				4835	ret = 0;
				4836	}
				4837	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
				4838	flags \|= FIEMAP_EXTENT_ENCODED;
				4839	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
				4840	flags \|= FIEMAP_EXTENT_UNWRITTEN;
				4841
				4842	free_extent_map(em);
				4843	em = NULL;
				4844	if ((em_start >= last) \|\| em_len == (u64)-1 \|\|
				4845	(last == (u64)-1 && isize <= em_end)) {
				4846	flags \|= FIEMAP_EXTENT_LAST;
				4847	end = 1;
				4848	}
				4849
				4850	/* now scan forward to see if this is really the last extent. */
				4851	em = get_extent_skip_holes(inode, off, last_for_get_extent);
				4852	if (IS_ERR(em)) {
				4853	ret = PTR_ERR(em);
				4854	goto out;
				4855	}
				4856	if (!em) {
				4857	flags \|= FIEMAP_EXTENT_LAST;
				4858	end = 1;
				4859	}
				4860	ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
				4861	em_len, flags);
				4862	if (ret) {
				4863	if (ret == 1)
				4864	ret = 0;
				4865	goto out_free;
				4866	}
				4867	}
				4868	out_free:
				4869	if (!ret)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4870	ret = emit_last_fiemap_cache(fieinfo, &cache);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4871	free_extent_map(em);
				4872	out:
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4873	unlock_extent_cached(&inode->io_tree, start, start + len - 1,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4874	&cached_state);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4875
				4876	out_free_ulist:
				4877	btrfs_free_path(path);
				4878	ulist_free(roots);
				4879	ulist_free(tmp_ulist);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4880	return ret;
				4881	}
				4882
				4883	static void __free_extent_buffer(struct extent_buffer *eb)
				4884	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4885	kmem_cache_free(extent_buffer_cache, eb);
				4886	}
				4887
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4888	int extent_buffer_under_io(const struct extent_buffer *eb)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4889	{
				4890	return (atomic_read(&eb->io_pages) \|\|
				4891	test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) \|\|
				4892	test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
				4893	}
				4894
				4895	/*
				4896	* Release all pages attached to the extent buffer.
				4897	*/
				4898	static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
				4899	{
				4900	int i;
				4901	int num_pages;
				4902	int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
				4903
				4904	BUG_ON(extent_buffer_under_io(eb));
				4905
				4906	num_pages = num_extent_pages(eb);
				4907	for (i = 0; i < num_pages; i++) {
				4908	struct page *page = eb->pages[i];
				4909
				4910	if (!page)
				4911	continue;
				4912	if (mapped)
				4913	spin_lock(&page->mapping->private_lock);
				4914	/*
				4915	* We do this since we'll remove the pages after we've
				4916	* removed the eb from the radix tree, so we could race
				4917	* and have this page now attached to the new eb. So
				4918	* only clear page_private if it's still connected to
				4919	* this eb.
				4920	*/
				4921	if (PagePrivate(page) &&
				4922	page->private == (unsigned long)eb) {
				4923	BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
				4924	BUG_ON(PageDirty(page));
				4925	BUG_ON(PageWriteback(page));
				4926	/*
				4927	* We need to make sure we haven't be attached
				4928	* to a new eb.
				4929	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4930	detach_page_private(page);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4931	}
				4932
				4933	if (mapped)
				4934	spin_unlock(&page->mapping->private_lock);
				4935
				4936	/* One for when we allocated the page */
				4937	put_page(page);
				4938	}
				4939	}
				4940
				4941	/*
				4942	* Helper for releasing the extent buffer.
				4943	*/
				4944	static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
				4945	{
				4946	btrfs_release_extent_buffer_pages(eb);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4947	btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4948	__free_extent_buffer(eb);
				4949	}
				4950
				4951	static struct extent_buffer *
				4952	__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
				4953	unsigned long len)
				4954	{
				4955	struct extent_buffer *eb = NULL;
				4956
				4957	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS\|__GFP_NOFAIL);
				4958	eb->start = start;
				4959	eb->len = len;
				4960	eb->fs_info = fs_info;
				4961	eb->bflags = 0;
				4962	rwlock_init(&eb->lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4963	atomic_set(&eb->blocking_readers, 0);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4964	eb->blocking_writers = 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4965	eb->lock_recursed = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4966	init_waitqueue_head(&eb->write_lock_wq);
				4967	init_waitqueue_head(&eb->read_lock_wq);
				4968
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4969	btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
				4970	&fs_info->allocated_ebs);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4971
				4972	spin_lock_init(&eb->refs_lock);
				4973	atomic_set(&eb->refs, 1);
				4974	atomic_set(&eb->io_pages, 0);
				4975
				4976	/*
				4977	* Sanity checks, currently the maximum is 64k covered by 16x 4k pages
				4978	*/
				4979	BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
				4980	> MAX_INLINE_EXTENT_BUFFER_SIZE);
				4981	BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
				4982
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4983	#ifdef CONFIG_BTRFS_DEBUG
				4984	eb->spinning_writers = 0;
				4985	atomic_set(&eb->spinning_readers, 0);
				4986	atomic_set(&eb->read_locks, 0);
				4987	eb->write_locks = 0;
				4988	#endif
				4989
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4990	return eb;
				4991	}
				4992
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	4993	struct extent_buffer btrfs_clone_extent_buffer(const struct extent_buffer src)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4994	{
				4995	int i;
				4996	struct page *p;
				4997	struct extent_buffer *new;
				4998	int num_pages = num_extent_pages(src);
				4999
				5000	new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
				5001	if (new == NULL)
				5002	return NULL;
				5003
				5004	for (i = 0; i < num_pages; i++) {
				5005	p = alloc_page(GFP_NOFS);
				5006	if (!p) {
				5007	btrfs_release_extent_buffer(new);
				5008	return NULL;
				5009	}
				5010	attach_extent_buffer_page(new, p);
				5011	WARN_ON(PageDirty(p));
				5012	SetPageUptodate(p);
				5013	new->pages[i] = p;
				5014	copy_page(page_address(p), page_address(src->pages[i]));
				5015	}
				5016
				5017	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
				5018	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
				5019
				5020	return new;
				5021	}
				5022
				5023	struct extent_buffer __alloc_dummy_extent_buffer(struct btrfs_fs_info fs_info,
				5024	u64 start, unsigned long len)
				5025	{
				5026	struct extent_buffer *eb;
				5027	int num_pages;
				5028	int i;
				5029
				5030	eb = __alloc_extent_buffer(fs_info, start, len);
				5031	if (!eb)
				5032	return NULL;
				5033
				5034	num_pages = num_extent_pages(eb);
				5035	for (i = 0; i < num_pages; i++) {
				5036	eb->pages[i] = alloc_page(GFP_NOFS);
				5037	if (!eb->pages[i])
				5038	goto err;
				5039	}
				5040	set_extent_buffer_uptodate(eb);
				5041	btrfs_set_header_nritems(eb, 0);
				5042	set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
				5043
				5044	return eb;
				5045	err:
				5046	for (; i > 0; i--)
				5047	__free_page(eb->pages[i - 1]);
				5048	__free_extent_buffer(eb);
				5049	return NULL;
				5050	}
				5051
				5052	struct extent_buffer alloc_dummy_extent_buffer(struct btrfs_fs_info fs_info,
				5053	u64 start)
				5054	{
				5055	return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
				5056	}
				5057
				5058	static void check_buffer_tree_ref(struct extent_buffer *eb)
				5059	{
				5060	int refs;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	5061	/*
				5062	* The TREE_REF bit is first set when the extent_buffer is added
				5063	* to the radix tree. It is also reset, if unset, when a new reference
				5064	* is created by find_extent_buffer.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5065	*
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	5066	* It is only cleared in two cases: freeing the last non-tree
				5067	* reference to the extent_buffer when its STALE bit is set or
				5068	* calling releasepage when the tree reference is the only reference.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5069	*
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	5070	* In both cases, care is taken to ensure that the extent_buffer's
				5071	* pages are not under io. However, releasepage can be concurrently
				5072	* called with creating new references, which is prone to race
				5073	* conditions between the calls to check_buffer_tree_ref in those
				5074	* codepaths and clearing TREE_REF in try_release_extent_buffer.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5075	*
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	5076	* The actual lifetime of the extent_buffer in the radix tree is
				5077	* adequately protected by the refcount, but the TREE_REF bit and
				5078	* its corresponding reference are not. To protect against this
				5079	* class of races, we call check_buffer_tree_ref from the codepaths
				5080	* which trigger io after they set eb->io_pages. Note that once io is
				5081	* initiated, TREE_REF can no longer be cleared, so that is the
				5082	* moment at which any such race is best fixed.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5083	*/
				5084	refs = atomic_read(&eb->refs);
				5085	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
				5086	return;
				5087
				5088	spin_lock(&eb->refs_lock);
				5089	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
				5090	atomic_inc(&eb->refs);
				5091	spin_unlock(&eb->refs_lock);
				5092	}
				5093
				5094	static void mark_extent_buffer_accessed(struct extent_buffer *eb,
				5095	struct page *accessed)
				5096	{
				5097	int num_pages, i;
				5098
				5099	check_buffer_tree_ref(eb);
				5100
				5101	num_pages = num_extent_pages(eb);
				5102	for (i = 0; i < num_pages; i++) {
				5103	struct page *p = eb->pages[i];
				5104
				5105	if (p != accessed)
				5106	mark_page_accessed(p);
				5107	}
				5108	}
				5109
				5110	struct extent_buffer find_extent_buffer(struct btrfs_fs_info fs_info,
				5111	u64 start)
				5112	{
				5113	struct extent_buffer *eb;
				5114
				5115	rcu_read_lock();
				5116	eb = radix_tree_lookup(&fs_info->buffer_radix,
				5117	start >> PAGE_SHIFT);
				5118	if (eb && atomic_inc_not_zero(&eb->refs)) {
				5119	rcu_read_unlock();
				5120	/*
				5121	* Lock our eb's refs_lock to avoid races with
				5122	* free_extent_buffer. When we get our eb it might be flagged
				5123	* with EXTENT_BUFFER_STALE and another task running
				5124	* free_extent_buffer might have seen that flag set,
				5125	* eb->refs == 2, that the buffer isn't under IO (dirty and
				5126	* writeback flags not set) and it's still in the tree (flag
				5127	* EXTENT_BUFFER_TREE_REF set), therefore being in the process
				5128	* of decrementing the extent buffer's reference count twice.
				5129	* So here we could race and increment the eb's reference count,
				5130	* clear its stale flag, mark it as dirty and drop our reference
				5131	* before the other task finishes executing free_extent_buffer,
				5132	* which would later result in an attempt to free an extent
				5133	* buffer that is dirty.
				5134	*/
				5135	if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
				5136	spin_lock(&eb->refs_lock);
				5137	spin_unlock(&eb->refs_lock);
				5138	}
				5139	mark_extent_buffer_accessed(eb, NULL);
				5140	return eb;
				5141	}
				5142	rcu_read_unlock();
				5143
				5144	return NULL;
				5145	}
				5146
				5147	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
				5148	struct extent_buffer alloc_test_extent_buffer(struct btrfs_fs_info fs_info,
				5149	u64 start)
				5150	{
				5151	struct extent_buffer eb, exists = NULL;
				5152	int ret;
				5153
				5154	eb = find_extent_buffer(fs_info, start);
				5155	if (eb)
				5156	return eb;
				5157	eb = alloc_dummy_extent_buffer(fs_info, start);
				5158	if (!eb)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	5159	return ERR_PTR(-ENOMEM);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5160	eb->fs_info = fs_info;
				5161	again:
				5162	ret = radix_tree_preload(GFP_NOFS);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	5163	if (ret) {
				5164	exists = ERR_PTR(ret);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5165	goto free_eb;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	5166	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5167	spin_lock(&fs_info->buffer_lock);
				5168	ret = radix_tree_insert(&fs_info->buffer_radix,
				5169	start >> PAGE_SHIFT, eb);
				5170	spin_unlock(&fs_info->buffer_lock);
				5171	radix_tree_preload_end();
				5172	if (ret == -EEXIST) {
				5173	exists = find_extent_buffer(fs_info, start);
				5174	if (exists)
				5175	goto free_eb;
				5176	else
				5177	goto again;
				5178	}
				5179	check_buffer_tree_ref(eb);
				5180	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
				5181
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5182	return eb;
				5183	free_eb:
				5184	btrfs_release_extent_buffer(eb);
				5185	return exists;
				5186	}
				5187	#endif
				5188
				5189	struct extent_buffer alloc_extent_buffer(struct btrfs_fs_info fs_info,
				5190	u64 start)
				5191	{
				5192	unsigned long len = fs_info->nodesize;
				5193	int num_pages;
				5194	int i;
				5195	unsigned long index = start >> PAGE_SHIFT;
				5196	struct extent_buffer *eb;
				5197	struct extent_buffer *exists = NULL;
				5198	struct page *p;
				5199	struct address_space *mapping = fs_info->btree_inode->i_mapping;
				5200	int uptodate = 1;
				5201	int ret;
				5202
				5203	if (!IS_ALIGNED(start, fs_info->sectorsize)) {
				5204	btrfs_err(fs_info, "bad tree block start %llu", start);
				5205	return ERR_PTR(-EINVAL);
				5206	}
				5207
				5208	eb = find_extent_buffer(fs_info, start);
				5209	if (eb)
				5210	return eb;
				5211
				5212	eb = __alloc_extent_buffer(fs_info, start, len);
				5213	if (!eb)
				5214	return ERR_PTR(-ENOMEM);
				5215
				5216	num_pages = num_extent_pages(eb);
				5217	for (i = 0; i < num_pages; i++, index++) {
				5218	p = find_or_create_page(mapping, index, GFP_NOFS\|__GFP_NOFAIL);
				5219	if (!p) {
				5220	exists = ERR_PTR(-ENOMEM);
				5221	goto free_eb;
				5222	}
				5223
				5224	spin_lock(&mapping->private_lock);
				5225	if (PagePrivate(p)) {
				5226	/*
				5227	* We could have already allocated an eb for this page
				5228	* and attached one so lets see if we can get a ref on
				5229	* the existing eb, and if we can we know it's good and
				5230	* we can just return that one, else we know we can just
				5231	* overwrite page->private.
				5232	*/
				5233	exists = (struct extent_buffer *)p->private;
				5234	if (atomic_inc_not_zero(&exists->refs)) {
				5235	spin_unlock(&mapping->private_lock);
				5236	unlock_page(p);
				5237	put_page(p);
				5238	mark_extent_buffer_accessed(exists, p);
				5239	goto free_eb;
				5240	}
				5241	exists = NULL;
				5242
				5243	/*
				5244	* Do this so attach doesn't complain and we need to
				5245	* drop the ref the old guy had.
				5246	*/
				5247	ClearPagePrivate(p);
				5248	WARN_ON(PageDirty(p));
				5249	put_page(p);
				5250	}
				5251	attach_extent_buffer_page(eb, p);
				5252	spin_unlock(&mapping->private_lock);
				5253	WARN_ON(PageDirty(p));
				5254	eb->pages[i] = p;
				5255	if (!PageUptodate(p))
				5256	uptodate = 0;
				5257
				5258	/*
				5259	* We can't unlock the pages just yet since the extent buffer
				5260	* hasn't been properly inserted in the radix tree, this
				5261	* opens a race with btree_releasepage which can free a page
				5262	* while we are still filling in all pages for the buffer and
				5263	* we could crash.
				5264	*/
				5265	}
				5266	if (uptodate)
				5267	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				5268	again:
				5269	ret = radix_tree_preload(GFP_NOFS);
				5270	if (ret) {
				5271	exists = ERR_PTR(ret);
				5272	goto free_eb;
				5273	}
				5274
				5275	spin_lock(&fs_info->buffer_lock);
				5276	ret = radix_tree_insert(&fs_info->buffer_radix,
				5277	start >> PAGE_SHIFT, eb);
				5278	spin_unlock(&fs_info->buffer_lock);
				5279	radix_tree_preload_end();
				5280	if (ret == -EEXIST) {
				5281	exists = find_extent_buffer(fs_info, start);
				5282	if (exists)
				5283	goto free_eb;
				5284	else
				5285	goto again;
				5286	}
				5287	/* add one reference for the tree */
				5288	check_buffer_tree_ref(eb);
				5289	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
				5290
				5291	/*
				5292	* Now it's safe to unlock the pages because any calls to
				5293	* btree_releasepage will correctly detect that a page belongs to a
				5294	* live buffer and won't free them prematurely.
				5295	*/
				5296	for (i = 0; i < num_pages; i++)
				5297	unlock_page(eb->pages[i]);
				5298	return eb;
				5299
				5300	free_eb:
				5301	WARN_ON(!atomic_dec_and_test(&eb->refs));
				5302	for (i = 0; i < num_pages; i++) {
				5303	if (eb->pages[i])
				5304	unlock_page(eb->pages[i]);
				5305	}
				5306
				5307	btrfs_release_extent_buffer(eb);
				5308	return exists;
				5309	}
				5310
				5311	static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
				5312	{
				5313	struct extent_buffer *eb =
				5314	container_of(head, struct extent_buffer, rcu_head);
				5315
				5316	__free_extent_buffer(eb);
				5317	}
				5318
				5319	static int release_extent_buffer(struct extent_buffer *eb)
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5320	__releases(&eb->refs_lock)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5321	{
				5322	lockdep_assert_held(&eb->refs_lock);
				5323
				5324	WARN_ON(atomic_read(&eb->refs) == 0);
				5325	if (atomic_dec_and_test(&eb->refs)) {
				5326	if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
				5327	struct btrfs_fs_info *fs_info = eb->fs_info;
				5328
				5329	spin_unlock(&eb->refs_lock);
				5330
				5331	spin_lock(&fs_info->buffer_lock);
				5332	radix_tree_delete(&fs_info->buffer_radix,
				5333	eb->start >> PAGE_SHIFT);
				5334	spin_unlock(&fs_info->buffer_lock);
				5335	} else {
				5336	spin_unlock(&eb->refs_lock);
				5337	}
				5338
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5339	btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5340	/* Should be safe to release our pages at this point */
				5341	btrfs_release_extent_buffer_pages(eb);
				5342	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
				5343	if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
				5344	__free_extent_buffer(eb);
				5345	return 1;
				5346	}
				5347	#endif
				5348	call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
				5349	return 1;
				5350	}
				5351	spin_unlock(&eb->refs_lock);
				5352
				5353	return 0;
				5354	}
				5355
				5356	void free_extent_buffer(struct extent_buffer *eb)
				5357	{
				5358	int refs;
				5359	int old;
				5360	if (!eb)
				5361	return;
				5362
				5363	while (1) {
				5364	refs = atomic_read(&eb->refs);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5365	if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
				5366	\|\| (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
				5367	refs == 1))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5368	break;
				5369	old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
				5370	if (old == refs)
				5371	return;
				5372	}
				5373
				5374	spin_lock(&eb->refs_lock);
				5375	if (atomic_read(&eb->refs) == 2 &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5376	test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
				5377	!extent_buffer_under_io(eb) &&
				5378	test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
				5379	atomic_dec(&eb->refs);
				5380
				5381	/*
				5382	* I know this is terrible, but it's temporary until we stop tracking
				5383	* the uptodate bits and such for the extent buffers.
				5384	*/
				5385	release_extent_buffer(eb);
				5386	}
				5387
				5388	void free_extent_buffer_stale(struct extent_buffer *eb)
				5389	{
				5390	if (!eb)
				5391	return;
				5392
				5393	spin_lock(&eb->refs_lock);
				5394	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
				5395
				5396	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
				5397	test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
				5398	atomic_dec(&eb->refs);
				5399	release_extent_buffer(eb);
				5400	}
				5401
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5402	void clear_extent_buffer_dirty(const struct extent_buffer *eb)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5403	{
				5404	int i;
				5405	int num_pages;
				5406	struct page *page;
				5407
				5408	num_pages = num_extent_pages(eb);
				5409
				5410	for (i = 0; i < num_pages; i++) {
				5411	page = eb->pages[i];
				5412	if (!PageDirty(page))
				5413	continue;
				5414
				5415	lock_page(page);
				5416	WARN_ON(!PagePrivate(page));
				5417
				5418	clear_page_dirty_for_io(page);
				5419	xa_lock_irq(&page->mapping->i_pages);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5420	if (!PageDirty(page))
				5421	__xa_clear_mark(&page->mapping->i_pages,
				5422	page_index(page), PAGECACHE_TAG_DIRTY);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5423	xa_unlock_irq(&page->mapping->i_pages);
				5424	ClearPageError(page);
				5425	unlock_page(page);
				5426	}
				5427	WARN_ON(atomic_read(&eb->refs) == 0);
				5428	}
				5429
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5430	bool set_extent_buffer_dirty(struct extent_buffer *eb)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5431	{
				5432	int i;
				5433	int num_pages;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5434	bool was_dirty;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5435
				5436	check_buffer_tree_ref(eb);
				5437
				5438	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
				5439
				5440	num_pages = num_extent_pages(eb);
				5441	WARN_ON(atomic_read(&eb->refs) == 0);
				5442	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
				5443
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5444	if (!was_dirty)
				5445	for (i = 0; i < num_pages; i++)
				5446	set_page_dirty(eb->pages[i]);
				5447
				5448	#ifdef CONFIG_BTRFS_DEBUG
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5449	for (i = 0; i < num_pages; i++)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5450	ASSERT(PageDirty(eb->pages[i]));
				5451	#endif
				5452
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5453	return was_dirty;
				5454	}
				5455
				5456	void clear_extent_buffer_uptodate(struct extent_buffer *eb)
				5457	{
				5458	int i;
				5459	struct page *page;
				5460	int num_pages;
				5461
				5462	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				5463	num_pages = num_extent_pages(eb);
				5464	for (i = 0; i < num_pages; i++) {
				5465	page = eb->pages[i];
				5466	if (page)
				5467	ClearPageUptodate(page);
				5468	}
				5469	}
				5470
				5471	void set_extent_buffer_uptodate(struct extent_buffer *eb)
				5472	{
				5473	int i;
				5474	struct page *page;
				5475	int num_pages;
				5476
				5477	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				5478	num_pages = num_extent_pages(eb);
				5479	for (i = 0; i < num_pages; i++) {
				5480	page = eb->pages[i];
				5481	SetPageUptodate(page);
				5482	}
				5483	}
				5484
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5485	int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5486	{
				5487	int i;
				5488	struct page *page;
				5489	int err;
				5490	int ret = 0;
				5491	int locked_pages = 0;
				5492	int all_uptodate = 1;
				5493	int num_pages;
				5494	unsigned long num_reads = 0;
				5495	struct bio *bio = NULL;
				5496	unsigned long bio_flags = 0;
				5497
				5498	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
				5499	return 0;
				5500
				5501	num_pages = num_extent_pages(eb);
				5502	for (i = 0; i < num_pages; i++) {
				5503	page = eb->pages[i];
				5504	if (wait == WAIT_NONE) {
				5505	if (!trylock_page(page))
				5506	goto unlock_exit;
				5507	} else {
				5508	lock_page(page);
				5509	}
				5510	locked_pages++;
				5511	}
				5512	/*
				5513	* We need to firstly lock all pages to make sure that
				5514	* the uptodate bit of our pages won't be affected by
				5515	* clear_extent_buffer_uptodate().
				5516	*/
				5517	for (i = 0; i < num_pages; i++) {
				5518	page = eb->pages[i];
				5519	if (!PageUptodate(page)) {
				5520	num_reads++;
				5521	all_uptodate = 0;
				5522	}
				5523	}
				5524
				5525	if (all_uptodate) {
				5526	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				5527	goto unlock_exit;
				5528	}
				5529
				5530	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
				5531	eb->read_mirror = 0;
				5532	atomic_set(&eb->io_pages, num_reads);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	5533	/*
				5534	* It is possible for releasepage to clear the TREE_REF bit before we
				5535	* set io_pages. See check_buffer_tree_ref for a more detailed comment.
				5536	*/
				5537	check_buffer_tree_ref(eb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5538	for (i = 0; i < num_pages; i++) {
				5539	page = eb->pages[i];
				5540
				5541	if (!PageUptodate(page)) {
				5542	if (ret) {
				5543	atomic_dec(&eb->io_pages);
				5544	unlock_page(page);
				5545	continue;
				5546	}
				5547
				5548	ClearPageError(page);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5549	err = submit_extent_page(REQ_OP_READ \| REQ_META, NULL,
				5550	page, page_offset(page), PAGE_SIZE, 0,
				5551	&bio, end_bio_extent_readpage,
				5552	mirror_num, 0, 0, false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5553	if (err) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5554	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5555	* We failed to submit the bio so it's the
				5556	* caller's responsibility to perform cleanup
				5557	* i.e unlock page/set error bit.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5558	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5559	ret = err;
				5560	SetPageError(page);
				5561	unlock_page(page);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5562	atomic_dec(&eb->io_pages);
				5563	}
				5564	} else {
				5565	unlock_page(page);
				5566	}
				5567	}
				5568
				5569	if (bio) {
				5570	err = submit_one_bio(bio, mirror_num, bio_flags);
				5571	if (err)
				5572	return err;
				5573	}
				5574
				5575	if (ret \|\| wait != WAIT_COMPLETE)
				5576	return ret;
				5577
				5578	for (i = 0; i < num_pages; i++) {
				5579	page = eb->pages[i];
				5580	wait_on_page_locked(page);
				5581	if (!PageUptodate(page))
				5582	ret = -EIO;
				5583	}
				5584
				5585	return ret;
				5586
				5587	unlock_exit:
				5588	while (locked_pages > 0) {
				5589	locked_pages--;
				5590	page = eb->pages[locked_pages];
				5591	unlock_page(page);
				5592	}
				5593	return ret;
				5594	}
				5595
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5596	static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
				5597	unsigned long len)
				5598	{
				5599	btrfs_warn(eb->fs_info,
				5600	"access to eb bytenr %llu len %lu out of range start %lu len %lu",
				5601	eb->start, eb->len, start, len);
				5602	WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
				5603
				5604	return true;
				5605	}
				5606
				5607	/*
				5608	* Check if the [start, start + len) range is valid before reading/writing
				5609	* the eb.
				5610	* NOTE: @start and @len are offset inside the eb, not logical address.
				5611	*
				5612	* Caller should not touch the dst/src memory if this function returns error.
				5613	*/
				5614	static inline int check_eb_range(const struct extent_buffer *eb,
				5615	unsigned long start, unsigned long len)
				5616	{
				5617	unsigned long offset;
				5618
				5619	/* start, start + len should not go beyond eb->len nor overflow */
				5620	if (unlikely(check_add_overflow(start, len, &offset) \|\| offset > eb->len))
				5621	return report_eb_range(eb, start, len);
				5622
				5623	return false;
				5624	}
				5625
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5626	void read_extent_buffer(const struct extent_buffer eb, void dstv,
				5627	unsigned long start, unsigned long len)
				5628	{
				5629	size_t cur;
				5630	size_t offset;
				5631	struct page *page;
				5632	char *kaddr;
				5633	char dst = (char )dstv;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5634	unsigned long i = start >> PAGE_SHIFT;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5635
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5636	if (check_eb_range(eb, start, len))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5637	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5638
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5639	offset = offset_in_page(start);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5640
				5641	while (len > 0) {
				5642	page = eb->pages[i];
				5643
				5644	cur = min(len, (PAGE_SIZE - offset));
				5645	kaddr = page_address(page);
				5646	memcpy(dst, kaddr + offset, cur);
				5647
				5648	dst += cur;
				5649	len -= cur;
				5650	offset = 0;
				5651	i++;
				5652	}
				5653	}
				5654
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	5655	int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
				5656	void __user *dstv,
				5657	unsigned long start, unsigned long len)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5658	{
				5659	size_t cur;
				5660	size_t offset;
				5661	struct page *page;
				5662	char *kaddr;
				5663	char __user dst = (char __user )dstv;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5664	unsigned long i = start >> PAGE_SHIFT;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5665	int ret = 0;
				5666
				5667	WARN_ON(start > eb->len);
				5668	WARN_ON(start + len > eb->start + eb->len);
				5669
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5670	offset = offset_in_page(start);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5671
				5672	while (len > 0) {
				5673	page = eb->pages[i];
				5674
				5675	cur = min(len, (PAGE_SIZE - offset));
				5676	kaddr = page_address(page);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5677	if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5678	ret = -EFAULT;
				5679	break;
				5680	}
				5681
				5682	dst += cur;
				5683	len -= cur;
				5684	offset = 0;
				5685	i++;
				5686	}
				5687
				5688	return ret;
				5689	}
				5690
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5691	int memcmp_extent_buffer(const struct extent_buffer eb, const void ptrv,
				5692	unsigned long start, unsigned long len)
				5693	{
				5694	size_t cur;
				5695	size_t offset;
				5696	struct page *page;
				5697	char *kaddr;
				5698	char ptr = (char )ptrv;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5699	unsigned long i = start >> PAGE_SHIFT;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5700	int ret = 0;
				5701
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5702	if (check_eb_range(eb, start, len))
				5703	return -EINVAL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5704
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5705	offset = offset_in_page(start);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5706
				5707	while (len > 0) {
				5708	page = eb->pages[i];
				5709
				5710	cur = min(len, (PAGE_SIZE - offset));
				5711
				5712	kaddr = page_address(page);
				5713	ret = memcmp(ptr, kaddr + offset, cur);
				5714	if (ret)
				5715	break;
				5716
				5717	ptr += cur;
				5718	len -= cur;
				5719	offset = 0;
				5720	i++;
				5721	}
				5722	return ret;
				5723	}
				5724
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5725	void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5726	const void *srcv)
				5727	{
				5728	char *kaddr;
				5729
				5730	WARN_ON(!PageUptodate(eb->pages[0]));
				5731	kaddr = page_address(eb->pages[0]);
				5732	memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
				5733	BTRFS_FSID_SIZE);
				5734	}
				5735
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5736	void write_extent_buffer_fsid(const struct extent_buffer eb, const void srcv)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5737	{
				5738	char *kaddr;
				5739
				5740	WARN_ON(!PageUptodate(eb->pages[0]));
				5741	kaddr = page_address(eb->pages[0]);
				5742	memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
				5743	BTRFS_FSID_SIZE);
				5744	}
				5745
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5746	void write_extent_buffer(const struct extent_buffer eb, const void srcv,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5747	unsigned long start, unsigned long len)
				5748	{
				5749	size_t cur;
				5750	size_t offset;
				5751	struct page *page;
				5752	char *kaddr;
				5753	char src = (char )srcv;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5754	unsigned long i = start >> PAGE_SHIFT;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5755
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5756	if (check_eb_range(eb, start, len))
				5757	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5758
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5759	offset = offset_in_page(start);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5760
				5761	while (len > 0) {
				5762	page = eb->pages[i];
				5763	WARN_ON(!PageUptodate(page));
				5764
				5765	cur = min(len, PAGE_SIZE - offset);
				5766	kaddr = page_address(page);
				5767	memcpy(kaddr + offset, src, cur);
				5768
				5769	src += cur;
				5770	len -= cur;
				5771	offset = 0;
				5772	i++;
				5773	}
				5774	}
				5775
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5776	void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5777	unsigned long len)
				5778	{
				5779	size_t cur;
				5780	size_t offset;
				5781	struct page *page;
				5782	char *kaddr;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5783	unsigned long i = start >> PAGE_SHIFT;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5784
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5785	if (check_eb_range(eb, start, len))
				5786	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5787
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5788	offset = offset_in_page(start);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5789
				5790	while (len > 0) {
				5791	page = eb->pages[i];
				5792	WARN_ON(!PageUptodate(page));
				5793
				5794	cur = min(len, PAGE_SIZE - offset);
				5795	kaddr = page_address(page);
				5796	memset(kaddr + offset, 0, cur);
				5797
				5798	len -= cur;
				5799	offset = 0;
				5800	i++;
				5801	}
				5802	}
				5803
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5804	void copy_extent_buffer_full(const struct extent_buffer *dst,
				5805	const struct extent_buffer *src)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5806	{
				5807	int i;
				5808	int num_pages;
				5809
				5810	ASSERT(dst->len == src->len);
				5811
				5812	num_pages = num_extent_pages(dst);
				5813	for (i = 0; i < num_pages; i++)
				5814	copy_page(page_address(dst->pages[i]),
				5815	page_address(src->pages[i]));
				5816	}
				5817
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5818	void copy_extent_buffer(const struct extent_buffer *dst,
				5819	const struct extent_buffer *src,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5820	unsigned long dst_offset, unsigned long src_offset,
				5821	unsigned long len)
				5822	{
				5823	u64 dst_len = dst->len;
				5824	size_t cur;
				5825	size_t offset;
				5826	struct page *page;
				5827	char *kaddr;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5828	unsigned long i = dst_offset >> PAGE_SHIFT;
				5829
				5830	if (check_eb_range(dst, dst_offset, len) \|\|
				5831	check_eb_range(src, src_offset, len))
				5832	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5833
				5834	WARN_ON(src->len != dst_len);
				5835
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5836	offset = offset_in_page(dst_offset);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5837
				5838	while (len > 0) {
				5839	page = dst->pages[i];
				5840	WARN_ON(!PageUptodate(page));
				5841
				5842	cur = min(len, (unsigned long)(PAGE_SIZE - offset));
				5843
				5844	kaddr = page_address(page);
				5845	read_extent_buffer(src, kaddr + offset, src_offset, cur);
				5846
				5847	src_offset += cur;
				5848	len -= cur;
				5849	offset = 0;
				5850	i++;
				5851	}
				5852	}
				5853
				5854	/*
				5855	* eb_bitmap_offset() - calculate the page and offset of the byte containing the
				5856	* given bit number
				5857	* @eb: the extent buffer
				5858	* @start: offset of the bitmap item in the extent buffer
				5859	* @nr: bit number
				5860	* @page_index: return index of the page in the extent buffer that contains the
				5861	* given bit number
				5862	* @page_offset: return offset into the page given by page_index
				5863	*
				5864	* This helper hides the ugliness of finding the byte in an extent buffer which
				5865	* contains a given bit.
				5866	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5867	static inline void eb_bitmap_offset(const struct extent_buffer *eb,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5868	unsigned long start, unsigned long nr,
				5869	unsigned long *page_index,
				5870	size_t *page_offset)
				5871	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5872	size_t byte_offset = BIT_BYTE(nr);
				5873	size_t offset;
				5874
				5875	/*
				5876	* The byte we want is the offset of the extent buffer + the offset of
				5877	* the bitmap item in the extent buffer + the offset of the byte in the
				5878	* bitmap item.
				5879	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5880	offset = start + byte_offset;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5881
				5882	*page_index = offset >> PAGE_SHIFT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5883	*page_offset = offset_in_page(offset);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5884	}
				5885
				5886	/**
				5887	* extent_buffer_test_bit - determine whether a bit in a bitmap item is set
				5888	* @eb: the extent buffer
				5889	* @start: offset of the bitmap item in the extent buffer
				5890	* @nr: bit number to test
				5891	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5892	int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5893	unsigned long nr)
				5894	{
				5895	u8 *kaddr;
				5896	struct page *page;
				5897	unsigned long i;
				5898	size_t offset;
				5899
				5900	eb_bitmap_offset(eb, start, nr, &i, &offset);
				5901	page = eb->pages[i];
				5902	WARN_ON(!PageUptodate(page));
				5903	kaddr = page_address(page);
				5904	return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
				5905	}
				5906
				5907	/**
				5908	* extent_buffer_bitmap_set - set an area of a bitmap
				5909	* @eb: the extent buffer
				5910	* @start: offset of the bitmap item in the extent buffer
				5911	* @pos: bit number of the first bit
				5912	* @len: number of bits to set
				5913	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5914	void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5915	unsigned long pos, unsigned long len)
				5916	{
				5917	u8 *kaddr;
				5918	struct page *page;
				5919	unsigned long i;
				5920	size_t offset;
				5921	const unsigned int size = pos + len;
				5922	int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
				5923	u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
				5924
				5925	eb_bitmap_offset(eb, start, pos, &i, &offset);
				5926	page = eb->pages[i];
				5927	WARN_ON(!PageUptodate(page));
				5928	kaddr = page_address(page);
				5929
				5930	while (len >= bits_to_set) {
				5931	kaddr[offset] \|= mask_to_set;
				5932	len -= bits_to_set;
				5933	bits_to_set = BITS_PER_BYTE;
				5934	mask_to_set = ~0;
				5935	if (++offset >= PAGE_SIZE && len > 0) {
				5936	offset = 0;
				5937	page = eb->pages[++i];
				5938	WARN_ON(!PageUptodate(page));
				5939	kaddr = page_address(page);
				5940	}
				5941	}
				5942	if (len) {
				5943	mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
				5944	kaddr[offset] \|= mask_to_set;
				5945	}
				5946	}
				5947
				5948
				5949	/**
				5950	* extent_buffer_bitmap_clear - clear an area of a bitmap
				5951	* @eb: the extent buffer
				5952	* @start: offset of the bitmap item in the extent buffer
				5953	* @pos: bit number of the first bit
				5954	* @len: number of bits to clear
				5955	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	5956	void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
				5957	unsigned long start, unsigned long pos,
				5958	unsigned long len)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5959	{
				5960	u8 *kaddr;
				5961	struct page *page;
				5962	unsigned long i;
				5963	size_t offset;
				5964	const unsigned int size = pos + len;
				5965	int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
				5966	u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
				5967
				5968	eb_bitmap_offset(eb, start, pos, &i, &offset);
				5969	page = eb->pages[i];
				5970	WARN_ON(!PageUptodate(page));
				5971	kaddr = page_address(page);
				5972
				5973	while (len >= bits_to_clear) {
				5974	kaddr[offset] &= ~mask_to_clear;
				5975	len -= bits_to_clear;
				5976	bits_to_clear = BITS_PER_BYTE;
				5977	mask_to_clear = ~0;
				5978	if (++offset >= PAGE_SIZE && len > 0) {
				5979	offset = 0;
				5980	page = eb->pages[++i];
				5981	WARN_ON(!PageUptodate(page));
				5982	kaddr = page_address(page);
				5983	}
				5984	}
				5985	if (len) {
				5986	mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
				5987	kaddr[offset] &= ~mask_to_clear;
				5988	}
				5989	}
				5990
				5991	static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
				5992	{
				5993	unsigned long distance = (src > dst) ? src - dst : dst - src;
				5994	return distance < len;
				5995	}
				5996
				5997	static void copy_pages(struct page dst_page, struct page src_page,
				5998	unsigned long dst_off, unsigned long src_off,
				5999	unsigned long len)
				6000	{
				6001	char *dst_kaddr = page_address(dst_page);
				6002	char *src_kaddr;
				6003	int must_memmove = 0;
				6004
				6005	if (dst_page != src_page) {
				6006	src_kaddr = page_address(src_page);
				6007	} else {
				6008	src_kaddr = dst_kaddr;
				6009	if (areas_overlap(src_off, dst_off, len))
				6010	must_memmove = 1;
				6011	}
				6012
				6013	if (must_memmove)
				6014	memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
				6015	else
				6016	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
				6017	}
				6018
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6019	void memcpy_extent_buffer(const struct extent_buffer *dst,
				6020	unsigned long dst_offset, unsigned long src_offset,
				6021	unsigned long len)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6022	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6023	size_t cur;
				6024	size_t dst_off_in_page;
				6025	size_t src_off_in_page;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6026	unsigned long dst_i;
				6027	unsigned long src_i;
				6028
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6029	if (check_eb_range(dst, dst_offset, len) \|\|
				6030	check_eb_range(dst, src_offset, len))
				6031	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6032
				6033	while (len > 0) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6034	dst_off_in_page = offset_in_page(dst_offset);
				6035	src_off_in_page = offset_in_page(src_offset);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6036
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6037	dst_i = dst_offset >> PAGE_SHIFT;
				6038	src_i = src_offset >> PAGE_SHIFT;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6039
				6040	cur = min(len, (unsigned long)(PAGE_SIZE -
				6041	src_off_in_page));
				6042	cur = min_t(unsigned long, cur,
				6043	(unsigned long)(PAGE_SIZE - dst_off_in_page));
				6044
				6045	copy_pages(dst->pages[dst_i], dst->pages[src_i],
				6046	dst_off_in_page, src_off_in_page, cur);
				6047
				6048	src_offset += cur;
				6049	dst_offset += cur;
				6050	len -= cur;
				6051	}
				6052	}
				6053
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6054	void memmove_extent_buffer(const struct extent_buffer *dst,
				6055	unsigned long dst_offset, unsigned long src_offset,
				6056	unsigned long len)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6057	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6058	size_t cur;
				6059	size_t dst_off_in_page;
				6060	size_t src_off_in_page;
				6061	unsigned long dst_end = dst_offset + len - 1;
				6062	unsigned long src_end = src_offset + len - 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6063	unsigned long dst_i;
				6064	unsigned long src_i;
				6065
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6066	if (check_eb_range(dst, dst_offset, len) \|\|
				6067	check_eb_range(dst, src_offset, len))
				6068	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6069	if (dst_offset < src_offset) {
				6070	memcpy_extent_buffer(dst, dst_offset, src_offset, len);
				6071	return;
				6072	}
				6073	while (len > 0) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6074	dst_i = dst_end >> PAGE_SHIFT;
				6075	src_i = src_end >> PAGE_SHIFT;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6076
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	6077	dst_off_in_page = offset_in_page(dst_end);
				6078	src_off_in_page = offset_in_page(src_end);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6079
				6080	cur = min_t(unsigned long, len, src_off_in_page + 1);
				6081	cur = min(cur, dst_off_in_page + 1);
				6082	copy_pages(dst->pages[dst_i], dst->pages[src_i],
				6083	dst_off_in_page - cur + 1,
				6084	src_off_in_page - cur + 1, cur);
				6085
				6086	dst_end -= cur;
				6087	src_end -= cur;
				6088	len -= cur;
				6089	}
				6090	}
				6091
				6092	int try_release_extent_buffer(struct page *page)
				6093	{
				6094	struct extent_buffer *eb;
				6095
				6096	/*
				6097	* We need to make sure nobody is attaching this page to an eb right
				6098	* now.
				6099	*/
				6100	spin_lock(&page->mapping->private_lock);
				6101	if (!PagePrivate(page)) {
				6102	spin_unlock(&page->mapping->private_lock);
				6103	return 1;
				6104	}
				6105
				6106	eb = (struct extent_buffer *)page->private;
				6107	BUG_ON(!eb);
				6108
				6109	/*
				6110	* This is a little awful but should be ok, we need to make sure that
				6111	* the eb doesn't disappear out from under us while we're looking at
				6112	* this page.
				6113	*/
				6114	spin_lock(&eb->refs_lock);
				6115	if (atomic_read(&eb->refs) != 1 \|\| extent_buffer_under_io(eb)) {
				6116	spin_unlock(&eb->refs_lock);
				6117	spin_unlock(&page->mapping->private_lock);
				6118	return 0;
				6119	}
				6120	spin_unlock(&page->mapping->private_lock);
				6121
				6122	/*
				6123	* If tree ref isn't set then we know the ref on this eb is a real ref,
				6124	* so just return, this page will likely be freed soon anyway.
				6125	*/
				6126	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
				6127	spin_unlock(&eb->refs_lock);
				6128	return 0;
				6129	}
				6130
				6131	return release_extent_buffer(eb);
				6132	}