Blame - fs/btrfs/transaction.c - hafnium/third_party/linux

blob: e6cb95b81787f1280a11885d432481fa2963fc94 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (C) 2007 Oracle. All rights reserved.
				4	*/
				5
				6	#include <linux/fs.h>
				7	#include <linux/slab.h>
				8	#include <linux/sched.h>
				9	#include <linux/writeback.h>
				10	#include <linux/pagemap.h>
				11	#include <linux/blkdev.h>
				12	#include <linux/uuid.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	13	#include "misc.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	14	#include "ctree.h"
				15	#include "disk-io.h"
				16	#include "transaction.h"
				17	#include "locking.h"
				18	#include "tree-log.h"
				19	#include "inode-map.h"
				20	#include "volumes.h"
				21	#include "dev-replace.h"
				22	#include "qgroup.h"
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	23	#include "block-group.h"
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	24	#include "space-info.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	25
				26	#define BTRFS_ROOT_TRANS_TAG 0
				27
				28	static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
				29	[TRANS_STATE_RUNNING] = 0U,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	30	[TRANS_STATE_COMMIT_START] = (__TRANS_START \| __TRANS_ATTACH),
				31	[TRANS_STATE_COMMIT_DOING] = (__TRANS_START \|
				32	__TRANS_ATTACH \|
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	33	__TRANS_JOIN \|
				34	__TRANS_JOIN_NOSTART),
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	35	[TRANS_STATE_UNBLOCKED] = (__TRANS_START \|
				36	__TRANS_ATTACH \|
				37	__TRANS_JOIN \|
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	38	__TRANS_JOIN_NOLOCK \|
				39	__TRANS_JOIN_NOSTART),
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	40	[TRANS_STATE_COMPLETED] = (__TRANS_START \|
				41	__TRANS_ATTACH \|
				42	__TRANS_JOIN \|
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	43	__TRANS_JOIN_NOLOCK \|
				44	__TRANS_JOIN_NOSTART),
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	45	};
				46
				47	void btrfs_put_transaction(struct btrfs_transaction *transaction)
				48	{
				49	WARN_ON(refcount_read(&transaction->use_count) == 0);
				50	if (refcount_dec_and_test(&transaction->use_count)) {
				51	BUG_ON(!list_empty(&transaction->list));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	52	WARN_ON(!RB_EMPTY_ROOT(
				53	&transaction->delayed_refs.href_root.rb_root));
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	54	WARN_ON(!RB_EMPTY_ROOT(
				55	&transaction->delayed_refs.dirty_extent_root));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	56	if (transaction->delayed_refs.pending_csums)
				57	btrfs_err(transaction->fs_info,
				58	"pending csums is %llu",
				59	transaction->delayed_refs.pending_csums);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	60	/*
				61	* If any block groups are found in ->deleted_bgs then it's
				62	* because the transaction was aborted and a commit did not
				63	* happen (things failed before writing the new superblock
				64	* and calling btrfs_finish_extent_commit()), so we can not
				65	* discard the physical locations of the block groups.
				66	*/
				67	while (!list_empty(&transaction->deleted_bgs)) {
				68	struct btrfs_block_group_cache *cache;
				69
				70	cache = list_first_entry(&transaction->deleted_bgs,
				71	struct btrfs_block_group_cache,
				72	bg_list);
				73	list_del_init(&cache->bg_list);
				74	btrfs_put_block_group_trimming(cache);
				75	btrfs_put_block_group(cache);
				76	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	77	WARN_ON(!list_empty(&transaction->dev_update_list));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	78	kfree(transaction);
				79	}
				80	}
				81
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	82	static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	83	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	84	struct btrfs_transaction *cur_trans = trans->transaction;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	85	struct btrfs_fs_info *fs_info = trans->fs_info;
				86	struct btrfs_root root, tmp;
				87
				88	down_write(&fs_info->commit_root_sem);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	89	list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	90	dirty_list) {
				91	list_del_init(&root->dirty_list);
				92	free_extent_buffer(root->commit_root);
				93	root->commit_root = btrfs_root_node(root);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	94	if (is_fstree(root->root_key.objectid))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	95	btrfs_unpin_free_ino(root);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	96	extent_io_tree_release(&root->dirty_log_pages);
				97	btrfs_qgroup_clean_swapped_blocks(root);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	98	}
				99
				100	/* We can free old roots now. */
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	101	spin_lock(&cur_trans->dropped_roots_lock);
				102	while (!list_empty(&cur_trans->dropped_roots)) {
				103	root = list_first_entry(&cur_trans->dropped_roots,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	104	struct btrfs_root, root_list);
				105	list_del_init(&root->root_list);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	106	spin_unlock(&cur_trans->dropped_roots_lock);
				107	btrfs_free_log(trans, root);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	108	btrfs_drop_and_free_fs_root(fs_info, root);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	109	spin_lock(&cur_trans->dropped_roots_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	110	}
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	111	spin_unlock(&cur_trans->dropped_roots_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	112	up_write(&fs_info->commit_root_sem);
				113	}
				114
				115	static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
				116	unsigned int type)
				117	{
				118	if (type & TRANS_EXTWRITERS)
				119	atomic_inc(&trans->num_extwriters);
				120	}
				121
				122	static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
				123	unsigned int type)
				124	{
				125	if (type & TRANS_EXTWRITERS)
				126	atomic_dec(&trans->num_extwriters);
				127	}
				128
				129	static inline void extwriter_counter_init(struct btrfs_transaction *trans,
				130	unsigned int type)
				131	{
				132	atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
				133	}
				134
				135	static inline int extwriter_counter_read(struct btrfs_transaction *trans)
				136	{
				137	return atomic_read(&trans->num_extwriters);
				138	}
				139
				140	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	141	* To be called after all the new block groups attached to the transaction
				142	* handle have been created (btrfs_create_pending_block_groups()).
				143	*/
				144	void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
				145	{
				146	struct btrfs_fs_info *fs_info = trans->fs_info;
				147
				148	if (!trans->chunk_bytes_reserved)
				149	return;
				150
				151	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
				152
				153	btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
				154	trans->chunk_bytes_reserved);
				155	trans->chunk_bytes_reserved = 0;
				156	}
				157
				158	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	159	* either allocate a new transaction or hop into the existing one
				160	*/
				161	static noinline int join_transaction(struct btrfs_fs_info *fs_info,
				162	unsigned int type)
				163	{
				164	struct btrfs_transaction *cur_trans;
				165
				166	spin_lock(&fs_info->trans_lock);
				167	loop:
				168	/* The file system has been taken offline. No new transactions. */
				169	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
				170	spin_unlock(&fs_info->trans_lock);
				171	return -EROFS;
				172	}
				173
				174	cur_trans = fs_info->running_transaction;
				175	if (cur_trans) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	176	if (TRANS_ABORTED(cur_trans)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	177	spin_unlock(&fs_info->trans_lock);
				178	return cur_trans->aborted;
				179	}
				180	if (btrfs_blocked_trans_types[cur_trans->state] & type) {
				181	spin_unlock(&fs_info->trans_lock);
				182	return -EBUSY;
				183	}
				184	refcount_inc(&cur_trans->use_count);
				185	atomic_inc(&cur_trans->num_writers);
				186	extwriter_counter_inc(cur_trans, type);
				187	spin_unlock(&fs_info->trans_lock);
				188	return 0;
				189	}
				190	spin_unlock(&fs_info->trans_lock);
				191
				192	/*
				193	* If we are ATTACH, we just want to catch the current transaction,
				194	* and commit it. If there is no transaction, just return ENOENT.
				195	*/
				196	if (type == TRANS_ATTACH)
				197	return -ENOENT;
				198
				199	/*
				200	* JOIN_NOLOCK only happens during the transaction commit, so
				201	* it is impossible that ->running_transaction is NULL
				202	*/
				203	BUG_ON(type == TRANS_JOIN_NOLOCK);
				204
				205	cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
				206	if (!cur_trans)
				207	return -ENOMEM;
				208
				209	spin_lock(&fs_info->trans_lock);
				210	if (fs_info->running_transaction) {
				211	/*
				212	* someone started a transaction after we unlocked. Make sure
				213	* to redo the checks above
				214	*/
				215	kfree(cur_trans);
				216	goto loop;
				217	} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
				218	spin_unlock(&fs_info->trans_lock);
				219	kfree(cur_trans);
				220	return -EROFS;
				221	}
				222
				223	cur_trans->fs_info = fs_info;
				224	atomic_set(&cur_trans->num_writers, 1);
				225	extwriter_counter_init(cur_trans, type);
				226	init_waitqueue_head(&cur_trans->writer_wait);
				227	init_waitqueue_head(&cur_trans->commit_wait);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	228	cur_trans->state = TRANS_STATE_RUNNING;
				229	/*
				230	* One for this trans handle, one so it will live on until we
				231	* commit the transaction.
				232	*/
				233	refcount_set(&cur_trans->use_count, 2);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	234	cur_trans->flags = 0;
				235	cur_trans->start_time = ktime_get_seconds();
				236
				237	memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
				238
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	239	cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	240	cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
				241	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
				242
				243	/*
				244	* although the tree mod log is per file system and not per transaction,
				245	* the log must never go across transaction boundaries.
				246	*/
				247	smp_mb();
				248	if (!list_empty(&fs_info->tree_mod_seq_list))
				249	WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n");
				250	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
				251	WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n");
				252	atomic64_set(&fs_info->tree_mod_seq, 0);
				253
				254	spin_lock_init(&cur_trans->delayed_refs.lock);
				255
				256	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	257	INIT_LIST_HEAD(&cur_trans->dev_update_list);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	258	INIT_LIST_HEAD(&cur_trans->switch_commits);
				259	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
				260	INIT_LIST_HEAD(&cur_trans->io_bgs);
				261	INIT_LIST_HEAD(&cur_trans->dropped_roots);
				262	mutex_init(&cur_trans->cache_write_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	263	spin_lock_init(&cur_trans->dirty_bgs_lock);
				264	INIT_LIST_HEAD(&cur_trans->deleted_bgs);
				265	spin_lock_init(&cur_trans->dropped_roots_lock);
				266	list_add_tail(&cur_trans->list, &fs_info->trans_list);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	267	extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
				268	IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	269	fs_info->generation++;
				270	cur_trans->transid = fs_info->generation;
				271	fs_info->running_transaction = cur_trans;
				272	cur_trans->aborted = 0;
				273	spin_unlock(&fs_info->trans_lock);
				274
				275	return 0;
				276	}
				277
				278	/*
				279	* this does all the record keeping required to make sure that a reference
				280	* counted root is properly recorded in a given transaction. This is required
				281	* to make sure the old root from before we joined the transaction is deleted
				282	* when the transaction commits
				283	*/
				284	static int record_root_in_trans(struct btrfs_trans_handle *trans,
				285	struct btrfs_root *root,
				286	int force)
				287	{
				288	struct btrfs_fs_info *fs_info = root->fs_info;
				289
				290	if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
				291	root->last_trans < trans->transid) \|\| force) {
				292	WARN_ON(root == fs_info->extent_root);
				293	WARN_ON(!force && root->commit_root != root->node);
				294
				295	/*
				296	* see below for IN_TRANS_SETUP usage rules
				297	* we have the reloc mutex held now, so there
				298	* is only one writer in this function
				299	*/
				300	set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
				301
				302	/* make sure readers find IN_TRANS_SETUP before
				303	* they find our root->last_trans update
				304	*/
				305	smp_wmb();
				306
				307	spin_lock(&fs_info->fs_roots_radix_lock);
				308	if (root->last_trans == trans->transid && !force) {
				309	spin_unlock(&fs_info->fs_roots_radix_lock);
				310	return 0;
				311	}
				312	radix_tree_tag_set(&fs_info->fs_roots_radix,
				313	(unsigned long)root->root_key.objectid,
				314	BTRFS_ROOT_TRANS_TAG);
				315	spin_unlock(&fs_info->fs_roots_radix_lock);
				316	root->last_trans = trans->transid;
				317
				318	/* this is pretty tricky. We don't want to
				319	* take the relocation lock in btrfs_record_root_in_trans
				320	* unless we're really doing the first setup for this root in
				321	* this transaction.
				322	*
				323	* Normally we'd use root->last_trans as a flag to decide
				324	* if we want to take the expensive mutex.
				325	*
				326	* But, we have to set root->last_trans before we
				327	* init the relocation root, otherwise, we trip over warnings
				328	* in ctree.c. The solution used here is to flag ourselves
				329	* with root IN_TRANS_SETUP. When this is 1, we're still
				330	* fixing up the reloc trees and everyone must wait.
				331	*
				332	* When this is zero, they can trust root->last_trans and fly
				333	* through btrfs_record_root_in_trans without having to take the
				334	* lock. smp_wmb() makes sure that all the writes above are
				335	* done before we pop in the zero below
				336	*/
				337	btrfs_init_reloc_root(trans, root);
				338	smp_mb__before_atomic();
				339	clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
				340	}
				341	return 0;
				342	}
				343
				344
				345	void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
				346	struct btrfs_root *root)
				347	{
				348	struct btrfs_fs_info *fs_info = root->fs_info;
				349	struct btrfs_transaction *cur_trans = trans->transaction;
				350
				351	/* Add ourselves to the transaction dropped list */
				352	spin_lock(&cur_trans->dropped_roots_lock);
				353	list_add_tail(&root->root_list, &cur_trans->dropped_roots);
				354	spin_unlock(&cur_trans->dropped_roots_lock);
				355
				356	/* Make sure we don't try to update the root at commit time */
				357	spin_lock(&fs_info->fs_roots_radix_lock);
				358	radix_tree_tag_clear(&fs_info->fs_roots_radix,
				359	(unsigned long)root->root_key.objectid,
				360	BTRFS_ROOT_TRANS_TAG);
				361	spin_unlock(&fs_info->fs_roots_radix_lock);
				362	}
				363
				364	int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
				365	struct btrfs_root *root)
				366	{
				367	struct btrfs_fs_info *fs_info = root->fs_info;
				368
				369	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
				370	return 0;
				371
				372	/*
				373	* see record_root_in_trans for comments about IN_TRANS_SETUP usage
				374	* and barriers
				375	*/
				376	smp_rmb();
				377	if (root->last_trans == trans->transid &&
				378	!test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
				379	return 0;
				380
				381	mutex_lock(&fs_info->reloc_mutex);
				382	record_root_in_trans(trans, root, 0);
				383	mutex_unlock(&fs_info->reloc_mutex);
				384
				385	return 0;
				386	}
				387
				388	static inline int is_transaction_blocked(struct btrfs_transaction *trans)
				389	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	390	return (trans->state >= TRANS_STATE_COMMIT_START &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	391	trans->state < TRANS_STATE_UNBLOCKED &&
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	392	!TRANS_ABORTED(trans));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	393	}
				394
				395	/* wait for commit against the current transaction to become unblocked
				396	* when this is done, it is safe to start a new transaction, but the current
				397	* transaction might not be fully on disk.
				398	*/
				399	static void wait_current_trans(struct btrfs_fs_info *fs_info)
				400	{
				401	struct btrfs_transaction *cur_trans;
				402
				403	spin_lock(&fs_info->trans_lock);
				404	cur_trans = fs_info->running_transaction;
				405	if (cur_trans && is_transaction_blocked(cur_trans)) {
				406	refcount_inc(&cur_trans->use_count);
				407	spin_unlock(&fs_info->trans_lock);
				408
				409	wait_event(fs_info->transaction_wait,
				410	cur_trans->state >= TRANS_STATE_UNBLOCKED \|\|
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	411	TRANS_ABORTED(cur_trans));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	412	btrfs_put_transaction(cur_trans);
				413	} else {
				414	spin_unlock(&fs_info->trans_lock);
				415	}
				416	}
				417
				418	static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
				419	{
				420	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
				421	return 0;
				422
				423	if (type == TRANS_START)
				424	return 1;
				425
				426	return 0;
				427	}
				428
				429	static inline bool need_reserve_reloc_root(struct btrfs_root *root)
				430	{
				431	struct btrfs_fs_info *fs_info = root->fs_info;
				432
				433	if (!fs_info->reloc_ctl \|\|
				434	!test_bit(BTRFS_ROOT_REF_COWS, &root->state) \|\|
				435	root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID \|\|
				436	root->reloc_root)
				437	return false;
				438
				439	return true;
				440	}
				441
				442	static struct btrfs_trans_handle *
				443	start_transaction(struct btrfs_root *root, unsigned int num_items,
				444	unsigned int type, enum btrfs_reserve_flush_enum flush,
				445	bool enforce_qgroups)
				446	{
				447	struct btrfs_fs_info *fs_info = root->fs_info;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	448	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	449	struct btrfs_trans_handle *h;
				450	struct btrfs_transaction *cur_trans;
				451	u64 num_bytes = 0;
				452	u64 qgroup_reserved = 0;
				453	bool reloc_reserved = false;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	454	bool do_chunk_alloc = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	455	int ret;
				456
				457	/* Send isn't supposed to start transactions. */
				458	ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
				459
				460	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
				461	return ERR_PTR(-EROFS);
				462
				463	if (current->journal_info) {
				464	WARN_ON(type & TRANS_EXTWRITERS);
				465	h = current->journal_info;
				466	refcount_inc(&h->use_count);
				467	WARN_ON(refcount_read(&h->use_count) > 2);
				468	h->orig_rsv = h->block_rsv;
				469	h->block_rsv = NULL;
				470	goto got_it;
				471	}
				472
				473	/*
				474	* Do the reservation before we join the transaction so we can do all
				475	* the appropriate flushing if need be.
				476	*/
				477	if (num_items && root != fs_info->chunk_root) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	478	struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
				479	u64 delayed_refs_bytes = 0;
				480
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	481	qgroup_reserved = num_items * fs_info->nodesize;
				482	ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
				483	enforce_qgroups);
				484	if (ret)
				485	return ERR_PTR(ret);
				486
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	487	/*
				488	* We want to reserve all the bytes we may need all at once, so
				489	* we only do 1 enospc flushing cycle per transaction start. We
				490	* accomplish this by simply assuming we'll do 2 x num_items
				491	* worth of delayed refs updates in this trans handle, and
				492	* refill that amount for whatever is missing in the reserve.
				493	*/
				494	num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	495	if (flush == BTRFS_RESERVE_FLUSH_ALL &&
				496	delayed_refs_rsv->full == 0) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	497	delayed_refs_bytes = num_bytes;
				498	num_bytes <<= 1;
				499	}
				500
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	501	/*
				502	* Do the reservation for the relocation root creation
				503	*/
				504	if (need_reserve_reloc_root(root)) {
				505	num_bytes += fs_info->nodesize;
				506	reloc_reserved = true;
				507	}
				508
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	509	ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush);
				510	if (ret)
				511	goto reserve_fail;
				512	if (delayed_refs_bytes) {
				513	btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv,
				514	delayed_refs_bytes);
				515	num_bytes -= delayed_refs_bytes;
				516	}
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	517
				518	if (rsv->space_info->force_alloc)
				519	do_chunk_alloc = true;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	520	} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
				521	!delayed_refs_rsv->full) {
				522	/*
				523	* Some people call with btrfs_start_transaction(root, 0)
				524	* because they can be throttled, but have some other mechanism
				525	* for reserving space. We still want these guys to refill the
				526	* delayed block_rsv so just add 1 items worth of reservation
				527	* here.
				528	*/
				529	ret = btrfs_delayed_refs_rsv_refill(fs_info, flush);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	530	if (ret)
				531	goto reserve_fail;
				532	}
				533	again:
				534	h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
				535	if (!h) {
				536	ret = -ENOMEM;
				537	goto alloc_fail;
				538	}
				539
				540	/*
				541	* If we are JOIN_NOLOCK we're already committing a transaction and
				542	* waiting on this guy, so we don't need to do the sb_start_intwrite
				543	* because we're already holding a ref. We need this because we could
				544	* have raced in and did an fsync() on a file which can kick a commit
				545	* and then we deadlock with somebody doing a freeze.
				546	*
				547	* If we are ATTACH, it means we just want to catch the current
				548	* transaction and commit it, so we needn't do sb_start_intwrite().
				549	*/
				550	if (type & __TRANS_FREEZABLE)
				551	sb_start_intwrite(fs_info->sb);
				552
				553	if (may_wait_transaction(fs_info, type))
				554	wait_current_trans(fs_info);
				555
				556	do {
				557	ret = join_transaction(fs_info, type);
				558	if (ret == -EBUSY) {
				559	wait_current_trans(fs_info);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	560	if (unlikely(type == TRANS_ATTACH \|\|
				561	type == TRANS_JOIN_NOSTART))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	562	ret = -ENOENT;
				563	}
				564	} while (ret == -EBUSY);
				565
				566	if (ret < 0)
				567	goto join_fail;
				568
				569	cur_trans = fs_info->running_transaction;
				570
				571	h->transid = cur_trans->transid;
				572	h->transaction = cur_trans;
				573	h->root = root;
				574	refcount_set(&h->use_count, 1);
				575	h->fs_info = root->fs_info;
				576
				577	h->type = type;
				578	h->can_flush_pending_bgs = true;
				579	INIT_LIST_HEAD(&h->new_bgs);
				580
				581	smp_mb();
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	582	if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	583	may_wait_transaction(fs_info, type)) {
				584	current->journal_info = h;
				585	btrfs_commit_transaction(h);
				586	goto again;
				587	}
				588
				589	if (num_bytes) {
				590	trace_btrfs_space_reservation(fs_info, "transaction",
				591	h->transid, num_bytes, 1);
				592	h->block_rsv = &fs_info->trans_block_rsv;
				593	h->bytes_reserved = num_bytes;
				594	h->reloc_reserved = reloc_reserved;
				595	}
				596
				597	got_it:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	598	if (!current->journal_info)
				599	current->journal_info = h;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	600
				601	/*
				602	* If the space_info is marked ALLOC_FORCE then we'll get upgraded to
				603	* ALLOC_FORCE the first run through, and then we won't allocate for
				604	* anybody else who races in later. We don't care about the return
				605	* value here.
				606	*/
				607	if (do_chunk_alloc && num_bytes) {
				608	u64 flags = h->block_rsv->space_info->flags;
				609
				610	btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags),
				611	CHUNK_ALLOC_NO_FORCE);
				612	}
				613
				614	/*
				615	* btrfs_record_root_in_trans() needs to alloc new extents, and may
				616	* call btrfs_join_transaction() while we're also starting a
				617	* transaction.
				618	*
				619	* Thus it need to be called after current->journal_info initialized,
				620	* or we can deadlock.
				621	*/
				622	btrfs_record_root_in_trans(h, root);
				623
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	624	return h;
				625
				626	join_fail:
				627	if (type & __TRANS_FREEZABLE)
				628	sb_end_intwrite(fs_info->sb);
				629	kmem_cache_free(btrfs_trans_handle_cachep, h);
				630	alloc_fail:
				631	if (num_bytes)
				632	btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
				633	num_bytes);
				634	reserve_fail:
				635	btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved);
				636	return ERR_PTR(ret);
				637	}
				638
				639	struct btrfs_trans_handle btrfs_start_transaction(struct btrfs_root root,
				640	unsigned int num_items)
				641	{
				642	return start_transaction(root, num_items, TRANS_START,
				643	BTRFS_RESERVE_FLUSH_ALL, true);
				644	}
				645
				646	struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
				647	struct btrfs_root *root,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	648	unsigned int num_items)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	649	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	650	return start_transaction(root, num_items, TRANS_START,
				651	BTRFS_RESERVE_FLUSH_ALL_STEAL, false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	652	}
				653
				654	struct btrfs_trans_handle btrfs_join_transaction(struct btrfs_root root)
				655	{
				656	return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH,
				657	true);
				658	}
				659
				660	struct btrfs_trans_handle btrfs_join_transaction_nolock(struct btrfs_root root)
				661	{
				662	return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
				663	BTRFS_RESERVE_NO_FLUSH, true);
				664	}
				665
				666	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	667	* Similar to regular join but it never starts a transaction when none is
				668	* running or after waiting for the current one to finish.
				669	*/
				670	struct btrfs_trans_handle btrfs_join_transaction_nostart(struct btrfs_root root)
				671	{
				672	return start_transaction(root, 0, TRANS_JOIN_NOSTART,
				673	BTRFS_RESERVE_NO_FLUSH, true);
				674	}
				675
				676	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	677	* btrfs_attach_transaction() - catch the running transaction
				678	*
				679	* It is used when we want to commit the current the transaction, but
				680	* don't want to start a new one.
				681	*
				682	* Note: If this function return -ENOENT, it just means there is no
				683	* running transaction. But it is possible that the inactive transaction
				684	* is still in the memory, not fully on disk. If you hope there is no
				685	* inactive transaction in the fs when -ENOENT is returned, you should
				686	* invoke
				687	* btrfs_attach_transaction_barrier()
				688	*/
				689	struct btrfs_trans_handle btrfs_attach_transaction(struct btrfs_root root)
				690	{
				691	return start_transaction(root, 0, TRANS_ATTACH,
				692	BTRFS_RESERVE_NO_FLUSH, true);
				693	}
				694
				695	/*
				696	* btrfs_attach_transaction_barrier() - catch the running transaction
				697	*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	698	* It is similar to the above function, the difference is this one
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	699	* will wait for all the inactive transactions until they fully
				700	* complete.
				701	*/
				702	struct btrfs_trans_handle *
				703	btrfs_attach_transaction_barrier(struct btrfs_root *root)
				704	{
				705	struct btrfs_trans_handle *trans;
				706
				707	trans = start_transaction(root, 0, TRANS_ATTACH,
				708	BTRFS_RESERVE_NO_FLUSH, true);
				709	if (trans == ERR_PTR(-ENOENT))
				710	btrfs_wait_for_commit(root->fs_info, 0);
				711
				712	return trans;
				713	}
				714
				715	/* wait for a transaction commit to be fully complete */
				716	static noinline void wait_for_commit(struct btrfs_transaction *commit)
				717	{
				718	wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
				719	}
				720
				721	int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
				722	{
				723	struct btrfs_transaction cur_trans = NULL, t;
				724	int ret = 0;
				725
				726	if (transid) {
				727	if (transid <= fs_info->last_trans_committed)
				728	goto out;
				729
				730	/* find specified transaction */
				731	spin_lock(&fs_info->trans_lock);
				732	list_for_each_entry(t, &fs_info->trans_list, list) {
				733	if (t->transid == transid) {
				734	cur_trans = t;
				735	refcount_inc(&cur_trans->use_count);
				736	ret = 0;
				737	break;
				738	}
				739	if (t->transid > transid) {
				740	ret = 0;
				741	break;
				742	}
				743	}
				744	spin_unlock(&fs_info->trans_lock);
				745
				746	/*
				747	* The specified transaction doesn't exist, or we
				748	* raced with btrfs_commit_transaction
				749	*/
				750	if (!cur_trans) {
				751	if (transid > fs_info->last_trans_committed)
				752	ret = -EINVAL;
				753	goto out;
				754	}
				755	} else {
				756	/* find newest transaction that is committing \| committed */
				757	spin_lock(&fs_info->trans_lock);
				758	list_for_each_entry_reverse(t, &fs_info->trans_list,
				759	list) {
				760	if (t->state >= TRANS_STATE_COMMIT_START) {
				761	if (t->state == TRANS_STATE_COMPLETED)
				762	break;
				763	cur_trans = t;
				764	refcount_inc(&cur_trans->use_count);
				765	break;
				766	}
				767	}
				768	spin_unlock(&fs_info->trans_lock);
				769	if (!cur_trans)
				770	goto out; /* nothing committing\|committed */
				771	}
				772
				773	wait_for_commit(cur_trans);
				774	btrfs_put_transaction(cur_trans);
				775	out:
				776	return ret;
				777	}
				778
				779	void btrfs_throttle(struct btrfs_fs_info *fs_info)
				780	{
				781	wait_current_trans(fs_info);
				782	}
				783
				784	static int should_end_transaction(struct btrfs_trans_handle *trans)
				785	{
				786	struct btrfs_fs_info *fs_info = trans->fs_info;
				787
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	788	if (btrfs_check_space_for_delayed_refs(fs_info))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	789	return 1;
				790
				791	return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
				792	}
				793
				794	int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
				795	{
				796	struct btrfs_transaction *cur_trans = trans->transaction;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	797
				798	smp_mb();
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	799	if (cur_trans->state >= TRANS_STATE_COMMIT_START \|\|
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	800	cur_trans->delayed_refs.flushing)
				801	return 1;
				802
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	803	return should_end_transaction(trans);
				804	}
				805
				806	static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
				807
				808	{
				809	struct btrfs_fs_info *fs_info = trans->fs_info;
				810
				811	if (!trans->block_rsv) {
				812	ASSERT(!trans->bytes_reserved);
				813	return;
				814	}
				815
				816	if (!trans->bytes_reserved)
				817	return;
				818
				819	ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
				820	trace_btrfs_space_reservation(fs_info, "transaction",
				821	trans->transid, trans->bytes_reserved, 0);
				822	btrfs_block_rsv_release(fs_info, trans->block_rsv,
				823	trans->bytes_reserved);
				824	trans->bytes_reserved = 0;
				825	}
				826
				827	static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
				828	int throttle)
				829	{
				830	struct btrfs_fs_info *info = trans->fs_info;
				831	struct btrfs_transaction *cur_trans = trans->transaction;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	832	int err = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	833
				834	if (refcount_read(&trans->use_count) > 1) {
				835	refcount_dec(&trans->use_count);
				836	trans->block_rsv = trans->orig_rsv;
				837	return 0;
				838	}
				839
				840	btrfs_trans_release_metadata(trans);
				841	trans->block_rsv = NULL;
				842
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	843	btrfs_create_pending_block_groups(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	844
				845	btrfs_trans_release_chunk_metadata(trans);
				846
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	847	if (trans->type & __TRANS_FREEZABLE)
				848	sb_end_intwrite(info->sb);
				849
				850	WARN_ON(cur_trans != info->running_transaction);
				851	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
				852	atomic_dec(&cur_trans->num_writers);
				853	extwriter_counter_dec(cur_trans, trans->type);
				854
				855	cond_wake_up(&cur_trans->writer_wait);
				856	btrfs_put_transaction(cur_trans);
				857
				858	if (current->journal_info == trans)
				859	current->journal_info = NULL;
				860
				861	if (throttle)
				862	btrfs_run_delayed_iputs(info);
				863
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	864	if (TRANS_ABORTED(trans) \|\|
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	865	test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
				866	wake_up_process(info->transaction_kthread);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	867	if (TRANS_ABORTED(trans))
				868	err = trans->aborted;
				869	else
				870	err = -EROFS;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	871	}
				872
				873	kmem_cache_free(btrfs_trans_handle_cachep, trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	874	return err;
				875	}
				876
				877	int btrfs_end_transaction(struct btrfs_trans_handle *trans)
				878	{
				879	return __btrfs_end_transaction(trans, 0);
				880	}
				881
				882	int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans)
				883	{
				884	return __btrfs_end_transaction(trans, 1);
				885	}
				886
				887	/*
				888	* when btree blocks are allocated, they have some corresponding bits set for
				889	* them in one of two extent_io trees. This is used to make sure all of
				890	* those extents are sent to disk but does not wait on them
				891	*/
				892	int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
				893	struct extent_io_tree *dirty_pages, int mark)
				894	{
				895	int err = 0;
				896	int werr = 0;
				897	struct address_space *mapping = fs_info->btree_inode->i_mapping;
				898	struct extent_state *cached_state = NULL;
				899	u64 start = 0;
				900	u64 end;
				901
				902	atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers);
				903	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				904	mark, &cached_state)) {
				905	bool wait_writeback = false;
				906
				907	err = convert_extent_bit(dirty_pages, start, end,
				908	EXTENT_NEED_WAIT,
				909	mark, &cached_state);
				910	/*
				911	* convert_extent_bit can return -ENOMEM, which is most of the
				912	* time a temporary error. So when it happens, ignore the error
				913	* and wait for writeback of this range to finish - because we
				914	* failed to set the bit EXTENT_NEED_WAIT for the range, a call
				915	* to __btrfs_wait_marked_extents() would not know that
				916	* writeback for this range started and therefore wouldn't
				917	* wait for it to finish - we don't want to commit a
				918	* superblock that points to btree nodes/leafs for which
				919	* writeback hasn't finished yet (and without errors).
				920	* We cleanup any entries left in the io tree when committing
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	921	* the transaction (through extent_io_tree_release()).
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	922	*/
				923	if (err == -ENOMEM) {
				924	err = 0;
				925	wait_writeback = true;
				926	}
				927	if (!err)
				928	err = filemap_fdatawrite_range(mapping, start, end);
				929	if (err)
				930	werr = err;
				931	else if (wait_writeback)
				932	werr = filemap_fdatawait_range(mapping, start, end);
				933	free_extent_state(cached_state);
				934	cached_state = NULL;
				935	cond_resched();
				936	start = end + 1;
				937	}
				938	atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers);
				939	return werr;
				940	}
				941
				942	/*
				943	* when btree blocks are allocated, they have some corresponding bits set for
				944	* them in one of two extent_io trees. This is used to make sure all of
				945	* those extents are on disk for transaction or log commit. We wait
				946	* on all the pages and clear them from the dirty pages state tree
				947	*/
				948	static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
				949	struct extent_io_tree *dirty_pages)
				950	{
				951	int err = 0;
				952	int werr = 0;
				953	struct address_space *mapping = fs_info->btree_inode->i_mapping;
				954	struct extent_state *cached_state = NULL;
				955	u64 start = 0;
				956	u64 end;
				957
				958	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				959	EXTENT_NEED_WAIT, &cached_state)) {
				960	/*
				961	* Ignore -ENOMEM errors returned by clear_extent_bit().
				962	* When committing the transaction, we'll remove any entries
				963	* left in the io tree. For a log commit, we don't remove them
				964	* after committing the log because the tree can be accessed
				965	* concurrently - we do it only at transaction commit time when
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	966	* it's safe to do it (through extent_io_tree_release()).
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	967	*/
				968	err = clear_extent_bit(dirty_pages, start, end,
				969	EXTENT_NEED_WAIT, 0, 0, &cached_state);
				970	if (err == -ENOMEM)
				971	err = 0;
				972	if (!err)
				973	err = filemap_fdatawait_range(mapping, start, end);
				974	if (err)
				975	werr = err;
				976	free_extent_state(cached_state);
				977	cached_state = NULL;
				978	cond_resched();
				979	start = end + 1;
				980	}
				981	if (err)
				982	werr = err;
				983	return werr;
				984	}
				985
				986	int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
				987	struct extent_io_tree *dirty_pages)
				988	{
				989	bool errors = false;
				990	int err;
				991
				992	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
				993	if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
				994	errors = true;
				995
				996	if (errors && !err)
				997	err = -EIO;
				998	return err;
				999	}
				1000
				1001	int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
				1002	{
				1003	struct btrfs_fs_info *fs_info = log_root->fs_info;
				1004	struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
				1005	bool errors = false;
				1006	int err;
				1007
				1008	ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
				1009
				1010	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
				1011	if ((mark & EXTENT_DIRTY) &&
				1012	test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
				1013	errors = true;
				1014
				1015	if ((mark & EXTENT_NEW) &&
				1016	test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
				1017	errors = true;
				1018
				1019	if (errors && !err)
				1020	err = -EIO;
				1021	return err;
				1022	}
				1023
				1024	/*
				1025	* When btree blocks are allocated the corresponding extents are marked dirty.
				1026	* This function ensures such extents are persisted on disk for transaction or
				1027	* log commit.
				1028	*
				1029	* @trans: transaction whose dirty pages we'd like to write
				1030	*/
				1031	static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
				1032	{
				1033	int ret;
				1034	int ret2;
				1035	struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages;
				1036	struct btrfs_fs_info *fs_info = trans->fs_info;
				1037	struct blk_plug plug;
				1038
				1039	blk_start_plug(&plug);
				1040	ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY);
				1041	blk_finish_plug(&plug);
				1042	ret2 = btrfs_wait_extents(fs_info, dirty_pages);
				1043
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1044	extent_io_tree_release(&trans->transaction->dirty_pages);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1045
				1046	if (ret)
				1047	return ret;
				1048	else if (ret2)
				1049	return ret2;
				1050	else
				1051	return 0;
				1052	}
				1053
				1054	/*
				1055	* this is used to update the root pointer in the tree of tree roots.
				1056	*
				1057	* But, in the case of the extent allocation tree, updating the root
				1058	* pointer may allocate blocks which may change the root of the extent
				1059	* allocation tree.
				1060	*
				1061	* So, this loops and repeats and makes sure the cowonly root didn't
				1062	* change while the root pointer was being updated in the metadata.
				1063	*/
				1064	static int update_cowonly_root(struct btrfs_trans_handle *trans,
				1065	struct btrfs_root *root)
				1066	{
				1067	int ret;
				1068	u64 old_root_bytenr;
				1069	u64 old_root_used;
				1070	struct btrfs_fs_info *fs_info = root->fs_info;
				1071	struct btrfs_root *tree_root = fs_info->tree_root;
				1072
				1073	old_root_used = btrfs_root_used(&root->root_item);
				1074
				1075	while (1) {
				1076	old_root_bytenr = btrfs_root_bytenr(&root->root_item);
				1077	if (old_root_bytenr == root->node->start &&
				1078	old_root_used == btrfs_root_used(&root->root_item))
				1079	break;
				1080
				1081	btrfs_set_root_node(&root->root_item, root->node);
				1082	ret = btrfs_update_root(trans, tree_root,
				1083	&root->root_key,
				1084	&root->root_item);
				1085	if (ret)
				1086	return ret;
				1087
				1088	old_root_used = btrfs_root_used(&root->root_item);
				1089	}
				1090
				1091	return 0;
				1092	}
				1093
				1094	/*
				1095	* update all the cowonly tree roots on disk
				1096	*
				1097	* The error handling in this function may not be obvious. Any of the
				1098	* failures will cause the file system to go offline. We still need
				1099	* to clean up the delayed refs.
				1100	*/
				1101	static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
				1102	{
				1103	struct btrfs_fs_info *fs_info = trans->fs_info;
				1104	struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
				1105	struct list_head *io_bgs = &trans->transaction->io_bgs;
				1106	struct list_head *next;
				1107	struct extent_buffer *eb;
				1108	int ret;
				1109
				1110	eb = btrfs_lock_root_node(fs_info->tree_root);
				1111	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
				1112	0, &eb);
				1113	btrfs_tree_unlock(eb);
				1114	free_extent_buffer(eb);
				1115
				1116	if (ret)
				1117	return ret;
				1118
				1119	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				1120	if (ret)
				1121	return ret;
				1122
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1123	ret = btrfs_run_dev_stats(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1124	if (ret)
				1125	return ret;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1126	ret = btrfs_run_dev_replace(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1127	if (ret)
				1128	return ret;
				1129	ret = btrfs_run_qgroups(trans);
				1130	if (ret)
				1131	return ret;
				1132
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1133	ret = btrfs_setup_space_cache(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1134	if (ret)
				1135	return ret;
				1136
				1137	/* run_qgroups might have added some more refs */
				1138	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				1139	if (ret)
				1140	return ret;
				1141	again:
				1142	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
				1143	struct btrfs_root *root;
				1144	next = fs_info->dirty_cowonly_roots.next;
				1145	list_del_init(next);
				1146	root = list_entry(next, struct btrfs_root, dirty_list);
				1147	clear_bit(BTRFS_ROOT_DIRTY, &root->state);
				1148
				1149	if (root != fs_info->extent_root)
				1150	list_add_tail(&root->dirty_list,
				1151	&trans->transaction->switch_commits);
				1152	ret = update_cowonly_root(trans, root);
				1153	if (ret)
				1154	return ret;
				1155	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				1156	if (ret)
				1157	return ret;
				1158	}
				1159
				1160	while (!list_empty(dirty_bgs) \|\| !list_empty(io_bgs)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1161	ret = btrfs_write_dirty_block_groups(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1162	if (ret)
				1163	return ret;
				1164	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				1165	if (ret)
				1166	return ret;
				1167	}
				1168
				1169	if (!list_empty(&fs_info->dirty_cowonly_roots))
				1170	goto again;
				1171
				1172	list_add_tail(&fs_info->extent_root->dirty_list,
				1173	&trans->transaction->switch_commits);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1174
				1175	/* Update dev-replace pointer once everything is committed */
				1176	fs_info->dev_replace.committed_cursor_left =
				1177	fs_info->dev_replace.cursor_left_last_write_of_item;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1178
				1179	return 0;
				1180	}
				1181
				1182	/*
				1183	* dead roots are old snapshots that need to be deleted. This allocates
				1184	* a dirty root struct and adds it into the list of dead roots that need to
				1185	* be deleted
				1186	*/
				1187	void btrfs_add_dead_root(struct btrfs_root *root)
				1188	{
				1189	struct btrfs_fs_info *fs_info = root->fs_info;
				1190
				1191	spin_lock(&fs_info->trans_lock);
				1192	if (list_empty(&root->root_list))
				1193	list_add_tail(&root->root_list, &fs_info->dead_roots);
				1194	spin_unlock(&fs_info->trans_lock);
				1195	}
				1196
				1197	/*
				1198	* update all the cowonly tree roots on disk
				1199	*/
				1200	static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
				1201	{
				1202	struct btrfs_fs_info *fs_info = trans->fs_info;
				1203	struct btrfs_root *gang[8];
				1204	int i;
				1205	int ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1206
				1207	spin_lock(&fs_info->fs_roots_radix_lock);
				1208	while (1) {
				1209	ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
				1210	(void **)gang, 0,
				1211	ARRAY_SIZE(gang),
				1212	BTRFS_ROOT_TRANS_TAG);
				1213	if (ret == 0)
				1214	break;
				1215	for (i = 0; i < ret; i++) {
				1216	struct btrfs_root *root = gang[i];
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1217	int ret2;
				1218
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1219	radix_tree_tag_clear(&fs_info->fs_roots_radix,
				1220	(unsigned long)root->root_key.objectid,
				1221	BTRFS_ROOT_TRANS_TAG);
				1222	spin_unlock(&fs_info->fs_roots_radix_lock);
				1223
				1224	btrfs_free_log(trans, root);
				1225	btrfs_update_reloc_root(trans, root);
				1226
				1227	btrfs_save_ino_cache(root, trans);
				1228
				1229	/* see comments in should_cow_block() */
				1230	clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
				1231	smp_mb__after_atomic();
				1232
				1233	if (root->commit_root != root->node) {
				1234	list_add_tail(&root->dirty_list,
				1235	&trans->transaction->switch_commits);
				1236	btrfs_set_root_node(&root->root_item,
				1237	root->node);
				1238	}
				1239
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1240	ret2 = btrfs_update_root(trans, fs_info->tree_root,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1241	&root->root_key,
				1242	&root->root_item);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1243	if (ret2)
				1244	return ret2;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1245	spin_lock(&fs_info->fs_roots_radix_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1246	btrfs_qgroup_free_meta_all_pertrans(root);
				1247	}
				1248	}
				1249	spin_unlock(&fs_info->fs_roots_radix_lock);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1250	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1251	}
				1252
				1253	/*
				1254	* defrag a given btree.
				1255	* Every leaf in the btree is read and defragged.
				1256	*/
				1257	int btrfs_defrag_root(struct btrfs_root *root)
				1258	{
				1259	struct btrfs_fs_info *info = root->fs_info;
				1260	struct btrfs_trans_handle *trans;
				1261	int ret;
				1262
				1263	if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
				1264	return 0;
				1265
				1266	while (1) {
				1267	trans = btrfs_start_transaction(root, 0);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1268	if (IS_ERR(trans)) {
				1269	ret = PTR_ERR(trans);
				1270	break;
				1271	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1272
				1273	ret = btrfs_defrag_leaves(trans, root);
				1274
				1275	btrfs_end_transaction(trans);
				1276	btrfs_btree_balance_dirty(info);
				1277	cond_resched();
				1278
				1279	if (btrfs_fs_closing(info) \|\| ret != -EAGAIN)
				1280	break;
				1281
				1282	if (btrfs_defrag_cancelled(info)) {
				1283	btrfs_debug(info, "defrag_root cancelled");
				1284	ret = -EAGAIN;
				1285	break;
				1286	}
				1287	}
				1288	clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
				1289	return ret;
				1290	}
				1291
				1292	/*
				1293	* Do all special snapshot related qgroup dirty hack.
				1294	*
				1295	* Will do all needed qgroup inherit and dirty hack like switch commit
				1296	* roots inside one transaction and write all btree into disk, to make
				1297	* qgroup works.
				1298	*/
				1299	static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
				1300	struct btrfs_root *src,
				1301	struct btrfs_root *parent,
				1302	struct btrfs_qgroup_inherit *inherit,
				1303	u64 dst_objectid)
				1304	{
				1305	struct btrfs_fs_info *fs_info = src->fs_info;
				1306	int ret;
				1307
				1308	/*
				1309	* Save some performance in the case that qgroups are not
				1310	* enabled. If this check races with the ioctl, rescan will
				1311	* kick in anyway.
				1312	*/
				1313	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
				1314	return 0;
				1315
				1316	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1317	* Ensure dirty @src will be committed. Or, after coming
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1318	* commit_fs_roots() and switch_commit_roots(), any dirty but not
				1319	* recorded root will never be updated again, causing an outdated root
				1320	* item.
				1321	*/
				1322	record_root_in_trans(trans, src, 1);
				1323
				1324	/*
				1325	* We are going to commit transaction, see btrfs_commit_transaction()
				1326	* comment for reason locking tree_log_mutex
				1327	*/
				1328	mutex_lock(&fs_info->tree_log_mutex);
				1329
				1330	ret = commit_fs_roots(trans);
				1331	if (ret)
				1332	goto out;
				1333	ret = btrfs_qgroup_account_extents(trans);
				1334	if (ret < 0)
				1335	goto out;
				1336
				1337	/* Now qgroup are all updated, we can inherit it to new qgroups */
				1338	ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid,
				1339	inherit);
				1340	if (ret < 0)
				1341	goto out;
				1342
				1343	/*
				1344	* Now we do a simplified commit transaction, which will:
				1345	* 1) commit all subvolume and extent tree
				1346	* To ensure all subvolume and extent tree have a valid
				1347	* commit_root to accounting later insert_dir_item()
				1348	* 2) write all btree blocks onto disk
				1349	* This is to make sure later btree modification will be cowed
				1350	* Or commit_root can be populated and cause wrong qgroup numbers
				1351	* In this simplified commit, we don't really care about other trees
				1352	* like chunk and root tree, as they won't affect qgroup.
				1353	* And we don't write super to avoid half committed status.
				1354	*/
				1355	ret = commit_cowonly_roots(trans);
				1356	if (ret)
				1357	goto out;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1358	switch_commit_roots(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1359	ret = btrfs_write_and_wait_transaction(trans);
				1360	if (ret)
				1361	btrfs_handle_fs_error(fs_info, ret,
				1362	"Error while writing out transaction for qgroup");
				1363
				1364	out:
				1365	mutex_unlock(&fs_info->tree_log_mutex);
				1366
				1367	/*
				1368	* Force parent root to be updated, as we recorded it before so its
				1369	* last_trans == cur_transid.
				1370	* Or it won't be committed again onto disk after later
				1371	* insert_dir_item()
				1372	*/
				1373	if (!ret)
				1374	record_root_in_trans(trans, parent, 1);
				1375	return ret;
				1376	}
				1377
				1378	/*
				1379	* new snapshots need to be created at a very specific time in the
				1380	* transaction commit. This does the actual creation.
				1381	*
				1382	* Note:
				1383	* If the error which may affect the commitment of the current transaction
				1384	* happens, we should return the error number. If the error which just affect
				1385	* the creation of the pending snapshots, just return 0.
				1386	*/
				1387	static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
				1388	struct btrfs_pending_snapshot *pending)
				1389	{
				1390
				1391	struct btrfs_fs_info *fs_info = trans->fs_info;
				1392	struct btrfs_key key;
				1393	struct btrfs_root_item *new_root_item;
				1394	struct btrfs_root *tree_root = fs_info->tree_root;
				1395	struct btrfs_root *root = pending->root;
				1396	struct btrfs_root *parent_root;
				1397	struct btrfs_block_rsv *rsv;
				1398	struct inode *parent_inode;
				1399	struct btrfs_path *path;
				1400	struct btrfs_dir_item *dir_item;
				1401	struct dentry *dentry;
				1402	struct extent_buffer *tmp;
				1403	struct extent_buffer *old;
				1404	struct timespec64 cur_time;
				1405	int ret = 0;
				1406	u64 to_reserve = 0;
				1407	u64 index = 0;
				1408	u64 objectid;
				1409	u64 root_flags;
				1410	uuid_le new_uuid;
				1411
				1412	ASSERT(pending->path);
				1413	path = pending->path;
				1414
				1415	ASSERT(pending->root_item);
				1416	new_root_item = pending->root_item;
				1417
				1418	pending->error = btrfs_find_free_objectid(tree_root, &objectid);
				1419	if (pending->error)
				1420	goto no_free_objectid;
				1421
				1422	/*
				1423	* Make qgroup to skip current new snapshot's qgroupid, as it is
				1424	* accounted by later btrfs_qgroup_inherit().
				1425	*/
				1426	btrfs_set_skip_qgroup(trans, objectid);
				1427
				1428	btrfs_reloc_pre_snapshot(pending, &to_reserve);
				1429
				1430	if (to_reserve > 0) {
				1431	pending->error = btrfs_block_rsv_add(root,
				1432	&pending->block_rsv,
				1433	to_reserve,
				1434	BTRFS_RESERVE_NO_FLUSH);
				1435	if (pending->error)
				1436	goto clear_skip_qgroup;
				1437	}
				1438
				1439	key.objectid = objectid;
				1440	key.offset = (u64)-1;
				1441	key.type = BTRFS_ROOT_ITEM_KEY;
				1442
				1443	rsv = trans->block_rsv;
				1444	trans->block_rsv = &pending->block_rsv;
				1445	trans->bytes_reserved = trans->block_rsv->reserved;
				1446	trace_btrfs_space_reservation(fs_info, "transaction",
				1447	trans->transid,
				1448	trans->bytes_reserved, 1);
				1449	dentry = pending->dentry;
				1450	parent_inode = pending->dir;
				1451	parent_root = BTRFS_I(parent_inode)->root;
				1452	record_root_in_trans(trans, parent_root, 0);
				1453
				1454	cur_time = current_time(parent_inode);
				1455
				1456	/*
				1457	* insert the directory item
				1458	*/
				1459	ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index);
				1460	BUG_ON(ret); /* -ENOMEM */
				1461
				1462	/* check if there is a file/dir which has the same name. */
				1463	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
				1464	btrfs_ino(BTRFS_I(parent_inode)),
				1465	dentry->d_name.name,
				1466	dentry->d_name.len, 0);
				1467	if (dir_item != NULL && !IS_ERR(dir_item)) {
				1468	pending->error = -EEXIST;
				1469	goto dir_item_existed;
				1470	} else if (IS_ERR(dir_item)) {
				1471	ret = PTR_ERR(dir_item);
				1472	btrfs_abort_transaction(trans, ret);
				1473	goto fail;
				1474	}
				1475	btrfs_release_path(path);
				1476
				1477	/*
				1478	* pull in the delayed directory update
				1479	* and the delayed inode item
				1480	* otherwise we corrupt the FS during
				1481	* snapshot
				1482	*/
				1483	ret = btrfs_run_delayed_items(trans);
				1484	if (ret) { /* Transaction aborted */
				1485	btrfs_abort_transaction(trans, ret);
				1486	goto fail;
				1487	}
				1488
				1489	record_root_in_trans(trans, root, 0);
				1490	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
				1491	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
				1492	btrfs_check_and_init_root_item(new_root_item);
				1493
				1494	root_flags = btrfs_root_flags(new_root_item);
				1495	if (pending->readonly)
				1496	root_flags \|= BTRFS_ROOT_SUBVOL_RDONLY;
				1497	else
				1498	root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
				1499	btrfs_set_root_flags(new_root_item, root_flags);
				1500
				1501	btrfs_set_root_generation_v2(new_root_item,
				1502	trans->transid);
				1503	uuid_le_gen(&new_uuid);
				1504	memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
				1505	memcpy(new_root_item->parent_uuid, root->root_item.uuid,
				1506	BTRFS_UUID_SIZE);
				1507	if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
				1508	memset(new_root_item->received_uuid, 0,
				1509	sizeof(new_root_item->received_uuid));
				1510	memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
				1511	memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
				1512	btrfs_set_root_stransid(new_root_item, 0);
				1513	btrfs_set_root_rtransid(new_root_item, 0);
				1514	}
				1515	btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec);
				1516	btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec);
				1517	btrfs_set_root_otransid(new_root_item, trans->transid);
				1518
				1519	old = btrfs_lock_root_node(root);
				1520	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
				1521	if (ret) {
				1522	btrfs_tree_unlock(old);
				1523	free_extent_buffer(old);
				1524	btrfs_abort_transaction(trans, ret);
				1525	goto fail;
				1526	}
				1527
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1528	btrfs_set_lock_blocking_write(old);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1529
				1530	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
				1531	/* clean up in any case */
				1532	btrfs_tree_unlock(old);
				1533	free_extent_buffer(old);
				1534	if (ret) {
				1535	btrfs_abort_transaction(trans, ret);
				1536	goto fail;
				1537	}
				1538	/* see comments in should_cow_block() */
				1539	set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
				1540	smp_wmb();
				1541
				1542	btrfs_set_root_node(new_root_item, tmp);
				1543	/* record when the snapshot was created in key.offset */
				1544	key.offset = trans->transid;
				1545	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
				1546	btrfs_tree_unlock(tmp);
				1547	free_extent_buffer(tmp);
				1548	if (ret) {
				1549	btrfs_abort_transaction(trans, ret);
				1550	goto fail;
				1551	}
				1552
				1553	/*
				1554	* insert root back/forward references
				1555	*/
				1556	ret = btrfs_add_root_ref(trans, objectid,
				1557	parent_root->root_key.objectid,
				1558	btrfs_ino(BTRFS_I(parent_inode)), index,
				1559	dentry->d_name.name, dentry->d_name.len);
				1560	if (ret) {
				1561	btrfs_abort_transaction(trans, ret);
				1562	goto fail;
				1563	}
				1564
				1565	key.offset = (u64)-1;
				1566	pending->snap = btrfs_read_fs_root_no_name(fs_info, &key);
				1567	if (IS_ERR(pending->snap)) {
				1568	ret = PTR_ERR(pending->snap);
				1569	btrfs_abort_transaction(trans, ret);
				1570	goto fail;
				1571	}
				1572
				1573	ret = btrfs_reloc_post_snapshot(trans, pending);
				1574	if (ret) {
				1575	btrfs_abort_transaction(trans, ret);
				1576	goto fail;
				1577	}
				1578
				1579	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				1580	if (ret) {
				1581	btrfs_abort_transaction(trans, ret);
				1582	goto fail;
				1583	}
				1584
				1585	/*
				1586	* Do special qgroup accounting for snapshot, as we do some qgroup
				1587	* snapshot hack to do fast snapshot.
				1588	* To co-operate with that hack, we do hack again.
				1589	* Or snapshot will be greatly slowed down by a subtree qgroup rescan
				1590	*/
				1591	ret = qgroup_account_snapshot(trans, root, parent_root,
				1592	pending->inherit, objectid);
				1593	if (ret < 0)
				1594	goto fail;
				1595
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1596	ret = btrfs_insert_dir_item(trans, dentry->d_name.name,
				1597	dentry->d_name.len, BTRFS_I(parent_inode),
				1598	&key, BTRFS_FT_DIR, index);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1599	/* We have check then name at the beginning, so it is impossible. */
				1600	BUG_ON(ret == -EEXIST \|\| ret == -EOVERFLOW);
				1601	if (ret) {
				1602	btrfs_abort_transaction(trans, ret);
				1603	goto fail;
				1604	}
				1605
				1606	btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
				1607	dentry->d_name.len * 2);
				1608	parent_inode->i_mtime = parent_inode->i_ctime =
				1609	current_time(parent_inode);
				1610	ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
				1611	if (ret) {
				1612	btrfs_abort_transaction(trans, ret);
				1613	goto fail;
				1614	}
				1615	ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL,
				1616	objectid);
				1617	if (ret) {
				1618	btrfs_abort_transaction(trans, ret);
				1619	goto fail;
				1620	}
				1621	if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
				1622	ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
				1623	BTRFS_UUID_KEY_RECEIVED_SUBVOL,
				1624	objectid);
				1625	if (ret && ret != -EEXIST) {
				1626	btrfs_abort_transaction(trans, ret);
				1627	goto fail;
				1628	}
				1629	}
				1630
				1631	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				1632	if (ret) {
				1633	btrfs_abort_transaction(trans, ret);
				1634	goto fail;
				1635	}
				1636
				1637	fail:
				1638	pending->error = ret;
				1639	dir_item_existed:
				1640	trans->block_rsv = rsv;
				1641	trans->bytes_reserved = 0;
				1642	clear_skip_qgroup:
				1643	btrfs_clear_skip_qgroup(trans);
				1644	no_free_objectid:
				1645	kfree(new_root_item);
				1646	pending->root_item = NULL;
				1647	btrfs_free_path(path);
				1648	pending->path = NULL;
				1649
				1650	return ret;
				1651	}
				1652
				1653	/*
				1654	* create all the snapshots we've scheduled for creation
				1655	*/
				1656	static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans)
				1657	{
				1658	struct btrfs_pending_snapshot pending, next;
				1659	struct list_head *head = &trans->transaction->pending_snapshots;
				1660	int ret = 0;
				1661
				1662	list_for_each_entry_safe(pending, next, head, list) {
				1663	list_del(&pending->list);
				1664	ret = create_pending_snapshot(trans, pending);
				1665	if (ret)
				1666	break;
				1667	}
				1668	return ret;
				1669	}
				1670
				1671	static void update_super_roots(struct btrfs_fs_info *fs_info)
				1672	{
				1673	struct btrfs_root_item *root_item;
				1674	struct btrfs_super_block *super;
				1675
				1676	super = fs_info->super_copy;
				1677
				1678	root_item = &fs_info->chunk_root->root_item;
				1679	super->chunk_root = root_item->bytenr;
				1680	super->chunk_root_generation = root_item->generation;
				1681	super->chunk_root_level = root_item->level;
				1682
				1683	root_item = &fs_info->tree_root->root_item;
				1684	super->root = root_item->bytenr;
				1685	super->generation = root_item->generation;
				1686	super->root_level = root_item->level;
				1687	if (btrfs_test_opt(fs_info, SPACE_CACHE))
				1688	super->cache_generation = root_item->generation;
				1689	if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
				1690	super->uuid_tree_generation = root_item->generation;
				1691	}
				1692
				1693	int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
				1694	{
				1695	struct btrfs_transaction *trans;
				1696	int ret = 0;
				1697
				1698	spin_lock(&info->trans_lock);
				1699	trans = info->running_transaction;
				1700	if (trans)
				1701	ret = (trans->state >= TRANS_STATE_COMMIT_START);
				1702	spin_unlock(&info->trans_lock);
				1703	return ret;
				1704	}
				1705
				1706	int btrfs_transaction_blocked(struct btrfs_fs_info *info)
				1707	{
				1708	struct btrfs_transaction *trans;
				1709	int ret = 0;
				1710
				1711	spin_lock(&info->trans_lock);
				1712	trans = info->running_transaction;
				1713	if (trans)
				1714	ret = is_transaction_blocked(trans);
				1715	spin_unlock(&info->trans_lock);
				1716	return ret;
				1717	}
				1718
				1719	/*
				1720	* wait for the current transaction commit to start and block subsequent
				1721	* transaction joins
				1722	*/
				1723	static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info,
				1724	struct btrfs_transaction *trans)
				1725	{
				1726	wait_event(fs_info->transaction_blocked_wait,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1727	trans->state >= TRANS_STATE_COMMIT_START \|\|
				1728	TRANS_ABORTED(trans));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1729	}
				1730
				1731	/*
				1732	* wait for the current transaction to start and then become unblocked.
				1733	* caller holds ref.
				1734	*/
				1735	static void wait_current_trans_commit_start_and_unblock(
				1736	struct btrfs_fs_info *fs_info,
				1737	struct btrfs_transaction *trans)
				1738	{
				1739	wait_event(fs_info->transaction_wait,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1740	trans->state >= TRANS_STATE_UNBLOCKED \|\|
				1741	TRANS_ABORTED(trans));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1742	}
				1743
				1744	/*
				1745	* commit transactions asynchronously. once btrfs_commit_transaction_async
				1746	* returns, any subsequent transaction will not be allowed to join.
				1747	*/
				1748	struct btrfs_async_commit {
				1749	struct btrfs_trans_handle *newtrans;
				1750	struct work_struct work;
				1751	};
				1752
				1753	static void do_async_commit(struct work_struct *work)
				1754	{
				1755	struct btrfs_async_commit *ac =
				1756	container_of(work, struct btrfs_async_commit, work);
				1757
				1758	/*
				1759	* We've got freeze protection passed with the transaction.
				1760	* Tell lockdep about it.
				1761	*/
				1762	if (ac->newtrans->type & __TRANS_FREEZABLE)
				1763	__sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS);
				1764
				1765	current->journal_info = ac->newtrans;
				1766
				1767	btrfs_commit_transaction(ac->newtrans);
				1768	kfree(ac);
				1769	}
				1770
				1771	int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
				1772	int wait_for_unblock)
				1773	{
				1774	struct btrfs_fs_info *fs_info = trans->fs_info;
				1775	struct btrfs_async_commit *ac;
				1776	struct btrfs_transaction *cur_trans;
				1777
				1778	ac = kmalloc(sizeof(*ac), GFP_NOFS);
				1779	if (!ac)
				1780	return -ENOMEM;
				1781
				1782	INIT_WORK(&ac->work, do_async_commit);
				1783	ac->newtrans = btrfs_join_transaction(trans->root);
				1784	if (IS_ERR(ac->newtrans)) {
				1785	int err = PTR_ERR(ac->newtrans);
				1786	kfree(ac);
				1787	return err;
				1788	}
				1789
				1790	/* take transaction reference */
				1791	cur_trans = trans->transaction;
				1792	refcount_inc(&cur_trans->use_count);
				1793
				1794	btrfs_end_transaction(trans);
				1795
				1796	/*
				1797	* Tell lockdep we've released the freeze rwsem, since the
				1798	* async commit thread will be the one to unlock it.
				1799	*/
				1800	if (ac->newtrans->type & __TRANS_FREEZABLE)
				1801	__sb_writers_release(fs_info->sb, SB_FREEZE_FS);
				1802
				1803	schedule_work(&ac->work);
				1804
				1805	/* wait for transaction to start and unblock */
				1806	if (wait_for_unblock)
				1807	wait_current_trans_commit_start_and_unblock(fs_info, cur_trans);
				1808	else
				1809	wait_current_trans_commit_start(fs_info, cur_trans);
				1810
				1811	if (current->journal_info == trans)
				1812	current->journal_info = NULL;
				1813
				1814	btrfs_put_transaction(cur_trans);
				1815	return 0;
				1816	}
				1817
				1818
				1819	static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
				1820	{
				1821	struct btrfs_fs_info *fs_info = trans->fs_info;
				1822	struct btrfs_transaction *cur_trans = trans->transaction;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1823
				1824	WARN_ON(refcount_read(&trans->use_count) > 1);
				1825
				1826	btrfs_abort_transaction(trans, err);
				1827
				1828	spin_lock(&fs_info->trans_lock);
				1829
				1830	/*
				1831	* If the transaction is removed from the list, it means this
				1832	* transaction has been committed successfully, so it is impossible
				1833	* to call the cleanup function.
				1834	*/
				1835	BUG_ON(list_empty(&cur_trans->list));
				1836
				1837	list_del_init(&cur_trans->list);
				1838	if (cur_trans == fs_info->running_transaction) {
				1839	cur_trans->state = TRANS_STATE_COMMIT_DOING;
				1840	spin_unlock(&fs_info->trans_lock);
				1841	wait_event(cur_trans->writer_wait,
				1842	atomic_read(&cur_trans->num_writers) == 1);
				1843
				1844	spin_lock(&fs_info->trans_lock);
				1845	}
				1846	spin_unlock(&fs_info->trans_lock);
				1847
				1848	btrfs_cleanup_one_transaction(trans->transaction, fs_info);
				1849
				1850	spin_lock(&fs_info->trans_lock);
				1851	if (cur_trans == fs_info->running_transaction)
				1852	fs_info->running_transaction = NULL;
				1853	spin_unlock(&fs_info->trans_lock);
				1854
				1855	if (trans->type & __TRANS_FREEZABLE)
				1856	sb_end_intwrite(fs_info->sb);
				1857	btrfs_put_transaction(cur_trans);
				1858	btrfs_put_transaction(cur_trans);
				1859
				1860	trace_btrfs_transaction_commit(trans->root);
				1861
				1862	if (current->journal_info == trans)
				1863	current->journal_info = NULL;
				1864	btrfs_scrub_cancel(fs_info);
				1865
				1866	kmem_cache_free(btrfs_trans_handle_cachep, trans);
				1867	}
				1868
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1869	/*
				1870	* Release reserved delayed ref space of all pending block groups of the
				1871	* transaction and remove them from the list
				1872	*/
				1873	static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1874	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1875	struct btrfs_fs_info *fs_info = trans->fs_info;
				1876	struct btrfs_block_group_cache block_group, tmp;
				1877
				1878	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
				1879	btrfs_delayed_refs_rsv_release(fs_info, 1);
				1880	list_del_init(&block_group->bg_list);
				1881	}
				1882	}
				1883
				1884	static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans)
				1885	{
				1886	struct btrfs_fs_info *fs_info = trans->fs_info;
				1887
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1888	/*
				1889	* We use writeback_inodes_sb here because if we used
				1890	* btrfs_start_delalloc_roots we would deadlock with fs freeze.
				1891	* Currently are holding the fs freeze lock, if we do an async flush
				1892	* we'll do btrfs_join_transaction() and deadlock because we need to
				1893	* wait for the fs freeze lock. Using the direct flushing we benefit
				1894	* from already being in a transaction and our join_transaction doesn't
				1895	* have to re-take the fs freeze lock.
				1896	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1897	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1898	writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1899	} else {
				1900	struct btrfs_pending_snapshot *pending;
				1901	struct list_head *head = &trans->transaction->pending_snapshots;
				1902
				1903	/*
				1904	* Flush dellaloc for any root that is going to be snapshotted.
				1905	* This is done to avoid a corrupted version of files, in the
				1906	* snapshots, that had both buffered and direct IO writes (even
				1907	* if they were done sequentially) due to an unordered update of
				1908	* the inode's size on disk.
				1909	*/
				1910	list_for_each_entry(pending, head, list) {
				1911	int ret;
				1912
				1913	ret = btrfs_start_delalloc_snapshot(pending->root);
				1914	if (ret)
				1915	return ret;
				1916	}
				1917	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1918	return 0;
				1919	}
				1920
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1921	static inline void btrfs_wait_delalloc_flush(struct btrfs_trans_handle *trans)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1922	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1923	struct btrfs_fs_info *fs_info = trans->fs_info;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1924
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1925	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
				1926	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
				1927	} else {
				1928	struct btrfs_pending_snapshot *pending;
				1929	struct list_head *head = &trans->transaction->pending_snapshots;
				1930
				1931	/*
				1932	* Wait for any dellaloc that we started previously for the roots
				1933	* that are going to be snapshotted. This is to avoid a corrupted
				1934	* version of files in the snapshots that had both buffered and
				1935	* direct IO writes (even if they were done sequentially).
				1936	*/
				1937	list_for_each_entry(pending, head, list)
				1938	btrfs_wait_ordered_extents(pending->root,
				1939	U64_MAX, 0, U64_MAX);
				1940	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1941	}
				1942
				1943	int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
				1944	{
				1945	struct btrfs_fs_info *fs_info = trans->fs_info;
				1946	struct btrfs_transaction *cur_trans = trans->transaction;
				1947	struct btrfs_transaction *prev_trans = NULL;
				1948	int ret;
				1949
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1950	/*
				1951	* Some places just start a transaction to commit it. We need to make
				1952	* sure that if this commit fails that the abort code actually marks the
				1953	* transaction as failed, so set trans->dirty to make the abort code do
				1954	* the right thing.
				1955	*/
				1956	trans->dirty = true;
				1957
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1958	/* Stop the commit early if ->aborted is set */
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1959	if (TRANS_ABORTED(cur_trans)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1960	ret = cur_trans->aborted;
				1961	btrfs_end_transaction(trans);
				1962	return ret;
				1963	}
				1964
				1965	btrfs_trans_release_metadata(trans);
				1966	trans->block_rsv = NULL;
				1967
				1968	/* make a pass through all the delayed refs we have so far
				1969	* any runnings procs may add more while we are here
				1970	*/
				1971	ret = btrfs_run_delayed_refs(trans, 0);
				1972	if (ret) {
				1973	btrfs_end_transaction(trans);
				1974	return ret;
				1975	}
				1976
				1977	cur_trans = trans->transaction;
				1978
				1979	/*
				1980	* set the flushing flag so procs in this transaction have to
				1981	* start sending their work down.
				1982	*/
				1983	cur_trans->delayed_refs.flushing = 1;
				1984	smp_wmb();
				1985
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1986	btrfs_create_pending_block_groups(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1987
				1988	ret = btrfs_run_delayed_refs(trans, 0);
				1989	if (ret) {
				1990	btrfs_end_transaction(trans);
				1991	return ret;
				1992	}
				1993
				1994	if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
				1995	int run_it = 0;
				1996
				1997	/* this mutex is also taken before trying to set
				1998	* block groups readonly. We need to make sure
				1999	* that nobody has set a block group readonly
				2000	* after a extents from that block group have been
				2001	* allocated for cache files. btrfs_set_block_group_ro
				2002	* will wait for the transaction to commit if it
				2003	* finds BTRFS_TRANS_DIRTY_BG_RUN set.
				2004	*
				2005	* The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
				2006	* only one process starts all the block group IO. It wouldn't
				2007	* hurt to have more than one go through, but there's no
				2008	* real advantage to it either.
				2009	*/
				2010	mutex_lock(&fs_info->ro_block_group_mutex);
				2011	if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
				2012	&cur_trans->flags))
				2013	run_it = 1;
				2014	mutex_unlock(&fs_info->ro_block_group_mutex);
				2015
				2016	if (run_it) {
				2017	ret = btrfs_start_dirty_block_groups(trans);
				2018	if (ret) {
				2019	btrfs_end_transaction(trans);
				2020	return ret;
				2021	}
				2022	}
				2023	}
				2024
				2025	spin_lock(&fs_info->trans_lock);
				2026	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
				2027	spin_unlock(&fs_info->trans_lock);
				2028	refcount_inc(&cur_trans->use_count);
				2029	ret = btrfs_end_transaction(trans);
				2030
				2031	wait_for_commit(cur_trans);
				2032
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2033	if (TRANS_ABORTED(cur_trans))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2034	ret = cur_trans->aborted;
				2035
				2036	btrfs_put_transaction(cur_trans);
				2037
				2038	return ret;
				2039	}
				2040
				2041	cur_trans->state = TRANS_STATE_COMMIT_START;
				2042	wake_up(&fs_info->transaction_blocked_wait);
				2043
				2044	if (cur_trans->list.prev != &fs_info->trans_list) {
				2045	prev_trans = list_entry(cur_trans->list.prev,
				2046	struct btrfs_transaction, list);
				2047	if (prev_trans->state != TRANS_STATE_COMPLETED) {
				2048	refcount_inc(&prev_trans->use_count);
				2049	spin_unlock(&fs_info->trans_lock);
				2050
				2051	wait_for_commit(prev_trans);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2052	ret = READ_ONCE(prev_trans->aborted);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2053
				2054	btrfs_put_transaction(prev_trans);
				2055	if (ret)
				2056	goto cleanup_transaction;
				2057	} else {
				2058	spin_unlock(&fs_info->trans_lock);
				2059	}
				2060	} else {
				2061	spin_unlock(&fs_info->trans_lock);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2062	/*
				2063	* The previous transaction was aborted and was already removed
				2064	* from the list of transactions at fs_info->trans_list. So we
				2065	* abort to prevent writing a new superblock that reflects a
				2066	* corrupt state (pointing to trees with unwritten nodes/leafs).
				2067	*/
				2068	if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
				2069	ret = -EROFS;
				2070	goto cleanup_transaction;
				2071	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2072	}
				2073
				2074	extwriter_counter_dec(cur_trans, trans->type);
				2075
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2076	ret = btrfs_start_delalloc_flush(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2077	if (ret)
				2078	goto cleanup_transaction;
				2079
				2080	ret = btrfs_run_delayed_items(trans);
				2081	if (ret)
				2082	goto cleanup_transaction;
				2083
				2084	wait_event(cur_trans->writer_wait,
				2085	extwriter_counter_read(cur_trans) == 0);
				2086
				2087	/* some pending stuffs might be added after the previous flush. */
				2088	ret = btrfs_run_delayed_items(trans);
				2089	if (ret)
				2090	goto cleanup_transaction;
				2091
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2092	btrfs_wait_delalloc_flush(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2093
				2094	btrfs_scrub_pause(fs_info);
				2095	/*
				2096	* Ok now we need to make sure to block out any other joins while we
				2097	* commit the transaction. We could have started a join before setting
				2098	* COMMIT_DOING so make sure to wait for num_writers to == 1 again.
				2099	*/
				2100	spin_lock(&fs_info->trans_lock);
				2101	cur_trans->state = TRANS_STATE_COMMIT_DOING;
				2102	spin_unlock(&fs_info->trans_lock);
				2103	wait_event(cur_trans->writer_wait,
				2104	atomic_read(&cur_trans->num_writers) == 1);
				2105
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2106	if (TRANS_ABORTED(cur_trans)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2107	ret = cur_trans->aborted;
				2108	goto scrub_continue;
				2109	}
				2110	/*
				2111	* the reloc mutex makes sure that we stop
				2112	* the balancing code from coming in and moving
				2113	* extents around in the middle of the commit
				2114	*/
				2115	mutex_lock(&fs_info->reloc_mutex);
				2116
				2117	/*
				2118	* We needn't worry about the delayed items because we will
				2119	* deal with them in create_pending_snapshot(), which is the
				2120	* core function of the snapshot creation.
				2121	*/
				2122	ret = create_pending_snapshots(trans);
				2123	if (ret) {
				2124	mutex_unlock(&fs_info->reloc_mutex);
				2125	goto scrub_continue;
				2126	}
				2127
				2128	/*
				2129	* We insert the dir indexes of the snapshots and update the inode
				2130	* of the snapshots' parents after the snapshot creation, so there
				2131	* are some delayed items which are not dealt with. Now deal with
				2132	* them.
				2133	*
				2134	* We needn't worry that this operation will corrupt the snapshots,
				2135	* because all the tree which are snapshoted will be forced to COW
				2136	* the nodes and leaves.
				2137	*/
				2138	ret = btrfs_run_delayed_items(trans);
				2139	if (ret) {
				2140	mutex_unlock(&fs_info->reloc_mutex);
				2141	goto scrub_continue;
				2142	}
				2143
				2144	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				2145	if (ret) {
				2146	mutex_unlock(&fs_info->reloc_mutex);
				2147	goto scrub_continue;
				2148	}
				2149
				2150	/*
				2151	* make sure none of the code above managed to slip in a
				2152	* delayed item
				2153	*/
				2154	btrfs_assert_delayed_root_empty(fs_info);
				2155
				2156	WARN_ON(cur_trans != trans->transaction);
				2157
				2158	/* btrfs_commit_tree_roots is responsible for getting the
				2159	* various roots consistent with each other. Every pointer
				2160	* in the tree of tree roots has to point to the most up to date
				2161	* root for every subvolume and other tree. So, we have to keep
				2162	* the tree logging code from jumping in and changing any
				2163	* of the trees.
				2164	*
				2165	* At this point in the commit, there can't be any tree-log
				2166	* writers, but a little lower down we drop the trans mutex
				2167	* and let new people in. By holding the tree_log_mutex
				2168	* from now until after the super is written, we avoid races
				2169	* with the tree-log code.
				2170	*/
				2171	mutex_lock(&fs_info->tree_log_mutex);
				2172
				2173	ret = commit_fs_roots(trans);
				2174	if (ret) {
				2175	mutex_unlock(&fs_info->tree_log_mutex);
				2176	mutex_unlock(&fs_info->reloc_mutex);
				2177	goto scrub_continue;
				2178	}
				2179
				2180	/*
				2181	* Since the transaction is done, we can apply the pending changes
				2182	* before the next transaction.
				2183	*/
				2184	btrfs_apply_pending_changes(fs_info);
				2185
				2186	/* commit_fs_roots gets rid of all the tree log roots, it is now
				2187	* safe to free the root of tree log roots
				2188	*/
				2189	btrfs_free_log_root_tree(trans, fs_info);
				2190
				2191	/*
				2192	* commit_fs_roots() can call btrfs_save_ino_cache(), which generates
				2193	* new delayed refs. Must handle them or qgroup can be wrong.
				2194	*/
				2195	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				2196	if (ret) {
				2197	mutex_unlock(&fs_info->tree_log_mutex);
				2198	mutex_unlock(&fs_info->reloc_mutex);
				2199	goto scrub_continue;
				2200	}
				2201
				2202	/*
				2203	* Since fs roots are all committed, we can get a quite accurate
				2204	* new_roots. So let's do quota accounting.
				2205	*/
				2206	ret = btrfs_qgroup_account_extents(trans);
				2207	if (ret < 0) {
				2208	mutex_unlock(&fs_info->tree_log_mutex);
				2209	mutex_unlock(&fs_info->reloc_mutex);
				2210	goto scrub_continue;
				2211	}
				2212
				2213	ret = commit_cowonly_roots(trans);
				2214	if (ret) {
				2215	mutex_unlock(&fs_info->tree_log_mutex);
				2216	mutex_unlock(&fs_info->reloc_mutex);
				2217	goto scrub_continue;
				2218	}
				2219
				2220	/*
				2221	* The tasks which save the space cache and inode cache may also
				2222	* update ->aborted, check it.
				2223	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2224	if (TRANS_ABORTED(cur_trans)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2225	ret = cur_trans->aborted;
				2226	mutex_unlock(&fs_info->tree_log_mutex);
				2227	mutex_unlock(&fs_info->reloc_mutex);
				2228	goto scrub_continue;
				2229	}
				2230
				2231	btrfs_prepare_extent_commit(fs_info);
				2232
				2233	cur_trans = fs_info->running_transaction;
				2234
				2235	btrfs_set_root_node(&fs_info->tree_root->root_item,
				2236	fs_info->tree_root->node);
				2237	list_add_tail(&fs_info->tree_root->dirty_list,
				2238	&cur_trans->switch_commits);
				2239
				2240	btrfs_set_root_node(&fs_info->chunk_root->root_item,
				2241	fs_info->chunk_root->node);
				2242	list_add_tail(&fs_info->chunk_root->dirty_list,
				2243	&cur_trans->switch_commits);
				2244
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2245	switch_commit_roots(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2246
				2247	ASSERT(list_empty(&cur_trans->dirty_bgs));
				2248	ASSERT(list_empty(&cur_trans->io_bgs));
				2249	update_super_roots(fs_info);
				2250
				2251	btrfs_set_super_log_root(fs_info->super_copy, 0);
				2252	btrfs_set_super_log_root_level(fs_info->super_copy, 0);
				2253	memcpy(fs_info->super_for_commit, fs_info->super_copy,
				2254	sizeof(*fs_info->super_copy));
				2255
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2256	btrfs_commit_device_sizes(cur_trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2257
				2258	clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
				2259	clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
				2260
				2261	btrfs_trans_release_chunk_metadata(trans);
				2262
				2263	spin_lock(&fs_info->trans_lock);
				2264	cur_trans->state = TRANS_STATE_UNBLOCKED;
				2265	fs_info->running_transaction = NULL;
				2266	spin_unlock(&fs_info->trans_lock);
				2267	mutex_unlock(&fs_info->reloc_mutex);
				2268
				2269	wake_up(&fs_info->transaction_wait);
				2270
				2271	ret = btrfs_write_and_wait_transaction(trans);
				2272	if (ret) {
				2273	btrfs_handle_fs_error(fs_info, ret,
				2274	"Error while writing out transaction");
				2275	mutex_unlock(&fs_info->tree_log_mutex);
				2276	goto scrub_continue;
				2277	}
				2278
				2279	ret = write_all_supers(fs_info, 0);
				2280	/*
				2281	* the super is written, we can safely allow the tree-loggers
				2282	* to go about their business
				2283	*/
				2284	mutex_unlock(&fs_info->tree_log_mutex);
				2285	if (ret)
				2286	goto scrub_continue;
				2287
				2288	btrfs_finish_extent_commit(trans);
				2289
				2290	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
				2291	btrfs_clear_space_info_full(fs_info);
				2292
				2293	fs_info->last_trans_committed = cur_trans->transid;
				2294	/*
				2295	* We needn't acquire the lock here because there is no other task
				2296	* which can change it.
				2297	*/
				2298	cur_trans->state = TRANS_STATE_COMPLETED;
				2299	wake_up(&cur_trans->commit_wait);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2300
				2301	spin_lock(&fs_info->trans_lock);
				2302	list_del_init(&cur_trans->list);
				2303	spin_unlock(&fs_info->trans_lock);
				2304
				2305	btrfs_put_transaction(cur_trans);
				2306	btrfs_put_transaction(cur_trans);
				2307
				2308	if (trans->type & __TRANS_FREEZABLE)
				2309	sb_end_intwrite(fs_info->sb);
				2310
				2311	trace_btrfs_transaction_commit(trans->root);
				2312
				2313	btrfs_scrub_continue(fs_info);
				2314
				2315	if (current->journal_info == trans)
				2316	current->journal_info = NULL;
				2317
				2318	kmem_cache_free(btrfs_trans_handle_cachep, trans);
				2319
				2320	return ret;
				2321
				2322	scrub_continue:
				2323	btrfs_scrub_continue(fs_info);
				2324	cleanup_transaction:
				2325	btrfs_trans_release_metadata(trans);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2326	btrfs_cleanup_pending_block_groups(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2327	btrfs_trans_release_chunk_metadata(trans);
				2328	trans->block_rsv = NULL;
				2329	btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
				2330	if (current->journal_info == trans)
				2331	current->journal_info = NULL;
				2332	cleanup_transaction(trans, ret);
				2333
				2334	return ret;
				2335	}
				2336
				2337	/*
				2338	* return < 0 if error
				2339	* 0 if there are no more dead_roots at the time of call
				2340	* 1 there are more to be processed, call me again
				2341	*
				2342	* The return value indicates there are certainly more snapshots to delete, but
				2343	* if there comes a new one during processing, it may return 0. We don't mind,
				2344	* because btrfs_commit_super will poke cleaner thread and it will process it a
				2345	* few seconds later.
				2346	*/
				2347	int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
				2348	{
				2349	int ret;
				2350	struct btrfs_fs_info *fs_info = root->fs_info;
				2351
				2352	spin_lock(&fs_info->trans_lock);
				2353	if (list_empty(&fs_info->dead_roots)) {
				2354	spin_unlock(&fs_info->trans_lock);
				2355	return 0;
				2356	}
				2357	root = list_first_entry(&fs_info->dead_roots,
				2358	struct btrfs_root, root_list);
				2359	list_del_init(&root->root_list);
				2360	spin_unlock(&fs_info->trans_lock);
				2361
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2362	btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2363
				2364	btrfs_kill_all_delayed_nodes(root);
				2365
				2366	if (btrfs_header_backref_rev(root->node) <
				2367	BTRFS_MIXED_BACKREF_REV)
				2368	ret = btrfs_drop_snapshot(root, NULL, 0, 0);
				2369	else
				2370	ret = btrfs_drop_snapshot(root, NULL, 1, 0);
				2371
				2372	return (ret < 0) ? 0 : 1;
				2373	}
				2374
				2375	void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
				2376	{
				2377	unsigned long prev;
				2378	unsigned long bit;
				2379
				2380	prev = xchg(&fs_info->pending_changes, 0);
				2381	if (!prev)
				2382	return;
				2383
				2384	bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
				2385	if (prev & bit)
				2386	btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
				2387	prev &= ~bit;
				2388
				2389	bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
				2390	if (prev & bit)
				2391	btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
				2392	prev &= ~bit;
				2393
				2394	bit = 1 << BTRFS_PENDING_COMMIT;
				2395	if (prev & bit)
				2396	btrfs_debug(fs_info, "pending commit done");
				2397	prev &= ~bit;
				2398
				2399	if (prev)
				2400	btrfs_warn(fs_info,
				2401	"unknown pending changes left 0x%lx, ignoring", prev);
				2402	}