Blame - fs/btrfs/tree-log.c - hafnium/third_party/linux

blob: 8ea4b3da85d1aa48a7acb1d83a7731f1273e798f [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (C) 2008 Oracle. All rights reserved.
				4	*/
				5
				6	#include <linux/sched.h>
				7	#include <linux/slab.h>
				8	#include <linux/blkdev.h>
				9	#include <linux/list_sort.h>
				10	#include <linux/iversion.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	11	#include "misc.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	12	#include "ctree.h"
				13	#include "tree-log.h"
				14	#include "disk-io.h"
				15	#include "locking.h"
				16	#include "print-tree.h"
				17	#include "backref.h"
				18	#include "compression.h"
				19	#include "qgroup.h"
				20	#include "inode-map.h"
				21
				22	/* magic values for the inode_only field in btrfs_log_inode:
				23	*
				24	* LOG_INODE_ALL means to log everything
				25	* LOG_INODE_EXISTS means to log just enough to recreate the inode
				26	* during log replay
				27	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	28	enum {
				29	LOG_INODE_ALL,
				30	LOG_INODE_EXISTS,
				31	LOG_OTHER_INODE,
				32	LOG_OTHER_INODE_ALL,
				33	};
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	34
				35	/*
				36	* directory trouble cases
				37	*
				38	* 1) on rename or unlink, if the inode being unlinked isn't in the fsync
				39	* log, we must force a full commit before doing an fsync of the directory
				40	* where the unlink was done.
				41	* ---> record transid of last unlink/rename per directory
				42	*
				43	* mkdir foo/some_dir
				44	* normal commit
				45	* rename foo/some_dir foo2/some_dir
				46	* mkdir foo/some_dir
				47	* fsync foo/some_dir/some_file
				48	*
				49	* The fsync above will unlink the original some_dir without recording
				50	* it in its new location (foo2). After a crash, some_dir will be gone
				51	* unless the fsync of some_file forces a full commit
				52	*
				53	* 2) we must log any new names for any file or dir that is in the fsync
				54	* log. ---> check inode while renaming/linking.
				55	*
				56	* 2a) we must log any new names for any file or dir during rename
				57	* when the directory they are being removed from was logged.
				58	* ---> check inode and old parent dir during rename
				59	*
				60	* 2a is actually the more important variant. With the extra logging
				61	* a crash might unlink the old name without recreating the new one
				62	*
				63	* 3) after a crash, we must go through any directories with a link count
				64	* of zero and redo the rm -rf
				65	*
				66	* mkdir f1/foo
				67	* normal commit
				68	* rm -rf f1/foo
				69	* fsync(f1)
				70	*
				71	* The directory f1 was fully removed from the FS, but fsync was never
				72	* called on f1, only its parent dir. After a crash the rm -rf must
				73	* be replayed. This must be able to recurse down the entire
				74	* directory tree. The inode link count fixup code takes care of the
				75	* ugly details.
				76	*/
				77
				78	/*
				79	* stages for the tree walking. The first
				80	* stage (0) is to only pin down the blocks we find
				81	* the second stage (1) is to make sure that all the inodes
				82	* we find in the log are created in the subvolume.
				83	*
				84	* The last stage is to deal with directories and links and extents
				85	* and all the other fun semantics
				86	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	87	enum {
				88	LOG_WALK_PIN_ONLY,
				89	LOG_WALK_REPLAY_INODES,
				90	LOG_WALK_REPLAY_DIR_INDEX,
				91	LOG_WALK_REPLAY_ALL,
				92	};
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	93
				94	static int btrfs_log_inode(struct btrfs_trans_handle *trans,
				95	struct btrfs_root root, struct btrfs_inode inode,
				96	int inode_only,
				97	const loff_t start,
				98	const loff_t end,
				99	struct btrfs_log_ctx *ctx);
				100	static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
				101	struct btrfs_root *root,
				102	struct btrfs_path *path, u64 objectid);
				103	static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
				104	struct btrfs_root *root,
				105	struct btrfs_root *log,
				106	struct btrfs_path *path,
				107	u64 dirid, int del_all);
				108
				109	/*
				110	* tree logging is a special write ahead log used to make sure that
				111	* fsyncs and O_SYNCs can happen without doing full tree commits.
				112	*
				113	* Full tree commits are expensive because they require commonly
				114	* modified blocks to be recowed, creating many dirty pages in the
				115	* extent tree an 4x-6x higher write load than ext3.
				116	*
				117	* Instead of doing a tree commit on every fsync, we use the
				118	* key ranges and transaction ids to find items for a given file or directory
				119	* that have changed in this transaction. Those items are copied into
				120	* a special tree (one per subvolume root), that tree is written to disk
				121	* and then the fsync is considered complete.
				122	*
				123	* After a crash, items are copied out of the log-tree back into the
				124	* subvolume tree. Any file data extents found are recorded in the extent
				125	* allocation tree, and the log-tree freed.
				126	*
				127	* The log tree is read three times, once to pin down all the extents it is
				128	* using in ram and once, once to create all the inodes logged in the tree
				129	* and once to do all the other items.
				130	*/
				131
				132	/*
				133	* start a sub transaction and setup the log tree
				134	* this increments the log tree writer count to make the people
				135	* syncing the tree wait for us to finish
				136	*/
				137	static int start_log_trans(struct btrfs_trans_handle *trans,
				138	struct btrfs_root *root,
				139	struct btrfs_log_ctx *ctx)
				140	{
				141	struct btrfs_fs_info *fs_info = root->fs_info;
				142	int ret = 0;
				143
				144	mutex_lock(&root->log_mutex);
				145
				146	if (root->log_root) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	147	if (btrfs_need_log_full_commit(trans)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	148	ret = -EAGAIN;
				149	goto out;
				150	}
				151
				152	if (!root->log_start_pid) {
				153	clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
				154	root->log_start_pid = current->pid;
				155	} else if (root->log_start_pid != current->pid) {
				156	set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
				157	}
				158	} else {
				159	mutex_lock(&fs_info->tree_log_mutex);
				160	if (!fs_info->log_root_tree)
				161	ret = btrfs_init_log_root_tree(trans, fs_info);
				162	mutex_unlock(&fs_info->tree_log_mutex);
				163	if (ret)
				164	goto out;
				165
				166	ret = btrfs_add_log_tree(trans, root);
				167	if (ret)
				168	goto out;
				169
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	170	set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	171	clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
				172	root->log_start_pid = current->pid;
				173	}
				174
				175	atomic_inc(&root->log_batch);
				176	atomic_inc(&root->log_writers);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	177	if (ctx && !ctx->logging_new_name) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	178	int index = root->log_transid % 2;
				179	list_add_tail(&ctx->list, &root->log_ctxs[index]);
				180	ctx->log_transid = root->log_transid;
				181	}
				182
				183	out:
				184	mutex_unlock(&root->log_mutex);
				185	return ret;
				186	}
				187
				188	/*
				189	* returns 0 if there was a log transaction running and we were able
				190	* to join, or returns -ENOENT if there were not transactions
				191	* in progress
				192	*/
				193	static int join_running_log_trans(struct btrfs_root *root)
				194	{
				195	int ret = -ENOENT;
				196
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	197	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
				198	return ret;
				199
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	200	mutex_lock(&root->log_mutex);
				201	if (root->log_root) {
				202	ret = 0;
				203	atomic_inc(&root->log_writers);
				204	}
				205	mutex_unlock(&root->log_mutex);
				206	return ret;
				207	}
				208
				209	/*
				210	* This either makes the current running log transaction wait
				211	* until you call btrfs_end_log_trans() or it makes any future
				212	* log transactions wait until you call btrfs_end_log_trans()
				213	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	214	void btrfs_pin_log_trans(struct btrfs_root *root)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	215	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	216	mutex_lock(&root->log_mutex);
				217	atomic_inc(&root->log_writers);
				218	mutex_unlock(&root->log_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	219	}
				220
				221	/*
				222	* indicate we're done making changes to the log tree
				223	* and wake up anyone waiting to do a sync
				224	*/
				225	void btrfs_end_log_trans(struct btrfs_root *root)
				226	{
				227	if (atomic_dec_and_test(&root->log_writers)) {
				228	/* atomic_dec_and_test implies a barrier */
				229	cond_wake_up_nomb(&root->log_writer_wait);
				230	}
				231	}
				232
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	233	static int btrfs_write_tree_block(struct extent_buffer *buf)
				234	{
				235	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
				236	buf->start + buf->len - 1);
				237	}
				238
				239	static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
				240	{
				241	filemap_fdatawait_range(buf->pages[0]->mapping,
				242	buf->start, buf->start + buf->len - 1);
				243	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	244
				245	/*
				246	* the walk control struct is used to pass state down the chain when
				247	* processing the log tree. The stage field tells us which part
				248	* of the log tree processing we are currently doing. The others
				249	* are state fields used for that specific part
				250	*/
				251	struct walk_control {
				252	/* should we free the extent on disk when done? This is used
				253	* at transaction commit time while freeing a log tree
				254	*/
				255	int free;
				256
				257	/* should we write out the extent buffer? This is used
				258	* while flushing the log tree to disk during a sync
				259	*/
				260	int write;
				261
				262	/* should we wait for the extent buffer io to finish? Also used
				263	* while flushing the log tree to disk for a sync
				264	*/
				265	int wait;
				266
				267	/* pin only walk, we record which extents on disk belong to the
				268	* log trees
				269	*/
				270	int pin;
				271
				272	/* what stage of the replay code we're currently in */
				273	int stage;
				274
				275	/*
				276	* Ignore any items from the inode currently being processed. Needs
				277	* to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
				278	* the LOG_WALK_REPLAY_INODES stage.
				279	*/
				280	bool ignore_cur_inode;
				281
				282	/* the root we are currently replaying */
				283	struct btrfs_root *replay_dest;
				284
				285	/* the trans handle for the current replay */
				286	struct btrfs_trans_handle *trans;
				287
				288	/* the function that gets used to process blocks we find in the
				289	* tree. Note the extent_buffer might not be up to date when it is
				290	* passed in, and it must be checked or read if you need the data
				291	* inside it
				292	*/
				293	int (process_func)(struct btrfs_root log, struct extent_buffer *eb,
				294	struct walk_control *wc, u64 gen, int level);
				295	};
				296
				297	/*
				298	* process_func used to pin down extents, write them or wait on them
				299	*/
				300	static int process_one_buffer(struct btrfs_root *log,
				301	struct extent_buffer *eb,
				302	struct walk_control *wc, u64 gen, int level)
				303	{
				304	struct btrfs_fs_info *fs_info = log->fs_info;
				305	int ret = 0;
				306
				307	/*
				308	* If this fs is mixed then we need to be able to process the leaves to
				309	* pin down any logged extents, so we have to read the block.
				310	*/
				311	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
				312	ret = btrfs_read_buffer(eb, gen, level, NULL);
				313	if (ret)
				314	return ret;
				315	}
				316
				317	if (wc->pin)
				318	ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
				319	eb->len);
				320
				321	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
				322	if (wc->pin && btrfs_header_level(eb) == 0)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	323	ret = btrfs_exclude_logged_extents(eb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	324	if (wc->write)
				325	btrfs_write_tree_block(eb);
				326	if (wc->wait)
				327	btrfs_wait_tree_block_writeback(eb);
				328	}
				329	return ret;
				330	}
				331
				332	/*
				333	* Item overwrite used by replay and tree logging. eb, slot and key all refer
				334	* to the src data we are copying out.
				335	*
				336	* root is the tree we are copying into, and path is a scratch
				337	* path for use in this function (it should be released on entry and
				338	* will be released on exit).
				339	*
				340	* If the key is already in the destination tree the existing item is
				341	* overwritten. If the existing item isn't big enough, it is extended.
				342	* If it is too large, it is truncated.
				343	*
				344	* If the key isn't in the destination yet, a new item is inserted.
				345	*/
				346	static noinline int overwrite_item(struct btrfs_trans_handle *trans,
				347	struct btrfs_root *root,
				348	struct btrfs_path *path,
				349	struct extent_buffer *eb, int slot,
				350	struct btrfs_key *key)
				351	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	352	int ret;
				353	u32 item_size;
				354	u64 saved_i_size = 0;
				355	int save_old_i_size = 0;
				356	unsigned long src_ptr;
				357	unsigned long dst_ptr;
				358	int overwrite_root = 0;
				359	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
				360
				361	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				362	overwrite_root = 1;
				363
				364	item_size = btrfs_item_size_nr(eb, slot);
				365	src_ptr = btrfs_item_ptr_offset(eb, slot);
				366
				367	/* look for the key in the destination tree */
				368	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				369	if (ret < 0)
				370	return ret;
				371
				372	if (ret == 0) {
				373	char *src_copy;
				374	char *dst_copy;
				375	u32 dst_size = btrfs_item_size_nr(path->nodes[0],
				376	path->slots[0]);
				377	if (dst_size != item_size)
				378	goto insert;
				379
				380	if (item_size == 0) {
				381	btrfs_release_path(path);
				382	return 0;
				383	}
				384	dst_copy = kmalloc(item_size, GFP_NOFS);
				385	src_copy = kmalloc(item_size, GFP_NOFS);
				386	if (!dst_copy \|\| !src_copy) {
				387	btrfs_release_path(path);
				388	kfree(dst_copy);
				389	kfree(src_copy);
				390	return -ENOMEM;
				391	}
				392
				393	read_extent_buffer(eb, src_copy, src_ptr, item_size);
				394
				395	dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				396	read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
				397	item_size);
				398	ret = memcmp(dst_copy, src_copy, item_size);
				399
				400	kfree(dst_copy);
				401	kfree(src_copy);
				402	/*
				403	* they have the same contents, just return, this saves
				404	* us from cowing blocks in the destination tree and doing
				405	* extra writes that may not have been done by a previous
				406	* sync
				407	*/
				408	if (ret == 0) {
				409	btrfs_release_path(path);
				410	return 0;
				411	}
				412
				413	/*
				414	* We need to load the old nbytes into the inode so when we
				415	* replay the extents we've logged we get the right nbytes.
				416	*/
				417	if (inode_item) {
				418	struct btrfs_inode_item *item;
				419	u64 nbytes;
				420	u32 mode;
				421
				422	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				423	struct btrfs_inode_item);
				424	nbytes = btrfs_inode_nbytes(path->nodes[0], item);
				425	item = btrfs_item_ptr(eb, slot,
				426	struct btrfs_inode_item);
				427	btrfs_set_inode_nbytes(eb, item, nbytes);
				428
				429	/*
				430	* If this is a directory we need to reset the i_size to
				431	* 0 so that we can set it up properly when replaying
				432	* the rest of the items in this log.
				433	*/
				434	mode = btrfs_inode_mode(eb, item);
				435	if (S_ISDIR(mode))
				436	btrfs_set_inode_size(eb, item, 0);
				437	}
				438	} else if (inode_item) {
				439	struct btrfs_inode_item *item;
				440	u32 mode;
				441
				442	/*
				443	* New inode, set nbytes to 0 so that the nbytes comes out
				444	* properly when we replay the extents.
				445	*/
				446	item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
				447	btrfs_set_inode_nbytes(eb, item, 0);
				448
				449	/*
				450	* If this is a directory we need to reset the i_size to 0 so
				451	* that we can set it up properly when replaying the rest of
				452	* the items in this log.
				453	*/
				454	mode = btrfs_inode_mode(eb, item);
				455	if (S_ISDIR(mode))
				456	btrfs_set_inode_size(eb, item, 0);
				457	}
				458	insert:
				459	btrfs_release_path(path);
				460	/* try to insert the key into the destination tree */
				461	path->skip_release_on_error = 1;
				462	ret = btrfs_insert_empty_item(trans, root, path,
				463	key, item_size);
				464	path->skip_release_on_error = 0;
				465
				466	/* make sure any existing item is the correct size */
				467	if (ret == -EEXIST \|\| ret == -EOVERFLOW) {
				468	u32 found_size;
				469	found_size = btrfs_item_size_nr(path->nodes[0],
				470	path->slots[0]);
				471	if (found_size > item_size)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	472	btrfs_truncate_item(path, item_size, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	473	else if (found_size < item_size)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	474	btrfs_extend_item(path, item_size - found_size);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	475	} else if (ret) {
				476	return ret;
				477	}
				478	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
				479	path->slots[0]);
				480
				481	/* don't overwrite an existing inode if the generation number
				482	* was logged as zero. This is done when the tree logging code
				483	* is just logging an inode to make sure it exists after recovery.
				484	*
				485	* Also, don't overwrite i_size on directories during replay.
				486	* log replay inserts and removes directory items based on the
				487	* state of the tree found in the subvolume, and i_size is modified
				488	* as it goes
				489	*/
				490	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
				491	struct btrfs_inode_item *src_item;
				492	struct btrfs_inode_item *dst_item;
				493
				494	src_item = (struct btrfs_inode_item *)src_ptr;
				495	dst_item = (struct btrfs_inode_item *)dst_ptr;
				496
				497	if (btrfs_inode_generation(eb, src_item) == 0) {
				498	struct extent_buffer *dst_eb = path->nodes[0];
				499	const u64 ino_size = btrfs_inode_size(eb, src_item);
				500
				501	/*
				502	* For regular files an ino_size == 0 is used only when
				503	* logging that an inode exists, as part of a directory
				504	* fsync, and the inode wasn't fsynced before. In this
				505	* case don't set the size of the inode in the fs/subvol
				506	* tree, otherwise we would be throwing valid data away.
				507	*/
				508	if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
				509	S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
				510	ino_size != 0) {
				511	struct btrfs_map_token token;
				512
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	513	btrfs_init_map_token(&token, dst_eb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	514	btrfs_set_token_inode_size(dst_eb, dst_item,
				515	ino_size, &token);
				516	}
				517	goto no_copy;
				518	}
				519
				520	if (overwrite_root &&
				521	S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
				522	S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
				523	save_old_i_size = 1;
				524	saved_i_size = btrfs_inode_size(path->nodes[0],
				525	dst_item);
				526	}
				527	}
				528
				529	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
				530	src_ptr, item_size);
				531
				532	if (save_old_i_size) {
				533	struct btrfs_inode_item *dst_item;
				534	dst_item = (struct btrfs_inode_item *)dst_ptr;
				535	btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
				536	}
				537
				538	/* make sure the generation is filled in */
				539	if (key->type == BTRFS_INODE_ITEM_KEY) {
				540	struct btrfs_inode_item *dst_item;
				541	dst_item = (struct btrfs_inode_item *)dst_ptr;
				542	if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
				543	btrfs_set_inode_generation(path->nodes[0], dst_item,
				544	trans->transid);
				545	}
				546	}
				547	no_copy:
				548	btrfs_mark_buffer_dirty(path->nodes[0]);
				549	btrfs_release_path(path);
				550	return 0;
				551	}
				552
				553	/*
				554	* simple helper to read an inode off the disk from a given root
				555	* This can only be called for subvolume roots and not for the log
				556	*/
				557	static noinline struct inode read_one_inode(struct btrfs_root root,
				558	u64 objectid)
				559	{
				560	struct btrfs_key key;
				561	struct inode *inode;
				562
				563	key.objectid = objectid;
				564	key.type = BTRFS_INODE_ITEM_KEY;
				565	key.offset = 0;
				566	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
				567	if (IS_ERR(inode))
				568	inode = NULL;
				569	return inode;
				570	}
				571
				572	/* replays a single extent in 'eb' at 'slot' with 'key' into the
				573	* subvolume 'root'. path is released on entry and should be released
				574	* on exit.
				575	*
				576	* extents in the log tree have not been allocated out of the extent
				577	* tree yet. So, this completes the allocation, taking a reference
				578	* as required if the extent already exists or creating a new extent
				579	* if it isn't in the extent allocation tree yet.
				580	*
				581	* The extent is inserted into the file, dropping any existing extents
				582	* from the file that overlap the new one.
				583	*/
				584	static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
				585	struct btrfs_root *root,
				586	struct btrfs_path *path,
				587	struct extent_buffer *eb, int slot,
				588	struct btrfs_key *key)
				589	{
				590	struct btrfs_fs_info *fs_info = root->fs_info;
				591	int found_type;
				592	u64 extent_end;
				593	u64 start = key->offset;
				594	u64 nbytes = 0;
				595	struct btrfs_file_extent_item *item;
				596	struct inode *inode = NULL;
				597	unsigned long size;
				598	int ret = 0;
				599
				600	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
				601	found_type = btrfs_file_extent_type(eb, item);
				602
				603	if (found_type == BTRFS_FILE_EXTENT_REG \|\|
				604	found_type == BTRFS_FILE_EXTENT_PREALLOC) {
				605	nbytes = btrfs_file_extent_num_bytes(eb, item);
				606	extent_end = start + nbytes;
				607
				608	/*
				609	* We don't add to the inodes nbytes if we are prealloc or a
				610	* hole.
				611	*/
				612	if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
				613	nbytes = 0;
				614	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
				615	size = btrfs_file_extent_ram_bytes(eb, item);
				616	nbytes = btrfs_file_extent_ram_bytes(eb, item);
				617	extent_end = ALIGN(start + size,
				618	fs_info->sectorsize);
				619	} else {
				620	ret = 0;
				621	goto out;
				622	}
				623
				624	inode = read_one_inode(root, key->objectid);
				625	if (!inode) {
				626	ret = -EIO;
				627	goto out;
				628	}
				629
				630	/*
				631	* first check to see if we already have this extent in the
				632	* file. This must be done before the btrfs_drop_extents run
				633	* so we don't try to drop this extent.
				634	*/
				635	ret = btrfs_lookup_file_extent(trans, root, path,
				636	btrfs_ino(BTRFS_I(inode)), start, 0);
				637
				638	if (ret == 0 &&
				639	(found_type == BTRFS_FILE_EXTENT_REG \|\|
				640	found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
				641	struct btrfs_file_extent_item cmp1;
				642	struct btrfs_file_extent_item cmp2;
				643	struct btrfs_file_extent_item *existing;
				644	struct extent_buffer *leaf;
				645
				646	leaf = path->nodes[0];
				647	existing = btrfs_item_ptr(leaf, path->slots[0],
				648	struct btrfs_file_extent_item);
				649
				650	read_extent_buffer(eb, &cmp1, (unsigned long)item,
				651	sizeof(cmp1));
				652	read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
				653	sizeof(cmp2));
				654
				655	/*
				656	* we already have a pointer to this exact extent,
				657	* we don't have to do anything
				658	*/
				659	if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
				660	btrfs_release_path(path);
				661	goto out;
				662	}
				663	}
				664	btrfs_release_path(path);
				665
				666	/* drop any overlapping extents */
				667	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
				668	if (ret)
				669	goto out;
				670
				671	if (found_type == BTRFS_FILE_EXTENT_REG \|\|
				672	found_type == BTRFS_FILE_EXTENT_PREALLOC) {
				673	u64 offset;
				674	unsigned long dest_offset;
				675	struct btrfs_key ins;
				676
				677	if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
				678	btrfs_fs_incompat(fs_info, NO_HOLES))
				679	goto update_inode;
				680
				681	ret = btrfs_insert_empty_item(trans, root, path, key,
				682	sizeof(*item));
				683	if (ret)
				684	goto out;
				685	dest_offset = btrfs_item_ptr_offset(path->nodes[0],
				686	path->slots[0]);
				687	copy_extent_buffer(path->nodes[0], eb, dest_offset,
				688	(unsigned long)item, sizeof(*item));
				689
				690	ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
				691	ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
				692	ins.type = BTRFS_EXTENT_ITEM_KEY;
				693	offset = key->offset - btrfs_file_extent_offset(eb, item);
				694
				695	/*
				696	* Manually record dirty extent, as here we did a shallow
				697	* file extent item copy and skip normal backref update,
				698	* but modifying extent tree all by ourselves.
				699	* So need to manually record dirty extent for qgroup,
				700	* as the owner of the file extent changed from log tree
				701	* (doesn't affect qgroup) to fs/file tree(affects qgroup)
				702	*/
				703	ret = btrfs_qgroup_trace_extent(trans,
				704	btrfs_file_extent_disk_bytenr(eb, item),
				705	btrfs_file_extent_disk_num_bytes(eb, item),
				706	GFP_NOFS);
				707	if (ret < 0)
				708	goto out;
				709
				710	if (ins.objectid > 0) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	711	struct btrfs_ref ref = { 0 };
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	712	u64 csum_start;
				713	u64 csum_end;
				714	LIST_HEAD(ordered_sums);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	715
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	716	/*
				717	* is this extent already allocated in the extent
				718	* allocation tree? If so, just add a reference
				719	*/
				720	ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
				721	ins.offset);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	722	if (ret < 0) {
				723	goto out;
				724	} else if (ret == 0) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	725	btrfs_init_generic_ref(&ref,
				726	BTRFS_ADD_DELAYED_REF,
				727	ins.objectid, ins.offset, 0);
				728	btrfs_init_data_ref(&ref,
				729	root->root_key.objectid,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	730	key->objectid, offset);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	731	ret = btrfs_inc_extent_ref(trans, &ref);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	732	if (ret)
				733	goto out;
				734	} else {
				735	/*
				736	* insert the extent pointer in the extent
				737	* allocation tree
				738	*/
				739	ret = btrfs_alloc_logged_file_extent(trans,
				740	root->root_key.objectid,
				741	key->objectid, offset, &ins);
				742	if (ret)
				743	goto out;
				744	}
				745	btrfs_release_path(path);
				746
				747	if (btrfs_file_extent_compression(eb, item)) {
				748	csum_start = ins.objectid;
				749	csum_end = csum_start + ins.offset;
				750	} else {
				751	csum_start = ins.objectid +
				752	btrfs_file_extent_offset(eb, item);
				753	csum_end = csum_start +
				754	btrfs_file_extent_num_bytes(eb, item);
				755	}
				756
				757	ret = btrfs_lookup_csums_range(root->log_root,
				758	csum_start, csum_end - 1,
				759	&ordered_sums, 0);
				760	if (ret)
				761	goto out;
				762	/*
				763	* Now delete all existing cums in the csum root that
				764	* cover our range. We do this because we can have an
				765	* extent that is completely referenced by one file
				766	* extent item and partially referenced by another
				767	* file extent item (like after using the clone or
				768	* extent_same ioctls). In this case if we end up doing
				769	* the replay of the one that partially references the
				770	* extent first, and we do not do the csum deletion
				771	* below, we can get 2 csum items in the csum tree that
				772	* overlap each other. For example, imagine our log has
				773	* the two following file extent items:
				774	*
				775	* key (257 EXTENT_DATA 409600)
				776	* extent data disk byte 12845056 nr 102400
				777	* extent data offset 20480 nr 20480 ram 102400
				778	*
				779	* key (257 EXTENT_DATA 819200)
				780	* extent data disk byte 12845056 nr 102400
				781	* extent data offset 0 nr 102400 ram 102400
				782	*
				783	* Where the second one fully references the 100K extent
				784	* that starts at disk byte 12845056, and the log tree
				785	* has a single csum item that covers the entire range
				786	* of the extent:
				787	*
				788	* key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
				789	*
				790	* After the first file extent item is replayed, the
				791	* csum tree gets the following csum item:
				792	*
				793	* key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
				794	*
				795	* Which covers the 20K sub-range starting at offset 20K
				796	* of our extent. Now when we replay the second file
				797	* extent item, if we do not delete existing csum items
				798	* that cover any of its blocks, we end up getting two
				799	* csum items in our csum tree that overlap each other:
				800	*
				801	* key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
				802	* key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
				803	*
				804	* Which is a problem, because after this anyone trying
				805	* to lookup up for the checksum of any block of our
				806	* extent starting at an offset of 40K or higher, will
				807	* end up looking at the second csum item only, which
				808	* does not contain the checksum for any block starting
				809	* at offset 40K or higher of our extent.
				810	*/
				811	while (!list_empty(&ordered_sums)) {
				812	struct btrfs_ordered_sum *sums;
				813	sums = list_entry(ordered_sums.next,
				814	struct btrfs_ordered_sum,
				815	list);
				816	if (!ret)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	817	ret = btrfs_del_csums(trans,
				818	fs_info->csum_root,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	819	sums->bytenr,
				820	sums->len);
				821	if (!ret)
				822	ret = btrfs_csum_file_blocks(trans,
				823	fs_info->csum_root, sums);
				824	list_del(&sums->list);
				825	kfree(sums);
				826	}
				827	if (ret)
				828	goto out;
				829	} else {
				830	btrfs_release_path(path);
				831	}
				832	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
				833	/* inline extents are easy, we just overwrite them */
				834	ret = overwrite_item(trans, root, path, eb, slot, key);
				835	if (ret)
				836	goto out;
				837	}
				838
				839	inode_add_bytes(inode, nbytes);
				840	update_inode:
				841	ret = btrfs_update_inode(trans, root, inode);
				842	out:
				843	if (inode)
				844	iput(inode);
				845	return ret;
				846	}
				847
				848	/*
				849	* when cleaning up conflicts between the directory names in the
				850	* subvolume, directory names in the log and directory names in the
				851	* inode back references, we may have to unlink inodes from directories.
				852	*
				853	* This is a helper function to do the unlink of a specific directory
				854	* item
				855	*/
				856	static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
				857	struct btrfs_root *root,
				858	struct btrfs_path *path,
				859	struct btrfs_inode *dir,
				860	struct btrfs_dir_item *di)
				861	{
				862	struct inode *inode;
				863	char *name;
				864	int name_len;
				865	struct extent_buffer *leaf;
				866	struct btrfs_key location;
				867	int ret;
				868
				869	leaf = path->nodes[0];
				870
				871	btrfs_dir_item_key_to_cpu(leaf, di, &location);
				872	name_len = btrfs_dir_name_len(leaf, di);
				873	name = kmalloc(name_len, GFP_NOFS);
				874	if (!name)
				875	return -ENOMEM;
				876
				877	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
				878	btrfs_release_path(path);
				879
				880	inode = read_one_inode(root, location.objectid);
				881	if (!inode) {
				882	ret = -EIO;
				883	goto out;
				884	}
				885
				886	ret = link_to_fixup_dir(trans, root, path, location.objectid);
				887	if (ret)
				888	goto out;
				889
				890	ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
				891	name_len);
				892	if (ret)
				893	goto out;
				894	else
				895	ret = btrfs_run_delayed_items(trans);
				896	out:
				897	kfree(name);
				898	iput(inode);
				899	return ret;
				900	}
				901
				902	/*
				903	* helper function to see if a given name and sequence number found
				904	* in an inode back reference are already in a directory and correctly
				905	* point to this inode
				906	*/
				907	static noinline int inode_in_dir(struct btrfs_root *root,
				908	struct btrfs_path *path,
				909	u64 dirid, u64 objectid, u64 index,
				910	const char *name, int name_len)
				911	{
				912	struct btrfs_dir_item *di;
				913	struct btrfs_key location;
				914	int match = 0;
				915
				916	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
				917	index, name, name_len, 0);
				918	if (di && !IS_ERR(di)) {
				919	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				920	if (location.objectid != objectid)
				921	goto out;
				922	} else
				923	goto out;
				924	btrfs_release_path(path);
				925
				926	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
				927	if (di && !IS_ERR(di)) {
				928	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				929	if (location.objectid != objectid)
				930	goto out;
				931	} else
				932	goto out;
				933	match = 1;
				934	out:
				935	btrfs_release_path(path);
				936	return match;
				937	}
				938
				939	/*
				940	* helper function to check a log tree for a named back reference in
				941	* an inode. This is used to decide if a back reference that is
				942	* found in the subvolume conflicts with what we find in the log.
				943	*
				944	* inode backreferences may have multiple refs in a single item,
				945	* during replay we process one reference at a time, and we don't
				946	* want to delete valid links to a file from the subvolume if that
				947	* link is also in the log.
				948	*/
				949	static noinline int backref_in_log(struct btrfs_root *log,
				950	struct btrfs_key *key,
				951	u64 ref_objectid,
				952	const char *name, int namelen)
				953	{
				954	struct btrfs_path *path;
				955	struct btrfs_inode_ref *ref;
				956	unsigned long ptr;
				957	unsigned long ptr_end;
				958	unsigned long name_ptr;
				959	int found_name_len;
				960	int item_size;
				961	int ret;
				962	int match = 0;
				963
				964	path = btrfs_alloc_path();
				965	if (!path)
				966	return -ENOMEM;
				967
				968	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
				969	if (ret != 0)
				970	goto out;
				971
				972	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				973
				974	if (key->type == BTRFS_INODE_EXTREF_KEY) {
				975	if (btrfs_find_name_in_ext_backref(path->nodes[0],
				976	path->slots[0],
				977	ref_objectid,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	978	name, namelen))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	979	match = 1;
				980
				981	goto out;
				982	}
				983
				984	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
				985	ptr_end = ptr + item_size;
				986	while (ptr < ptr_end) {
				987	ref = (struct btrfs_inode_ref *)ptr;
				988	found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
				989	if (found_name_len == namelen) {
				990	name_ptr = (unsigned long)(ref + 1);
				991	ret = memcmp_extent_buffer(path->nodes[0], name,
				992	name_ptr, namelen);
				993	if (ret == 0) {
				994	match = 1;
				995	goto out;
				996	}
				997	}
				998	ptr = (unsigned long)(ref + 1) + found_name_len;
				999	}
				1000	out:
				1001	btrfs_free_path(path);
				1002	return match;
				1003	}
				1004
				1005	static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
				1006	struct btrfs_root *root,
				1007	struct btrfs_path *path,
				1008	struct btrfs_root *log_root,
				1009	struct btrfs_inode *dir,
				1010	struct btrfs_inode *inode,
				1011	u64 inode_objectid, u64 parent_objectid,
				1012	u64 ref_index, char *name, int namelen,
				1013	int *search_done)
				1014	{
				1015	int ret;
				1016	char *victim_name;
				1017	int victim_name_len;
				1018	struct extent_buffer *leaf;
				1019	struct btrfs_dir_item *di;
				1020	struct btrfs_key search_key;
				1021	struct btrfs_inode_extref *extref;
				1022
				1023	again:
				1024	/* Search old style refs */
				1025	search_key.objectid = inode_objectid;
				1026	search_key.type = BTRFS_INODE_REF_KEY;
				1027	search_key.offset = parent_objectid;
				1028	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
				1029	if (ret == 0) {
				1030	struct btrfs_inode_ref *victim_ref;
				1031	unsigned long ptr;
				1032	unsigned long ptr_end;
				1033
				1034	leaf = path->nodes[0];
				1035
				1036	/* are we trying to overwrite a back ref for the root directory
				1037	* if so, just jump out, we're done
				1038	*/
				1039	if (search_key.objectid == search_key.offset)
				1040	return 1;
				1041
				1042	/* check all the names in this back reference to see
				1043	* if they are in the log. if so, we allow them to stay
				1044	* otherwise they must be unlinked as a conflict
				1045	*/
				1046	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
				1047	ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
				1048	while (ptr < ptr_end) {
				1049	victim_ref = (struct btrfs_inode_ref *)ptr;
				1050	victim_name_len = btrfs_inode_ref_name_len(leaf,
				1051	victim_ref);
				1052	victim_name = kmalloc(victim_name_len, GFP_NOFS);
				1053	if (!victim_name)
				1054	return -ENOMEM;
				1055
				1056	read_extent_buffer(leaf, victim_name,
				1057	(unsigned long)(victim_ref + 1),
				1058	victim_name_len);
				1059
				1060	if (!backref_in_log(log_root, &search_key,
				1061	parent_objectid,
				1062	victim_name,
				1063	victim_name_len)) {
				1064	inc_nlink(&inode->vfs_inode);
				1065	btrfs_release_path(path);
				1066
				1067	ret = btrfs_unlink_inode(trans, root, dir, inode,
				1068	victim_name, victim_name_len);
				1069	kfree(victim_name);
				1070	if (ret)
				1071	return ret;
				1072	ret = btrfs_run_delayed_items(trans);
				1073	if (ret)
				1074	return ret;
				1075	*search_done = 1;
				1076	goto again;
				1077	}
				1078	kfree(victim_name);
				1079
				1080	ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
				1081	}
				1082
				1083	/*
				1084	* NOTE: we have searched root tree and checked the
				1085	* corresponding ref, it does not need to check again.
				1086	*/
				1087	*search_done = 1;
				1088	}
				1089	btrfs_release_path(path);
				1090
				1091	/* Same search but for extended refs */
				1092	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
				1093	inode_objectid, parent_objectid, 0,
				1094	0);
				1095	if (!IS_ERR_OR_NULL(extref)) {
				1096	u32 item_size;
				1097	u32 cur_offset = 0;
				1098	unsigned long base;
				1099	struct inode *victim_parent;
				1100
				1101	leaf = path->nodes[0];
				1102
				1103	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				1104	base = btrfs_item_ptr_offset(leaf, path->slots[0]);
				1105
				1106	while (cur_offset < item_size) {
				1107	extref = (struct btrfs_inode_extref *)(base + cur_offset);
				1108
				1109	victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
				1110
				1111	if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
				1112	goto next;
				1113
				1114	victim_name = kmalloc(victim_name_len, GFP_NOFS);
				1115	if (!victim_name)
				1116	return -ENOMEM;
				1117	read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
				1118	victim_name_len);
				1119
				1120	search_key.objectid = inode_objectid;
				1121	search_key.type = BTRFS_INODE_EXTREF_KEY;
				1122	search_key.offset = btrfs_extref_hash(parent_objectid,
				1123	victim_name,
				1124	victim_name_len);
				1125	ret = 0;
				1126	if (!backref_in_log(log_root, &search_key,
				1127	parent_objectid, victim_name,
				1128	victim_name_len)) {
				1129	ret = -ENOENT;
				1130	victim_parent = read_one_inode(root,
				1131	parent_objectid);
				1132	if (victim_parent) {
				1133	inc_nlink(&inode->vfs_inode);
				1134	btrfs_release_path(path);
				1135
				1136	ret = btrfs_unlink_inode(trans, root,
				1137	BTRFS_I(victim_parent),
				1138	inode,
				1139	victim_name,
				1140	victim_name_len);
				1141	if (!ret)
				1142	ret = btrfs_run_delayed_items(
				1143	trans);
				1144	}
				1145	iput(victim_parent);
				1146	kfree(victim_name);
				1147	if (ret)
				1148	return ret;
				1149	*search_done = 1;
				1150	goto again;
				1151	}
				1152	kfree(victim_name);
				1153	next:
				1154	cur_offset += victim_name_len + sizeof(*extref);
				1155	}
				1156	*search_done = 1;
				1157	}
				1158	btrfs_release_path(path);
				1159
				1160	/* look for a conflicting sequence number */
				1161	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
				1162	ref_index, name, namelen, 0);
				1163	if (di && !IS_ERR(di)) {
				1164	ret = drop_one_dir_item(trans, root, path, dir, di);
				1165	if (ret)
				1166	return ret;
				1167	}
				1168	btrfs_release_path(path);
				1169
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1170	/* look for a conflicting name */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1171	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
				1172	name, namelen, 0);
				1173	if (di && !IS_ERR(di)) {
				1174	ret = drop_one_dir_item(trans, root, path, dir, di);
				1175	if (ret)
				1176	return ret;
				1177	}
				1178	btrfs_release_path(path);
				1179
				1180	return 0;
				1181	}
				1182
				1183	static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
				1184	u32 namelen, char name, u64 index,
				1185	u64 *parent_objectid)
				1186	{
				1187	struct btrfs_inode_extref *extref;
				1188
				1189	extref = (struct btrfs_inode_extref *)ref_ptr;
				1190
				1191	*namelen = btrfs_inode_extref_name_len(eb, extref);
				1192	name = kmalloc(namelen, GFP_NOFS);
				1193	if (*name == NULL)
				1194	return -ENOMEM;
				1195
				1196	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
				1197	*namelen);
				1198
				1199	if (index)
				1200	*index = btrfs_inode_extref_index(eb, extref);
				1201	if (parent_objectid)
				1202	*parent_objectid = btrfs_inode_extref_parent(eb, extref);
				1203
				1204	return 0;
				1205	}
				1206
				1207	static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
				1208	u32 namelen, char name, u64 index)
				1209	{
				1210	struct btrfs_inode_ref *ref;
				1211
				1212	ref = (struct btrfs_inode_ref *)ref_ptr;
				1213
				1214	*namelen = btrfs_inode_ref_name_len(eb, ref);
				1215	name = kmalloc(namelen, GFP_NOFS);
				1216	if (*name == NULL)
				1217	return -ENOMEM;
				1218
				1219	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
				1220
				1221	if (index)
				1222	*index = btrfs_inode_ref_index(eb, ref);
				1223
				1224	return 0;
				1225	}
				1226
				1227	/*
				1228	* Take an inode reference item from the log tree and iterate all names from the
				1229	* inode reference item in the subvolume tree with the same key (if it exists).
				1230	* For any name that is not in the inode reference item from the log tree, do a
				1231	* proper unlink of that name (that is, remove its entry from the inode
				1232	* reference item and both dir index keys).
				1233	*/
				1234	static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
				1235	struct btrfs_root *root,
				1236	struct btrfs_path *path,
				1237	struct btrfs_inode *inode,
				1238	struct extent_buffer *log_eb,
				1239	int log_slot,
				1240	struct btrfs_key *key)
				1241	{
				1242	int ret;
				1243	unsigned long ref_ptr;
				1244	unsigned long ref_end;
				1245	struct extent_buffer *eb;
				1246
				1247	again:
				1248	btrfs_release_path(path);
				1249	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				1250	if (ret > 0) {
				1251	ret = 0;
				1252	goto out;
				1253	}
				1254	if (ret < 0)
				1255	goto out;
				1256
				1257	eb = path->nodes[0];
				1258	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
				1259	ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
				1260	while (ref_ptr < ref_end) {
				1261	char *name = NULL;
				1262	int namelen;
				1263	u64 parent_id;
				1264
				1265	if (key->type == BTRFS_INODE_EXTREF_KEY) {
				1266	ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
				1267	NULL, &parent_id);
				1268	} else {
				1269	parent_id = key->offset;
				1270	ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
				1271	NULL);
				1272	}
				1273	if (ret)
				1274	goto out;
				1275
				1276	if (key->type == BTRFS_INODE_EXTREF_KEY)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1277	ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
				1278	parent_id, name,
				1279	namelen);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1280	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1281	ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
				1282	name, namelen);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1283
				1284	if (!ret) {
				1285	struct inode *dir;
				1286
				1287	btrfs_release_path(path);
				1288	dir = read_one_inode(root, parent_id);
				1289	if (!dir) {
				1290	ret = -ENOENT;
				1291	kfree(name);
				1292	goto out;
				1293	}
				1294	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
				1295	inode, name, namelen);
				1296	kfree(name);
				1297	iput(dir);
				1298	if (ret)
				1299	goto out;
				1300	goto again;
				1301	}
				1302
				1303	kfree(name);
				1304	ref_ptr += namelen;
				1305	if (key->type == BTRFS_INODE_EXTREF_KEY)
				1306	ref_ptr += sizeof(struct btrfs_inode_extref);
				1307	else
				1308	ref_ptr += sizeof(struct btrfs_inode_ref);
				1309	}
				1310	ret = 0;
				1311	out:
				1312	btrfs_release_path(path);
				1313	return ret;
				1314	}
				1315
				1316	static int btrfs_inode_ref_exists(struct inode inode, struct inode dir,
				1317	const u8 ref_type, const char *name,
				1318	const int namelen)
				1319	{
				1320	struct btrfs_key key;
				1321	struct btrfs_path *path;
				1322	const u64 parent_id = btrfs_ino(BTRFS_I(dir));
				1323	int ret;
				1324
				1325	path = btrfs_alloc_path();
				1326	if (!path)
				1327	return -ENOMEM;
				1328
				1329	key.objectid = btrfs_ino(BTRFS_I(inode));
				1330	key.type = ref_type;
				1331	if (key.type == BTRFS_INODE_REF_KEY)
				1332	key.offset = parent_id;
				1333	else
				1334	key.offset = btrfs_extref_hash(parent_id, name, namelen);
				1335
				1336	ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
				1337	if (ret < 0)
				1338	goto out;
				1339	if (ret > 0) {
				1340	ret = 0;
				1341	goto out;
				1342	}
				1343	if (key.type == BTRFS_INODE_EXTREF_KEY)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1344	ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
				1345	path->slots[0], parent_id, name, namelen);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1346	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1347	ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
				1348	name, namelen);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1349
				1350	out:
				1351	btrfs_free_path(path);
				1352	return ret;
				1353	}
				1354
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1355	static int add_link(struct btrfs_trans_handle trans, struct btrfs_root root,
				1356	struct inode dir, struct inode inode, const char *name,
				1357	int namelen, u64 ref_index)
				1358	{
				1359	struct btrfs_dir_item *dir_item;
				1360	struct btrfs_key key;
				1361	struct btrfs_path *path;
				1362	struct inode *other_inode = NULL;
				1363	int ret;
				1364
				1365	path = btrfs_alloc_path();
				1366	if (!path)
				1367	return -ENOMEM;
				1368
				1369	dir_item = btrfs_lookup_dir_item(NULL, root, path,
				1370	btrfs_ino(BTRFS_I(dir)),
				1371	name, namelen, 0);
				1372	if (!dir_item) {
				1373	btrfs_release_path(path);
				1374	goto add_link;
				1375	} else if (IS_ERR(dir_item)) {
				1376	ret = PTR_ERR(dir_item);
				1377	goto out;
				1378	}
				1379
				1380	/*
				1381	* Our inode's dentry collides with the dentry of another inode which is
				1382	* in the log but not yet processed since it has a higher inode number.
				1383	* So delete that other dentry.
				1384	*/
				1385	btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
				1386	btrfs_release_path(path);
				1387	other_inode = read_one_inode(root, key.objectid);
				1388	if (!other_inode) {
				1389	ret = -ENOENT;
				1390	goto out;
				1391	}
				1392	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
				1393	name, namelen);
				1394	if (ret)
				1395	goto out;
				1396	/*
				1397	* If we dropped the link count to 0, bump it so that later the iput()
				1398	* on the inode will not free it. We will fixup the link count later.
				1399	*/
				1400	if (other_inode->i_nlink == 0)
				1401	inc_nlink(other_inode);
				1402
				1403	ret = btrfs_run_delayed_items(trans);
				1404	if (ret)
				1405	goto out;
				1406	add_link:
				1407	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
				1408	name, namelen, 0, ref_index);
				1409	out:
				1410	iput(other_inode);
				1411	btrfs_free_path(path);
				1412
				1413	return ret;
				1414	}
				1415
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1416	/*
				1417	* replay one inode back reference item found in the log tree.
				1418	* eb, slot and key refer to the buffer and key found in the log tree.
				1419	* root is the destination we are replaying into, and path is for temp
				1420	* use by this function. (it should be released on return).
				1421	*/
				1422	static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
				1423	struct btrfs_root *root,
				1424	struct btrfs_root *log,
				1425	struct btrfs_path *path,
				1426	struct extent_buffer *eb, int slot,
				1427	struct btrfs_key *key)
				1428	{
				1429	struct inode *dir = NULL;
				1430	struct inode *inode = NULL;
				1431	unsigned long ref_ptr;
				1432	unsigned long ref_end;
				1433	char *name = NULL;
				1434	int namelen;
				1435	int ret;
				1436	int search_done = 0;
				1437	int log_ref_ver = 0;
				1438	u64 parent_objectid;
				1439	u64 inode_objectid;
				1440	u64 ref_index = 0;
				1441	int ref_struct_size;
				1442
				1443	ref_ptr = btrfs_item_ptr_offset(eb, slot);
				1444	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
				1445
				1446	if (key->type == BTRFS_INODE_EXTREF_KEY) {
				1447	struct btrfs_inode_extref *r;
				1448
				1449	ref_struct_size = sizeof(struct btrfs_inode_extref);
				1450	log_ref_ver = 1;
				1451	r = (struct btrfs_inode_extref *)ref_ptr;
				1452	parent_objectid = btrfs_inode_extref_parent(eb, r);
				1453	} else {
				1454	ref_struct_size = sizeof(struct btrfs_inode_ref);
				1455	parent_objectid = key->offset;
				1456	}
				1457	inode_objectid = key->objectid;
				1458
				1459	/*
				1460	* it is possible that we didn't log all the parent directories
				1461	* for a given inode. If we don't find the dir, just don't
				1462	* copy the back ref in. The link count fixup code will take
				1463	* care of the rest
				1464	*/
				1465	dir = read_one_inode(root, parent_objectid);
				1466	if (!dir) {
				1467	ret = -ENOENT;
				1468	goto out;
				1469	}
				1470
				1471	inode = read_one_inode(root, inode_objectid);
				1472	if (!inode) {
				1473	ret = -EIO;
				1474	goto out;
				1475	}
				1476
				1477	while (ref_ptr < ref_end) {
				1478	if (log_ref_ver) {
				1479	ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
				1480	&ref_index, &parent_objectid);
				1481	/*
				1482	* parent object can change from one array
				1483	* item to another.
				1484	*/
				1485	if (!dir)
				1486	dir = read_one_inode(root, parent_objectid);
				1487	if (!dir) {
				1488	ret = -ENOENT;
				1489	goto out;
				1490	}
				1491	} else {
				1492	ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
				1493	&ref_index);
				1494	}
				1495	if (ret)
				1496	goto out;
				1497
				1498	/* if we already have a perfect match, we're done */
				1499	if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
				1500	btrfs_ino(BTRFS_I(inode)), ref_index,
				1501	name, namelen)) {
				1502	/*
				1503	* look for a conflicting back reference in the
				1504	* metadata. if we find one we have to unlink that name
				1505	* of the file before we add our new link. Later on, we
				1506	* overwrite any existing back reference, and we don't
				1507	* want to create dangling pointers in the directory.
				1508	*/
				1509
				1510	if (!search_done) {
				1511	ret = __add_inode_ref(trans, root, path, log,
				1512	BTRFS_I(dir),
				1513	BTRFS_I(inode),
				1514	inode_objectid,
				1515	parent_objectid,
				1516	ref_index, name, namelen,
				1517	&search_done);
				1518	if (ret) {
				1519	if (ret == 1)
				1520	ret = 0;
				1521	goto out;
				1522	}
				1523	}
				1524
				1525	/*
				1526	* If a reference item already exists for this inode
				1527	* with the same parent and name, but different index,
				1528	* drop it and the corresponding directory index entries
				1529	* from the parent before adding the new reference item
				1530	* and dir index entries, otherwise we would fail with
				1531	* -EEXIST returned from btrfs_add_link() below.
				1532	*/
				1533	ret = btrfs_inode_ref_exists(inode, dir, key->type,
				1534	name, namelen);
				1535	if (ret > 0) {
				1536	ret = btrfs_unlink_inode(trans, root,
				1537	BTRFS_I(dir),
				1538	BTRFS_I(inode),
				1539	name, namelen);
				1540	/*
				1541	* If we dropped the link count to 0, bump it so
				1542	* that later the iput() on the inode will not
				1543	* free it. We will fixup the link count later.
				1544	*/
				1545	if (!ret && inode->i_nlink == 0)
				1546	inc_nlink(inode);
				1547	}
				1548	if (ret < 0)
				1549	goto out;
				1550
				1551	/* insert our name */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1552	ret = add_link(trans, root, dir, inode, name, namelen,
				1553	ref_index);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1554	if (ret)
				1555	goto out;
				1556
				1557	btrfs_update_inode(trans, root, inode);
				1558	}
				1559
				1560	ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
				1561	kfree(name);
				1562	name = NULL;
				1563	if (log_ref_ver) {
				1564	iput(dir);
				1565	dir = NULL;
				1566	}
				1567	}
				1568
				1569	/*
				1570	* Before we overwrite the inode reference item in the subvolume tree
				1571	* with the item from the log tree, we must unlink all names from the
				1572	* parent directory that are in the subvolume's tree inode reference
				1573	* item, otherwise we end up with an inconsistent subvolume tree where
				1574	* dir index entries exist for a name but there is no inode reference
				1575	* item with the same name.
				1576	*/
				1577	ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
				1578	key);
				1579	if (ret)
				1580	goto out;
				1581
				1582	/* finally write the back reference in the inode */
				1583	ret = overwrite_item(trans, root, path, eb, slot, key);
				1584	out:
				1585	btrfs_release_path(path);
				1586	kfree(name);
				1587	iput(dir);
				1588	iput(inode);
				1589	return ret;
				1590	}
				1591
				1592	static int insert_orphan_item(struct btrfs_trans_handle *trans,
				1593	struct btrfs_root *root, u64 ino)
				1594	{
				1595	int ret;
				1596
				1597	ret = btrfs_insert_orphan_item(trans, root, ino);
				1598	if (ret == -EEXIST)
				1599	ret = 0;
				1600
				1601	return ret;
				1602	}
				1603
				1604	static int count_inode_extrefs(struct btrfs_root *root,
				1605	struct btrfs_inode inode, struct btrfs_path path)
				1606	{
				1607	int ret = 0;
				1608	int name_len;
				1609	unsigned int nlink = 0;
				1610	u32 item_size;
				1611	u32 cur_offset = 0;
				1612	u64 inode_objectid = btrfs_ino(inode);
				1613	u64 offset = 0;
				1614	unsigned long ptr;
				1615	struct btrfs_inode_extref *extref;
				1616	struct extent_buffer *leaf;
				1617
				1618	while (1) {
				1619	ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
				1620	&extref, &offset);
				1621	if (ret)
				1622	break;
				1623
				1624	leaf = path->nodes[0];
				1625	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				1626	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
				1627	cur_offset = 0;
				1628
				1629	while (cur_offset < item_size) {
				1630	extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
				1631	name_len = btrfs_inode_extref_name_len(leaf, extref);
				1632
				1633	nlink++;
				1634
				1635	cur_offset += name_len + sizeof(*extref);
				1636	}
				1637
				1638	offset++;
				1639	btrfs_release_path(path);
				1640	}
				1641	btrfs_release_path(path);
				1642
				1643	if (ret < 0 && ret != -ENOENT)
				1644	return ret;
				1645	return nlink;
				1646	}
				1647
				1648	static int count_inode_refs(struct btrfs_root *root,
				1649	struct btrfs_inode inode, struct btrfs_path path)
				1650	{
				1651	int ret;
				1652	struct btrfs_key key;
				1653	unsigned int nlink = 0;
				1654	unsigned long ptr;
				1655	unsigned long ptr_end;
				1656	int name_len;
				1657	u64 ino = btrfs_ino(inode);
				1658
				1659	key.objectid = ino;
				1660	key.type = BTRFS_INODE_REF_KEY;
				1661	key.offset = (u64)-1;
				1662
				1663	while (1) {
				1664	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1665	if (ret < 0)
				1666	break;
				1667	if (ret > 0) {
				1668	if (path->slots[0] == 0)
				1669	break;
				1670	path->slots[0]--;
				1671	}
				1672	process_slot:
				1673	btrfs_item_key_to_cpu(path->nodes[0], &key,
				1674	path->slots[0]);
				1675	if (key.objectid != ino \|\|
				1676	key.type != BTRFS_INODE_REF_KEY)
				1677	break;
				1678	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				1679	ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
				1680	path->slots[0]);
				1681	while (ptr < ptr_end) {
				1682	struct btrfs_inode_ref *ref;
				1683
				1684	ref = (struct btrfs_inode_ref *)ptr;
				1685	name_len = btrfs_inode_ref_name_len(path->nodes[0],
				1686	ref);
				1687	ptr = (unsigned long)(ref + 1) + name_len;
				1688	nlink++;
				1689	}
				1690
				1691	if (key.offset == 0)
				1692	break;
				1693	if (path->slots[0] > 0) {
				1694	path->slots[0]--;
				1695	goto process_slot;
				1696	}
				1697	key.offset--;
				1698	btrfs_release_path(path);
				1699	}
				1700	btrfs_release_path(path);
				1701
				1702	return nlink;
				1703	}
				1704
				1705	/*
				1706	* There are a few corners where the link count of the file can't
				1707	* be properly maintained during replay. So, instead of adding
				1708	* lots of complexity to the log code, we just scan the backrefs
				1709	* for any file that has been through replay.
				1710	*
				1711	* The scan will update the link count on the inode to reflect the
				1712	* number of back refs found. If it goes down to zero, the iput
				1713	* will free the inode.
				1714	*/
				1715	static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
				1716	struct btrfs_root *root,
				1717	struct inode *inode)
				1718	{
				1719	struct btrfs_path *path;
				1720	int ret;
				1721	u64 nlink = 0;
				1722	u64 ino = btrfs_ino(BTRFS_I(inode));
				1723
				1724	path = btrfs_alloc_path();
				1725	if (!path)
				1726	return -ENOMEM;
				1727
				1728	ret = count_inode_refs(root, BTRFS_I(inode), path);
				1729	if (ret < 0)
				1730	goto out;
				1731
				1732	nlink = ret;
				1733
				1734	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
				1735	if (ret < 0)
				1736	goto out;
				1737
				1738	nlink += ret;
				1739
				1740	ret = 0;
				1741
				1742	if (nlink != inode->i_nlink) {
				1743	set_nlink(inode, nlink);
				1744	btrfs_update_inode(trans, root, inode);
				1745	}
				1746	BTRFS_I(inode)->index_cnt = (u64)-1;
				1747
				1748	if (inode->i_nlink == 0) {
				1749	if (S_ISDIR(inode->i_mode)) {
				1750	ret = replay_dir_deletes(trans, root, NULL, path,
				1751	ino, 1);
				1752	if (ret)
				1753	goto out;
				1754	}
				1755	ret = insert_orphan_item(trans, root, ino);
				1756	}
				1757
				1758	out:
				1759	btrfs_free_path(path);
				1760	return ret;
				1761	}
				1762
				1763	static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
				1764	struct btrfs_root *root,
				1765	struct btrfs_path *path)
				1766	{
				1767	int ret;
				1768	struct btrfs_key key;
				1769	struct inode *inode;
				1770
				1771	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1772	key.type = BTRFS_ORPHAN_ITEM_KEY;
				1773	key.offset = (u64)-1;
				1774	while (1) {
				1775	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1776	if (ret < 0)
				1777	break;
				1778
				1779	if (ret == 1) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1780	ret = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1781	if (path->slots[0] == 0)
				1782	break;
				1783	path->slots[0]--;
				1784	}
				1785
				1786	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1787	if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID \|\|
				1788	key.type != BTRFS_ORPHAN_ITEM_KEY)
				1789	break;
				1790
				1791	ret = btrfs_del_item(trans, root, path);
				1792	if (ret)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1793	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1794
				1795	btrfs_release_path(path);
				1796	inode = read_one_inode(root, key.offset);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1797	if (!inode) {
				1798	ret = -EIO;
				1799	break;
				1800	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1801
				1802	ret = fixup_inode_link_count(trans, root, inode);
				1803	iput(inode);
				1804	if (ret)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1805	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1806
				1807	/*
				1808	* fixup on a directory may create new entries,
				1809	* make sure we always look for the highset possible
				1810	* offset
				1811	*/
				1812	key.offset = (u64)-1;
				1813	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1814	btrfs_release_path(path);
				1815	return ret;
				1816	}
				1817
				1818
				1819	/*
				1820	* record a given inode in the fixup dir so we can check its link
				1821	* count when replay is done. The link count is incremented here
				1822	* so the inode won't go away until we check it
				1823	*/
				1824	static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
				1825	struct btrfs_root *root,
				1826	struct btrfs_path *path,
				1827	u64 objectid)
				1828	{
				1829	struct btrfs_key key;
				1830	int ret = 0;
				1831	struct inode *inode;
				1832
				1833	inode = read_one_inode(root, objectid);
				1834	if (!inode)
				1835	return -EIO;
				1836
				1837	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1838	key.type = BTRFS_ORPHAN_ITEM_KEY;
				1839	key.offset = objectid;
				1840
				1841	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
				1842
				1843	btrfs_release_path(path);
				1844	if (ret == 0) {
				1845	if (!inode->i_nlink)
				1846	set_nlink(inode, 1);
				1847	else
				1848	inc_nlink(inode);
				1849	ret = btrfs_update_inode(trans, root, inode);
				1850	} else if (ret == -EEXIST) {
				1851	ret = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1852	}
				1853	iput(inode);
				1854
				1855	return ret;
				1856	}
				1857
				1858	/*
				1859	* when replaying the log for a directory, we only insert names
				1860	* for inodes that actually exist. This means an fsync on a directory
				1861	* does not implicitly fsync all the new files in it
				1862	*/
				1863	static noinline int insert_one_name(struct btrfs_trans_handle *trans,
				1864	struct btrfs_root *root,
				1865	u64 dirid, u64 index,
				1866	char *name, int name_len,
				1867	struct btrfs_key *location)
				1868	{
				1869	struct inode *inode;
				1870	struct inode *dir;
				1871	int ret;
				1872
				1873	inode = read_one_inode(root, location->objectid);
				1874	if (!inode)
				1875	return -ENOENT;
				1876
				1877	dir = read_one_inode(root, dirid);
				1878	if (!dir) {
				1879	iput(inode);
				1880	return -EIO;
				1881	}
				1882
				1883	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
				1884	name_len, 1, index);
				1885
				1886	/* FIXME, put inode into FIXUP list */
				1887
				1888	iput(inode);
				1889	iput(dir);
				1890	return ret;
				1891	}
				1892
				1893	/*
				1894	* Return true if an inode reference exists in the log for the given name,
				1895	* inode and parent inode.
				1896	*/
				1897	static bool name_in_log_ref(struct btrfs_root *log_root,
				1898	const char *name, const int name_len,
				1899	const u64 dirid, const u64 ino)
				1900	{
				1901	struct btrfs_key search_key;
				1902
				1903	search_key.objectid = ino;
				1904	search_key.type = BTRFS_INODE_REF_KEY;
				1905	search_key.offset = dirid;
				1906	if (backref_in_log(log_root, &search_key, dirid, name, name_len))
				1907	return true;
				1908
				1909	search_key.type = BTRFS_INODE_EXTREF_KEY;
				1910	search_key.offset = btrfs_extref_hash(dirid, name, name_len);
				1911	if (backref_in_log(log_root, &search_key, dirid, name, name_len))
				1912	return true;
				1913
				1914	return false;
				1915	}
				1916
				1917	/*
				1918	* take a single entry in a log directory item and replay it into
				1919	* the subvolume.
				1920	*
				1921	* if a conflicting item exists in the subdirectory already,
				1922	* the inode it points to is unlinked and put into the link count
				1923	* fix up tree.
				1924	*
				1925	* If a name from the log points to a file or directory that does
				1926	* not exist in the FS, it is skipped. fsyncs on directories
				1927	* do not force down inodes inside that directory, just changes to the
				1928	* names or unlinks in a directory.
				1929	*
				1930	* Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
				1931	* non-existing inode) and 1 if the name was replayed.
				1932	*/
				1933	static noinline int replay_one_name(struct btrfs_trans_handle *trans,
				1934	struct btrfs_root *root,
				1935	struct btrfs_path *path,
				1936	struct extent_buffer *eb,
				1937	struct btrfs_dir_item *di,
				1938	struct btrfs_key *key)
				1939	{
				1940	char *name;
				1941	int name_len;
				1942	struct btrfs_dir_item *dst_di;
				1943	struct btrfs_key found_key;
				1944	struct btrfs_key log_key;
				1945	struct inode *dir;
				1946	u8 log_type;
				1947	int exists;
				1948	int ret = 0;
				1949	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
				1950	bool name_added = false;
				1951
				1952	dir = read_one_inode(root, key->objectid);
				1953	if (!dir)
				1954	return -EIO;
				1955
				1956	name_len = btrfs_dir_name_len(eb, di);
				1957	name = kmalloc(name_len, GFP_NOFS);
				1958	if (!name) {
				1959	ret = -ENOMEM;
				1960	goto out;
				1961	}
				1962
				1963	log_type = btrfs_dir_type(eb, di);
				1964	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				1965	name_len);
				1966
				1967	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
				1968	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
				1969	if (exists == 0)
				1970	exists = 1;
				1971	else
				1972	exists = 0;
				1973	btrfs_release_path(path);
				1974
				1975	if (key->type == BTRFS_DIR_ITEM_KEY) {
				1976	dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
				1977	name, name_len, 1);
				1978	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
				1979	dst_di = btrfs_lookup_dir_index_item(trans, root, path,
				1980	key->objectid,
				1981	key->offset, name,
				1982	name_len, 1);
				1983	} else {
				1984	/* Corruption */
				1985	ret = -EINVAL;
				1986	goto out;
				1987	}
				1988	if (IS_ERR_OR_NULL(dst_di)) {
				1989	/* we need a sequence number to insert, so we only
				1990	* do inserts for the BTRFS_DIR_INDEX_KEY types
				1991	*/
				1992	if (key->type != BTRFS_DIR_INDEX_KEY)
				1993	goto out;
				1994	goto insert;
				1995	}
				1996
				1997	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
				1998	/* the existing item matches the logged item */
				1999	if (found_key.objectid == log_key.objectid &&
				2000	found_key.type == log_key.type &&
				2001	found_key.offset == log_key.offset &&
				2002	btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
				2003	update_size = false;
				2004	goto out;
				2005	}
				2006
				2007	/*
				2008	* don't drop the conflicting directory entry if the inode
				2009	* for the new entry doesn't exist
				2010	*/
				2011	if (!exists)
				2012	goto out;
				2013
				2014	ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
				2015	if (ret)
				2016	goto out;
				2017
				2018	if (key->type == BTRFS_DIR_INDEX_KEY)
				2019	goto insert;
				2020	out:
				2021	btrfs_release_path(path);
				2022	if (!ret && update_size) {
				2023	btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
				2024	ret = btrfs_update_inode(trans, root, dir);
				2025	}
				2026	kfree(name);
				2027	iput(dir);
				2028	if (!ret && name_added)
				2029	ret = 1;
				2030	return ret;
				2031
				2032	insert:
				2033	if (name_in_log_ref(root->log_root, name, name_len,
				2034	key->objectid, log_key.objectid)) {
				2035	/* The dentry will be added later. */
				2036	ret = 0;
				2037	update_size = false;
				2038	goto out;
				2039	}
				2040	btrfs_release_path(path);
				2041	ret = insert_one_name(trans, root, key->objectid, key->offset,
				2042	name, name_len, &log_key);
				2043	if (ret && ret != -ENOENT && ret != -EEXIST)
				2044	goto out;
				2045	if (!ret)
				2046	name_added = true;
				2047	update_size = false;
				2048	ret = 0;
				2049	goto out;
				2050	}
				2051
				2052	/*
				2053	* find all the names in a directory item and reconcile them into
				2054	* the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
				2055	* one name in a directory item, but the same code gets used for
				2056	* both directory index types
				2057	*/
				2058	static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
				2059	struct btrfs_root *root,
				2060	struct btrfs_path *path,
				2061	struct extent_buffer *eb, int slot,
				2062	struct btrfs_key *key)
				2063	{
				2064	int ret = 0;
				2065	u32 item_size = btrfs_item_size_nr(eb, slot);
				2066	struct btrfs_dir_item *di;
				2067	int name_len;
				2068	unsigned long ptr;
				2069	unsigned long ptr_end;
				2070	struct btrfs_path *fixup_path = NULL;
				2071
				2072	ptr = btrfs_item_ptr_offset(eb, slot);
				2073	ptr_end = ptr + item_size;
				2074	while (ptr < ptr_end) {
				2075	di = (struct btrfs_dir_item *)ptr;
				2076	name_len = btrfs_dir_name_len(eb, di);
				2077	ret = replay_one_name(trans, root, path, eb, di, key);
				2078	if (ret < 0)
				2079	break;
				2080	ptr = (unsigned long)(di + 1);
				2081	ptr += name_len;
				2082
				2083	/*
				2084	* If this entry refers to a non-directory (directories can not
				2085	* have a link count > 1) and it was added in the transaction
				2086	* that was not committed, make sure we fixup the link count of
				2087	* the inode it the entry points to. Otherwise something like
				2088	* the following would result in a directory pointing to an
				2089	* inode with a wrong link that does not account for this dir
				2090	* entry:
				2091	*
				2092	* mkdir testdir
				2093	* touch testdir/foo
				2094	* touch testdir/bar
				2095	* sync
				2096	*
				2097	* ln testdir/bar testdir/bar_link
				2098	* ln testdir/foo testdir/foo_link
				2099	* xfs_io -c "fsync" testdir/bar
				2100	*
				2101	* <power failure>
				2102	*
				2103	* mount fs, log replay happens
				2104	*
				2105	* File foo would remain with a link count of 1 when it has two
				2106	* entries pointing to it in the directory testdir. This would
				2107	* make it impossible to ever delete the parent directory has
				2108	* it would result in stale dentries that can never be deleted.
				2109	*/
				2110	if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
				2111	struct btrfs_key di_key;
				2112
				2113	if (!fixup_path) {
				2114	fixup_path = btrfs_alloc_path();
				2115	if (!fixup_path) {
				2116	ret = -ENOMEM;
				2117	break;
				2118	}
				2119	}
				2120
				2121	btrfs_dir_item_key_to_cpu(eb, di, &di_key);
				2122	ret = link_to_fixup_dir(trans, root, fixup_path,
				2123	di_key.objectid);
				2124	if (ret)
				2125	break;
				2126	}
				2127	ret = 0;
				2128	}
				2129	btrfs_free_path(fixup_path);
				2130	return ret;
				2131	}
				2132
				2133	/*
				2134	* directory replay has two parts. There are the standard directory
				2135	* items in the log copied from the subvolume, and range items
				2136	* created in the log while the subvolume was logged.
				2137	*
				2138	* The range items tell us which parts of the key space the log
				2139	* is authoritative for. During replay, if a key in the subvolume
				2140	* directory is in a logged range item, but not actually in the log
				2141	* that means it was deleted from the directory before the fsync
				2142	* and should be removed.
				2143	*/
				2144	static noinline int find_dir_range(struct btrfs_root *root,
				2145	struct btrfs_path *path,
				2146	u64 dirid, int key_type,
				2147	u64 start_ret, u64 end_ret)
				2148	{
				2149	struct btrfs_key key;
				2150	u64 found_end;
				2151	struct btrfs_dir_log_item *item;
				2152	int ret;
				2153	int nritems;
				2154
				2155	if (*start_ret == (u64)-1)
				2156	return 1;
				2157
				2158	key.objectid = dirid;
				2159	key.type = key_type;
				2160	key.offset = *start_ret;
				2161
				2162	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				2163	if (ret < 0)
				2164	goto out;
				2165	if (ret > 0) {
				2166	if (path->slots[0] == 0)
				2167	goto out;
				2168	path->slots[0]--;
				2169	}
				2170	if (ret != 0)
				2171	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				2172
				2173	if (key.type != key_type \|\| key.objectid != dirid) {
				2174	ret = 1;
				2175	goto next;
				2176	}
				2177	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2178	struct btrfs_dir_log_item);
				2179	found_end = btrfs_dir_log_end(path->nodes[0], item);
				2180
				2181	if (start_ret >= key.offset && start_ret <= found_end) {
				2182	ret = 0;
				2183	*start_ret = key.offset;
				2184	*end_ret = found_end;
				2185	goto out;
				2186	}
				2187	ret = 1;
				2188	next:
				2189	/* check the next slot in the tree to see if it is a valid item */
				2190	nritems = btrfs_header_nritems(path->nodes[0]);
				2191	path->slots[0]++;
				2192	if (path->slots[0] >= nritems) {
				2193	ret = btrfs_next_leaf(root, path);
				2194	if (ret)
				2195	goto out;
				2196	}
				2197
				2198	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				2199
				2200	if (key.type != key_type \|\| key.objectid != dirid) {
				2201	ret = 1;
				2202	goto out;
				2203	}
				2204	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2205	struct btrfs_dir_log_item);
				2206	found_end = btrfs_dir_log_end(path->nodes[0], item);
				2207	*start_ret = key.offset;
				2208	*end_ret = found_end;
				2209	ret = 0;
				2210	out:
				2211	btrfs_release_path(path);
				2212	return ret;
				2213	}
				2214
				2215	/*
				2216	* this looks for a given directory item in the log. If the directory
				2217	* item is not in the log, the item is removed and the inode it points
				2218	* to is unlinked
				2219	*/
				2220	static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
				2221	struct btrfs_root *root,
				2222	struct btrfs_root *log,
				2223	struct btrfs_path *path,
				2224	struct btrfs_path *log_path,
				2225	struct inode *dir,
				2226	struct btrfs_key *dir_key)
				2227	{
				2228	int ret;
				2229	struct extent_buffer *eb;
				2230	int slot;
				2231	u32 item_size;
				2232	struct btrfs_dir_item *di;
				2233	struct btrfs_dir_item *log_di;
				2234	int name_len;
				2235	unsigned long ptr;
				2236	unsigned long ptr_end;
				2237	char *name;
				2238	struct inode *inode;
				2239	struct btrfs_key location;
				2240
				2241	again:
				2242	eb = path->nodes[0];
				2243	slot = path->slots[0];
				2244	item_size = btrfs_item_size_nr(eb, slot);
				2245	ptr = btrfs_item_ptr_offset(eb, slot);
				2246	ptr_end = ptr + item_size;
				2247	while (ptr < ptr_end) {
				2248	di = (struct btrfs_dir_item *)ptr;
				2249	name_len = btrfs_dir_name_len(eb, di);
				2250	name = kmalloc(name_len, GFP_NOFS);
				2251	if (!name) {
				2252	ret = -ENOMEM;
				2253	goto out;
				2254	}
				2255	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				2256	name_len);
				2257	log_di = NULL;
				2258	if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
				2259	log_di = btrfs_lookup_dir_item(trans, log, log_path,
				2260	dir_key->objectid,
				2261	name, name_len, 0);
				2262	} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
				2263	log_di = btrfs_lookup_dir_index_item(trans, log,
				2264	log_path,
				2265	dir_key->objectid,
				2266	dir_key->offset,
				2267	name, name_len, 0);
				2268	}
				2269	if (!log_di \|\| log_di == ERR_PTR(-ENOENT)) {
				2270	btrfs_dir_item_key_to_cpu(eb, di, &location);
				2271	btrfs_release_path(path);
				2272	btrfs_release_path(log_path);
				2273	inode = read_one_inode(root, location.objectid);
				2274	if (!inode) {
				2275	kfree(name);
				2276	return -EIO;
				2277	}
				2278
				2279	ret = link_to_fixup_dir(trans, root,
				2280	path, location.objectid);
				2281	if (ret) {
				2282	kfree(name);
				2283	iput(inode);
				2284	goto out;
				2285	}
				2286
				2287	inc_nlink(inode);
				2288	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
				2289	BTRFS_I(inode), name, name_len);
				2290	if (!ret)
				2291	ret = btrfs_run_delayed_items(trans);
				2292	kfree(name);
				2293	iput(inode);
				2294	if (ret)
				2295	goto out;
				2296
				2297	/* there might still be more names under this key
				2298	* check and repeat if required
				2299	*/
				2300	ret = btrfs_search_slot(NULL, root, dir_key, path,
				2301	0, 0);
				2302	if (ret == 0)
				2303	goto again;
				2304	ret = 0;
				2305	goto out;
				2306	} else if (IS_ERR(log_di)) {
				2307	kfree(name);
				2308	return PTR_ERR(log_di);
				2309	}
				2310	btrfs_release_path(log_path);
				2311	kfree(name);
				2312
				2313	ptr = (unsigned long)(di + 1);
				2314	ptr += name_len;
				2315	}
				2316	ret = 0;
				2317	out:
				2318	btrfs_release_path(path);
				2319	btrfs_release_path(log_path);
				2320	return ret;
				2321	}
				2322
				2323	static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
				2324	struct btrfs_root *root,
				2325	struct btrfs_root *log,
				2326	struct btrfs_path *path,
				2327	const u64 ino)
				2328	{
				2329	struct btrfs_key search_key;
				2330	struct btrfs_path *log_path;
				2331	int i;
				2332	int nritems;
				2333	int ret;
				2334
				2335	log_path = btrfs_alloc_path();
				2336	if (!log_path)
				2337	return -ENOMEM;
				2338
				2339	search_key.objectid = ino;
				2340	search_key.type = BTRFS_XATTR_ITEM_KEY;
				2341	search_key.offset = 0;
				2342	again:
				2343	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
				2344	if (ret < 0)
				2345	goto out;
				2346	process_leaf:
				2347	nritems = btrfs_header_nritems(path->nodes[0]);
				2348	for (i = path->slots[0]; i < nritems; i++) {
				2349	struct btrfs_key key;
				2350	struct btrfs_dir_item *di;
				2351	struct btrfs_dir_item *log_di;
				2352	u32 total_size;
				2353	u32 cur;
				2354
				2355	btrfs_item_key_to_cpu(path->nodes[0], &key, i);
				2356	if (key.objectid != ino \|\| key.type != BTRFS_XATTR_ITEM_KEY) {
				2357	ret = 0;
				2358	goto out;
				2359	}
				2360
				2361	di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
				2362	total_size = btrfs_item_size_nr(path->nodes[0], i);
				2363	cur = 0;
				2364	while (cur < total_size) {
				2365	u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
				2366	u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
				2367	u32 this_len = sizeof(*di) + name_len + data_len;
				2368	char *name;
				2369
				2370	name = kmalloc(name_len, GFP_NOFS);
				2371	if (!name) {
				2372	ret = -ENOMEM;
				2373	goto out;
				2374	}
				2375	read_extent_buffer(path->nodes[0], name,
				2376	(unsigned long)(di + 1), name_len);
				2377
				2378	log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
				2379	name, name_len, 0);
				2380	btrfs_release_path(log_path);
				2381	if (!log_di) {
				2382	/* Doesn't exist in log tree, so delete it. */
				2383	btrfs_release_path(path);
				2384	di = btrfs_lookup_xattr(trans, root, path, ino,
				2385	name, name_len, -1);
				2386	kfree(name);
				2387	if (IS_ERR(di)) {
				2388	ret = PTR_ERR(di);
				2389	goto out;
				2390	}
				2391	ASSERT(di);
				2392	ret = btrfs_delete_one_dir_name(trans, root,
				2393	path, di);
				2394	if (ret)
				2395	goto out;
				2396	btrfs_release_path(path);
				2397	search_key = key;
				2398	goto again;
				2399	}
				2400	kfree(name);
				2401	if (IS_ERR(log_di)) {
				2402	ret = PTR_ERR(log_di);
				2403	goto out;
				2404	}
				2405	cur += this_len;
				2406	di = (struct btrfs_dir_item )((char )di + this_len);
				2407	}
				2408	}
				2409	ret = btrfs_next_leaf(root, path);
				2410	if (ret > 0)
				2411	ret = 0;
				2412	else if (ret == 0)
				2413	goto process_leaf;
				2414	out:
				2415	btrfs_free_path(log_path);
				2416	btrfs_release_path(path);
				2417	return ret;
				2418	}
				2419
				2420
				2421	/*
				2422	* deletion replay happens before we copy any new directory items
				2423	* out of the log or out of backreferences from inodes. It
				2424	* scans the log to find ranges of keys that log is authoritative for,
				2425	* and then scans the directory to find items in those ranges that are
				2426	* not present in the log.
				2427	*
				2428	* Anything we don't find in the log is unlinked and removed from the
				2429	* directory.
				2430	*/
				2431	static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
				2432	struct btrfs_root *root,
				2433	struct btrfs_root *log,
				2434	struct btrfs_path *path,
				2435	u64 dirid, int del_all)
				2436	{
				2437	u64 range_start;
				2438	u64 range_end;
				2439	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
				2440	int ret = 0;
				2441	struct btrfs_key dir_key;
				2442	struct btrfs_key found_key;
				2443	struct btrfs_path *log_path;
				2444	struct inode *dir;
				2445
				2446	dir_key.objectid = dirid;
				2447	dir_key.type = BTRFS_DIR_ITEM_KEY;
				2448	log_path = btrfs_alloc_path();
				2449	if (!log_path)
				2450	return -ENOMEM;
				2451
				2452	dir = read_one_inode(root, dirid);
				2453	/* it isn't an error if the inode isn't there, that can happen
				2454	* because we replay the deletes before we copy in the inode item
				2455	* from the log
				2456	*/
				2457	if (!dir) {
				2458	btrfs_free_path(log_path);
				2459	return 0;
				2460	}
				2461	again:
				2462	range_start = 0;
				2463	range_end = 0;
				2464	while (1) {
				2465	if (del_all)
				2466	range_end = (u64)-1;
				2467	else {
				2468	ret = find_dir_range(log, path, dirid, key_type,
				2469	&range_start, &range_end);
				2470	if (ret != 0)
				2471	break;
				2472	}
				2473
				2474	dir_key.offset = range_start;
				2475	while (1) {
				2476	int nritems;
				2477	ret = btrfs_search_slot(NULL, root, &dir_key, path,
				2478	0, 0);
				2479	if (ret < 0)
				2480	goto out;
				2481
				2482	nritems = btrfs_header_nritems(path->nodes[0]);
				2483	if (path->slots[0] >= nritems) {
				2484	ret = btrfs_next_leaf(root, path);
				2485	if (ret == 1)
				2486	break;
				2487	else if (ret < 0)
				2488	goto out;
				2489	}
				2490	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2491	path->slots[0]);
				2492	if (found_key.objectid != dirid \|\|
				2493	found_key.type != dir_key.type)
				2494	goto next_type;
				2495
				2496	if (found_key.offset > range_end)
				2497	break;
				2498
				2499	ret = check_item_in_log(trans, root, log, path,
				2500	log_path, dir,
				2501	&found_key);
				2502	if (ret)
				2503	goto out;
				2504	if (found_key.offset == (u64)-1)
				2505	break;
				2506	dir_key.offset = found_key.offset + 1;
				2507	}
				2508	btrfs_release_path(path);
				2509	if (range_end == (u64)-1)
				2510	break;
				2511	range_start = range_end + 1;
				2512	}
				2513
				2514	next_type:
				2515	ret = 0;
				2516	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
				2517	key_type = BTRFS_DIR_LOG_INDEX_KEY;
				2518	dir_key.type = BTRFS_DIR_INDEX_KEY;
				2519	btrfs_release_path(path);
				2520	goto again;
				2521	}
				2522	out:
				2523	btrfs_release_path(path);
				2524	btrfs_free_path(log_path);
				2525	iput(dir);
				2526	return ret;
				2527	}
				2528
				2529	/*
				2530	* the process_func used to replay items from the log tree. This
				2531	* gets called in two different stages. The first stage just looks
				2532	* for inodes and makes sure they are all copied into the subvolume.
				2533	*
				2534	* The second stage copies all the other item types from the log into
				2535	* the subvolume. The two stage approach is slower, but gets rid of
				2536	* lots of complexity around inodes referencing other inodes that exist
				2537	* only in the log (references come from either directory items or inode
				2538	* back refs).
				2539	*/
				2540	static int replay_one_buffer(struct btrfs_root log, struct extent_buffer eb,
				2541	struct walk_control *wc, u64 gen, int level)
				2542	{
				2543	int nritems;
				2544	struct btrfs_path *path;
				2545	struct btrfs_root *root = wc->replay_dest;
				2546	struct btrfs_key key;
				2547	int i;
				2548	int ret;
				2549
				2550	ret = btrfs_read_buffer(eb, gen, level, NULL);
				2551	if (ret)
				2552	return ret;
				2553
				2554	level = btrfs_header_level(eb);
				2555
				2556	if (level != 0)
				2557	return 0;
				2558
				2559	path = btrfs_alloc_path();
				2560	if (!path)
				2561	return -ENOMEM;
				2562
				2563	nritems = btrfs_header_nritems(eb);
				2564	for (i = 0; i < nritems; i++) {
				2565	btrfs_item_key_to_cpu(eb, &key, i);
				2566
				2567	/* inode keys are done during the first stage */
				2568	if (key.type == BTRFS_INODE_ITEM_KEY &&
				2569	wc->stage == LOG_WALK_REPLAY_INODES) {
				2570	struct btrfs_inode_item *inode_item;
				2571	u32 mode;
				2572
				2573	inode_item = btrfs_item_ptr(eb, i,
				2574	struct btrfs_inode_item);
				2575	/*
				2576	* If we have a tmpfile (O_TMPFILE) that got fsync'ed
				2577	* and never got linked before the fsync, skip it, as
				2578	* replaying it is pointless since it would be deleted
				2579	* later. We skip logging tmpfiles, but it's always
				2580	* possible we are replaying a log created with a kernel
				2581	* that used to log tmpfiles.
				2582	*/
				2583	if (btrfs_inode_nlink(eb, inode_item) == 0) {
				2584	wc->ignore_cur_inode = true;
				2585	continue;
				2586	} else {
				2587	wc->ignore_cur_inode = false;
				2588	}
				2589	ret = replay_xattr_deletes(wc->trans, root, log,
				2590	path, key.objectid);
				2591	if (ret)
				2592	break;
				2593	mode = btrfs_inode_mode(eb, inode_item);
				2594	if (S_ISDIR(mode)) {
				2595	ret = replay_dir_deletes(wc->trans,
				2596	root, log, path, key.objectid, 0);
				2597	if (ret)
				2598	break;
				2599	}
				2600	ret = overwrite_item(wc->trans, root, path,
				2601	eb, i, &key);
				2602	if (ret)
				2603	break;
				2604
				2605	/*
				2606	* Before replaying extents, truncate the inode to its
				2607	* size. We need to do it now and not after log replay
				2608	* because before an fsync we can have prealloc extents
				2609	* added beyond the inode's i_size. If we did it after,
				2610	* through orphan cleanup for example, we would drop
				2611	* those prealloc extents just after replaying them.
				2612	*/
				2613	if (S_ISREG(mode)) {
				2614	struct inode *inode;
				2615	u64 from;
				2616
				2617	inode = read_one_inode(root, key.objectid);
				2618	if (!inode) {
				2619	ret = -EIO;
				2620	break;
				2621	}
				2622	from = ALIGN(i_size_read(inode),
				2623	root->fs_info->sectorsize);
				2624	ret = btrfs_drop_extents(wc->trans, root, inode,
				2625	from, (u64)-1, 1);
				2626	if (!ret) {
				2627	/* Update the inode's nbytes. */
				2628	ret = btrfs_update_inode(wc->trans,
				2629	root, inode);
				2630	}
				2631	iput(inode);
				2632	if (ret)
				2633	break;
				2634	}
				2635
				2636	ret = link_to_fixup_dir(wc->trans, root,
				2637	path, key.objectid);
				2638	if (ret)
				2639	break;
				2640	}
				2641
				2642	if (wc->ignore_cur_inode)
				2643	continue;
				2644
				2645	if (key.type == BTRFS_DIR_INDEX_KEY &&
				2646	wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
				2647	ret = replay_one_dir_item(wc->trans, root, path,
				2648	eb, i, &key);
				2649	if (ret)
				2650	break;
				2651	}
				2652
				2653	if (wc->stage < LOG_WALK_REPLAY_ALL)
				2654	continue;
				2655
				2656	/* these keys are simply copied */
				2657	if (key.type == BTRFS_XATTR_ITEM_KEY) {
				2658	ret = overwrite_item(wc->trans, root, path,
				2659	eb, i, &key);
				2660	if (ret)
				2661	break;
				2662	} else if (key.type == BTRFS_INODE_REF_KEY \|\|
				2663	key.type == BTRFS_INODE_EXTREF_KEY) {
				2664	ret = add_inode_ref(wc->trans, root, log, path,
				2665	eb, i, &key);
				2666	if (ret && ret != -ENOENT)
				2667	break;
				2668	ret = 0;
				2669	} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
				2670	ret = replay_one_extent(wc->trans, root, path,
				2671	eb, i, &key);
				2672	if (ret)
				2673	break;
				2674	} else if (key.type == BTRFS_DIR_ITEM_KEY) {
				2675	ret = replay_one_dir_item(wc->trans, root, path,
				2676	eb, i, &key);
				2677	if (ret)
				2678	break;
				2679	}
				2680	}
				2681	btrfs_free_path(path);
				2682	return ret;
				2683	}
				2684
				2685	static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
				2686	struct btrfs_root *root,
				2687	struct btrfs_path path, int level,
				2688	struct walk_control *wc)
				2689	{
				2690	struct btrfs_fs_info *fs_info = root->fs_info;
				2691	u64 root_owner;
				2692	u64 bytenr;
				2693	u64 ptr_gen;
				2694	struct extent_buffer *next;
				2695	struct extent_buffer *cur;
				2696	struct extent_buffer *parent;
				2697	u32 blocksize;
				2698	int ret = 0;
				2699
				2700	WARN_ON(*level < 0);
				2701	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				2702
				2703	while (*level > 0) {
				2704	struct btrfs_key first_key;
				2705
				2706	WARN_ON(*level < 0);
				2707	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				2708	cur = path->nodes[*level];
				2709
				2710	WARN_ON(btrfs_header_level(cur) != *level);
				2711
				2712	if (path->slots[*level] >=
				2713	btrfs_header_nritems(cur))
				2714	break;
				2715
				2716	bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
				2717	ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
				2718	btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
				2719	blocksize = fs_info->nodesize;
				2720
				2721	parent = path->nodes[*level];
				2722	root_owner = btrfs_header_owner(parent);
				2723
				2724	next = btrfs_find_create_tree_block(fs_info, bytenr);
				2725	if (IS_ERR(next))
				2726	return PTR_ERR(next);
				2727
				2728	if (*level == 1) {
				2729	ret = wc->process_func(root, next, wc, ptr_gen,
				2730	*level - 1);
				2731	if (ret) {
				2732	free_extent_buffer(next);
				2733	return ret;
				2734	}
				2735
				2736	path->slots[*level]++;
				2737	if (wc->free) {
				2738	ret = btrfs_read_buffer(next, ptr_gen,
				2739	*level - 1, &first_key);
				2740	if (ret) {
				2741	free_extent_buffer(next);
				2742	return ret;
				2743	}
				2744
				2745	if (trans) {
				2746	btrfs_tree_lock(next);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2747	btrfs_set_lock_blocking_write(next);
				2748	btrfs_clean_tree_block(next);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2749	btrfs_wait_tree_block_writeback(next);
				2750	btrfs_tree_unlock(next);
				2751	} else {
				2752	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
				2753	clear_extent_buffer_dirty(next);
				2754	}
				2755
				2756	WARN_ON(root_owner !=
				2757	BTRFS_TREE_LOG_OBJECTID);
				2758	ret = btrfs_free_and_pin_reserved_extent(
				2759	fs_info, bytenr,
				2760	blocksize);
				2761	if (ret) {
				2762	free_extent_buffer(next);
				2763	return ret;
				2764	}
				2765	}
				2766	free_extent_buffer(next);
				2767	continue;
				2768	}
				2769	ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
				2770	if (ret) {
				2771	free_extent_buffer(next);
				2772	return ret;
				2773	}
				2774
				2775	WARN_ON(*level <= 0);
				2776	if (path->nodes[*level-1])
				2777	free_extent_buffer(path->nodes[*level-1]);
				2778	path->nodes[*level-1] = next;
				2779	*level = btrfs_header_level(next);
				2780	path->slots[*level] = 0;
				2781	cond_resched();
				2782	}
				2783	WARN_ON(*level < 0);
				2784	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				2785
				2786	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
				2787
				2788	cond_resched();
				2789	return 0;
				2790	}
				2791
				2792	static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
				2793	struct btrfs_root *root,
				2794	struct btrfs_path path, int level,
				2795	struct walk_control *wc)
				2796	{
				2797	struct btrfs_fs_info *fs_info = root->fs_info;
				2798	u64 root_owner;
				2799	int i;
				2800	int slot;
				2801	int ret;
				2802
				2803	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
				2804	slot = path->slots[i];
				2805	if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
				2806	path->slots[i]++;
				2807	*level = i;
				2808	WARN_ON(*level == 0);
				2809	return 0;
				2810	} else {
				2811	struct extent_buffer *parent;
				2812	if (path->nodes[*level] == root->node)
				2813	parent = path->nodes[*level];
				2814	else
				2815	parent = path->nodes[*level + 1];
				2816
				2817	root_owner = btrfs_header_owner(parent);
				2818	ret = wc->process_func(root, path->nodes[*level], wc,
				2819	btrfs_header_generation(path->nodes[*level]),
				2820	*level);
				2821	if (ret)
				2822	return ret;
				2823
				2824	if (wc->free) {
				2825	struct extent_buffer *next;
				2826
				2827	next = path->nodes[*level];
				2828
				2829	if (trans) {
				2830	btrfs_tree_lock(next);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2831	btrfs_set_lock_blocking_write(next);
				2832	btrfs_clean_tree_block(next);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2833	btrfs_wait_tree_block_writeback(next);
				2834	btrfs_tree_unlock(next);
				2835	} else {
				2836	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
				2837	clear_extent_buffer_dirty(next);
				2838	}
				2839
				2840	WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
				2841	ret = btrfs_free_and_pin_reserved_extent(
				2842	fs_info,
				2843	path->nodes[*level]->start,
				2844	path->nodes[*level]->len);
				2845	if (ret)
				2846	return ret;
				2847	}
				2848	free_extent_buffer(path->nodes[*level]);
				2849	path->nodes[*level] = NULL;
				2850	*level = i + 1;
				2851	}
				2852	}
				2853	return 1;
				2854	}
				2855
				2856	/*
				2857	* drop the reference count on the tree rooted at 'snap'. This traverses
				2858	* the tree freeing any blocks that have a ref count of zero after being
				2859	* decremented.
				2860	*/
				2861	static int walk_log_tree(struct btrfs_trans_handle *trans,
				2862	struct btrfs_root log, struct walk_control wc)
				2863	{
				2864	struct btrfs_fs_info *fs_info = log->fs_info;
				2865	int ret = 0;
				2866	int wret;
				2867	int level;
				2868	struct btrfs_path *path;
				2869	int orig_level;
				2870
				2871	path = btrfs_alloc_path();
				2872	if (!path)
				2873	return -ENOMEM;
				2874
				2875	level = btrfs_header_level(log->node);
				2876	orig_level = level;
				2877	path->nodes[level] = log->node;
				2878	extent_buffer_get(log->node);
				2879	path->slots[level] = 0;
				2880
				2881	while (1) {
				2882	wret = walk_down_log_tree(trans, log, path, &level, wc);
				2883	if (wret > 0)
				2884	break;
				2885	if (wret < 0) {
				2886	ret = wret;
				2887	goto out;
				2888	}
				2889
				2890	wret = walk_up_log_tree(trans, log, path, &level, wc);
				2891	if (wret > 0)
				2892	break;
				2893	if (wret < 0) {
				2894	ret = wret;
				2895	goto out;
				2896	}
				2897	}
				2898
				2899	/* was the root node processed? if not, catch it here */
				2900	if (path->nodes[orig_level]) {
				2901	ret = wc->process_func(log, path->nodes[orig_level], wc,
				2902	btrfs_header_generation(path->nodes[orig_level]),
				2903	orig_level);
				2904	if (ret)
				2905	goto out;
				2906	if (wc->free) {
				2907	struct extent_buffer *next;
				2908
				2909	next = path->nodes[orig_level];
				2910
				2911	if (trans) {
				2912	btrfs_tree_lock(next);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2913	btrfs_set_lock_blocking_write(next);
				2914	btrfs_clean_tree_block(next);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2915	btrfs_wait_tree_block_writeback(next);
				2916	btrfs_tree_unlock(next);
				2917	} else {
				2918	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
				2919	clear_extent_buffer_dirty(next);
				2920	}
				2921
				2922	WARN_ON(log->root_key.objectid !=
				2923	BTRFS_TREE_LOG_OBJECTID);
				2924	ret = btrfs_free_and_pin_reserved_extent(fs_info,
				2925	next->start, next->len);
				2926	if (ret)
				2927	goto out;
				2928	}
				2929	}
				2930
				2931	out:
				2932	btrfs_free_path(path);
				2933	return ret;
				2934	}
				2935
				2936	/*
				2937	* helper function to update the item for a given subvolumes log root
				2938	* in the tree of log roots
				2939	*/
				2940	static int update_log_root(struct btrfs_trans_handle *trans,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2941	struct btrfs_root *log,
				2942	struct btrfs_root_item *root_item)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2943	{
				2944	struct btrfs_fs_info *fs_info = log->fs_info;
				2945	int ret;
				2946
				2947	if (log->log_transid == 1) {
				2948	/* insert root item on the first sync */
				2949	ret = btrfs_insert_root(trans, fs_info->log_root_tree,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2950	&log->root_key, root_item);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2951	} else {
				2952	ret = btrfs_update_root(trans, fs_info->log_root_tree,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2953	&log->root_key, root_item);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2954	}
				2955	return ret;
				2956	}
				2957
				2958	static void wait_log_commit(struct btrfs_root *root, int transid)
				2959	{
				2960	DEFINE_WAIT(wait);
				2961	int index = transid % 2;
				2962
				2963	/*
				2964	* we only allow two pending log transactions at a time,
				2965	* so we know that if ours is more than 2 older than the
				2966	* current transaction, we're done
				2967	*/
				2968	for (;;) {
				2969	prepare_to_wait(&root->log_commit_wait[index],
				2970	&wait, TASK_UNINTERRUPTIBLE);
				2971
				2972	if (!(root->log_transid_committed < transid &&
				2973	atomic_read(&root->log_commit[index])))
				2974	break;
				2975
				2976	mutex_unlock(&root->log_mutex);
				2977	schedule();
				2978	mutex_lock(&root->log_mutex);
				2979	}
				2980	finish_wait(&root->log_commit_wait[index], &wait);
				2981	}
				2982
				2983	static void wait_for_writer(struct btrfs_root *root)
				2984	{
				2985	DEFINE_WAIT(wait);
				2986
				2987	for (;;) {
				2988	prepare_to_wait(&root->log_writer_wait, &wait,
				2989	TASK_UNINTERRUPTIBLE);
				2990	if (!atomic_read(&root->log_writers))
				2991	break;
				2992
				2993	mutex_unlock(&root->log_mutex);
				2994	schedule();
				2995	mutex_lock(&root->log_mutex);
				2996	}
				2997	finish_wait(&root->log_writer_wait, &wait);
				2998	}
				2999
				3000	static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
				3001	struct btrfs_log_ctx *ctx)
				3002	{
				3003	if (!ctx)
				3004	return;
				3005
				3006	mutex_lock(&root->log_mutex);
				3007	list_del_init(&ctx->list);
				3008	mutex_unlock(&root->log_mutex);
				3009	}
				3010
				3011	/*
				3012	* Invoked in log mutex context, or be sure there is no other task which
				3013	* can access the list.
				3014	*/
				3015	static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
				3016	int index, int error)
				3017	{
				3018	struct btrfs_log_ctx *ctx;
				3019	struct btrfs_log_ctx *safe;
				3020
				3021	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
				3022	list_del_init(&ctx->list);
				3023	ctx->log_ret = error;
				3024	}
				3025
				3026	INIT_LIST_HEAD(&root->log_ctxs[index]);
				3027	}
				3028
				3029	/*
				3030	* btrfs_sync_log does sends a given tree log down to the disk and
				3031	* updates the super blocks to record it. When this call is done,
				3032	* you know that any inodes previously logged are safely on disk only
				3033	* if it returns 0.
				3034	*
				3035	* Any other return value means you need to call btrfs_commit_transaction.
				3036	* Some of the edge cases for fsyncing directories that have had unlinks
				3037	* or renames done in the past mean that sometimes the only safe
				3038	* fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
				3039	* that has happened.
				3040	*/
				3041	int btrfs_sync_log(struct btrfs_trans_handle *trans,
				3042	struct btrfs_root root, struct btrfs_log_ctx ctx)
				3043	{
				3044	int index1;
				3045	int index2;
				3046	int mark;
				3047	int ret;
				3048	struct btrfs_fs_info *fs_info = root->fs_info;
				3049	struct btrfs_root *log = root->log_root;
				3050	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3051	struct btrfs_root_item new_root_item;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3052	int log_transid = 0;
				3053	struct btrfs_log_ctx root_log_ctx;
				3054	struct blk_plug plug;
				3055
				3056	mutex_lock(&root->log_mutex);
				3057	log_transid = ctx->log_transid;
				3058	if (root->log_transid_committed >= log_transid) {
				3059	mutex_unlock(&root->log_mutex);
				3060	return ctx->log_ret;
				3061	}
				3062
				3063	index1 = log_transid % 2;
				3064	if (atomic_read(&root->log_commit[index1])) {
				3065	wait_log_commit(root, log_transid);
				3066	mutex_unlock(&root->log_mutex);
				3067	return ctx->log_ret;
				3068	}
				3069	ASSERT(log_transid == root->log_transid);
				3070	atomic_set(&root->log_commit[index1], 1);
				3071
				3072	/* wait for previous tree log sync to complete */
				3073	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
				3074	wait_log_commit(root, log_transid - 1);
				3075
				3076	while (1) {
				3077	int batch = atomic_read(&root->log_batch);
				3078	/* when we're on an ssd, just kick the log commit out */
				3079	if (!btrfs_test_opt(fs_info, SSD) &&
				3080	test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
				3081	mutex_unlock(&root->log_mutex);
				3082	schedule_timeout_uninterruptible(1);
				3083	mutex_lock(&root->log_mutex);
				3084	}
				3085	wait_for_writer(root);
				3086	if (batch == atomic_read(&root->log_batch))
				3087	break;
				3088	}
				3089
				3090	/* bail out if we need to do a full commit */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3091	if (btrfs_need_log_full_commit(trans)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3092	ret = -EAGAIN;
				3093	mutex_unlock(&root->log_mutex);
				3094	goto out;
				3095	}
				3096
				3097	if (log_transid % 2 == 0)
				3098	mark = EXTENT_DIRTY;
				3099	else
				3100	mark = EXTENT_NEW;
				3101
				3102	/* we start IO on all the marked extents here, but we don't actually
				3103	* wait for them until later.
				3104	*/
				3105	blk_start_plug(&plug);
				3106	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
				3107	if (ret) {
				3108	blk_finish_plug(&plug);
				3109	btrfs_abort_transaction(trans, ret);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3110	btrfs_set_log_full_commit(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3111	mutex_unlock(&root->log_mutex);
				3112	goto out;
				3113	}
				3114
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3115	/*
				3116	* We _must_ update under the root->log_mutex in order to make sure we
				3117	* have a consistent view of the log root we are trying to commit at
				3118	* this moment.
				3119	*
				3120	* We _must_ copy this into a local copy, because we are not holding the
				3121	* log_root_tree->log_mutex yet. This is important because when we
				3122	* commit the log_root_tree we must have a consistent view of the
				3123	* log_root_tree when we update the super block to point at the
				3124	* log_root_tree bytenr. If we update the log_root_tree here we'll race
				3125	* with the commit and possibly point at the new block which we may not
				3126	* have written out.
				3127	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3128	btrfs_set_root_node(&log->root_item, log->node);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3129	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3130
				3131	root->log_transid++;
				3132	log->log_transid = root->log_transid;
				3133	root->log_start_pid = 0;
				3134	/*
				3135	* IO has been started, blocks of the log tree have WRITTEN flag set
				3136	* in their headers. new modifications of the log will be written to
				3137	* new positions. so it's safe to allow log writers to go in.
				3138	*/
				3139	mutex_unlock(&root->log_mutex);
				3140
				3141	btrfs_init_log_ctx(&root_log_ctx, NULL);
				3142
				3143	mutex_lock(&log_root_tree->log_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3144
				3145	index2 = log_root_tree->log_transid % 2;
				3146	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
				3147	root_log_ctx.log_transid = log_root_tree->log_transid;
				3148
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3149	/*
				3150	* Now we are safe to update the log_root_tree because we're under the
				3151	* log_mutex, and we're a current writer so we're holding the commit
				3152	* open until we drop the log_mutex.
				3153	*/
				3154	ret = update_log_root(trans, log, &new_root_item);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3155	if (ret) {
				3156	if (!list_empty(&root_log_ctx.list))
				3157	list_del_init(&root_log_ctx.list);
				3158
				3159	blk_finish_plug(&plug);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3160	btrfs_set_log_full_commit(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3161
				3162	if (ret != -ENOSPC) {
				3163	btrfs_abort_transaction(trans, ret);
				3164	mutex_unlock(&log_root_tree->log_mutex);
				3165	goto out;
				3166	}
				3167	btrfs_wait_tree_log_extents(log, mark);
				3168	mutex_unlock(&log_root_tree->log_mutex);
				3169	ret = -EAGAIN;
				3170	goto out;
				3171	}
				3172
				3173	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
				3174	blk_finish_plug(&plug);
				3175	list_del_init(&root_log_ctx.list);
				3176	mutex_unlock(&log_root_tree->log_mutex);
				3177	ret = root_log_ctx.log_ret;
				3178	goto out;
				3179	}
				3180
				3181	index2 = root_log_ctx.log_transid % 2;
				3182	if (atomic_read(&log_root_tree->log_commit[index2])) {
				3183	blk_finish_plug(&plug);
				3184	ret = btrfs_wait_tree_log_extents(log, mark);
				3185	wait_log_commit(log_root_tree,
				3186	root_log_ctx.log_transid);
				3187	mutex_unlock(&log_root_tree->log_mutex);
				3188	if (!ret)
				3189	ret = root_log_ctx.log_ret;
				3190	goto out;
				3191	}
				3192	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
				3193	atomic_set(&log_root_tree->log_commit[index2], 1);
				3194
				3195	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
				3196	wait_log_commit(log_root_tree,
				3197	root_log_ctx.log_transid - 1);
				3198	}
				3199
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3200	/*
				3201	* now that we've moved on to the tree of log tree roots,
				3202	* check the full commit flag again
				3203	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3204	if (btrfs_need_log_full_commit(trans)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3205	blk_finish_plug(&plug);
				3206	btrfs_wait_tree_log_extents(log, mark);
				3207	mutex_unlock(&log_root_tree->log_mutex);
				3208	ret = -EAGAIN;
				3209	goto out_wake_log_root;
				3210	}
				3211
				3212	ret = btrfs_write_marked_extents(fs_info,
				3213	&log_root_tree->dirty_log_pages,
				3214	EXTENT_DIRTY \| EXTENT_NEW);
				3215	blk_finish_plug(&plug);
				3216	if (ret) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3217	btrfs_set_log_full_commit(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3218	btrfs_abort_transaction(trans, ret);
				3219	mutex_unlock(&log_root_tree->log_mutex);
				3220	goto out_wake_log_root;
				3221	}
				3222	ret = btrfs_wait_tree_log_extents(log, mark);
				3223	if (!ret)
				3224	ret = btrfs_wait_tree_log_extents(log_root_tree,
				3225	EXTENT_NEW \| EXTENT_DIRTY);
				3226	if (ret) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3227	btrfs_set_log_full_commit(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3228	mutex_unlock(&log_root_tree->log_mutex);
				3229	goto out_wake_log_root;
				3230	}
				3231
				3232	btrfs_set_super_log_root(fs_info->super_for_commit,
				3233	log_root_tree->node->start);
				3234	btrfs_set_super_log_root_level(fs_info->super_for_commit,
				3235	btrfs_header_level(log_root_tree->node));
				3236
				3237	log_root_tree->log_transid++;
				3238	mutex_unlock(&log_root_tree->log_mutex);
				3239
				3240	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3241	* Nobody else is going to jump in and write the ctree
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3242	* super here because the log_commit atomic below is protecting
				3243	* us. We must be called with a transaction handle pinning
				3244	* the running transaction open, so a full commit can't hop
				3245	* in and cause problems either.
				3246	*/
				3247	ret = write_all_supers(fs_info, 1);
				3248	if (ret) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3249	btrfs_set_log_full_commit(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3250	btrfs_abort_transaction(trans, ret);
				3251	goto out_wake_log_root;
				3252	}
				3253
				3254	mutex_lock(&root->log_mutex);
				3255	if (root->last_log_commit < log_transid)
				3256	root->last_log_commit = log_transid;
				3257	mutex_unlock(&root->log_mutex);
				3258
				3259	out_wake_log_root:
				3260	mutex_lock(&log_root_tree->log_mutex);
				3261	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
				3262
				3263	log_root_tree->log_transid_committed++;
				3264	atomic_set(&log_root_tree->log_commit[index2], 0);
				3265	mutex_unlock(&log_root_tree->log_mutex);
				3266
				3267	/*
				3268	* The barrier before waitqueue_active (in cond_wake_up) is needed so
				3269	* all the updates above are seen by the woken threads. It might not be
				3270	* necessary, but proving that seems to be hard.
				3271	*/
				3272	cond_wake_up(&log_root_tree->log_commit_wait[index2]);
				3273	out:
				3274	mutex_lock(&root->log_mutex);
				3275	btrfs_remove_all_log_ctxs(root, index1, ret);
				3276	root->log_transid_committed++;
				3277	atomic_set(&root->log_commit[index1], 0);
				3278	mutex_unlock(&root->log_mutex);
				3279
				3280	/*
				3281	* The barrier before waitqueue_active (in cond_wake_up) is needed so
				3282	* all the updates above are seen by the woken threads. It might not be
				3283	* necessary, but proving that seems to be hard.
				3284	*/
				3285	cond_wake_up(&root->log_commit_wait[index1]);
				3286	return ret;
				3287	}
				3288
				3289	static void free_log_tree(struct btrfs_trans_handle *trans,
				3290	struct btrfs_root *log)
				3291	{
				3292	int ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3293	struct walk_control wc = {
				3294	.free = 1,
				3295	.process_func = process_one_buffer
				3296	};
				3297
				3298	ret = walk_log_tree(trans, log, &wc);
				3299	if (ret) {
				3300	if (trans)
				3301	btrfs_abort_transaction(trans, ret);
				3302	else
				3303	btrfs_handle_fs_error(log->fs_info, ret, NULL);
				3304	}
				3305
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3306	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
				3307	EXTENT_DIRTY \| EXTENT_NEW \| EXTENT_NEED_WAIT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3308	free_extent_buffer(log->node);
				3309	kfree(log);
				3310	}
				3311
				3312	/*
				3313	* free all the extents used by the tree log. This should be called
				3314	* at commit time of the full transaction
				3315	*/
				3316	int btrfs_free_log(struct btrfs_trans_handle trans, struct btrfs_root root)
				3317	{
				3318	if (root->log_root) {
				3319	free_log_tree(trans, root->log_root);
				3320	root->log_root = NULL;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3321	clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3322	}
				3323	return 0;
				3324	}
				3325
				3326	int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
				3327	struct btrfs_fs_info *fs_info)
				3328	{
				3329	if (fs_info->log_root_tree) {
				3330	free_log_tree(trans, fs_info->log_root_tree);
				3331	fs_info->log_root_tree = NULL;
				3332	}
				3333	return 0;
				3334	}
				3335
				3336	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3337	* Check if an inode was logged in the current transaction. We can't always rely
				3338	* on an inode's logged_trans value, because it's an in-memory only field and
				3339	* therefore not persisted. This means that its value is lost if the inode gets
				3340	* evicted and loaded again from disk (in which case it has a value of 0, and
				3341	* certainly it is smaller then any possible transaction ID), when that happens
				3342	* the full_sync flag is set in the inode's runtime flags, so on that case we
				3343	* assume eviction happened and ignore the logged_trans value, assuming the
				3344	* worst case, that the inode was logged before in the current transaction.
				3345	*/
				3346	static bool inode_logged(struct btrfs_trans_handle *trans,
				3347	struct btrfs_inode *inode)
				3348	{
				3349	if (inode->logged_trans == trans->transid)
				3350	return true;
				3351
				3352	if (inode->last_trans == trans->transid &&
				3353	test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
				3354	!test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
				3355	return true;
				3356
				3357	return false;
				3358	}
				3359
				3360	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3361	* If both a file and directory are logged, and unlinks or renames are
				3362	* mixed in, we have a few interesting corners:
				3363	*
				3364	* create file X in dir Y
				3365	* link file X to X.link in dir Y
				3366	* fsync file X
				3367	* unlink file X but leave X.link
				3368	* fsync dir Y
				3369	*
				3370	* After a crash we would expect only X.link to exist. But file X
				3371	* didn't get fsync'd again so the log has back refs for X and X.link.
				3372	*
				3373	* We solve this by removing directory entries and inode backrefs from the
				3374	* log when a file that was logged in the current transaction is
				3375	* unlinked. Any later fsync will include the updated log entries, and
				3376	* we'll be able to reconstruct the proper directory items from backrefs.
				3377	*
				3378	* This optimizations allows us to avoid relogging the entire inode
				3379	* or the entire directory.
				3380	*/
				3381	int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
				3382	struct btrfs_root *root,
				3383	const char *name, int name_len,
				3384	struct btrfs_inode *dir, u64 index)
				3385	{
				3386	struct btrfs_root *log;
				3387	struct btrfs_dir_item *di;
				3388	struct btrfs_path *path;
				3389	int ret;
				3390	int err = 0;
				3391	int bytes_del = 0;
				3392	u64 dir_ino = btrfs_ino(dir);
				3393
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3394	if (!inode_logged(trans, dir))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3395	return 0;
				3396
				3397	ret = join_running_log_trans(root);
				3398	if (ret)
				3399	return 0;
				3400
				3401	mutex_lock(&dir->log_mutex);
				3402
				3403	log = root->log_root;
				3404	path = btrfs_alloc_path();
				3405	if (!path) {
				3406	err = -ENOMEM;
				3407	goto out_unlock;
				3408	}
				3409
				3410	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
				3411	name, name_len, -1);
				3412	if (IS_ERR(di)) {
				3413	err = PTR_ERR(di);
				3414	goto fail;
				3415	}
				3416	if (di) {
				3417	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				3418	bytes_del += name_len;
				3419	if (ret) {
				3420	err = ret;
				3421	goto fail;
				3422	}
				3423	}
				3424	btrfs_release_path(path);
				3425	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
				3426	index, name, name_len, -1);
				3427	if (IS_ERR(di)) {
				3428	err = PTR_ERR(di);
				3429	goto fail;
				3430	}
				3431	if (di) {
				3432	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				3433	bytes_del += name_len;
				3434	if (ret) {
				3435	err = ret;
				3436	goto fail;
				3437	}
				3438	}
				3439
				3440	/* update the directory size in the log to reflect the names
				3441	* we have removed
				3442	*/
				3443	if (bytes_del) {
				3444	struct btrfs_key key;
				3445
				3446	key.objectid = dir_ino;
				3447	key.offset = 0;
				3448	key.type = BTRFS_INODE_ITEM_KEY;
				3449	btrfs_release_path(path);
				3450
				3451	ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
				3452	if (ret < 0) {
				3453	err = ret;
				3454	goto fail;
				3455	}
				3456	if (ret == 0) {
				3457	struct btrfs_inode_item *item;
				3458	u64 i_size;
				3459
				3460	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				3461	struct btrfs_inode_item);
				3462	i_size = btrfs_inode_size(path->nodes[0], item);
				3463	if (i_size > bytes_del)
				3464	i_size -= bytes_del;
				3465	else
				3466	i_size = 0;
				3467	btrfs_set_inode_size(path->nodes[0], item, i_size);
				3468	btrfs_mark_buffer_dirty(path->nodes[0]);
				3469	} else
				3470	ret = 0;
				3471	btrfs_release_path(path);
				3472	}
				3473	fail:
				3474	btrfs_free_path(path);
				3475	out_unlock:
				3476	mutex_unlock(&dir->log_mutex);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3477	if (err == -ENOSPC) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3478	btrfs_set_log_full_commit(trans);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3479	err = 0;
				3480	} else if (err < 0 && err != -ENOENT) {
				3481	/* ENOENT can be returned if the entry hasn't been fsynced yet */
				3482	btrfs_abort_transaction(trans, err);
				3483	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3484
				3485	btrfs_end_log_trans(root);
				3486
				3487	return err;
				3488	}
				3489
				3490	/* see comments for btrfs_del_dir_entries_in_log */
				3491	int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
				3492	struct btrfs_root *root,
				3493	const char *name, int name_len,
				3494	struct btrfs_inode *inode, u64 dirid)
				3495	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3496	struct btrfs_root *log;
				3497	u64 index;
				3498	int ret;
				3499
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3500	if (!inode_logged(trans, inode))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3501	return 0;
				3502
				3503	ret = join_running_log_trans(root);
				3504	if (ret)
				3505	return 0;
				3506	log = root->log_root;
				3507	mutex_lock(&inode->log_mutex);
				3508
				3509	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
				3510	dirid, &index);
				3511	mutex_unlock(&inode->log_mutex);
				3512	if (ret == -ENOSPC) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3513	btrfs_set_log_full_commit(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3514	ret = 0;
				3515	} else if (ret < 0 && ret != -ENOENT)
				3516	btrfs_abort_transaction(trans, ret);
				3517	btrfs_end_log_trans(root);
				3518
				3519	return ret;
				3520	}
				3521
				3522	/*
				3523	* creates a range item in the log for 'dirid'. first_offset and
				3524	* last_offset tell us which parts of the key space the log should
				3525	* be considered authoritative for.
				3526	*/
				3527	static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
				3528	struct btrfs_root *log,
				3529	struct btrfs_path *path,
				3530	int key_type, u64 dirid,
				3531	u64 first_offset, u64 last_offset)
				3532	{
				3533	int ret;
				3534	struct btrfs_key key;
				3535	struct btrfs_dir_log_item *item;
				3536
				3537	key.objectid = dirid;
				3538	key.offset = first_offset;
				3539	if (key_type == BTRFS_DIR_ITEM_KEY)
				3540	key.type = BTRFS_DIR_LOG_ITEM_KEY;
				3541	else
				3542	key.type = BTRFS_DIR_LOG_INDEX_KEY;
				3543	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
				3544	if (ret)
				3545	return ret;
				3546
				3547	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				3548	struct btrfs_dir_log_item);
				3549	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
				3550	btrfs_mark_buffer_dirty(path->nodes[0]);
				3551	btrfs_release_path(path);
				3552	return 0;
				3553	}
				3554
				3555	/*
				3556	* log all the items included in the current transaction for a given
				3557	* directory. This also creates the range items in the log tree required
				3558	* to replay anything deleted before the fsync
				3559	*/
				3560	static noinline int log_dir_items(struct btrfs_trans_handle *trans,
				3561	struct btrfs_root root, struct btrfs_inode inode,
				3562	struct btrfs_path *path,
				3563	struct btrfs_path *dst_path, int key_type,
				3564	struct btrfs_log_ctx *ctx,
				3565	u64 min_offset, u64 *last_offset_ret)
				3566	{
				3567	struct btrfs_key min_key;
				3568	struct btrfs_root *log = root->log_root;
				3569	struct extent_buffer *src;
				3570	int err = 0;
				3571	int ret;
				3572	int i;
				3573	int nritems;
				3574	u64 first_offset = min_offset;
				3575	u64 last_offset = (u64)-1;
				3576	u64 ino = btrfs_ino(inode);
				3577
				3578	log = root->log_root;
				3579
				3580	min_key.objectid = ino;
				3581	min_key.type = key_type;
				3582	min_key.offset = min_offset;
				3583
				3584	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
				3585
				3586	/*
				3587	* we didn't find anything from this transaction, see if there
				3588	* is anything at all
				3589	*/
				3590	if (ret != 0 \|\| min_key.objectid != ino \|\| min_key.type != key_type) {
				3591	min_key.objectid = ino;
				3592	min_key.type = key_type;
				3593	min_key.offset = (u64)-1;
				3594	btrfs_release_path(path);
				3595	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				3596	if (ret < 0) {
				3597	btrfs_release_path(path);
				3598	return ret;
				3599	}
				3600	ret = btrfs_previous_item(root, path, ino, key_type);
				3601
				3602	/* if ret == 0 there are items for this type,
				3603	* create a range to tell us the last key of this type.
				3604	* otherwise, there are no items in this directory after
				3605	* *min_offset, and we create a range to indicate that.
				3606	*/
				3607	if (ret == 0) {
				3608	struct btrfs_key tmp;
				3609	btrfs_item_key_to_cpu(path->nodes[0], &tmp,
				3610	path->slots[0]);
				3611	if (key_type == tmp.type)
				3612	first_offset = max(min_offset, tmp.offset) + 1;
				3613	}
				3614	goto done;
				3615	}
				3616
				3617	/* go backward to find any previous key */
				3618	ret = btrfs_previous_item(root, path, ino, key_type);
				3619	if (ret == 0) {
				3620	struct btrfs_key tmp;
				3621	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				3622	if (key_type == tmp.type) {
				3623	first_offset = tmp.offset;
				3624	ret = overwrite_item(trans, log, dst_path,
				3625	path->nodes[0], path->slots[0],
				3626	&tmp);
				3627	if (ret) {
				3628	err = ret;
				3629	goto done;
				3630	}
				3631	}
				3632	}
				3633	btrfs_release_path(path);
				3634
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3635	/*
				3636	* Find the first key from this transaction again. See the note for
				3637	* log_new_dir_dentries, if we're logging a directory recursively we
				3638	* won't be holding its i_mutex, which means we can modify the directory
				3639	* while we're logging it. If we remove an entry between our first
				3640	* search and this search we'll not find the key again and can just
				3641	* bail.
				3642	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3643	search:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3644	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3645	if (ret != 0)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3646	goto done;
				3647
				3648	/*
				3649	* we have a block from this transaction, log every item in it
				3650	* from our directory
				3651	*/
				3652	while (1) {
				3653	struct btrfs_key tmp;
				3654	src = path->nodes[0];
				3655	nritems = btrfs_header_nritems(src);
				3656	for (i = path->slots[0]; i < nritems; i++) {
				3657	struct btrfs_dir_item *di;
				3658
				3659	btrfs_item_key_to_cpu(src, &min_key, i);
				3660
				3661	if (min_key.objectid != ino \|\| min_key.type != key_type)
				3662	goto done;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3663
				3664	if (need_resched()) {
				3665	btrfs_release_path(path);
				3666	cond_resched();
				3667	goto search;
				3668	}
				3669
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3670	ret = overwrite_item(trans, log, dst_path, src, i,
				3671	&min_key);
				3672	if (ret) {
				3673	err = ret;
				3674	goto done;
				3675	}
				3676
				3677	/*
				3678	* We must make sure that when we log a directory entry,
				3679	* the corresponding inode, after log replay, has a
				3680	* matching link count. For example:
				3681	*
				3682	* touch foo
				3683	* mkdir mydir
				3684	* sync
				3685	* ln foo mydir/bar
				3686	* xfs_io -c "fsync" mydir
				3687	* <crash>
				3688	* <mount fs and log replay>
				3689	*
				3690	* Would result in a fsync log that when replayed, our
				3691	* file inode would have a link count of 1, but we get
				3692	* two directory entries pointing to the same inode.
				3693	* After removing one of the names, it would not be
				3694	* possible to remove the other name, which resulted
				3695	* always in stale file handle errors, and would not
				3696	* be possible to rmdir the parent directory, since
				3697	* its i_size could never decrement to the value
				3698	* BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
				3699	*/
				3700	di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
				3701	btrfs_dir_item_key_to_cpu(src, di, &tmp);
				3702	if (ctx &&
				3703	(btrfs_dir_transid(src, di) == trans->transid \|\|
				3704	btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
				3705	tmp.type != BTRFS_ROOT_ITEM_KEY)
				3706	ctx->log_new_dentries = true;
				3707	}
				3708	path->slots[0] = nritems;
				3709
				3710	/*
				3711	* look ahead to the next item and see if it is also
				3712	* from this directory and from this transaction
				3713	*/
				3714	ret = btrfs_next_leaf(root, path);
				3715	if (ret) {
				3716	if (ret == 1)
				3717	last_offset = (u64)-1;
				3718	else
				3719	err = ret;
				3720	goto done;
				3721	}
				3722	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				3723	if (tmp.objectid != ino \|\| tmp.type != key_type) {
				3724	last_offset = (u64)-1;
				3725	goto done;
				3726	}
				3727	if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
				3728	ret = overwrite_item(trans, log, dst_path,
				3729	path->nodes[0], path->slots[0],
				3730	&tmp);
				3731	if (ret)
				3732	err = ret;
				3733	else
				3734	last_offset = tmp.offset;
				3735	goto done;
				3736	}
				3737	}
				3738	done:
				3739	btrfs_release_path(path);
				3740	btrfs_release_path(dst_path);
				3741
				3742	if (err == 0) {
				3743	*last_offset_ret = last_offset;
				3744	/*
				3745	* insert the log range keys to indicate where the log
				3746	* is valid
				3747	*/
				3748	ret = insert_dir_log_key(trans, log, path, key_type,
				3749	ino, first_offset, last_offset);
				3750	if (ret)
				3751	err = ret;
				3752	}
				3753	return err;
				3754	}
				3755
				3756	/*
				3757	* logging directories is very similar to logging inodes, We find all the items
				3758	* from the current transaction and write them to the log.
				3759	*
				3760	* The recovery code scans the directory in the subvolume, and if it finds a
				3761	* key in the range logged that is not present in the log tree, then it means
				3762	* that dir entry was unlinked during the transaction.
				3763	*
				3764	* In order for that scan to work, we must include one key smaller than
				3765	* the smallest logged by this transaction and one key larger than the largest
				3766	* key logged by this transaction.
				3767	*/
				3768	static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
				3769	struct btrfs_root root, struct btrfs_inode inode,
				3770	struct btrfs_path *path,
				3771	struct btrfs_path *dst_path,
				3772	struct btrfs_log_ctx *ctx)
				3773	{
				3774	u64 min_key;
				3775	u64 max_key;
				3776	int ret;
				3777	int key_type = BTRFS_DIR_ITEM_KEY;
				3778
				3779	again:
				3780	min_key = 0;
				3781	max_key = 0;
				3782	while (1) {
				3783	ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
				3784	ctx, min_key, &max_key);
				3785	if (ret)
				3786	return ret;
				3787	if (max_key == (u64)-1)
				3788	break;
				3789	min_key = max_key + 1;
				3790	}
				3791
				3792	if (key_type == BTRFS_DIR_ITEM_KEY) {
				3793	key_type = BTRFS_DIR_INDEX_KEY;
				3794	goto again;
				3795	}
				3796	return 0;
				3797	}
				3798
				3799	/*
				3800	* a helper function to drop items from the log before we relog an
				3801	* inode. max_key_type indicates the highest item type to remove.
				3802	* This cannot be run for file data extents because it does not
				3803	* free the extents they point to.
				3804	*/
				3805	static int drop_objectid_items(struct btrfs_trans_handle *trans,
				3806	struct btrfs_root *log,
				3807	struct btrfs_path *path,
				3808	u64 objectid, int max_key_type)
				3809	{
				3810	int ret;
				3811	struct btrfs_key key;
				3812	struct btrfs_key found_key;
				3813	int start_slot;
				3814
				3815	key.objectid = objectid;
				3816	key.type = max_key_type;
				3817	key.offset = (u64)-1;
				3818
				3819	while (1) {
				3820	ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
				3821	BUG_ON(ret == 0); /* Logic error */
				3822	if (ret < 0)
				3823	break;
				3824
				3825	if (path->slots[0] == 0)
				3826	break;
				3827
				3828	path->slots[0]--;
				3829	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				3830	path->slots[0]);
				3831
				3832	if (found_key.objectid != objectid)
				3833	break;
				3834
				3835	found_key.offset = 0;
				3836	found_key.type = 0;
				3837	ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
				3838	&start_slot);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3839	if (ret < 0)
				3840	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3841
				3842	ret = btrfs_del_items(trans, log, path, start_slot,
				3843	path->slots[0] - start_slot + 1);
				3844	/*
				3845	* If start slot isn't 0 then we don't need to re-search, we've
				3846	* found the last guy with the objectid in this tree.
				3847	*/
				3848	if (ret \|\| start_slot != 0)
				3849	break;
				3850	btrfs_release_path(path);
				3851	}
				3852	btrfs_release_path(path);
				3853	if (ret > 0)
				3854	ret = 0;
				3855	return ret;
				3856	}
				3857
				3858	static void fill_inode_item(struct btrfs_trans_handle *trans,
				3859	struct extent_buffer *leaf,
				3860	struct btrfs_inode_item *item,
				3861	struct inode *inode, int log_inode_only,
				3862	u64 logged_isize)
				3863	{
				3864	struct btrfs_map_token token;
				3865
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3866	btrfs_init_map_token(&token, leaf);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3867
				3868	if (log_inode_only) {
				3869	/* set the generation to zero so the recover code
				3870	* can tell the difference between an logging
				3871	* just to say 'this inode exists' and a logging
				3872	* to say 'update this inode with these values'
				3873	*/
				3874	btrfs_set_token_inode_generation(leaf, item, 0, &token);
				3875	btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
				3876	} else {
				3877	btrfs_set_token_inode_generation(leaf, item,
				3878	BTRFS_I(inode)->generation,
				3879	&token);
				3880	btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
				3881	}
				3882
				3883	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
				3884	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
				3885	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
				3886	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
				3887
				3888	btrfs_set_token_timespec_sec(leaf, &item->atime,
				3889	inode->i_atime.tv_sec, &token);
				3890	btrfs_set_token_timespec_nsec(leaf, &item->atime,
				3891	inode->i_atime.tv_nsec, &token);
				3892
				3893	btrfs_set_token_timespec_sec(leaf, &item->mtime,
				3894	inode->i_mtime.tv_sec, &token);
				3895	btrfs_set_token_timespec_nsec(leaf, &item->mtime,
				3896	inode->i_mtime.tv_nsec, &token);
				3897
				3898	btrfs_set_token_timespec_sec(leaf, &item->ctime,
				3899	inode->i_ctime.tv_sec, &token);
				3900	btrfs_set_token_timespec_nsec(leaf, &item->ctime,
				3901	inode->i_ctime.tv_nsec, &token);
				3902
				3903	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
				3904	&token);
				3905
				3906	btrfs_set_token_inode_sequence(leaf, item,
				3907	inode_peek_iversion(inode), &token);
				3908	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
				3909	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
				3910	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
				3911	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
				3912	}
				3913
				3914	static int log_inode_item(struct btrfs_trans_handle *trans,
				3915	struct btrfs_root log, struct btrfs_path path,
				3916	struct btrfs_inode *inode)
				3917	{
				3918	struct btrfs_inode_item *inode_item;
				3919	int ret;
				3920
				3921	ret = btrfs_insert_empty_item(trans, log, path,
				3922	&inode->location, sizeof(*inode_item));
				3923	if (ret && ret != -EEXIST)
				3924	return ret;
				3925	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				3926	struct btrfs_inode_item);
				3927	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
				3928	0, 0);
				3929	btrfs_release_path(path);
				3930	return 0;
				3931	}
				3932
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3933	static int log_csums(struct btrfs_trans_handle *trans,
				3934	struct btrfs_root *log_root,
				3935	struct btrfs_ordered_sum *sums)
				3936	{
				3937	int ret;
				3938
				3939	/*
				3940	* Due to extent cloning, we might have logged a csum item that covers a
				3941	* subrange of a cloned extent, and later we can end up logging a csum
				3942	* item for a larger subrange of the same extent or the entire range.
				3943	* This would leave csum items in the log tree that cover the same range
				3944	* and break the searches for checksums in the log tree, resulting in
				3945	* some checksums missing in the fs/subvolume tree. So just delete (or
				3946	* trim and adjust) any existing csum items in the log for this range.
				3947	*/
				3948	ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
				3949	if (ret)
				3950	return ret;
				3951
				3952	return btrfs_csum_file_blocks(trans, log_root, sums);
				3953	}
				3954
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3955	static noinline int copy_items(struct btrfs_trans_handle *trans,
				3956	struct btrfs_inode *inode,
				3957	struct btrfs_path *dst_path,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3958	struct btrfs_path *src_path,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3959	int start_slot, int nr, int inode_only,
				3960	u64 logged_isize)
				3961	{
				3962	struct btrfs_fs_info *fs_info = trans->fs_info;
				3963	unsigned long src_offset;
				3964	unsigned long dst_offset;
				3965	struct btrfs_root *log = inode->root->log_root;
				3966	struct btrfs_file_extent_item *extent;
				3967	struct btrfs_inode_item *inode_item;
				3968	struct extent_buffer *src = src_path->nodes[0];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3969	int ret;
				3970	struct btrfs_key *ins_keys;
				3971	u32 *ins_sizes;
				3972	char *ins_data;
				3973	int i;
				3974	struct list_head ordered_sums;
				3975	int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3976
				3977	INIT_LIST_HEAD(&ordered_sums);
				3978
				3979	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
				3980	nr * sizeof(u32), GFP_NOFS);
				3981	if (!ins_data)
				3982	return -ENOMEM;
				3983
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3984	ins_sizes = (u32 *)ins_data;
				3985	ins_keys = (struct btrfs_key )(ins_data + nr sizeof(u32));
				3986
				3987	for (i = 0; i < nr; i++) {
				3988	ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
				3989	btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
				3990	}
				3991	ret = btrfs_insert_empty_items(trans, log, dst_path,
				3992	ins_keys, ins_sizes, nr);
				3993	if (ret) {
				3994	kfree(ins_data);
				3995	return ret;
				3996	}
				3997
				3998	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
				3999	dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
				4000	dst_path->slots[0]);
				4001
				4002	src_offset = btrfs_item_ptr_offset(src, start_slot + i);
				4003
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4004	if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
				4005	inode_item = btrfs_item_ptr(dst_path->nodes[0],
				4006	dst_path->slots[0],
				4007	struct btrfs_inode_item);
				4008	fill_inode_item(trans, dst_path->nodes[0], inode_item,
				4009	&inode->vfs_inode,
				4010	inode_only == LOG_INODE_EXISTS,
				4011	logged_isize);
				4012	} else {
				4013	copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
				4014	src_offset, ins_sizes[i]);
				4015	}
				4016
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4017	/* take a reference on file data extents so that truncates
				4018	* or deletes of this inode don't have to relog the inode
				4019	* again
				4020	*/
				4021	if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
				4022	!skip_csum) {
				4023	int found_type;
				4024	extent = btrfs_item_ptr(src, start_slot + i,
				4025	struct btrfs_file_extent_item);
				4026
				4027	if (btrfs_file_extent_generation(src, extent) < trans->transid)
				4028	continue;
				4029
				4030	found_type = btrfs_file_extent_type(src, extent);
				4031	if (found_type == BTRFS_FILE_EXTENT_REG) {
				4032	u64 ds, dl, cs, cl;
				4033	ds = btrfs_file_extent_disk_bytenr(src,
				4034	extent);
				4035	/* ds == 0 is a hole */
				4036	if (ds == 0)
				4037	continue;
				4038
				4039	dl = btrfs_file_extent_disk_num_bytes(src,
				4040	extent);
				4041	cs = btrfs_file_extent_offset(src, extent);
				4042	cl = btrfs_file_extent_num_bytes(src,
				4043	extent);
				4044	if (btrfs_file_extent_compression(src,
				4045	extent)) {
				4046	cs = 0;
				4047	cl = dl;
				4048	}
				4049
				4050	ret = btrfs_lookup_csums_range(
				4051	fs_info->csum_root,
				4052	ds + cs, ds + cs + cl - 1,
				4053	&ordered_sums, 0);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4054	if (ret)
				4055	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4056	}
				4057	}
				4058	}
				4059
				4060	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
				4061	btrfs_release_path(dst_path);
				4062	kfree(ins_data);
				4063
				4064	/*
				4065	* we have to do this after the loop above to avoid changing the
				4066	* log tree while trying to change the log tree.
				4067	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4068	while (!list_empty(&ordered_sums)) {
				4069	struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
				4070	struct btrfs_ordered_sum,
				4071	list);
				4072	if (!ret)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4073	ret = log_csums(trans, log, sums);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4074	list_del(&sums->list);
				4075	kfree(sums);
				4076	}
				4077
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4078	return ret;
				4079	}
				4080
				4081	static int extent_cmp(void priv, struct list_head a, struct list_head *b)
				4082	{
				4083	struct extent_map em1, em2;
				4084
				4085	em1 = list_entry(a, struct extent_map, list);
				4086	em2 = list_entry(b, struct extent_map, list);
				4087
				4088	if (em1->start < em2->start)
				4089	return -1;
				4090	else if (em1->start > em2->start)
				4091	return 1;
				4092	return 0;
				4093	}
				4094
				4095	static int log_extent_csums(struct btrfs_trans_handle *trans,
				4096	struct btrfs_inode *inode,
				4097	struct btrfs_root *log_root,
				4098	const struct extent_map *em)
				4099	{
				4100	u64 csum_offset;
				4101	u64 csum_len;
				4102	LIST_HEAD(ordered_sums);
				4103	int ret = 0;
				4104
				4105	if (inode->flags & BTRFS_INODE_NODATASUM \|\|
				4106	test_bit(EXTENT_FLAG_PREALLOC, &em->flags) \|\|
				4107	em->block_start == EXTENT_MAP_HOLE)
				4108	return 0;
				4109
				4110	/* If we're compressed we have to save the entire range of csums. */
				4111	if (em->compress_type) {
				4112	csum_offset = 0;
				4113	csum_len = max(em->block_len, em->orig_block_len);
				4114	} else {
				4115	csum_offset = em->mod_start - em->start;
				4116	csum_len = em->mod_len;
				4117	}
				4118
				4119	/* block start is already adjusted for the file extent offset. */
				4120	ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
				4121	em->block_start + csum_offset,
				4122	em->block_start + csum_offset +
				4123	csum_len - 1, &ordered_sums, 0);
				4124	if (ret)
				4125	return ret;
				4126
				4127	while (!list_empty(&ordered_sums)) {
				4128	struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
				4129	struct btrfs_ordered_sum,
				4130	list);
				4131	if (!ret)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4132	ret = log_csums(trans, log_root, sums);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4133	list_del(&sums->list);
				4134	kfree(sums);
				4135	}
				4136
				4137	return ret;
				4138	}
				4139
				4140	static int log_one_extent(struct btrfs_trans_handle *trans,
				4141	struct btrfs_inode inode, struct btrfs_root root,
				4142	const struct extent_map *em,
				4143	struct btrfs_path *path,
				4144	struct btrfs_log_ctx *ctx)
				4145	{
				4146	struct btrfs_root *log = root->log_root;
				4147	struct btrfs_file_extent_item *fi;
				4148	struct extent_buffer *leaf;
				4149	struct btrfs_map_token token;
				4150	struct btrfs_key key;
				4151	u64 extent_offset = em->start - em->orig_start;
				4152	u64 block_len;
				4153	int ret;
				4154	int extent_inserted = 0;
				4155
				4156	ret = log_extent_csums(trans, inode, log, em);
				4157	if (ret)
				4158	return ret;
				4159
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4160	ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
				4161	em->start + em->len, NULL, 0, 1,
				4162	sizeof(*fi), &extent_inserted);
				4163	if (ret)
				4164	return ret;
				4165
				4166	if (!extent_inserted) {
				4167	key.objectid = btrfs_ino(inode);
				4168	key.type = BTRFS_EXTENT_DATA_KEY;
				4169	key.offset = em->start;
				4170
				4171	ret = btrfs_insert_empty_item(trans, log, path, &key,
				4172	sizeof(*fi));
				4173	if (ret)
				4174	return ret;
				4175	}
				4176	leaf = path->nodes[0];
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4177	btrfs_init_map_token(&token, leaf);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4178	fi = btrfs_item_ptr(leaf, path->slots[0],
				4179	struct btrfs_file_extent_item);
				4180
				4181	btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
				4182	&token);
				4183	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
				4184	btrfs_set_token_file_extent_type(leaf, fi,
				4185	BTRFS_FILE_EXTENT_PREALLOC,
				4186	&token);
				4187	else
				4188	btrfs_set_token_file_extent_type(leaf, fi,
				4189	BTRFS_FILE_EXTENT_REG,
				4190	&token);
				4191
				4192	block_len = max(em->block_len, em->orig_block_len);
				4193	if (em->compress_type != BTRFS_COMPRESS_NONE) {
				4194	btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
				4195	em->block_start,
				4196	&token);
				4197	btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
				4198	&token);
				4199	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
				4200	btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
				4201	em->block_start -
				4202	extent_offset, &token);
				4203	btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
				4204	&token);
				4205	} else {
				4206	btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
				4207	btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
				4208	&token);
				4209	}
				4210
				4211	btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
				4212	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
				4213	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
				4214	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
				4215	&token);
				4216	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
				4217	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
				4218	btrfs_mark_buffer_dirty(leaf);
				4219
				4220	btrfs_release_path(path);
				4221
				4222	return ret;
				4223	}
				4224
				4225	/*
				4226	* Log all prealloc extents beyond the inode's i_size to make sure we do not
				4227	* lose them after doing a fast fsync and replaying the log. We scan the
				4228	* subvolume's root instead of iterating the inode's extent map tree because
				4229	* otherwise we can log incorrect extent items based on extent map conversion.
				4230	* That can happen due to the fact that extent maps are merged when they
				4231	* are not in the extent map tree's list of modified extents.
				4232	*/
				4233	static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
				4234	struct btrfs_inode *inode,
				4235	struct btrfs_path *path)
				4236	{
				4237	struct btrfs_root *root = inode->root;
				4238	struct btrfs_key key;
				4239	const u64 i_size = i_size_read(&inode->vfs_inode);
				4240	const u64 ino = btrfs_ino(inode);
				4241	struct btrfs_path *dst_path = NULL;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4242	bool dropped_extents = false;
				4243	u64 truncate_offset = i_size;
				4244	struct extent_buffer *leaf;
				4245	int slot;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4246	int ins_nr = 0;
				4247	int start_slot;
				4248	int ret;
				4249
				4250	if (!(inode->flags & BTRFS_INODE_PREALLOC))
				4251	return 0;
				4252
				4253	key.objectid = ino;
				4254	key.type = BTRFS_EXTENT_DATA_KEY;
				4255	key.offset = i_size;
				4256	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				4257	if (ret < 0)
				4258	goto out;
				4259
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4260	/*
				4261	* We must check if there is a prealloc extent that starts before the
				4262	* i_size and crosses the i_size boundary. This is to ensure later we
				4263	* truncate down to the end of that extent and not to the i_size, as
				4264	* otherwise we end up losing part of the prealloc extent after a log
				4265	* replay and with an implicit hole if there is another prealloc extent
				4266	* that starts at an offset beyond i_size.
				4267	*/
				4268	ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
				4269	if (ret < 0)
				4270	goto out;
				4271
				4272	if (ret == 0) {
				4273	struct btrfs_file_extent_item *ei;
				4274
				4275	leaf = path->nodes[0];
				4276	slot = path->slots[0];
				4277	ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
				4278
				4279	if (btrfs_file_extent_type(leaf, ei) ==
				4280	BTRFS_FILE_EXTENT_PREALLOC) {
				4281	u64 extent_end;
				4282
				4283	btrfs_item_key_to_cpu(leaf, &key, slot);
				4284	extent_end = key.offset +
				4285	btrfs_file_extent_num_bytes(leaf, ei);
				4286
				4287	if (extent_end > i_size)
				4288	truncate_offset = extent_end;
				4289	}
				4290	} else {
				4291	ret = 0;
				4292	}
				4293
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4294	while (true) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4295	leaf = path->nodes[0];
				4296	slot = path->slots[0];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4297
				4298	if (slot >= btrfs_header_nritems(leaf)) {
				4299	if (ins_nr > 0) {
				4300	ret = copy_items(trans, inode, dst_path, path,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4301	start_slot, ins_nr, 1, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4302	if (ret < 0)
				4303	goto out;
				4304	ins_nr = 0;
				4305	}
				4306	ret = btrfs_next_leaf(root, path);
				4307	if (ret < 0)
				4308	goto out;
				4309	if (ret > 0) {
				4310	ret = 0;
				4311	break;
				4312	}
				4313	continue;
				4314	}
				4315
				4316	btrfs_item_key_to_cpu(leaf, &key, slot);
				4317	if (key.objectid > ino)
				4318	break;
				4319	if (WARN_ON_ONCE(key.objectid < ino) \|\|
				4320	key.type < BTRFS_EXTENT_DATA_KEY \|\|
				4321	key.offset < i_size) {
				4322	path->slots[0]++;
				4323	continue;
				4324	}
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4325	if (!dropped_extents) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4326	/*
				4327	* Avoid logging extent items logged in past fsync calls
				4328	* and leading to duplicate keys in the log tree.
				4329	*/
				4330	do {
				4331	ret = btrfs_truncate_inode_items(trans,
				4332	root->log_root,
				4333	&inode->vfs_inode,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4334	truncate_offset,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4335	BTRFS_EXTENT_DATA_KEY);
				4336	} while (ret == -EAGAIN);
				4337	if (ret)
				4338	goto out;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4339	dropped_extents = true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4340	}
				4341	if (ins_nr == 0)
				4342	start_slot = slot;
				4343	ins_nr++;
				4344	path->slots[0]++;
				4345	if (!dst_path) {
				4346	dst_path = btrfs_alloc_path();
				4347	if (!dst_path) {
				4348	ret = -ENOMEM;
				4349	goto out;
				4350	}
				4351	}
				4352	}
				4353	if (ins_nr > 0) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4354	ret = copy_items(trans, inode, dst_path, path,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4355	start_slot, ins_nr, 1, 0);
				4356	if (ret > 0)
				4357	ret = 0;
				4358	}
				4359	out:
				4360	btrfs_release_path(path);
				4361	btrfs_free_path(dst_path);
				4362	return ret;
				4363	}
				4364
				4365	static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
				4366	struct btrfs_root *root,
				4367	struct btrfs_inode *inode,
				4368	struct btrfs_path *path,
				4369	struct btrfs_log_ctx *ctx,
				4370	const u64 start,
				4371	const u64 end)
				4372	{
				4373	struct extent_map em, n;
				4374	struct list_head extents;
				4375	struct extent_map_tree *tree = &inode->extent_tree;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4376	u64 test_gen;
				4377	int ret = 0;
				4378	int num = 0;
				4379
				4380	INIT_LIST_HEAD(&extents);
				4381
				4382	write_lock(&tree->lock);
				4383	test_gen = root->fs_info->last_trans_committed;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4384
				4385	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
				4386	/*
				4387	* Skip extents outside our logging range. It's important to do
				4388	* it for correctness because if we don't ignore them, we may
				4389	* log them before their ordered extent completes, and therefore
				4390	* we could log them without logging their respective checksums
				4391	* (the checksum items are added to the csum tree at the very
				4392	* end of btrfs_finish_ordered_io()). Also leave such extents
				4393	* outside of our range in the list, since we may have another
				4394	* ranged fsync in the near future that needs them. If an extent
				4395	* outside our range corresponds to a hole, log it to avoid
				4396	* leaving gaps between extents (fsck will complain when we are
				4397	* not using the NO_HOLES feature).
				4398	*/
				4399	if ((em->start > end \|\| em->start + em->len <= start) &&
				4400	em->block_start != EXTENT_MAP_HOLE)
				4401	continue;
				4402
				4403	list_del_init(&em->list);
				4404	/*
				4405	* Just an arbitrary number, this can be really CPU intensive
				4406	* once we start getting a lot of extents, and really once we
				4407	* have a bunch of extents we just want to commit since it will
				4408	* be faster.
				4409	*/
				4410	if (++num > 32768) {
				4411	list_del_init(&tree->modified_extents);
				4412	ret = -EFBIG;
				4413	goto process;
				4414	}
				4415
				4416	if (em->generation <= test_gen)
				4417	continue;
				4418
				4419	/* We log prealloc extents beyond eof later. */
				4420	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
				4421	em->start >= i_size_read(&inode->vfs_inode))
				4422	continue;
				4423
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4424	/* Need a ref to keep it from getting evicted from cache */
				4425	refcount_inc(&em->refs);
				4426	set_bit(EXTENT_FLAG_LOGGING, &em->flags);
				4427	list_add_tail(&em->list, &extents);
				4428	num++;
				4429	}
				4430
				4431	list_sort(NULL, &extents, extent_cmp);
				4432	process:
				4433	while (!list_empty(&extents)) {
				4434	em = list_entry(extents.next, struct extent_map, list);
				4435
				4436	list_del_init(&em->list);
				4437
				4438	/*
				4439	* If we had an error we just need to delete everybody from our
				4440	* private list.
				4441	*/
				4442	if (ret) {
				4443	clear_em_logging(tree, em);
				4444	free_extent_map(em);
				4445	continue;
				4446	}
				4447
				4448	write_unlock(&tree->lock);
				4449
				4450	ret = log_one_extent(trans, inode, root, em, path, ctx);
				4451	write_lock(&tree->lock);
				4452	clear_em_logging(tree, em);
				4453	free_extent_map(em);
				4454	}
				4455	WARN_ON(!list_empty(&extents));
				4456	write_unlock(&tree->lock);
				4457
				4458	btrfs_release_path(path);
				4459	if (!ret)
				4460	ret = btrfs_log_prealloc_extents(trans, inode, path);
				4461
				4462	return ret;
				4463	}
				4464
				4465	static int logged_inode_size(struct btrfs_root log, struct btrfs_inode inode,
				4466	struct btrfs_path path, u64 size_ret)
				4467	{
				4468	struct btrfs_key key;
				4469	int ret;
				4470
				4471	key.objectid = btrfs_ino(inode);
				4472	key.type = BTRFS_INODE_ITEM_KEY;
				4473	key.offset = 0;
				4474
				4475	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
				4476	if (ret < 0) {
				4477	return ret;
				4478	} else if (ret > 0) {
				4479	*size_ret = 0;
				4480	} else {
				4481	struct btrfs_inode_item *item;
				4482
				4483	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				4484	struct btrfs_inode_item);
				4485	*size_ret = btrfs_inode_size(path->nodes[0], item);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4486	/*
				4487	* If the in-memory inode's i_size is smaller then the inode
				4488	* size stored in the btree, return the inode's i_size, so
				4489	* that we get a correct inode size after replaying the log
				4490	* when before a power failure we had a shrinking truncate
				4491	* followed by addition of a new name (rename / new hard link).
				4492	* Otherwise return the inode size from the btree, to avoid
				4493	* data loss when replaying a log due to previously doing a
				4494	* write that expands the inode's size and logging a new name
				4495	* immediately after.
				4496	*/
				4497	if (*size_ret > inode->vfs_inode.i_size)
				4498	*size_ret = inode->vfs_inode.i_size;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4499	}
				4500
				4501	btrfs_release_path(path);
				4502	return 0;
				4503	}
				4504
				4505	/*
				4506	* At the moment we always log all xattrs. This is to figure out at log replay
				4507	* time which xattrs must have their deletion replayed. If a xattr is missing
				4508	* in the log tree and exists in the fs/subvol tree, we delete it. This is
				4509	* because if a xattr is deleted, the inode is fsynced and a power failure
				4510	* happens, causing the log to be replayed the next time the fs is mounted,
				4511	* we want the xattr to not exist anymore (same behaviour as other filesystems
				4512	* with a journal, ext3/4, xfs, f2fs, etc).
				4513	*/
				4514	static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
				4515	struct btrfs_root *root,
				4516	struct btrfs_inode *inode,
				4517	struct btrfs_path *path,
				4518	struct btrfs_path *dst_path)
				4519	{
				4520	int ret;
				4521	struct btrfs_key key;
				4522	const u64 ino = btrfs_ino(inode);
				4523	int ins_nr = 0;
				4524	int start_slot = 0;
				4525
				4526	key.objectid = ino;
				4527	key.type = BTRFS_XATTR_ITEM_KEY;
				4528	key.offset = 0;
				4529
				4530	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				4531	if (ret < 0)
				4532	return ret;
				4533
				4534	while (true) {
				4535	int slot = path->slots[0];
				4536	struct extent_buffer *leaf = path->nodes[0];
				4537	int nritems = btrfs_header_nritems(leaf);
				4538
				4539	if (slot >= nritems) {
				4540	if (ins_nr > 0) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4541	ret = copy_items(trans, inode, dst_path, path,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4542	start_slot, ins_nr, 1, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4543	if (ret < 0)
				4544	return ret;
				4545	ins_nr = 0;
				4546	}
				4547	ret = btrfs_next_leaf(root, path);
				4548	if (ret < 0)
				4549	return ret;
				4550	else if (ret > 0)
				4551	break;
				4552	continue;
				4553	}
				4554
				4555	btrfs_item_key_to_cpu(leaf, &key, slot);
				4556	if (key.objectid != ino \|\| key.type != BTRFS_XATTR_ITEM_KEY)
				4557	break;
				4558
				4559	if (ins_nr == 0)
				4560	start_slot = slot;
				4561	ins_nr++;
				4562	path->slots[0]++;
				4563	cond_resched();
				4564	}
				4565	if (ins_nr > 0) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4566	ret = copy_items(trans, inode, dst_path, path,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4567	start_slot, ins_nr, 1, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4568	if (ret < 0)
				4569	return ret;
				4570	}
				4571
				4572	return 0;
				4573	}
				4574
				4575	/*
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4576	* When using the NO_HOLES feature if we punched a hole that causes the
				4577	* deletion of entire leafs or all the extent items of the first leaf (the one
				4578	* that contains the inode item and references) we may end up not processing
				4579	* any extents, because there are no leafs with a generation matching the
				4580	* current transaction that have extent items for our inode. So we need to find
				4581	* if any holes exist and then log them. We also need to log holes after any
				4582	* truncate operation that changes the inode's size.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4583	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4584	static int btrfs_log_holes(struct btrfs_trans_handle *trans,
				4585	struct btrfs_root *root,
				4586	struct btrfs_inode *inode,
				4587	struct btrfs_path *path)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4588	{
				4589	struct btrfs_fs_info *fs_info = root->fs_info;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4590	struct btrfs_key key;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4591	const u64 ino = btrfs_ino(inode);
				4592	const u64 i_size = i_size_read(&inode->vfs_inode);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4593	u64 prev_extent_end = 0;
				4594	int ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4595
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4596	if (!btrfs_fs_incompat(fs_info, NO_HOLES) \|\| i_size == 0)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4597	return 0;
				4598
				4599	key.objectid = ino;
				4600	key.type = BTRFS_EXTENT_DATA_KEY;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4601	key.offset = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4602
				4603	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4604	if (ret < 0)
				4605	return ret;
				4606
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4607	while (true) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4608	struct btrfs_file_extent_item *extent;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4609	struct extent_buffer *leaf = path->nodes[0];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4610	u64 len;
				4611
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4612	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
				4613	ret = btrfs_next_leaf(root, path);
				4614	if (ret < 0)
				4615	return ret;
				4616	if (ret > 0) {
				4617	ret = 0;
				4618	break;
				4619	}
				4620	leaf = path->nodes[0];
				4621	}
				4622
				4623	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				4624	if (key.objectid != ino \|\| key.type != BTRFS_EXTENT_DATA_KEY)
				4625	break;
				4626
				4627	/* We have a hole, log it. */
				4628	if (prev_extent_end < key.offset) {
				4629	const u64 hole_len = key.offset - prev_extent_end;
				4630
				4631	/*
				4632	* Release the path to avoid deadlocks with other code
				4633	* paths that search the root while holding locks on
				4634	* leafs from the log root.
				4635	*/
				4636	btrfs_release_path(path);
				4637	ret = btrfs_insert_file_extent(trans, root->log_root,
				4638	ino, prev_extent_end, 0,
				4639	0, hole_len, 0, hole_len,
				4640	0, 0, 0);
				4641	if (ret < 0)
				4642	return ret;
				4643
				4644	/*
				4645	* Search for the same key again in the root. Since it's
				4646	* an extent item and we are holding the inode lock, the
				4647	* key must still exist. If it doesn't just emit warning
				4648	* and return an error to fall back to a transaction
				4649	* commit.
				4650	*/
				4651	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				4652	if (ret < 0)
				4653	return ret;
				4654	if (WARN_ON(ret > 0))
				4655	return -ENOENT;
				4656	leaf = path->nodes[0];
				4657	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4658
				4659	extent = btrfs_item_ptr(leaf, path->slots[0],
				4660	struct btrfs_file_extent_item);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4661	if (btrfs_file_extent_type(leaf, extent) ==
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4662	BTRFS_FILE_EXTENT_INLINE) {
				4663	len = btrfs_file_extent_ram_bytes(leaf, extent);
				4664	prev_extent_end = ALIGN(key.offset + len,
				4665	fs_info->sectorsize);
				4666	} else {
				4667	len = btrfs_file_extent_num_bytes(leaf, extent);
				4668	prev_extent_end = key.offset + len;
				4669	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4670
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4671	path->slots[0]++;
				4672	cond_resched();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4673	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4674
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4675	if (prev_extent_end < i_size) {
				4676	u64 hole_len;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4677
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4678	btrfs_release_path(path);
				4679	hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
				4680	ret = btrfs_insert_file_extent(trans, root->log_root,
				4681	ino, prev_extent_end, 0, 0,
				4682	hole_len, 0, hole_len,
				4683	0, 0, 0);
				4684	if (ret < 0)
				4685	return ret;
				4686	}
				4687
				4688	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4689	}
				4690
				4691	/*
				4692	* When we are logging a new inode X, check if it doesn't have a reference that
				4693	* matches the reference from some other inode Y created in a past transaction
				4694	* and that was renamed in the current transaction. If we don't do this, then at
				4695	* log replay time we can lose inode Y (and all its files if it's a directory):
				4696	*
				4697	* mkdir /mnt/x
				4698	* echo "hello world" > /mnt/x/foobar
				4699	* sync
				4700	* mv /mnt/x /mnt/y
				4701	* mkdir /mnt/x # or touch /mnt/x
				4702	* xfs_io -c fsync /mnt/x
				4703	* <power fail>
				4704	* mount fs, trigger log replay
				4705	*
				4706	* After the log replay procedure, we would lose the first directory and all its
				4707	* files (file foobar).
				4708	* For the case where inode Y is not a directory we simply end up losing it:
				4709	*
				4710	* echo "123" > /mnt/foo
				4711	* sync
				4712	* mv /mnt/foo /mnt/bar
				4713	* echo "abc" > /mnt/foo
				4714	* xfs_io -c fsync /mnt/foo
				4715	* <power fail>
				4716	*
				4717	* We also need this for cases where a snapshot entry is replaced by some other
				4718	* entry (file or directory) otherwise we end up with an unreplayable log due to
				4719	* attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
				4720	* if it were a regular entry:
				4721	*
				4722	* mkdir /mnt/x
				4723	* btrfs subvolume snapshot /mnt /mnt/x/snap
				4724	* btrfs subvolume delete /mnt/x/snap
				4725	* rmdir /mnt/x
				4726	* mkdir /mnt/x
				4727	* fsync /mnt/x or fsync some new file inside it
				4728	* <power fail>
				4729	*
				4730	* The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
				4731	* the same transaction.
				4732	*/
				4733	static int btrfs_check_ref_name_override(struct extent_buffer *eb,
				4734	const int slot,
				4735	const struct btrfs_key *key,
				4736	struct btrfs_inode *inode,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4737	u64 other_ino, u64 other_parent)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4738	{
				4739	int ret;
				4740	struct btrfs_path *search_path;
				4741	char *name = NULL;
				4742	u32 name_len = 0;
				4743	u32 item_size = btrfs_item_size_nr(eb, slot);
				4744	u32 cur_offset = 0;
				4745	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
				4746
				4747	search_path = btrfs_alloc_path();
				4748	if (!search_path)
				4749	return -ENOMEM;
				4750	search_path->search_commit_root = 1;
				4751	search_path->skip_locking = 1;
				4752
				4753	while (cur_offset < item_size) {
				4754	u64 parent;
				4755	u32 this_name_len;
				4756	u32 this_len;
				4757	unsigned long name_ptr;
				4758	struct btrfs_dir_item *di;
				4759
				4760	if (key->type == BTRFS_INODE_REF_KEY) {
				4761	struct btrfs_inode_ref *iref;
				4762
				4763	iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
				4764	parent = key->offset;
				4765	this_name_len = btrfs_inode_ref_name_len(eb, iref);
				4766	name_ptr = (unsigned long)(iref + 1);
				4767	this_len = sizeof(*iref) + this_name_len;
				4768	} else {
				4769	struct btrfs_inode_extref *extref;
				4770
				4771	extref = (struct btrfs_inode_extref *)(ptr +
				4772	cur_offset);
				4773	parent = btrfs_inode_extref_parent(eb, extref);
				4774	this_name_len = btrfs_inode_extref_name_len(eb, extref);
				4775	name_ptr = (unsigned long)&extref->name;
				4776	this_len = sizeof(*extref) + this_name_len;
				4777	}
				4778
				4779	if (this_name_len > name_len) {
				4780	char *new_name;
				4781
				4782	new_name = krealloc(name, this_name_len, GFP_NOFS);
				4783	if (!new_name) {
				4784	ret = -ENOMEM;
				4785	goto out;
				4786	}
				4787	name_len = this_name_len;
				4788	name = new_name;
				4789	}
				4790
				4791	read_extent_buffer(eb, name, name_ptr, this_name_len);
				4792	di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
				4793	parent, name, this_name_len, 0);
				4794	if (di && !IS_ERR(di)) {
				4795	struct btrfs_key di_key;
				4796
				4797	btrfs_dir_item_key_to_cpu(search_path->nodes[0],
				4798	di, &di_key);
				4799	if (di_key.type == BTRFS_INODE_ITEM_KEY) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4800	if (di_key.objectid != key->objectid) {
				4801	ret = 1;
				4802	*other_ino = di_key.objectid;
				4803	*other_parent = parent;
				4804	} else {
				4805	ret = 0;
				4806	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4807	} else {
				4808	ret = -EAGAIN;
				4809	}
				4810	goto out;
				4811	} else if (IS_ERR(di)) {
				4812	ret = PTR_ERR(di);
				4813	goto out;
				4814	}
				4815	btrfs_release_path(search_path);
				4816
				4817	cur_offset += this_len;
				4818	}
				4819	ret = 0;
				4820	out:
				4821	btrfs_free_path(search_path);
				4822	kfree(name);
				4823	return ret;
				4824	}
				4825
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4826	struct btrfs_ino_list {
				4827	u64 ino;
				4828	u64 parent;
				4829	struct list_head list;
				4830	};
				4831
				4832	static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
				4833	struct btrfs_root *root,
				4834	struct btrfs_path *path,
				4835	struct btrfs_log_ctx *ctx,
				4836	u64 ino, u64 parent)
				4837	{
				4838	struct btrfs_ino_list *ino_elem;
				4839	LIST_HEAD(inode_list);
				4840	int ret = 0;
				4841
				4842	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
				4843	if (!ino_elem)
				4844	return -ENOMEM;
				4845	ino_elem->ino = ino;
				4846	ino_elem->parent = parent;
				4847	list_add_tail(&ino_elem->list, &inode_list);
				4848
				4849	while (!list_empty(&inode_list)) {
				4850	struct btrfs_fs_info *fs_info = root->fs_info;
				4851	struct btrfs_key key;
				4852	struct inode *inode;
				4853
				4854	ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
				4855	list);
				4856	ino = ino_elem->ino;
				4857	parent = ino_elem->parent;
				4858	list_del(&ino_elem->list);
				4859	kfree(ino_elem);
				4860	if (ret)
				4861	continue;
				4862
				4863	btrfs_release_path(path);
				4864
				4865	key.objectid = ino;
				4866	key.type = BTRFS_INODE_ITEM_KEY;
				4867	key.offset = 0;
				4868	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
				4869	/*
				4870	* If the other inode that had a conflicting dir entry was
				4871	* deleted in the current transaction, we need to log its parent
				4872	* directory.
				4873	*/
				4874	if (IS_ERR(inode)) {
				4875	ret = PTR_ERR(inode);
				4876	if (ret == -ENOENT) {
				4877	key.objectid = parent;
				4878	inode = btrfs_iget(fs_info->sb, &key, root,
				4879	NULL);
				4880	if (IS_ERR(inode)) {
				4881	ret = PTR_ERR(inode);
				4882	} else {
				4883	ret = btrfs_log_inode(trans, root,
				4884	BTRFS_I(inode),
				4885	LOG_OTHER_INODE_ALL,
				4886	0, LLONG_MAX, ctx);
				4887	btrfs_add_delayed_iput(inode);
				4888	}
				4889	}
				4890	continue;
				4891	}
				4892	/*
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4893	* If the inode was already logged skip it - otherwise we can
				4894	* hit an infinite loop. Example:
				4895	*
				4896	* From the commit root (previous transaction) we have the
				4897	* following inodes:
				4898	*
				4899	* inode 257 a directory
				4900	* inode 258 with references "zz" and "zz_link" on inode 257
				4901	* inode 259 with reference "a" on inode 257
				4902	*
				4903	* And in the current (uncommitted) transaction we have:
				4904	*
				4905	* inode 257 a directory, unchanged
				4906	* inode 258 with references "a" and "a2" on inode 257
				4907	* inode 259 with reference "zz_link" on inode 257
				4908	* inode 261 with reference "zz" on inode 257
				4909	*
				4910	* When logging inode 261 the following infinite loop could
				4911	* happen if we don't skip already logged inodes:
				4912	*
				4913	* - we detect inode 258 as a conflicting inode, with inode 261
				4914	* on reference "zz", and log it;
				4915	*
				4916	* - we detect inode 259 as a conflicting inode, with inode 258
				4917	* on reference "a", and log it;
				4918	*
				4919	* - we detect inode 258 as a conflicting inode, with inode 259
				4920	* on reference "zz_link", and log it - again! After this we
				4921	* repeat the above steps forever.
				4922	*/
				4923	spin_lock(&BTRFS_I(inode)->lock);
				4924	/*
				4925	* Check the inode's logged_trans only instead of
				4926	* btrfs_inode_in_log(). This is because the last_log_commit of
				4927	* the inode is not updated when we only log that it exists and
				4928	* it has the full sync bit set (see btrfs_log_inode()).
				4929	*/
				4930	if (BTRFS_I(inode)->logged_trans == trans->transid) {
				4931	spin_unlock(&BTRFS_I(inode)->lock);
				4932	btrfs_add_delayed_iput(inode);
				4933	continue;
				4934	}
				4935	spin_unlock(&BTRFS_I(inode)->lock);
				4936	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4937	* We are safe logging the other inode without acquiring its
				4938	* lock as long as we log with the LOG_INODE_EXISTS mode. We
				4939	* are safe against concurrent renames of the other inode as
				4940	* well because during a rename we pin the log and update the
				4941	* log with the new name before we unpin it.
				4942	*/
				4943	ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
				4944	LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
				4945	if (ret) {
				4946	btrfs_add_delayed_iput(inode);
				4947	continue;
				4948	}
				4949
				4950	key.objectid = ino;
				4951	key.type = BTRFS_INODE_REF_KEY;
				4952	key.offset = 0;
				4953	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				4954	if (ret < 0) {
				4955	btrfs_add_delayed_iput(inode);
				4956	continue;
				4957	}
				4958
				4959	while (true) {
				4960	struct extent_buffer *leaf = path->nodes[0];
				4961	int slot = path->slots[0];
				4962	u64 other_ino = 0;
				4963	u64 other_parent = 0;
				4964
				4965	if (slot >= btrfs_header_nritems(leaf)) {
				4966	ret = btrfs_next_leaf(root, path);
				4967	if (ret < 0) {
				4968	break;
				4969	} else if (ret > 0) {
				4970	ret = 0;
				4971	break;
				4972	}
				4973	continue;
				4974	}
				4975
				4976	btrfs_item_key_to_cpu(leaf, &key, slot);
				4977	if (key.objectid != ino \|\|
				4978	(key.type != BTRFS_INODE_REF_KEY &&
				4979	key.type != BTRFS_INODE_EXTREF_KEY)) {
				4980	ret = 0;
				4981	break;
				4982	}
				4983
				4984	ret = btrfs_check_ref_name_override(leaf, slot, &key,
				4985	BTRFS_I(inode), &other_ino,
				4986	&other_parent);
				4987	if (ret < 0)
				4988	break;
				4989	if (ret > 0) {
				4990	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
				4991	if (!ino_elem) {
				4992	ret = -ENOMEM;
				4993	break;
				4994	}
				4995	ino_elem->ino = other_ino;
				4996	ino_elem->parent = other_parent;
				4997	list_add_tail(&ino_elem->list, &inode_list);
				4998	ret = 0;
				4999	}
				5000	path->slots[0]++;
				5001	}
				5002	btrfs_add_delayed_iput(inode);
				5003	}
				5004
				5005	return ret;
				5006	}
				5007
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5008	static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
				5009	struct btrfs_inode *inode,
				5010	struct btrfs_key *min_key,
				5011	const struct btrfs_key *max_key,
				5012	struct btrfs_path *path,
				5013	struct btrfs_path *dst_path,
				5014	const u64 logged_isize,
				5015	const bool recursive_logging,
				5016	const int inode_only,
				5017	struct btrfs_log_ctx *ctx,
				5018	bool *need_log_inode_item)
				5019	{
				5020	struct btrfs_root *root = inode->root;
				5021	int ins_start_slot = 0;
				5022	int ins_nr = 0;
				5023	int ret;
				5024
				5025	while (1) {
				5026	ret = btrfs_search_forward(root, min_key, path, trans->transid);
				5027	if (ret < 0)
				5028	return ret;
				5029	if (ret > 0) {
				5030	ret = 0;
				5031	break;
				5032	}
				5033	again:
				5034	/* Note, ins_nr might be > 0 here, cleanup outside the loop */
				5035	if (min_key->objectid != max_key->objectid)
				5036	break;
				5037	if (min_key->type > max_key->type)
				5038	break;
				5039
				5040	if (min_key->type == BTRFS_INODE_ITEM_KEY)
				5041	*need_log_inode_item = false;
				5042
				5043	if ((min_key->type == BTRFS_INODE_REF_KEY \|\|
				5044	min_key->type == BTRFS_INODE_EXTREF_KEY) &&
				5045	inode->generation == trans->transid &&
				5046	!recursive_logging) {
				5047	u64 other_ino = 0;
				5048	u64 other_parent = 0;
				5049
				5050	ret = btrfs_check_ref_name_override(path->nodes[0],
				5051	path->slots[0], min_key, inode,
				5052	&other_ino, &other_parent);
				5053	if (ret < 0) {
				5054	return ret;
				5055	} else if (ret > 0 && ctx &&
				5056	other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
				5057	if (ins_nr > 0) {
				5058	ins_nr++;
				5059	} else {
				5060	ins_nr = 1;
				5061	ins_start_slot = path->slots[0];
				5062	}
				5063	ret = copy_items(trans, inode, dst_path, path,
				5064	ins_start_slot, ins_nr,
				5065	inode_only, logged_isize);
				5066	if (ret < 0)
				5067	return ret;
				5068	ins_nr = 0;
				5069
				5070	ret = log_conflicting_inodes(trans, root, path,
				5071	ctx, other_ino, other_parent);
				5072	if (ret)
				5073	return ret;
				5074	btrfs_release_path(path);
				5075	goto next_key;
				5076	}
				5077	}
				5078
				5079	/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
				5080	if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
				5081	if (ins_nr == 0)
				5082	goto next_slot;
				5083	ret = copy_items(trans, inode, dst_path, path,
				5084	ins_start_slot,
				5085	ins_nr, inode_only, logged_isize);
				5086	if (ret < 0)
				5087	return ret;
				5088	ins_nr = 0;
				5089	goto next_slot;
				5090	}
				5091
				5092	if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
				5093	ins_nr++;
				5094	goto next_slot;
				5095	} else if (!ins_nr) {
				5096	ins_start_slot = path->slots[0];
				5097	ins_nr = 1;
				5098	goto next_slot;
				5099	}
				5100
				5101	ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
				5102	ins_nr, inode_only, logged_isize);
				5103	if (ret < 0)
				5104	return ret;
				5105	ins_nr = 1;
				5106	ins_start_slot = path->slots[0];
				5107	next_slot:
				5108	path->slots[0]++;
				5109	if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
				5110	btrfs_item_key_to_cpu(path->nodes[0], min_key,
				5111	path->slots[0]);
				5112	goto again;
				5113	}
				5114	if (ins_nr) {
				5115	ret = copy_items(trans, inode, dst_path, path,
				5116	ins_start_slot, ins_nr, inode_only,
				5117	logged_isize);
				5118	if (ret < 0)
				5119	return ret;
				5120	ins_nr = 0;
				5121	}
				5122	btrfs_release_path(path);
				5123	next_key:
				5124	if (min_key->offset < (u64)-1) {
				5125	min_key->offset++;
				5126	} else if (min_key->type < max_key->type) {
				5127	min_key->type++;
				5128	min_key->offset = 0;
				5129	} else {
				5130	break;
				5131	}
				5132	}
				5133	if (ins_nr)
				5134	ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
				5135	ins_nr, inode_only, logged_isize);
				5136
				5137	return ret;
				5138	}
				5139
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5140	/* log a single inode in the tree log.
				5141	* At least one parent directory for this inode must exist in the tree
				5142	* or be logged already.
				5143	*
				5144	* Any items from this inode changed by the current transaction are copied
				5145	* to the log tree. An extra reference is taken on any extents in this
				5146	* file, allowing us to avoid a whole pile of corner cases around logging
				5147	* blocks that have been removed from the tree.
				5148	*
				5149	* See LOG_INODE_ALL and related defines for a description of what inode_only
				5150	* does.
				5151	*
				5152	* This handles both files and directories.
				5153	*/
				5154	static int btrfs_log_inode(struct btrfs_trans_handle *trans,
				5155	struct btrfs_root root, struct btrfs_inode inode,
				5156	int inode_only,
				5157	const loff_t start,
				5158	const loff_t end,
				5159	struct btrfs_log_ctx *ctx)
				5160	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5161	struct btrfs_path *path;
				5162	struct btrfs_path *dst_path;
				5163	struct btrfs_key min_key;
				5164	struct btrfs_key max_key;
				5165	struct btrfs_root *log = root->log_root;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5166	int err = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5167	int ret = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5168	bool fast_search = false;
				5169	u64 ino = btrfs_ino(inode);
				5170	struct extent_map_tree *em_tree = &inode->extent_tree;
				5171	u64 logged_isize = 0;
				5172	bool need_log_inode_item = true;
				5173	bool xattrs_logged = false;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5174	bool recursive_logging = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5175
				5176	path = btrfs_alloc_path();
				5177	if (!path)
				5178	return -ENOMEM;
				5179	dst_path = btrfs_alloc_path();
				5180	if (!dst_path) {
				5181	btrfs_free_path(path);
				5182	return -ENOMEM;
				5183	}
				5184
				5185	min_key.objectid = ino;
				5186	min_key.type = BTRFS_INODE_ITEM_KEY;
				5187	min_key.offset = 0;
				5188
				5189	max_key.objectid = ino;
				5190
				5191
				5192	/* today the code can only do partial logging of directories */
				5193	if (S_ISDIR(inode->vfs_inode.i_mode) \|\|
				5194	(!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
				5195	&inode->runtime_flags) &&
				5196	inode_only >= LOG_INODE_EXISTS))
				5197	max_key.type = BTRFS_XATTR_ITEM_KEY;
				5198	else
				5199	max_key.type = (u8)-1;
				5200	max_key.offset = (u64)-1;
				5201
				5202	/*
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5203	* Only run delayed items if we are a directory. We want to make sure
				5204	* all directory indexes hit the fs/subvolume tree so we can find them
				5205	* and figure out which index ranges have to be logged.
				5206	*
				5207	* Otherwise commit the delayed inode only if the full sync flag is set,
				5208	* as we want to make sure an up to date version is in the subvolume
				5209	* tree so copy_inode_items_to_log() / copy_items() can find it and copy
				5210	* it to the log tree. For a non full sync, we always log the inode item
				5211	* based on the in-memory struct btrfs_inode which is always up to date.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5212	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5213	if (S_ISDIR(inode->vfs_inode.i_mode))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5214	ret = btrfs_commit_inode_delayed_items(trans, inode);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5215	else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5216	ret = btrfs_commit_inode_delayed_inode(inode);
				5217
				5218	if (ret) {
				5219	btrfs_free_path(path);
				5220	btrfs_free_path(dst_path);
				5221	return ret;
				5222	}
				5223
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5224	if (inode_only == LOG_OTHER_INODE \|\| inode_only == LOG_OTHER_INODE_ALL) {
				5225	recursive_logging = true;
				5226	if (inode_only == LOG_OTHER_INODE)
				5227	inode_only = LOG_INODE_EXISTS;
				5228	else
				5229	inode_only = LOG_INODE_ALL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5230	mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
				5231	} else {
				5232	mutex_lock(&inode->log_mutex);
				5233	}
				5234
				5235	/*
				5236	* a brute force approach to making sure we get the most uptodate
				5237	* copies of everything.
				5238	*/
				5239	if (S_ISDIR(inode->vfs_inode.i_mode)) {
				5240	int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
				5241
				5242	if (inode_only == LOG_INODE_EXISTS)
				5243	max_key_type = BTRFS_XATTR_ITEM_KEY;
				5244	ret = drop_objectid_items(trans, log, path, ino, max_key_type);
				5245	} else {
				5246	if (inode_only == LOG_INODE_EXISTS) {
				5247	/*
				5248	* Make sure the new inode item we write to the log has
				5249	* the same isize as the current one (if it exists).
				5250	* This is necessary to prevent data loss after log
				5251	* replay, and also to prevent doing a wrong expanding
				5252	* truncate - for e.g. create file, write 4K into offset
				5253	* 0, fsync, write 4K into offset 4096, add hard link,
				5254	* fsync some other file (to sync log), power fail - if
				5255	* we use the inode's current i_size, after log replay
				5256	* we get a 8Kb file, with the last 4Kb extent as a hole
				5257	* (zeroes), as if an expanding truncate happened,
				5258	* instead of getting a file of 4Kb only.
				5259	*/
				5260	err = logged_inode_size(log, inode, path, &logged_isize);
				5261	if (err)
				5262	goto out_unlock;
				5263	}
				5264	if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
				5265	&inode->runtime_flags)) {
				5266	if (inode_only == LOG_INODE_EXISTS) {
				5267	max_key.type = BTRFS_XATTR_ITEM_KEY;
				5268	ret = drop_objectid_items(trans, log, path, ino,
				5269	max_key.type);
				5270	} else {
				5271	clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
				5272	&inode->runtime_flags);
				5273	clear_bit(BTRFS_INODE_COPY_EVERYTHING,
				5274	&inode->runtime_flags);
				5275	while(1) {
				5276	ret = btrfs_truncate_inode_items(trans,
				5277	log, &inode->vfs_inode, 0, 0);
				5278	if (ret != -EAGAIN)
				5279	break;
				5280	}
				5281	}
				5282	} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
				5283	&inode->runtime_flags) \|\|
				5284	inode_only == LOG_INODE_EXISTS) {
				5285	if (inode_only == LOG_INODE_ALL)
				5286	fast_search = true;
				5287	max_key.type = BTRFS_XATTR_ITEM_KEY;
				5288	ret = drop_objectid_items(trans, log, path, ino,
				5289	max_key.type);
				5290	} else {
				5291	if (inode_only == LOG_INODE_ALL)
				5292	fast_search = true;
				5293	goto log_extents;
				5294	}
				5295
				5296	}
				5297	if (ret) {
				5298	err = ret;
				5299	goto out_unlock;
				5300	}
				5301
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5302	err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
				5303	path, dst_path, logged_isize,
				5304	recursive_logging, inode_only, ctx,
				5305	&need_log_inode_item);
				5306	if (err)
				5307	goto out_unlock;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5308
				5309	btrfs_release_path(path);
				5310	btrfs_release_path(dst_path);
				5311	err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
				5312	if (err)
				5313	goto out_unlock;
				5314	xattrs_logged = true;
				5315	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
				5316	btrfs_release_path(path);
				5317	btrfs_release_path(dst_path);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5318	err = btrfs_log_holes(trans, root, inode, path);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5319	if (err)
				5320	goto out_unlock;
				5321	}
				5322	log_extents:
				5323	btrfs_release_path(path);
				5324	btrfs_release_path(dst_path);
				5325	if (need_log_inode_item) {
				5326	err = log_inode_item(trans, log, dst_path, inode);
				5327	if (!err && !xattrs_logged) {
				5328	err = btrfs_log_all_xattrs(trans, root, inode, path,
				5329	dst_path);
				5330	btrfs_release_path(path);
				5331	}
				5332	if (err)
				5333	goto out_unlock;
				5334	}
				5335	if (fast_search) {
				5336	ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
				5337	ctx, start, end);
				5338	if (ret) {
				5339	err = ret;
				5340	goto out_unlock;
				5341	}
				5342	} else if (inode_only == LOG_INODE_ALL) {
				5343	struct extent_map em, n;
				5344
				5345	write_lock(&em_tree->lock);
				5346	/*
				5347	* We can't just remove every em if we're called for a ranged
				5348	* fsync - that is, one that doesn't cover the whole possible
				5349	* file range (0 to LLONG_MAX). This is because we can have
				5350	* em's that fall outside the range we're logging and therefore
				5351	* their ordered operations haven't completed yet
				5352	* (btrfs_finish_ordered_io() not invoked yet). This means we
				5353	* didn't get their respective file extent item in the fs/subvol
				5354	* tree yet, and need to let the next fast fsync (one which
				5355	* consults the list of modified extent maps) find the em so
				5356	* that it logs a matching file extent item and waits for the
				5357	* respective ordered operation to complete (if it's still
				5358	* running).
				5359	*
				5360	* Removing every em outside the range we're logging would make
				5361	* the next fast fsync not log their matching file extent items,
				5362	* therefore making us lose data after a log replay.
				5363	*/
				5364	list_for_each_entry_safe(em, n, &em_tree->modified_extents,
				5365	list) {
				5366	const u64 mod_end = em->mod_start + em->mod_len - 1;
				5367
				5368	if (em->mod_start >= start && mod_end <= end)
				5369	list_del_init(&em->list);
				5370	}
				5371	write_unlock(&em_tree->lock);
				5372	}
				5373
				5374	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
				5375	ret = log_directory_changes(trans, root, inode, path, dst_path,
				5376	ctx);
				5377	if (ret) {
				5378	err = ret;
				5379	goto out_unlock;
				5380	}
				5381	}
				5382
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5383	/*
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5384	* If we are logging that an ancestor inode exists as part of logging a
				5385	* new name from a link or rename operation, don't mark the inode as
				5386	* logged - otherwise if an explicit fsync is made against an ancestor,
				5387	* the fsync considers the inode in the log and doesn't sync the log,
				5388	* resulting in the ancestor missing after a power failure unless the
				5389	* log was synced as part of an fsync against any other unrelated inode.
				5390	* So keep it simple for this case and just don't flag the ancestors as
				5391	* logged.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5392	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5393	if (!ctx \|\|
				5394	!(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
				5395	&inode->vfs_inode != ctx->inode)) {
				5396	spin_lock(&inode->lock);
				5397	inode->logged_trans = trans->transid;
				5398	/*
				5399	* Don't update last_log_commit if we logged that an inode exists
				5400	* after it was loaded to memory (full_sync bit set).
				5401	* This is to prevent data loss when we do a write to the inode,
				5402	* then the inode gets evicted after all delalloc was flushed,
				5403	* then we log it exists (due to a rename for example) and then
				5404	* fsync it. This last fsync would do nothing (not logging the
				5405	* extents previously written).
				5406	*/
				5407	if (inode_only != LOG_INODE_EXISTS \|\|
				5408	!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
				5409	inode->last_log_commit = inode->last_sub_trans;
				5410	spin_unlock(&inode->lock);
				5411	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5412	out_unlock:
				5413	mutex_unlock(&inode->log_mutex);
				5414
				5415	btrfs_free_path(path);
				5416	btrfs_free_path(dst_path);
				5417	return err;
				5418	}
				5419
				5420	/*
				5421	* Check if we must fallback to a transaction commit when logging an inode.
				5422	* This must be called after logging the inode and is used only in the context
				5423	* when fsyncing an inode requires the need to log some other inode - in which
				5424	* case we can't lock the i_mutex of each other inode we need to log as that
				5425	* can lead to deadlocks with concurrent fsync against other inodes (as we can
				5426	* log inodes up or down in the hierarchy) or rename operations for example. So
				5427	* we take the log_mutex of the inode after we have logged it and then check for
				5428	* its last_unlink_trans value - this is safe because any task setting
				5429	* last_unlink_trans must take the log_mutex and it must do this before it does
				5430	* the actual unlink operation, so if we do this check before a concurrent task
				5431	* sets last_unlink_trans it means we've logged a consistent version/state of
				5432	* all the inode items, otherwise we are not sure and must do a transaction
				5433	* commit (the concurrent task might have only updated last_unlink_trans before
				5434	* we logged the inode or it might have also done the unlink).
				5435	*/
				5436	static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
				5437	struct btrfs_inode *inode)
				5438	{
				5439	struct btrfs_fs_info *fs_info = inode->root->fs_info;
				5440	bool ret = false;
				5441
				5442	mutex_lock(&inode->log_mutex);
				5443	if (inode->last_unlink_trans > fs_info->last_trans_committed) {
				5444	/*
				5445	* Make sure any commits to the log are forced to be full
				5446	* commits.
				5447	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5448	btrfs_set_log_full_commit(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5449	ret = true;
				5450	}
				5451	mutex_unlock(&inode->log_mutex);
				5452
				5453	return ret;
				5454	}
				5455
				5456	/*
				5457	* follow the dentry parent pointers up the chain and see if any
				5458	* of the directories in it require a full commit before they can
				5459	* be logged. Returns zero if nothing special needs to be done or 1 if
				5460	* a full commit is required.
				5461	*/
				5462	static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
				5463	struct btrfs_inode *inode,
				5464	struct dentry *parent,
				5465	struct super_block *sb,
				5466	u64 last_committed)
				5467	{
				5468	int ret = 0;
				5469	struct dentry *old_parent = NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5470
				5471	/*
				5472	* for regular files, if its inode is already on disk, we don't
				5473	* have to worry about the parents at all. This is because
				5474	* we can use the last_unlink_trans field to record renames
				5475	* and other fun in this file.
				5476	*/
				5477	if (S_ISREG(inode->vfs_inode.i_mode) &&
				5478	inode->generation <= last_committed &&
				5479	inode->last_unlink_trans <= last_committed)
				5480	goto out;
				5481
				5482	if (!S_ISDIR(inode->vfs_inode.i_mode)) {
				5483	if (!parent \|\| d_really_is_negative(parent) \|\| sb != parent->d_sb)
				5484	goto out;
				5485	inode = BTRFS_I(d_inode(parent));
				5486	}
				5487
				5488	while (1) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5489	if (btrfs_must_commit_transaction(trans, inode)) {
				5490	ret = 1;
				5491	break;
				5492	}
				5493
				5494	if (!parent \|\| d_really_is_negative(parent) \|\| sb != parent->d_sb)
				5495	break;
				5496
				5497	if (IS_ROOT(parent)) {
				5498	inode = BTRFS_I(d_inode(parent));
				5499	if (btrfs_must_commit_transaction(trans, inode))
				5500	ret = 1;
				5501	break;
				5502	}
				5503
				5504	parent = dget_parent(parent);
				5505	dput(old_parent);
				5506	old_parent = parent;
				5507	inode = BTRFS_I(d_inode(parent));
				5508
				5509	}
				5510	dput(old_parent);
				5511	out:
				5512	return ret;
				5513	}
				5514
				5515	struct btrfs_dir_list {
				5516	u64 ino;
				5517	struct list_head list;
				5518	};
				5519
				5520	/*
				5521	* Log the inodes of the new dentries of a directory. See log_dir_items() for
				5522	* details about the why it is needed.
				5523	* This is a recursive operation - if an existing dentry corresponds to a
				5524	* directory, that directory's new entries are logged too (same behaviour as
				5525	* ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
				5526	* the dentries point to we do not lock their i_mutex, otherwise lockdep
				5527	* complains about the following circular lock dependency / possible deadlock:
				5528	*
				5529	* CPU0 CPU1
				5530	* ---- ----
				5531	* lock(&type->i_mutex_dir_key#3/2);
				5532	* lock(sb_internal#2);
				5533	* lock(&type->i_mutex_dir_key#3/2);
				5534	* lock(&sb->s_type->i_mutex_key#14);
				5535	*
				5536	* Where sb_internal is the lock (a counter that works as a lock) acquired by
				5537	* sb_start_intwrite() in btrfs_start_transaction().
				5538	* Not locking i_mutex of the inodes is still safe because:
				5539	*
				5540	* 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
				5541	* that while logging the inode new references (names) are added or removed
				5542	* from the inode, leaving the logged inode item with a link count that does
				5543	* not match the number of logged inode reference items. This is fine because
				5544	* at log replay time we compute the real number of links and correct the
				5545	* link count in the inode item (see replay_one_buffer() and
				5546	* link_to_fixup_dir());
				5547	*
				5548	* 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
				5549	* while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
				5550	* BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
				5551	* has a size that doesn't match the sum of the lengths of all the logged
				5552	* names. This does not result in a problem because if a dir_item key is
				5553	* logged but its matching dir_index key is not logged, at log replay time we
				5554	* don't use it to replay the respective name (see replay_one_name()). On the
				5555	* other hand if only the dir_index key ends up being logged, the respective
				5556	* name is added to the fs/subvol tree with both the dir_item and dir_index
				5557	* keys created (see replay_one_name()).
				5558	* The directory's inode item with a wrong i_size is not a problem as well,
				5559	* since we don't use it at log replay time to set the i_size in the inode
				5560	* item of the fs/subvol tree (see overwrite_item()).
				5561	*/
				5562	static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
				5563	struct btrfs_root *root,
				5564	struct btrfs_inode *start_inode,
				5565	struct btrfs_log_ctx *ctx)
				5566	{
				5567	struct btrfs_fs_info *fs_info = root->fs_info;
				5568	struct btrfs_root *log = root->log_root;
				5569	struct btrfs_path *path;
				5570	LIST_HEAD(dir_list);
				5571	struct btrfs_dir_list *dir_elem;
				5572	int ret = 0;
				5573
				5574	path = btrfs_alloc_path();
				5575	if (!path)
				5576	return -ENOMEM;
				5577
				5578	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
				5579	if (!dir_elem) {
				5580	btrfs_free_path(path);
				5581	return -ENOMEM;
				5582	}
				5583	dir_elem->ino = btrfs_ino(start_inode);
				5584	list_add_tail(&dir_elem->list, &dir_list);
				5585
				5586	while (!list_empty(&dir_list)) {
				5587	struct extent_buffer *leaf;
				5588	struct btrfs_key min_key;
				5589	int nritems;
				5590	int i;
				5591
				5592	dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
				5593	list);
				5594	if (ret)
				5595	goto next_dir_inode;
				5596
				5597	min_key.objectid = dir_elem->ino;
				5598	min_key.type = BTRFS_DIR_ITEM_KEY;
				5599	min_key.offset = 0;
				5600	again:
				5601	btrfs_release_path(path);
				5602	ret = btrfs_search_forward(log, &min_key, path, trans->transid);
				5603	if (ret < 0) {
				5604	goto next_dir_inode;
				5605	} else if (ret > 0) {
				5606	ret = 0;
				5607	goto next_dir_inode;
				5608	}
				5609
				5610	process_leaf:
				5611	leaf = path->nodes[0];
				5612	nritems = btrfs_header_nritems(leaf);
				5613	for (i = path->slots[0]; i < nritems; i++) {
				5614	struct btrfs_dir_item *di;
				5615	struct btrfs_key di_key;
				5616	struct inode *di_inode;
				5617	struct btrfs_dir_list *new_dir_elem;
				5618	int log_mode = LOG_INODE_EXISTS;
				5619	int type;
				5620
				5621	btrfs_item_key_to_cpu(leaf, &min_key, i);
				5622	if (min_key.objectid != dir_elem->ino \|\|
				5623	min_key.type != BTRFS_DIR_ITEM_KEY)
				5624	goto next_dir_inode;
				5625
				5626	di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
				5627	type = btrfs_dir_type(leaf, di);
				5628	if (btrfs_dir_transid(leaf, di) < trans->transid &&
				5629	type != BTRFS_FT_DIR)
				5630	continue;
				5631	btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
				5632	if (di_key.type == BTRFS_ROOT_ITEM_KEY)
				5633	continue;
				5634
				5635	btrfs_release_path(path);
				5636	di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
				5637	if (IS_ERR(di_inode)) {
				5638	ret = PTR_ERR(di_inode);
				5639	goto next_dir_inode;
				5640	}
				5641
				5642	if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5643	btrfs_add_delayed_iput(di_inode);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5644	break;
				5645	}
				5646
				5647	ctx->log_new_dentries = false;
				5648	if (type == BTRFS_FT_DIR \|\| type == BTRFS_FT_SYMLINK)
				5649	log_mode = LOG_INODE_ALL;
				5650	ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
				5651	log_mode, 0, LLONG_MAX, ctx);
				5652	if (!ret &&
				5653	btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
				5654	ret = 1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5655	btrfs_add_delayed_iput(di_inode);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5656	if (ret)
				5657	goto next_dir_inode;
				5658	if (ctx->log_new_dentries) {
				5659	new_dir_elem = kmalloc(sizeof(*new_dir_elem),
				5660	GFP_NOFS);
				5661	if (!new_dir_elem) {
				5662	ret = -ENOMEM;
				5663	goto next_dir_inode;
				5664	}
				5665	new_dir_elem->ino = di_key.objectid;
				5666	list_add_tail(&new_dir_elem->list, &dir_list);
				5667	}
				5668	break;
				5669	}
				5670	if (i == nritems) {
				5671	ret = btrfs_next_leaf(log, path);
				5672	if (ret < 0) {
				5673	goto next_dir_inode;
				5674	} else if (ret > 0) {
				5675	ret = 0;
				5676	goto next_dir_inode;
				5677	}
				5678	goto process_leaf;
				5679	}
				5680	if (min_key.offset < (u64)-1) {
				5681	min_key.offset++;
				5682	goto again;
				5683	}
				5684	next_dir_inode:
				5685	list_del(&dir_elem->list);
				5686	kfree(dir_elem);
				5687	}
				5688
				5689	btrfs_free_path(path);
				5690	return ret;
				5691	}
				5692
				5693	static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
				5694	struct btrfs_inode *inode,
				5695	struct btrfs_log_ctx *ctx)
				5696	{
				5697	struct btrfs_fs_info *fs_info = trans->fs_info;
				5698	int ret;
				5699	struct btrfs_path *path;
				5700	struct btrfs_key key;
				5701	struct btrfs_root *root = inode->root;
				5702	const u64 ino = btrfs_ino(inode);
				5703
				5704	path = btrfs_alloc_path();
				5705	if (!path)
				5706	return -ENOMEM;
				5707	path->skip_locking = 1;
				5708	path->search_commit_root = 1;
				5709
				5710	key.objectid = ino;
				5711	key.type = BTRFS_INODE_REF_KEY;
				5712	key.offset = 0;
				5713	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				5714	if (ret < 0)
				5715	goto out;
				5716
				5717	while (true) {
				5718	struct extent_buffer *leaf = path->nodes[0];
				5719	int slot = path->slots[0];
				5720	u32 cur_offset = 0;
				5721	u32 item_size;
				5722	unsigned long ptr;
				5723
				5724	if (slot >= btrfs_header_nritems(leaf)) {
				5725	ret = btrfs_next_leaf(root, path);
				5726	if (ret < 0)
				5727	goto out;
				5728	else if (ret > 0)
				5729	break;
				5730	continue;
				5731	}
				5732
				5733	btrfs_item_key_to_cpu(leaf, &key, slot);
				5734	/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
				5735	if (key.objectid != ino \|\| key.type > BTRFS_INODE_EXTREF_KEY)
				5736	break;
				5737
				5738	item_size = btrfs_item_size_nr(leaf, slot);
				5739	ptr = btrfs_item_ptr_offset(leaf, slot);
				5740	while (cur_offset < item_size) {
				5741	struct btrfs_key inode_key;
				5742	struct inode *dir_inode;
				5743
				5744	inode_key.type = BTRFS_INODE_ITEM_KEY;
				5745	inode_key.offset = 0;
				5746
				5747	if (key.type == BTRFS_INODE_EXTREF_KEY) {
				5748	struct btrfs_inode_extref *extref;
				5749
				5750	extref = (struct btrfs_inode_extref *)
				5751	(ptr + cur_offset);
				5752	inode_key.objectid = btrfs_inode_extref_parent(
				5753	leaf, extref);
				5754	cur_offset += sizeof(*extref);
				5755	cur_offset += btrfs_inode_extref_name_len(leaf,
				5756	extref);
				5757	} else {
				5758	inode_key.objectid = key.offset;
				5759	cur_offset = item_size;
				5760	}
				5761
				5762	dir_inode = btrfs_iget(fs_info->sb, &inode_key,
				5763	root, NULL);
				5764	/*
				5765	* If the parent inode was deleted, return an error to
				5766	* fallback to a transaction commit. This is to prevent
				5767	* getting an inode that was moved from one parent A to
				5768	* a parent B, got its former parent A deleted and then
				5769	* it got fsync'ed, from existing at both parents after
				5770	* a log replay (and the old parent still existing).
				5771	* Example:
				5772	*
				5773	* mkdir /mnt/A
				5774	* mkdir /mnt/B
				5775	* touch /mnt/B/bar
				5776	* sync
				5777	* mv /mnt/B/bar /mnt/A/bar
				5778	* mv -T /mnt/A /mnt/B
				5779	* fsync /mnt/B/bar
				5780	* <power fail>
				5781	*
				5782	* If we ignore the old parent B which got deleted,
				5783	* after a log replay we would have file bar linked
				5784	* at both parents and the old parent B would still
				5785	* exist.
				5786	*/
				5787	if (IS_ERR(dir_inode)) {
				5788	ret = PTR_ERR(dir_inode);
				5789	goto out;
				5790	}
				5791
				5792	if (ctx)
				5793	ctx->log_new_dentries = false;
				5794	ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
				5795	LOG_INODE_ALL, 0, LLONG_MAX, ctx);
				5796	if (!ret &&
				5797	btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
				5798	ret = 1;
				5799	if (!ret && ctx && ctx->log_new_dentries)
				5800	ret = log_new_dir_dentries(trans, root,
				5801	BTRFS_I(dir_inode), ctx);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5802	btrfs_add_delayed_iput(dir_inode);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5803	if (ret)
				5804	goto out;
				5805	}
				5806	path->slots[0]++;
				5807	}
				5808	ret = 0;
				5809	out:
				5810	btrfs_free_path(path);
				5811	return ret;
				5812	}
				5813
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5814	static int log_new_ancestors(struct btrfs_trans_handle *trans,
				5815	struct btrfs_root *root,
				5816	struct btrfs_path *path,
				5817	struct btrfs_log_ctx *ctx)
				5818	{
				5819	struct btrfs_key found_key;
				5820
				5821	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
				5822
				5823	while (true) {
				5824	struct btrfs_fs_info *fs_info = root->fs_info;
				5825	const u64 last_committed = fs_info->last_trans_committed;
				5826	struct extent_buffer *leaf = path->nodes[0];
				5827	int slot = path->slots[0];
				5828	struct btrfs_key search_key;
				5829	struct inode *inode;
				5830	int ret = 0;
				5831
				5832	btrfs_release_path(path);
				5833
				5834	search_key.objectid = found_key.offset;
				5835	search_key.type = BTRFS_INODE_ITEM_KEY;
				5836	search_key.offset = 0;
				5837	inode = btrfs_iget(fs_info->sb, &search_key, root, NULL);
				5838	if (IS_ERR(inode))
				5839	return PTR_ERR(inode);
				5840
				5841	if (BTRFS_I(inode)->generation > last_committed)
				5842	ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
				5843	LOG_INODE_EXISTS,
				5844	0, LLONG_MAX, ctx);
				5845	btrfs_add_delayed_iput(inode);
				5846	if (ret)
				5847	return ret;
				5848
				5849	if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
				5850	break;
				5851
				5852	search_key.type = BTRFS_INODE_REF_KEY;
				5853	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
				5854	if (ret < 0)
				5855	return ret;
				5856
				5857	leaf = path->nodes[0];
				5858	slot = path->slots[0];
				5859	if (slot >= btrfs_header_nritems(leaf)) {
				5860	ret = btrfs_next_leaf(root, path);
				5861	if (ret < 0)
				5862	return ret;
				5863	else if (ret > 0)
				5864	return -ENOENT;
				5865	leaf = path->nodes[0];
				5866	slot = path->slots[0];
				5867	}
				5868
				5869	btrfs_item_key_to_cpu(leaf, &found_key, slot);
				5870	if (found_key.objectid != search_key.objectid \|\|
				5871	found_key.type != BTRFS_INODE_REF_KEY)
				5872	return -ENOENT;
				5873	}
				5874	return 0;
				5875	}
				5876
				5877	static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
				5878	struct btrfs_inode *inode,
				5879	struct dentry *parent,
				5880	struct btrfs_log_ctx *ctx)
				5881	{
				5882	struct btrfs_root *root = inode->root;
				5883	struct btrfs_fs_info *fs_info = root->fs_info;
				5884	struct dentry *old_parent = NULL;
				5885	struct super_block *sb = inode->vfs_inode.i_sb;
				5886	int ret = 0;
				5887
				5888	while (true) {
				5889	if (!parent \|\| d_really_is_negative(parent) \|\|
				5890	sb != parent->d_sb)
				5891	break;
				5892
				5893	inode = BTRFS_I(d_inode(parent));
				5894	if (root != inode->root)
				5895	break;
				5896
				5897	if (inode->generation > fs_info->last_trans_committed) {
				5898	ret = btrfs_log_inode(trans, root, inode,
				5899	LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
				5900	if (ret)
				5901	break;
				5902	}
				5903	if (IS_ROOT(parent))
				5904	break;
				5905
				5906	parent = dget_parent(parent);
				5907	dput(old_parent);
				5908	old_parent = parent;
				5909	}
				5910	dput(old_parent);
				5911
				5912	return ret;
				5913	}
				5914
				5915	static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
				5916	struct btrfs_inode *inode,
				5917	struct dentry *parent,
				5918	struct btrfs_log_ctx *ctx)
				5919	{
				5920	struct btrfs_root *root = inode->root;
				5921	const u64 ino = btrfs_ino(inode);
				5922	struct btrfs_path *path;
				5923	struct btrfs_key search_key;
				5924	int ret;
				5925
				5926	/*
				5927	* For a single hard link case, go through a fast path that does not
				5928	* need to iterate the fs/subvolume tree.
				5929	*/
				5930	if (inode->vfs_inode.i_nlink < 2)
				5931	return log_new_ancestors_fast(trans, inode, parent, ctx);
				5932
				5933	path = btrfs_alloc_path();
				5934	if (!path)
				5935	return -ENOMEM;
				5936
				5937	search_key.objectid = ino;
				5938	search_key.type = BTRFS_INODE_REF_KEY;
				5939	search_key.offset = 0;
				5940	again:
				5941	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
				5942	if (ret < 0)
				5943	goto out;
				5944	if (ret == 0)
				5945	path->slots[0]++;
				5946
				5947	while (true) {
				5948	struct extent_buffer *leaf = path->nodes[0];
				5949	int slot = path->slots[0];
				5950	struct btrfs_key found_key;
				5951
				5952	if (slot >= btrfs_header_nritems(leaf)) {
				5953	ret = btrfs_next_leaf(root, path);
				5954	if (ret < 0)
				5955	goto out;
				5956	else if (ret > 0)
				5957	break;
				5958	continue;
				5959	}
				5960
				5961	btrfs_item_key_to_cpu(leaf, &found_key, slot);
				5962	if (found_key.objectid != ino \|\|
				5963	found_key.type > BTRFS_INODE_EXTREF_KEY)
				5964	break;
				5965
				5966	/*
				5967	* Don't deal with extended references because they are rare
				5968	* cases and too complex to deal with (we would need to keep
				5969	* track of which subitem we are processing for each item in
				5970	* this loop, etc). So just return some error to fallback to
				5971	* a transaction commit.
				5972	*/
				5973	if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
				5974	ret = -EMLINK;
				5975	goto out;
				5976	}
				5977
				5978	/*
				5979	* Logging ancestors needs to do more searches on the fs/subvol
				5980	* tree, so it releases the path as needed to avoid deadlocks.
				5981	* Keep track of the last inode ref key and resume from that key
				5982	* after logging all new ancestors for the current hard link.
				5983	*/
				5984	memcpy(&search_key, &found_key, sizeof(search_key));
				5985
				5986	ret = log_new_ancestors(trans, root, path, ctx);
				5987	if (ret)
				5988	goto out;
				5989	btrfs_release_path(path);
				5990	goto again;
				5991	}
				5992	ret = 0;
				5993	out:
				5994	btrfs_free_path(path);
				5995	return ret;
				5996	}
				5997
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5998	/*
				5999	* helper function around btrfs_log_inode to make sure newly created
				6000	* parent directories also end up in the log. A minimal inode and backref
				6001	* only logging is done of any parent directories that are older than
				6002	* the last committed transaction
				6003	*/
				6004	static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
				6005	struct btrfs_inode *inode,
				6006	struct dentry *parent,
				6007	const loff_t start,
				6008	const loff_t end,
				6009	int inode_only,
				6010	struct btrfs_log_ctx *ctx)
				6011	{
				6012	struct btrfs_root *root = inode->root;
				6013	struct btrfs_fs_info *fs_info = root->fs_info;
				6014	struct super_block *sb;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6015	int ret = 0;
				6016	u64 last_committed = fs_info->last_trans_committed;
				6017	bool log_dentries = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6018
				6019	sb = inode->vfs_inode.i_sb;
				6020
				6021	if (btrfs_test_opt(fs_info, NOTREELOG)) {
				6022	ret = 1;
				6023	goto end_no_trans;
				6024	}
				6025
				6026	/*
				6027	* The prev transaction commit doesn't complete, we need do
				6028	* full commit by ourselves.
				6029	*/
				6030	if (fs_info->last_trans_log_full_commit >
				6031	fs_info->last_trans_committed) {
				6032	ret = 1;
				6033	goto end_no_trans;
				6034	}
				6035
				6036	if (btrfs_root_refs(&root->root_item) == 0) {
				6037	ret = 1;
				6038	goto end_no_trans;
				6039	}
				6040
				6041	ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
				6042	last_committed);
				6043	if (ret)
				6044	goto end_no_trans;
				6045
				6046	/*
				6047	* Skip already logged inodes or inodes corresponding to tmpfiles
				6048	* (since logging them is pointless, a link count of 0 means they
				6049	* will never be accessible).
				6050	*/
				6051	if (btrfs_inode_in_log(inode, trans->transid) \|\|
				6052	inode->vfs_inode.i_nlink == 0) {
				6053	ret = BTRFS_NO_LOG_SYNC;
				6054	goto end_no_trans;
				6055	}
				6056
				6057	ret = start_log_trans(trans, root, ctx);
				6058	if (ret)
				6059	goto end_no_trans;
				6060
				6061	ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
				6062	if (ret)
				6063	goto end_trans;
				6064
				6065	/*
				6066	* for regular files, if its inode is already on disk, we don't
				6067	* have to worry about the parents at all. This is because
				6068	* we can use the last_unlink_trans field to record renames
				6069	* and other fun in this file.
				6070	*/
				6071	if (S_ISREG(inode->vfs_inode.i_mode) &&
				6072	inode->generation <= last_committed &&
				6073	inode->last_unlink_trans <= last_committed) {
				6074	ret = 0;
				6075	goto end_trans;
				6076	}
				6077
				6078	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
				6079	log_dentries = true;
				6080
				6081	/*
				6082	* On unlink we must make sure all our current and old parent directory
				6083	* inodes are fully logged. This is to prevent leaving dangling
				6084	* directory index entries in directories that were our parents but are
				6085	* not anymore. Not doing this results in old parent directory being
				6086	* impossible to delete after log replay (rmdir will always fail with
				6087	* error -ENOTEMPTY).
				6088	*
				6089	* Example 1:
				6090	*
				6091	* mkdir testdir
				6092	* touch testdir/foo
				6093	* ln testdir/foo testdir/bar
				6094	* sync
				6095	* unlink testdir/bar
				6096	* xfs_io -c fsync testdir/foo
				6097	* <power failure>
				6098	* mount fs, triggers log replay
				6099	*
				6100	* If we don't log the parent directory (testdir), after log replay the
				6101	* directory still has an entry pointing to the file inode using the bar
				6102	* name, but a matching BTRFS_INODE_[REF\|EXTREF]_KEY does not exist and
				6103	* the file inode has a link count of 1.
				6104	*
				6105	* Example 2:
				6106	*
				6107	* mkdir testdir
				6108	* touch foo
				6109	* ln foo testdir/foo2
				6110	* ln foo testdir/foo3
				6111	* sync
				6112	* unlink testdir/foo3
				6113	* xfs_io -c fsync foo
				6114	* <power failure>
				6115	* mount fs, triggers log replay
				6116	*
				6117	* Similar as the first example, after log replay the parent directory
				6118	* testdir still has an entry pointing to the inode file with name foo3
				6119	* but the file inode does not have a matching BTRFS_INODE_REF_KEY item
				6120	* and has a link count of 2.
				6121	*/
				6122	if (inode->last_unlink_trans > last_committed) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6123	ret = btrfs_log_all_parents(trans, inode, ctx);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6124	if (ret)
				6125	goto end_trans;
				6126	}
				6127
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6128	ret = log_all_new_ancestors(trans, inode, parent, ctx);
				6129	if (ret)
				6130	goto end_trans;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6131
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6132	if (log_dentries)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6133	ret = log_new_dir_dentries(trans, root, inode, ctx);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6134	else
				6135	ret = 0;
				6136	end_trans:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6137	if (ret < 0) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6138	btrfs_set_log_full_commit(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6139	ret = 1;
				6140	}
				6141
				6142	if (ret)
				6143	btrfs_remove_log_ctx(root, ctx);
				6144	btrfs_end_log_trans(root);
				6145	end_no_trans:
				6146	return ret;
				6147	}
				6148
				6149	/*
				6150	* it is not safe to log dentry if the chunk root has added new
				6151	* chunks. This returns 0 if the dentry was logged, and 1 otherwise.
				6152	* If this returns 1, you must commit the transaction to safely get your
				6153	* data on disk.
				6154	*/
				6155	int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
				6156	struct dentry *dentry,
				6157	const loff_t start,
				6158	const loff_t end,
				6159	struct btrfs_log_ctx *ctx)
				6160	{
				6161	struct dentry *parent = dget_parent(dentry);
				6162	int ret;
				6163
				6164	ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
				6165	start, end, LOG_INODE_ALL, ctx);
				6166	dput(parent);
				6167
				6168	return ret;
				6169	}
				6170
				6171	/*
				6172	* should be called during mount to recover any replay any log trees
				6173	* from the FS
				6174	*/
				6175	int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
				6176	{
				6177	int ret;
				6178	struct btrfs_path *path;
				6179	struct btrfs_trans_handle *trans;
				6180	struct btrfs_key key;
				6181	struct btrfs_key found_key;
				6182	struct btrfs_key tmp_key;
				6183	struct btrfs_root *log;
				6184	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
				6185	struct walk_control wc = {
				6186	.process_func = process_one_buffer,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6187	.stage = LOG_WALK_PIN_ONLY,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6188	};
				6189
				6190	path = btrfs_alloc_path();
				6191	if (!path)
				6192	return -ENOMEM;
				6193
				6194	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
				6195
				6196	trans = btrfs_start_transaction(fs_info->tree_root, 0);
				6197	if (IS_ERR(trans)) {
				6198	ret = PTR_ERR(trans);
				6199	goto error;
				6200	}
				6201
				6202	wc.trans = trans;
				6203	wc.pin = 1;
				6204
				6205	ret = walk_log_tree(trans, log_root_tree, &wc);
				6206	if (ret) {
				6207	btrfs_handle_fs_error(fs_info, ret,
				6208	"Failed to pin buffers while recovering log root tree.");
				6209	goto error;
				6210	}
				6211
				6212	again:
				6213	key.objectid = BTRFS_TREE_LOG_OBJECTID;
				6214	key.offset = (u64)-1;
				6215	key.type = BTRFS_ROOT_ITEM_KEY;
				6216
				6217	while (1) {
				6218	ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
				6219
				6220	if (ret < 0) {
				6221	btrfs_handle_fs_error(fs_info, ret,
				6222	"Couldn't find tree log root.");
				6223	goto error;
				6224	}
				6225	if (ret > 0) {
				6226	if (path->slots[0] == 0)
				6227	break;
				6228	path->slots[0]--;
				6229	}
				6230	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				6231	path->slots[0]);
				6232	btrfs_release_path(path);
				6233	if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				6234	break;
				6235
				6236	log = btrfs_read_fs_root(log_root_tree, &found_key);
				6237	if (IS_ERR(log)) {
				6238	ret = PTR_ERR(log);
				6239	btrfs_handle_fs_error(fs_info, ret,
				6240	"Couldn't read tree log root.");
				6241	goto error;
				6242	}
				6243
				6244	tmp_key.objectid = found_key.offset;
				6245	tmp_key.type = BTRFS_ROOT_ITEM_KEY;
				6246	tmp_key.offset = (u64)-1;
				6247
				6248	wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
				6249	if (IS_ERR(wc.replay_dest)) {
				6250	ret = PTR_ERR(wc.replay_dest);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6251
				6252	/*
				6253	* We didn't find the subvol, likely because it was
				6254	* deleted. This is ok, simply skip this log and go to
				6255	* the next one.
				6256	*
				6257	* We need to exclude the root because we can't have
				6258	* other log replays overwriting this log as we'll read
				6259	* it back in a few more times. This will keep our
				6260	* block from being modified, and we'll just bail for
				6261	* each subsequent pass.
				6262	*/
				6263	if (ret == -ENOENT)
				6264	ret = btrfs_pin_extent_for_log_replay(fs_info,
				6265	log->node->start,
				6266	log->node->len);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6267	free_extent_buffer(log->node);
				6268	free_extent_buffer(log->commit_root);
				6269	kfree(log);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6270
				6271	if (!ret)
				6272	goto next;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6273	btrfs_handle_fs_error(fs_info, ret,
				6274	"Couldn't read target root for tree log recovery.");
				6275	goto error;
				6276	}
				6277
				6278	wc.replay_dest->log_root = log;
				6279	btrfs_record_root_in_trans(trans, wc.replay_dest);
				6280	ret = walk_log_tree(trans, log, &wc);
				6281
				6282	if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
				6283	ret = fixup_inode_link_counts(trans, wc.replay_dest,
				6284	path);
				6285	}
				6286
				6287	if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
				6288	struct btrfs_root *root = wc.replay_dest;
				6289
				6290	btrfs_release_path(path);
				6291
				6292	/*
				6293	* We have just replayed everything, and the highest
				6294	* objectid of fs roots probably has changed in case
				6295	* some inode_item's got replayed.
				6296	*
				6297	* root->objectid_mutex is not acquired as log replay
				6298	* could only happen during mount.
				6299	*/
				6300	ret = btrfs_find_highest_objectid(root,
				6301	&root->highest_objectid);
				6302	}
				6303
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6304	wc.replay_dest->log_root = NULL;
				6305	free_extent_buffer(log->node);
				6306	free_extent_buffer(log->commit_root);
				6307	kfree(log);
				6308
				6309	if (ret)
				6310	goto error;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6311	next:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6312	if (found_key.offset == 0)
				6313	break;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6314	key.offset = found_key.offset - 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6315	}
				6316	btrfs_release_path(path);
				6317
				6318	/* step one is to pin it all, step two is to replay just inodes */
				6319	if (wc.pin) {
				6320	wc.pin = 0;
				6321	wc.process_func = replay_one_buffer;
				6322	wc.stage = LOG_WALK_REPLAY_INODES;
				6323	goto again;
				6324	}
				6325	/* step three is to replay everything */
				6326	if (wc.stage < LOG_WALK_REPLAY_ALL) {
				6327	wc.stage++;
				6328	goto again;
				6329	}
				6330
				6331	btrfs_free_path(path);
				6332
				6333	/* step 4: commit the transaction, which also unpins the blocks */
				6334	ret = btrfs_commit_transaction(trans);
				6335	if (ret)
				6336	return ret;
				6337
				6338	free_extent_buffer(log_root_tree->node);
				6339	log_root_tree->log_root = NULL;
				6340	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
				6341	kfree(log_root_tree);
				6342
				6343	return 0;
				6344	error:
				6345	if (wc.trans)
				6346	btrfs_end_transaction(wc.trans);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6347	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6348	btrfs_free_path(path);
				6349	return ret;
				6350	}
				6351
				6352	/*
				6353	* there are some corner cases where we want to force a full
				6354	* commit instead of allowing a directory to be logged.
				6355	*
				6356	* They revolve around files there were unlinked from the directory, and
				6357	* this function updates the parent directory so that a full commit is
				6358	* properly done if it is fsync'd later after the unlinks are done.
				6359	*
				6360	* Must be called before the unlink operations (updates to the subvolume tree,
				6361	* inodes, etc) are done.
				6362	*/
				6363	void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
				6364	struct btrfs_inode dir, struct btrfs_inode inode,
				6365	int for_rename)
				6366	{
				6367	/*
				6368	* when we're logging a file, if it hasn't been renamed
				6369	* or unlinked, and its inode is fully committed on disk,
				6370	* we don't have to worry about walking up the directory chain
				6371	* to log its parents.
				6372	*
				6373	* So, we use the last_unlink_trans field to put this transid
				6374	* into the file. When the file is logged we check it and
				6375	* don't log the parents if the file is fully on disk.
				6376	*/
				6377	mutex_lock(&inode->log_mutex);
				6378	inode->last_unlink_trans = trans->transid;
				6379	mutex_unlock(&inode->log_mutex);
				6380
				6381	/*
				6382	* if this directory was already logged any new
				6383	* names for this file/dir will get recorded
				6384	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6385	if (dir->logged_trans == trans->transid)
				6386	return;
				6387
				6388	/*
				6389	* if the inode we're about to unlink was logged,
				6390	* the log will be properly updated for any new names
				6391	*/
				6392	if (inode->logged_trans == trans->transid)
				6393	return;
				6394
				6395	/*
				6396	* when renaming files across directories, if the directory
				6397	* there we're unlinking from gets fsync'd later on, there's
				6398	* no way to find the destination directory later and fsync it
				6399	* properly. So, we have to be conservative and force commits
				6400	* so the new name gets discovered.
				6401	*/
				6402	if (for_rename)
				6403	goto record;
				6404
				6405	/* we can safely do the unlink without any special recording */
				6406	return;
				6407
				6408	record:
				6409	mutex_lock(&dir->log_mutex);
				6410	dir->last_unlink_trans = trans->transid;
				6411	mutex_unlock(&dir->log_mutex);
				6412	}
				6413
				6414	/*
				6415	* Make sure that if someone attempts to fsync the parent directory of a deleted
				6416	* snapshot, it ends up triggering a transaction commit. This is to guarantee
				6417	* that after replaying the log tree of the parent directory's root we will not
				6418	* see the snapshot anymore and at log replay time we will not see any log tree
				6419	* corresponding to the deleted snapshot's root, which could lead to replaying
				6420	* it after replaying the log tree of the parent directory (which would replay
				6421	* the snapshot delete operation).
				6422	*
				6423	* Must be called before the actual snapshot destroy operation (updates to the
				6424	* parent root and tree of tree roots trees, etc) are done.
				6425	*/
				6426	void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
				6427	struct btrfs_inode *dir)
				6428	{
				6429	mutex_lock(&dir->log_mutex);
				6430	dir->last_unlink_trans = trans->transid;
				6431	mutex_unlock(&dir->log_mutex);
				6432	}
				6433
				6434	/*
				6435	* Call this after adding a new name for a file and it will properly
				6436	* update the log to reflect the new name.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6437	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6438	void btrfs_log_new_name(struct btrfs_trans_handle *trans,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6439	struct btrfs_inode inode, struct btrfs_inode old_dir,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6440	struct dentry *parent)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6441	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6442	struct btrfs_log_ctx ctx;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6443
				6444	/*
				6445	* this will force the logging code to walk the dentry chain
				6446	* up for the file
				6447	*/
				6448	if (!S_ISDIR(inode->vfs_inode.i_mode))
				6449	inode->last_unlink_trans = trans->transid;
				6450
				6451	/*
				6452	* if this inode hasn't been logged and directory we're renaming it
				6453	* from hasn't been logged, we don't need to log it
				6454	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6455	if (!inode_logged(trans, inode) &&
				6456	(!old_dir \|\| !inode_logged(trans, old_dir)))
				6457	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6458
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6459	btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
				6460	ctx.logging_new_name = true;
				6461	/*
				6462	* We don't care about the return value. If we fail to log the new name
				6463	* then we know the next attempt to sync the log will fallback to a full
				6464	* transaction commit (due to a call to btrfs_set_log_full_commit()), so
				6465	* we don't need to worry about getting a log committed that has an
				6466	* inconsistent state after a rename operation.
				6467	*/
				6468	btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
				6469	LOG_INODE_EXISTS, &ctx);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6470	}
				6471