Blame - fs/btrfs/volumes.c - hafnium/third_party/linux

blob: 8deee49a6b3fa5fe98286cd323523eeda5e455c7 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (C) 2007 Oracle. All rights reserved.
				4	*/
				5
				6	#include <linux/sched.h>
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7	#include <linux/sched/mm.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8	#include <linux/bio.h>
				9	#include <linux/slab.h>
				10	#include <linux/buffer_head.h>
				11	#include <linux/blkdev.h>
				12	#include <linux/ratelimit.h>
				13	#include <linux/kthread.h>
				14	#include <linux/raid/pq.h>
				15	#include <linux/semaphore.h>
				16	#include <linux/uuid.h>
				17	#include <linux/list_sort.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	18	#include "misc.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	19	#include "ctree.h"
				20	#include "extent_map.h"
				21	#include "disk-io.h"
				22	#include "transaction.h"
				23	#include "print-tree.h"
				24	#include "volumes.h"
				25	#include "raid56.h"
				26	#include "async-thread.h"
				27	#include "check-integrity.h"
				28	#include "rcu-string.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	29	#include "dev-replace.h"
				30	#include "sysfs.h"
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	31	#include "tree-checker.h"
				32	#include "space-info.h"
				33	#include "block-group.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	34
				35	const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
				36	[BTRFS_RAID_RAID10] = {
				37	.sub_stripes = 2,
				38	.dev_stripes = 1,
				39	.devs_max = 0, /* 0 == as many as possible */
				40	.devs_min = 4,
				41	.tolerated_failures = 1,
				42	.devs_increment = 2,
				43	.ncopies = 2,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	44	.nparity = 0,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	45	.raid_name = "raid10",
				46	.bg_flag = BTRFS_BLOCK_GROUP_RAID10,
				47	.mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
				48	},
				49	[BTRFS_RAID_RAID1] = {
				50	.sub_stripes = 1,
				51	.dev_stripes = 1,
				52	.devs_max = 2,
				53	.devs_min = 2,
				54	.tolerated_failures = 1,
				55	.devs_increment = 2,
				56	.ncopies = 2,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	57	.nparity = 0,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	58	.raid_name = "raid1",
				59	.bg_flag = BTRFS_BLOCK_GROUP_RAID1,
				60	.mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
				61	},
				62	[BTRFS_RAID_DUP] = {
				63	.sub_stripes = 1,
				64	.dev_stripes = 2,
				65	.devs_max = 1,
				66	.devs_min = 1,
				67	.tolerated_failures = 0,
				68	.devs_increment = 1,
				69	.ncopies = 2,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	70	.nparity = 0,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	71	.raid_name = "dup",
				72	.bg_flag = BTRFS_BLOCK_GROUP_DUP,
				73	.mindev_error = 0,
				74	},
				75	[BTRFS_RAID_RAID0] = {
				76	.sub_stripes = 1,
				77	.dev_stripes = 1,
				78	.devs_max = 0,
				79	.devs_min = 2,
				80	.tolerated_failures = 0,
				81	.devs_increment = 1,
				82	.ncopies = 1,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	83	.nparity = 0,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	84	.raid_name = "raid0",
				85	.bg_flag = BTRFS_BLOCK_GROUP_RAID0,
				86	.mindev_error = 0,
				87	},
				88	[BTRFS_RAID_SINGLE] = {
				89	.sub_stripes = 1,
				90	.dev_stripes = 1,
				91	.devs_max = 1,
				92	.devs_min = 1,
				93	.tolerated_failures = 0,
				94	.devs_increment = 1,
				95	.ncopies = 1,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	96	.nparity = 0,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	97	.raid_name = "single",
				98	.bg_flag = 0,
				99	.mindev_error = 0,
				100	},
				101	[BTRFS_RAID_RAID5] = {
				102	.sub_stripes = 1,
				103	.dev_stripes = 1,
				104	.devs_max = 0,
				105	.devs_min = 2,
				106	.tolerated_failures = 1,
				107	.devs_increment = 1,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	108	.ncopies = 1,
				109	.nparity = 1,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	110	.raid_name = "raid5",
				111	.bg_flag = BTRFS_BLOCK_GROUP_RAID5,
				112	.mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
				113	},
				114	[BTRFS_RAID_RAID6] = {
				115	.sub_stripes = 1,
				116	.dev_stripes = 1,
				117	.devs_max = 0,
				118	.devs_min = 3,
				119	.tolerated_failures = 2,
				120	.devs_increment = 1,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	121	.ncopies = 1,
				122	.nparity = 2,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	123	.raid_name = "raid6",
				124	.bg_flag = BTRFS_BLOCK_GROUP_RAID6,
				125	.mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
				126	},
				127	};
				128
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	129	const char *btrfs_bg_type_to_raid_name(u64 flags)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	130	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	131	const int index = btrfs_bg_flags_to_raid_index(flags);
				132
				133	if (index >= BTRFS_NR_RAID_TYPES)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	134	return NULL;
				135
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	136	return btrfs_raid_array[index].raid_name;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	137	}
				138
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	139	/*
				140	* Fill @buf with textual description of @bg_flags, no more than @size_buf
				141	* bytes including terminating null byte.
				142	*/
				143	void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
				144	{
				145	int i;
				146	int ret;
				147	char *bp = buf;
				148	u64 flags = bg_flags;
				149	u32 size_bp = size_buf;
				150
				151	if (!flags) {
				152	strcpy(bp, "NONE");
				153	return;
				154	}
				155
				156	#define DESCRIBE_FLAG(flag, desc) \
				157	do { \
				158	if (flags & (flag)) { \
				159	ret = snprintf(bp, size_bp, "%s\|", (desc)); \
				160	if (ret < 0 \|\| ret >= size_bp) \
				161	goto out_overflow; \
				162	size_bp -= ret; \
				163	bp += ret; \
				164	flags &= ~(flag); \
				165	} \
				166	} while (0)
				167
				168	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
				169	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
				170	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
				171
				172	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
				173	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
				174	DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
				175	btrfs_raid_array[i].raid_name);
				176	#undef DESCRIBE_FLAG
				177
				178	if (flags) {
				179	ret = snprintf(bp, size_bp, "0x%llx\|", flags);
				180	size_bp -= ret;
				181	}
				182
				183	if (size_bp < size_buf)
				184	buf[size_buf - size_bp - 1] = '\0'; /* remove last \| */
				185
				186	/*
				187	* The text is trimmed, it's up to the caller to provide sufficiently
				188	* large buffer
				189	*/
				190	out_overflow:;
				191	}
				192
				193	static int init_first_rw_device(struct btrfs_trans_handle *trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	194	static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	195	static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
				196	static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
				197	static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
				198	enum btrfs_map_op op,
				199	u64 logical, u64 *length,
				200	struct btrfs_bio **bbio_ret,
				201	int mirror_num, int need_raid_map);
				202
				203	/*
				204	* Device locking
				205	* ==============
				206	*
				207	* There are several mutexes that protect manipulation of devices and low-level
				208	* structures like chunks but not block groups, extents or files
				209	*
				210	* uuid_mutex (global lock)
				211	* ------------------------
				212	* protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
				213	* the SCAN_DEV ioctl registration or from mount either implicitly (the first
				214	* device) or requested by the device= mount option
				215	*
				216	* the mutex can be very coarse and can cover long-running operations
				217	*
				218	* protects: updates to fs_devices counters like missing devices, rw devices,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	219	* seeding, structure cloning, opening/closing devices at mount/umount time
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	220	*
				221	* global::fs_devs - add, remove, updates to the global list
				222	*
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	223	* does not protect: manipulation of the fs_devices::devices list in general
				224	* but in mount context it could be used to exclude list modifications by eg.
				225	* scan ioctl
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	226	*
				227	* btrfs_device::name - renames (write side), read is RCU
				228	*
				229	* fs_devices::device_list_mutex (per-fs, with RCU)
				230	* ------------------------------------------------
				231	* protects updates to fs_devices::devices, ie. adding and deleting
				232	*
				233	* simple list traversal with read-only actions can be done with RCU protection
				234	*
				235	* may be used to exclude some operations from running concurrently without any
				236	* modifications to the list (see write_all_supers)
				237	*
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	238	* Is not required at mount and close times, because our device list is
				239	* protected by the uuid_mutex at that point.
				240	*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	241	* balance_mutex
				242	* -------------
				243	* protects balance structures (status, state) and context accessed from
				244	* several places (internally, ioctl)
				245	*
				246	* chunk_mutex
				247	* -----------
				248	* protects chunks, adding or removing during allocation, trim or when a new
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	249	* device is added/removed. Additionally it also protects post_commit_list of
				250	* individual devices, since they can be added to the transaction's
				251	* post_commit_list only with chunk_mutex held.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	252	*
				253	* cleaner_mutex
				254	* -------------
				255	* a big lock that is held by the cleaner thread and prevents running subvolume
				256	* cleaning together with relocation or delayed iputs
				257	*
				258	*
				259	* Lock nesting
				260	* ============
				261	*
				262	* uuid_mutex
				263	* volume_mutex
				264	* device_list_mutex
				265	* chunk_mutex
				266	* balance_mutex
				267	*
				268	*
				269	* Exclusive operations, BTRFS_FS_EXCL_OP
				270	* ======================================
				271	*
				272	* Maintains the exclusivity of the following operations that apply to the
				273	* whole filesystem and cannot run in parallel.
				274	*
				275	* - Balance (*)
				276	* - Device add
				277	* - Device remove
				278	* - Device replace (*)
				279	* - Resize
				280	*
				281	* The device operations (as above) can be in one of the following states:
				282	*
				283	* - Running state
				284	* - Paused state
				285	* - Completed state
				286	*
				287	* Only device operations marked with (*) can go into the Paused state for the
				288	* following reasons:
				289	*
				290	* - ioctl (only Balance can be Paused through ioctl)
				291	* - filesystem remounted as read-only
				292	* - filesystem unmounted and mounted as read-only
				293	* - system power-cycle and filesystem mounted as read-only
				294	* - filesystem or device errors leading to forced read-only
				295	*
				296	* BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
				297	* During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
				298	* A device operation in Paused or Running state can be canceled or resumed
				299	* either by ioctl (Balance only) or when remounted as read-write.
				300	* BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
				301	* completed.
				302	*/
				303
				304	DEFINE_MUTEX(uuid_mutex);
				305	static LIST_HEAD(fs_uuids);
				306	struct list_head *btrfs_get_fs_uuids(void)
				307	{
				308	return &fs_uuids;
				309	}
				310
				311	/*
				312	* alloc_fs_devices - allocate struct btrfs_fs_devices
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	313	* @fsid: if not NULL, copy the UUID to fs_devices::fsid
				314	* @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	315	*
				316	* Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
				317	* The returned struct is not linked onto any lists and can be destroyed with
				318	* kfree() right away.
				319	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	320	static struct btrfs_fs_devices alloc_fs_devices(const u8 fsid,
				321	const u8 *metadata_fsid)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	322	{
				323	struct btrfs_fs_devices *fs_devs;
				324
				325	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
				326	if (!fs_devs)
				327	return ERR_PTR(-ENOMEM);
				328
				329	mutex_init(&fs_devs->device_list_mutex);
				330
				331	INIT_LIST_HEAD(&fs_devs->devices);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	332	INIT_LIST_HEAD(&fs_devs->alloc_list);
				333	INIT_LIST_HEAD(&fs_devs->fs_list);
				334	if (fsid)
				335	memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
				336
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	337	if (metadata_fsid)
				338	memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
				339	else if (fsid)
				340	memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
				341
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	342	return fs_devs;
				343	}
				344
				345	void btrfs_free_device(struct btrfs_device *device)
				346	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	347	WARN_ON(!list_empty(&device->post_commit_list));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	348	rcu_string_free(device->name);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	349	extent_io_tree_release(&device->alloc_state);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	350	bio_put(device->flush_bio);
				351	kfree(device);
				352	}
				353
				354	static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
				355	{
				356	struct btrfs_device *device;
				357	WARN_ON(fs_devices->opened);
				358	while (!list_empty(&fs_devices->devices)) {
				359	device = list_entry(fs_devices->devices.next,
				360	struct btrfs_device, dev_list);
				361	list_del(&device->dev_list);
				362	btrfs_free_device(device);
				363	}
				364	kfree(fs_devices);
				365	}
				366
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	367	void __exit btrfs_cleanup_fs_uuids(void)
				368	{
				369	struct btrfs_fs_devices *fs_devices;
				370
				371	while (!list_empty(&fs_uuids)) {
				372	fs_devices = list_entry(fs_uuids.next,
				373	struct btrfs_fs_devices, fs_list);
				374	list_del(&fs_devices->fs_list);
				375	free_fs_devices(fs_devices);
				376	}
				377	}
				378
				379	/*
				380	* Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
				381	* Returned struct is not linked onto any lists and must be destroyed using
				382	* btrfs_free_device.
				383	*/
				384	static struct btrfs_device *__alloc_device(void)
				385	{
				386	struct btrfs_device *dev;
				387
				388	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
				389	if (!dev)
				390	return ERR_PTR(-ENOMEM);
				391
				392	/*
				393	* Preallocate a bio that's always going to be used for flushing device
				394	* barriers and matches the device lifespan
				395	*/
				396	dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
				397	if (!dev->flush_bio) {
				398	kfree(dev);
				399	return ERR_PTR(-ENOMEM);
				400	}
				401
				402	INIT_LIST_HEAD(&dev->dev_list);
				403	INIT_LIST_HEAD(&dev->dev_alloc_list);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	404	INIT_LIST_HEAD(&dev->post_commit_list);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	405
				406	spin_lock_init(&dev->io_lock);
				407
				408	atomic_set(&dev->reada_in_flight, 0);
				409	atomic_set(&dev->dev_stats_ccnt, 0);
				410	btrfs_device_data_ordered_init(dev);
				411	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
				412	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	413	extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	414
				415	return dev;
				416	}
				417
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	418	static noinline struct btrfs_fs_devices *find_fsid(
				419	const u8 fsid, const u8 metadata_fsid)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	420	{
				421	struct btrfs_fs_devices *fs_devices;
				422
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	423	ASSERT(fsid);
				424
				425	if (metadata_fsid) {
				426	/*
				427	* Handle scanned device having completed its fsid change but
				428	* belonging to a fs_devices that was created by first scanning
				429	* a device which didn't have its fsid/metadata_uuid changed
				430	* at all and the CHANGING_FSID_V2 flag set.
				431	*/
				432	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
				433	if (fs_devices->fsid_change &&
				434	memcmp(metadata_fsid, fs_devices->fsid,
				435	BTRFS_FSID_SIZE) == 0 &&
				436	memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
				437	BTRFS_FSID_SIZE) == 0) {
				438	return fs_devices;
				439	}
				440	}
				441	/*
				442	* Handle scanned device having completed its fsid change but
				443	* belonging to a fs_devices that was created by a device that
				444	* has an outdated pair of fsid/metadata_uuid and
				445	* CHANGING_FSID_V2 flag set.
				446	*/
				447	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
				448	if (fs_devices->fsid_change &&
				449	memcmp(fs_devices->metadata_uuid,
				450	fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
				451	memcmp(metadata_fsid, fs_devices->metadata_uuid,
				452	BTRFS_FSID_SIZE) == 0) {
				453	return fs_devices;
				454	}
				455	}
				456	}
				457
				458	/* Handle non-split brain cases */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	459	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	460	if (metadata_fsid) {
				461	if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
				462	&& memcmp(metadata_fsid, fs_devices->metadata_uuid,
				463	BTRFS_FSID_SIZE) == 0)
				464	return fs_devices;
				465	} else {
				466	if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
				467	return fs_devices;
				468	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	469	}
				470	return NULL;
				471	}
				472
				473	static int
				474	btrfs_get_bdev_and_sb(const char device_path, fmode_t flags, void holder,
				475	int flush, struct block_device **bdev,
				476	struct buffer_head **bh)
				477	{
				478	int ret;
				479
				480	*bdev = blkdev_get_by_path(device_path, flags, holder);
				481
				482	if (IS_ERR(*bdev)) {
				483	ret = PTR_ERR(*bdev);
				484	goto error;
				485	}
				486
				487	if (flush)
				488	filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
				489	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
				490	if (ret) {
				491	blkdev_put(*bdev, flags);
				492	goto error;
				493	}
				494	invalidate_bdev(*bdev);
				495	bh = btrfs_read_dev_super(bdev);
				496	if (IS_ERR(*bh)) {
				497	ret = PTR_ERR(*bh);
				498	blkdev_put(*bdev, flags);
				499	goto error;
				500	}
				501
				502	return 0;
				503
				504	error:
				505	*bdev = NULL;
				506	*bh = NULL;
				507	return ret;
				508	}
				509
				510	static void requeue_list(struct btrfs_pending_bios *pending_bios,
				511	struct bio head, struct bio tail)
				512	{
				513
				514	struct bio *old_head;
				515
				516	old_head = pending_bios->head;
				517	pending_bios->head = head;
				518	if (pending_bios->tail)
				519	tail->bi_next = old_head;
				520	else
				521	pending_bios->tail = tail;
				522	}
				523
				524	/*
				525	* we try to collect pending bios for a device so we don't get a large
				526	* number of procs sending bios down to the same device. This greatly
				527	* improves the schedulers ability to collect and merge the bios.
				528	*
				529	* But, it also turns into a long list of bios to process and that is sure
				530	* to eventually make the worker thread block. The solution here is to
				531	* make some progress and then put this work struct back at the end of
				532	* the list if the block device is congested. This way, multiple devices
				533	* can make progress from a single worker thread.
				534	*/
				535	static noinline void run_scheduled_bios(struct btrfs_device *device)
				536	{
				537	struct btrfs_fs_info *fs_info = device->fs_info;
				538	struct bio *pending;
				539	struct backing_dev_info *bdi;
				540	struct btrfs_pending_bios *pending_bios;
				541	struct bio *tail;
				542	struct bio *cur;
				543	int again = 0;
				544	unsigned long num_run;
				545	unsigned long batch_run = 0;
				546	unsigned long last_waited = 0;
				547	int force_reg = 0;
				548	int sync_pending = 0;
				549	struct blk_plug plug;
				550
				551	/*
				552	* this function runs all the bios we've collected for
				553	* a particular device. We don't want to wander off to
				554	* another device without first sending all of these down.
				555	* So, setup a plug here and finish it off before we return
				556	*/
				557	blk_start_plug(&plug);
				558
				559	bdi = device->bdev->bd_bdi;
				560
				561	loop:
				562	spin_lock(&device->io_lock);
				563
				564	loop_lock:
				565	num_run = 0;
				566
				567	/* take all the bios off the list at once and process them
				568	* later on (without the lock held). But, remember the
				569	* tail and other pointers so the bios can be properly reinserted
				570	* into the list if we hit congestion
				571	*/
				572	if (!force_reg && device->pending_sync_bios.head) {
				573	pending_bios = &device->pending_sync_bios;
				574	force_reg = 1;
				575	} else {
				576	pending_bios = &device->pending_bios;
				577	force_reg = 0;
				578	}
				579
				580	pending = pending_bios->head;
				581	tail = pending_bios->tail;
				582	WARN_ON(pending && !tail);
				583
				584	/*
				585	* if pending was null this time around, no bios need processing
				586	* at all and we can stop. Otherwise it'll loop back up again
				587	* and do an additional check so no bios are missed.
				588	*
				589	* device->running_pending is used to synchronize with the
				590	* schedule_bio code.
				591	*/
				592	if (device->pending_sync_bios.head == NULL &&
				593	device->pending_bios.head == NULL) {
				594	again = 0;
				595	device->running_pending = 0;
				596	} else {
				597	again = 1;
				598	device->running_pending = 1;
				599	}
				600
				601	pending_bios->head = NULL;
				602	pending_bios->tail = NULL;
				603
				604	spin_unlock(&device->io_lock);
				605
				606	while (pending) {
				607
				608	rmb();
				609	/* we want to work on both lists, but do more bios on the
				610	* sync list than the regular list
				611	*/
				612	if ((num_run > 32 &&
				613	pending_bios != &device->pending_sync_bios &&
				614	device->pending_sync_bios.head) \|\|
				615	(num_run > 64 && pending_bios == &device->pending_sync_bios &&
				616	device->pending_bios.head)) {
				617	spin_lock(&device->io_lock);
				618	requeue_list(pending_bios, pending, tail);
				619	goto loop_lock;
				620	}
				621
				622	cur = pending;
				623	pending = pending->bi_next;
				624	cur->bi_next = NULL;
				625
				626	BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
				627
				628	/*
				629	* if we're doing the sync list, record that our
				630	* plug has some sync requests on it
				631	*
				632	* If we're doing the regular list and there are
				633	* sync requests sitting around, unplug before
				634	* we add more
				635	*/
				636	if (pending_bios == &device->pending_sync_bios) {
				637	sync_pending = 1;
				638	} else if (sync_pending) {
				639	blk_finish_plug(&plug);
				640	blk_start_plug(&plug);
				641	sync_pending = 0;
				642	}
				643
				644	btrfsic_submit_bio(cur);
				645	num_run++;
				646	batch_run++;
				647
				648	cond_resched();
				649
				650	/*
				651	* we made progress, there is more work to do and the bdi
				652	* is now congested. Back off and let other work structs
				653	* run instead
				654	*/
				655	if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
				656	fs_info->fs_devices->open_devices > 1) {
				657	struct io_context *ioc;
				658
				659	ioc = current->io_context;
				660
				661	/*
				662	* the main goal here is that we don't want to
				663	* block if we're going to be able to submit
				664	* more requests without blocking.
				665	*
				666	* This code does two great things, it pokes into
				667	* the elevator code from a filesystem _and_
				668	* it makes assumptions about how batching works.
				669	*/
				670	if (ioc && ioc->nr_batch_requests > 0 &&
				671	time_before(jiffies, ioc->last_waited + HZ/50UL) &&
				672	(last_waited == 0 \|\|
				673	ioc->last_waited == last_waited)) {
				674	/*
				675	* we want to go through our batch of
				676	* requests and stop. So, we copy out
				677	* the ioc->last_waited time and test
				678	* against it before looping
				679	*/
				680	last_waited = ioc->last_waited;
				681	cond_resched();
				682	continue;
				683	}
				684	spin_lock(&device->io_lock);
				685	requeue_list(pending_bios, pending, tail);
				686	device->running_pending = 1;
				687
				688	spin_unlock(&device->io_lock);
				689	btrfs_queue_work(fs_info->submit_workers,
				690	&device->work);
				691	goto done;
				692	}
				693	}
				694
				695	cond_resched();
				696	if (again)
				697	goto loop;
				698
				699	spin_lock(&device->io_lock);
				700	if (device->pending_bios.head \|\| device->pending_sync_bios.head)
				701	goto loop_lock;
				702	spin_unlock(&device->io_lock);
				703
				704	done:
				705	blk_finish_plug(&plug);
				706	}
				707
				708	static void pending_bios_fn(struct btrfs_work *work)
				709	{
				710	struct btrfs_device *device;
				711
				712	device = container_of(work, struct btrfs_device, work);
				713	run_scheduled_bios(device);
				714	}
				715
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	716	static bool device_path_matched(const char path, struct btrfs_device device)
				717	{
				718	int found;
				719
				720	rcu_read_lock();
				721	found = strcmp(rcu_str_deref(device->name), path);
				722	rcu_read_unlock();
				723
				724	return found == 0;
				725	}
				726
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	727	/*
				728	* Search and remove all stale (devices which are not mounted) devices.
				729	* When both inputs are NULL, it will search and release all stale devices.
				730	* path: Optional. When provided will it release all unmounted devices
				731	* matching this path only.
				732	* skip_dev: Optional. Will skip this device when searching for the stale
				733	* devices.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	734	* Return: 0 for success or if @path is NULL.
				735	* -EBUSY if @path is a mounted device.
				736	* -ENOENT if @path does not match any device in the list.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	737	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	738	static int btrfs_free_stale_devices(const char *path,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	739	struct btrfs_device *skip_device)
				740	{
				741	struct btrfs_fs_devices fs_devices, tmp_fs_devices;
				742	struct btrfs_device device, tmp_device;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	743	int ret = 0;
				744
				745	if (path)
				746	ret = -ENOENT;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	747
				748	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	749
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	750	mutex_lock(&fs_devices->device_list_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	751	list_for_each_entry_safe(device, tmp_device,
				752	&fs_devices->devices, dev_list) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	753	if (skip_device && skip_device == device)
				754	continue;
				755	if (path && !device->name)
				756	continue;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	757	if (path && !device_path_matched(path, device))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	758	continue;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	759	if (fs_devices->opened) {
				760	/* for an already deleted device return 0 */
				761	if (path && ret != 0)
				762	ret = -EBUSY;
				763	break;
				764	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	765
				766	/* delete the stale device */
				767	fs_devices->num_devices--;
				768	list_del(&device->dev_list);
				769	btrfs_free_device(device);
				770
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	771	ret = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	772	if (fs_devices->num_devices == 0)
				773	break;
				774	}
				775	mutex_unlock(&fs_devices->device_list_mutex);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	776
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	777	if (fs_devices->num_devices == 0) {
				778	btrfs_sysfs_remove_fsid(fs_devices);
				779	list_del(&fs_devices->fs_list);
				780	free_fs_devices(fs_devices);
				781	}
				782	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	783
				784	return ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	785	}
				786
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	787	/*
				788	* This is only used on mount, and we are protected from competing things
				789	* messing with our fs_devices by the uuid_mutex, thus we do not need the
				790	* fs_devices->device_list_mutex here.
				791	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	792	static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
				793	struct btrfs_device *device, fmode_t flags,
				794	void *holder)
				795	{
				796	struct request_queue *q;
				797	struct block_device *bdev;
				798	struct buffer_head *bh;
				799	struct btrfs_super_block *disk_super;
				800	u64 devid;
				801	int ret;
				802
				803	if (device->bdev)
				804	return -EINVAL;
				805	if (!device->name)
				806	return -EINVAL;
				807
				808	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
				809	&bdev, &bh);
				810	if (ret)
				811	return ret;
				812
				813	disk_super = (struct btrfs_super_block *)bh->b_data;
				814	devid = btrfs_stack_device_id(&disk_super->dev_item);
				815	if (devid != device->devid)
				816	goto error_brelse;
				817
				818	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
				819	goto error_brelse;
				820
				821	device->generation = btrfs_super_generation(disk_super);
				822
				823	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	824	if (btrfs_super_incompat_flags(disk_super) &
				825	BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
				826	pr_err(
				827	"BTRFS: Invalid seeding and uuid-changed device detected\n");
				828	goto error_brelse;
				829	}
				830
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	831	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
				832	fs_devices->seeding = 1;
				833	} else {
				834	if (bdev_read_only(bdev))
				835	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
				836	else
				837	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
				838	}
				839
				840	q = bdev_get_queue(bdev);
				841	if (!blk_queue_nonrot(q))
				842	fs_devices->rotating = 1;
				843
				844	device->bdev = bdev;
				845	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
				846	device->mode = flags;
				847
				848	fs_devices->open_devices++;
				849	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
				850	device->devid != BTRFS_DEV_REPLACE_DEVID) {
				851	fs_devices->rw_devices++;
				852	list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
				853	}
				854	brelse(bh);
				855
				856	return 0;
				857
				858	error_brelse:
				859	brelse(bh);
				860	blkdev_put(bdev, flags);
				861
				862	return -EINVAL;
				863	}
				864
				865	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	866	* Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
				867	* being created with a disk that has already completed its fsid change.
				868	*/
				869	static struct btrfs_fs_devices *find_fsid_inprogress(
				870	struct btrfs_super_block *disk_super)
				871	{
				872	struct btrfs_fs_devices *fs_devices;
				873
				874	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
				875	if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
				876	BTRFS_FSID_SIZE) != 0 &&
				877	memcmp(fs_devices->metadata_uuid, disk_super->fsid,
				878	BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
				879	return fs_devices;
				880	}
				881	}
				882
				883	return NULL;
				884	}
				885
				886
				887	static struct btrfs_fs_devices *find_fsid_changed(
				888	struct btrfs_super_block *disk_super)
				889	{
				890	struct btrfs_fs_devices *fs_devices;
				891
				892	/*
				893	* Handles the case where scanned device is part of an fs that had
				894	* multiple successful changes of FSID but curently device didn't
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	895	* observe it. Meaning our fsid will be different than theirs. We need
				896	* to handle two subcases :
				897	* 1 - The fs still continues to have different METADATA/FSID uuids.
				898	* 2 - The fs is switched back to its original FSID (METADATA/FSID
				899	* are equal).
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	900	*/
				901	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	902	/* Changed UUIDs */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	903	if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
				904	BTRFS_FSID_SIZE) != 0 &&
				905	memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
				906	BTRFS_FSID_SIZE) == 0 &&
				907	memcmp(fs_devices->fsid, disk_super->fsid,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	908	BTRFS_FSID_SIZE) != 0)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	909	return fs_devices;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	910
				911	/* Unchanged UUIDs */
				912	if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
				913	BTRFS_FSID_SIZE) == 0 &&
				914	memcmp(fs_devices->fsid, disk_super->metadata_uuid,
				915	BTRFS_FSID_SIZE) == 0)
				916	return fs_devices;
				917	}
				918
				919	return NULL;
				920	}
				921
				922	static struct btrfs_fs_devices *find_fsid_reverted_metadata(
				923	struct btrfs_super_block *disk_super)
				924	{
				925	struct btrfs_fs_devices *fs_devices;
				926
				927	/*
				928	* Handle the case where the scanned device is part of an fs whose last
				929	* metadata UUID change reverted it to the original FSID. At the same
				930	* time * fs_devices was first created by another constitutent device
				931	* which didn't fully observe the operation. This results in an
				932	* btrfs_fs_devices created with metadata/fsid different AND
				933	* btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
				934	* fs_devices equal to the FSID of the disk.
				935	*/
				936	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
				937	if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
				938	BTRFS_FSID_SIZE) != 0 &&
				939	memcmp(fs_devices->metadata_uuid, disk_super->fsid,
				940	BTRFS_FSID_SIZE) == 0 &&
				941	fs_devices->fsid_change)
				942	return fs_devices;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	943	}
				944
				945	return NULL;
				946	}
				947	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	948	* Add new device to list of registered devices
				949	*
				950	* Returns:
				951	* device pointer which was just added or updated when successful
				952	* error pointer when failed
				953	*/
				954	static noinline struct btrfs_device device_list_add(const char path,
				955	struct btrfs_super_block *disk_super,
				956	bool *new_device_added)
				957	{
				958	struct btrfs_device *device;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	959	struct btrfs_fs_devices *fs_devices = NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	960	struct rcu_string *name;
				961	u64 found_transid = btrfs_super_generation(disk_super);
				962	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	963	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
				964	BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
				965	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
				966	BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	967
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	968	if (fsid_change_in_progress) {
				969	if (!has_metadata_uuid) {
				970	/*
				971	* When we have an image which has CHANGING_FSID_V2 set
				972	* it might belong to either a filesystem which has
				973	* disks with completed fsid change or it might belong
				974	* to fs with no UUID changes in effect, handle both.
				975	*/
				976	fs_devices = find_fsid_inprogress(disk_super);
				977	if (!fs_devices)
				978	fs_devices = find_fsid(disk_super->fsid, NULL);
				979	} else {
				980	fs_devices = find_fsid_changed(disk_super);
				981	}
				982	} else if (has_metadata_uuid) {
				983	fs_devices = find_fsid(disk_super->fsid,
				984	disk_super->metadata_uuid);
				985	} else {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	986	fs_devices = find_fsid_reverted_metadata(disk_super);
				987	if (!fs_devices)
				988	fs_devices = find_fsid(disk_super->fsid, NULL);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	989	}
				990
				991
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	992	if (!fs_devices) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	993	if (has_metadata_uuid)
				994	fs_devices = alloc_fs_devices(disk_super->fsid,
				995	disk_super->metadata_uuid);
				996	else
				997	fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
				998
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	999	if (IS_ERR(fs_devices))
				1000	return ERR_CAST(fs_devices);
				1001
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1002	fs_devices->fsid_change = fsid_change_in_progress;
				1003
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1004	mutex_lock(&fs_devices->device_list_mutex);
				1005	list_add(&fs_devices->fs_list, &fs_uuids);
				1006
				1007	device = NULL;
				1008	} else {
				1009	mutex_lock(&fs_devices->device_list_mutex);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1010	device = btrfs_find_device(fs_devices, devid,
				1011	disk_super->dev_item.uuid, NULL, false);
				1012
				1013	/*
				1014	* If this disk has been pulled into an fs devices created by
				1015	* a device which had the CHANGING_FSID_V2 flag then replace the
				1016	* metadata_uuid/fsid values of the fs_devices.
				1017	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1018	if (fs_devices->fsid_change &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1019	found_transid > fs_devices->latest_generation) {
				1020	memcpy(fs_devices->fsid, disk_super->fsid,
				1021	BTRFS_FSID_SIZE);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1022
				1023	if (has_metadata_uuid)
				1024	memcpy(fs_devices->metadata_uuid,
				1025	disk_super->metadata_uuid,
				1026	BTRFS_FSID_SIZE);
				1027	else
				1028	memcpy(fs_devices->metadata_uuid,
				1029	disk_super->fsid, BTRFS_FSID_SIZE);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1030
				1031	fs_devices->fsid_change = false;
				1032	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1033	}
				1034
				1035	if (!device) {
				1036	if (fs_devices->opened) {
				1037	mutex_unlock(&fs_devices->device_list_mutex);
				1038	return ERR_PTR(-EBUSY);
				1039	}
				1040
				1041	device = btrfs_alloc_device(NULL, &devid,
				1042	disk_super->dev_item.uuid);
				1043	if (IS_ERR(device)) {
				1044	mutex_unlock(&fs_devices->device_list_mutex);
				1045	/* we can safely leave the fs_devices entry around */
				1046	return device;
				1047	}
				1048
				1049	name = rcu_string_strdup(path, GFP_NOFS);
				1050	if (!name) {
				1051	btrfs_free_device(device);
				1052	mutex_unlock(&fs_devices->device_list_mutex);
				1053	return ERR_PTR(-ENOMEM);
				1054	}
				1055	rcu_assign_pointer(device->name, name);
				1056
				1057	list_add_rcu(&device->dev_list, &fs_devices->devices);
				1058	fs_devices->num_devices++;
				1059
				1060	device->fs_devices = fs_devices;
				1061	*new_device_added = true;
				1062
				1063	if (disk_super->label[0])
				1064	pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
				1065	disk_super->label, devid, found_transid, path);
				1066	else
				1067	pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
				1068	disk_super->fsid, devid, found_transid, path);
				1069
				1070	} else if (!device->name \|\| strcmp(device->name->str, path)) {
				1071	/*
				1072	* When FS is already mounted.
				1073	* 1. If you are here and if the device->name is NULL that
				1074	* means this device was missing at time of FS mount.
				1075	* 2. If you are here and if the device->name is different
				1076	* from 'path' that means either
				1077	* a. The same device disappeared and reappeared with
				1078	* different name. or
				1079	* b. The missing-disk-which-was-replaced, has
				1080	* reappeared now.
				1081	*
				1082	* We must allow 1 and 2a above. But 2b would be a spurious
				1083	* and unintentional.
				1084	*
				1085	* Further in case of 1 and 2a above, the disk at 'path'
				1086	* would have missed some transaction when it was away and
				1087	* in case of 2a the stale bdev has to be updated as well.
				1088	* 2b must not be allowed at all time.
				1089	*/
				1090
				1091	/*
				1092	* For now, we do allow update to btrfs_fs_device through the
				1093	* btrfs dev scan cli after FS has been mounted. We're still
				1094	* tracking a problem where systems fail mount by subvolume id
				1095	* when we reject replacement on a mounted FS.
				1096	*/
				1097	if (!fs_devices->opened && found_transid < device->generation) {
				1098	/*
				1099	* That is if the FS is _not_ mounted and if you
				1100	* are here, that means there is more than one
				1101	* disk with same uuid and devid.We keep the one
				1102	* with larger generation number or the last-in if
				1103	* generation are equal.
				1104	*/
				1105	mutex_unlock(&fs_devices->device_list_mutex);
				1106	return ERR_PTR(-EEXIST);
				1107	}
				1108
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1109	/*
				1110	* We are going to replace the device path for a given devid,
				1111	* make sure it's the same device if the device is mounted
				1112	*/
				1113	if (device->bdev) {
				1114	struct block_device *path_bdev;
				1115
				1116	path_bdev = lookup_bdev(path);
				1117	if (IS_ERR(path_bdev)) {
				1118	mutex_unlock(&fs_devices->device_list_mutex);
				1119	return ERR_CAST(path_bdev);
				1120	}
				1121
				1122	if (device->bdev != path_bdev) {
				1123	bdput(path_bdev);
				1124	mutex_unlock(&fs_devices->device_list_mutex);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1125	/*
				1126	* device->fs_info may not be reliable here, so
				1127	* pass in a NULL instead. This avoids a
				1128	* possible use-after-free when the fs_info and
				1129	* fs_info->sb are already torn down.
				1130	*/
				1131	btrfs_warn_in_rcu(NULL,
				1132	"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
				1133	path, devid, found_transid,
				1134	current->comm,
				1135	task_pid_nr(current));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1136	return ERR_PTR(-EEXIST);
				1137	}
				1138	bdput(path_bdev);
				1139	btrfs_info_in_rcu(device->fs_info,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1140	"devid %llu device path %s changed to %s scanned by %s (%d)",
				1141	devid, rcu_str_deref(device->name),
				1142	path, current->comm,
				1143	task_pid_nr(current));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1144	}
				1145
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1146	name = rcu_string_strdup(path, GFP_NOFS);
				1147	if (!name) {
				1148	mutex_unlock(&fs_devices->device_list_mutex);
				1149	return ERR_PTR(-ENOMEM);
				1150	}
				1151	rcu_string_free(device->name);
				1152	rcu_assign_pointer(device->name, name);
				1153	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
				1154	fs_devices->missing_devices--;
				1155	clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
				1156	}
				1157	}
				1158
				1159	/*
				1160	* Unmount does not free the btrfs_device struct but would zero
				1161	* generation along with most of the other members. So just update
				1162	* it back. We need it to pick the disk with largest generation
				1163	* (as above).
				1164	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1165	if (!fs_devices->opened) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1166	device->generation = found_transid;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1167	fs_devices->latest_generation = max_t(u64, found_transid,
				1168	fs_devices->latest_generation);
				1169	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1170
				1171	fs_devices->total_devices = btrfs_super_num_devices(disk_super);
				1172
				1173	mutex_unlock(&fs_devices->device_list_mutex);
				1174	return device;
				1175	}
				1176
				1177	static struct btrfs_fs_devices clone_fs_devices(struct btrfs_fs_devices orig)
				1178	{
				1179	struct btrfs_fs_devices *fs_devices;
				1180	struct btrfs_device *device;
				1181	struct btrfs_device *orig_dev;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1182	int ret = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1183
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1184	fs_devices = alloc_fs_devices(orig->fsid, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1185	if (IS_ERR(fs_devices))
				1186	return fs_devices;
				1187
				1188	mutex_lock(&orig->device_list_mutex);
				1189	fs_devices->total_devices = orig->total_devices;
				1190
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1191	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
				1192	struct rcu_string *name;
				1193
				1194	device = btrfs_alloc_device(NULL, &orig_dev->devid,
				1195	orig_dev->uuid);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1196	if (IS_ERR(device)) {
				1197	ret = PTR_ERR(device);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1198	goto error;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1199	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1200
				1201	/*
				1202	* This is ok to do without rcu read locked because we hold the
				1203	* uuid mutex so nothing we touch in here is going to disappear.
				1204	*/
				1205	if (orig_dev->name) {
				1206	name = rcu_string_strdup(orig_dev->name->str,
				1207	GFP_KERNEL);
				1208	if (!name) {
				1209	btrfs_free_device(device);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1210	ret = -ENOMEM;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1211	goto error;
				1212	}
				1213	rcu_assign_pointer(device->name, name);
				1214	}
				1215
				1216	list_add(&device->dev_list, &fs_devices->devices);
				1217	device->fs_devices = fs_devices;
				1218	fs_devices->num_devices++;
				1219	}
				1220	mutex_unlock(&orig->device_list_mutex);
				1221	return fs_devices;
				1222	error:
				1223	mutex_unlock(&orig->device_list_mutex);
				1224	free_fs_devices(fs_devices);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1225	return ERR_PTR(ret);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1226	}
				1227
				1228	/*
				1229	* After we have read the system tree and know devids belonging to
				1230	* this filesystem, remove the device which does not belong there.
				1231	*/
				1232	void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
				1233	{
				1234	struct btrfs_device device, next;
				1235	struct btrfs_device *latest_dev = NULL;
				1236
				1237	mutex_lock(&uuid_mutex);
				1238	again:
				1239	/* This is the initialized path, it is safe to release the devices. */
				1240	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
				1241	if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				1242	&device->dev_state)) {
				1243	if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
				1244	&device->dev_state) &&
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1245	!test_bit(BTRFS_DEV_STATE_MISSING,
				1246	&device->dev_state) &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1247	(!latest_dev \|\|
				1248	device->generation > latest_dev->generation)) {
				1249	latest_dev = device;
				1250	}
				1251	continue;
				1252	}
				1253
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1254	/*
				1255	* We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
				1256	* in btrfs_init_dev_replace() so just continue.
				1257	*/
				1258	if (device->devid == BTRFS_DEV_REPLACE_DEVID)
				1259	continue;
				1260
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1261	if (device->bdev) {
				1262	blkdev_put(device->bdev, device->mode);
				1263	device->bdev = NULL;
				1264	fs_devices->open_devices--;
				1265	}
				1266	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
				1267	list_del_init(&device->dev_alloc_list);
				1268	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1269	fs_devices->rw_devices--;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1270	}
				1271	list_del_init(&device->dev_list);
				1272	fs_devices->num_devices--;
				1273	btrfs_free_device(device);
				1274	}
				1275
				1276	if (fs_devices->seed) {
				1277	fs_devices = fs_devices->seed;
				1278	goto again;
				1279	}
				1280
				1281	fs_devices->latest_bdev = latest_dev->bdev;
				1282
				1283	mutex_unlock(&uuid_mutex);
				1284	}
				1285
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1286	static void btrfs_close_bdev(struct btrfs_device *device)
				1287	{
				1288	if (!device->bdev)
				1289	return;
				1290
				1291	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
				1292	sync_blockdev(device->bdev);
				1293	invalidate_bdev(device->bdev);
				1294	}
				1295
				1296	blkdev_put(device->bdev, device->mode);
				1297	}
				1298
				1299	static void btrfs_close_one_device(struct btrfs_device *device)
				1300	{
				1301	struct btrfs_fs_devices *fs_devices = device->fs_devices;
				1302	struct btrfs_device *new_device;
				1303	struct rcu_string *name;
				1304
				1305	if (device->bdev)
				1306	fs_devices->open_devices--;
				1307
				1308	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
				1309	device->devid != BTRFS_DEV_REPLACE_DEVID) {
				1310	list_del_init(&device->dev_alloc_list);
				1311	fs_devices->rw_devices--;
				1312	}
				1313
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1314	if (device->devid == BTRFS_DEV_REPLACE_DEVID)
				1315	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
				1316
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1317	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
				1318	fs_devices->missing_devices--;
				1319
				1320	btrfs_close_bdev(device);
				1321
				1322	new_device = btrfs_alloc_device(NULL, &device->devid,
				1323	device->uuid);
				1324	BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
				1325
				1326	/* Safe because we are under uuid_mutex */
				1327	if (device->name) {
				1328	name = rcu_string_strdup(device->name->str, GFP_NOFS);
				1329	BUG_ON(!name); /* -ENOMEM */
				1330	rcu_assign_pointer(new_device->name, name);
				1331	}
				1332
				1333	list_replace_rcu(&device->dev_list, &new_device->dev_list);
				1334	new_device->fs_devices = device->fs_devices;
				1335
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1336	synchronize_rcu();
				1337	btrfs_free_device(device);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1338	}
				1339
				1340	static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
				1341	{
				1342	struct btrfs_device device, tmp;
				1343
				1344	if (--fs_devices->opened > 0)
				1345	return 0;
				1346
				1347	mutex_lock(&fs_devices->device_list_mutex);
				1348	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
				1349	btrfs_close_one_device(device);
				1350	}
				1351	mutex_unlock(&fs_devices->device_list_mutex);
				1352
				1353	WARN_ON(fs_devices->open_devices);
				1354	WARN_ON(fs_devices->rw_devices);
				1355	fs_devices->opened = 0;
				1356	fs_devices->seeding = 0;
				1357
				1358	return 0;
				1359	}
				1360
				1361	int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
				1362	{
				1363	struct btrfs_fs_devices *seed_devices = NULL;
				1364	int ret;
				1365
				1366	mutex_lock(&uuid_mutex);
				1367	ret = close_fs_devices(fs_devices);
				1368	if (!fs_devices->opened) {
				1369	seed_devices = fs_devices->seed;
				1370	fs_devices->seed = NULL;
				1371	}
				1372	mutex_unlock(&uuid_mutex);
				1373
				1374	while (seed_devices) {
				1375	fs_devices = seed_devices;
				1376	seed_devices = fs_devices->seed;
				1377	close_fs_devices(fs_devices);
				1378	free_fs_devices(fs_devices);
				1379	}
				1380	return ret;
				1381	}
				1382
				1383	static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
				1384	fmode_t flags, void *holder)
				1385	{
				1386	struct btrfs_device *device;
				1387	struct btrfs_device *latest_dev = NULL;
				1388	int ret = 0;
				1389
				1390	flags \|= FMODE_EXCL;
				1391
				1392	list_for_each_entry(device, &fs_devices->devices, dev_list) {
				1393	/* Just open everything we can; ignore failures here */
				1394	if (btrfs_open_one_device(fs_devices, device, flags, holder))
				1395	continue;
				1396
				1397	if (!latest_dev \|\|
				1398	device->generation > latest_dev->generation)
				1399	latest_dev = device;
				1400	}
				1401	if (fs_devices->open_devices == 0) {
				1402	ret = -EINVAL;
				1403	goto out;
				1404	}
				1405	fs_devices->opened = 1;
				1406	fs_devices->latest_bdev = latest_dev->bdev;
				1407	fs_devices->total_rw_bytes = 0;
				1408	out:
				1409	return ret;
				1410	}
				1411
				1412	static int devid_cmp(void priv, struct list_head a, struct list_head *b)
				1413	{
				1414	struct btrfs_device dev1, dev2;
				1415
				1416	dev1 = list_entry(a, struct btrfs_device, dev_list);
				1417	dev2 = list_entry(b, struct btrfs_device, dev_list);
				1418
				1419	if (dev1->devid < dev2->devid)
				1420	return -1;
				1421	else if (dev1->devid > dev2->devid)
				1422	return 1;
				1423	return 0;
				1424	}
				1425
				1426	int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
				1427	fmode_t flags, void *holder)
				1428	{
				1429	int ret;
				1430
				1431	lockdep_assert_held(&uuid_mutex);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1432	/*
				1433	* The device_list_mutex cannot be taken here in case opening the
				1434	* underlying device takes further locks like bd_mutex.
				1435	*
				1436	* We also don't need the lock here as this is called during mount and
				1437	* exclusion is provided by uuid_mutex
				1438	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1439
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1440	if (fs_devices->opened) {
				1441	fs_devices->opened++;
				1442	ret = 0;
				1443	} else {
				1444	list_sort(NULL, &fs_devices->devices, devid_cmp);
				1445	ret = open_fs_devices(fs_devices, flags, holder);
				1446	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1447
				1448	return ret;
				1449	}
				1450
				1451	static void btrfs_release_disk_super(struct page *page)
				1452	{
				1453	kunmap(page);
				1454	put_page(page);
				1455	}
				1456
				1457	static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
				1458	struct page **page,
				1459	struct btrfs_super_block **disk_super)
				1460	{
				1461	void *p;
				1462	pgoff_t index;
				1463
				1464	/* make sure our super fits in the device */
				1465	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
				1466	return 1;
				1467
				1468	/* make sure our super fits in the page */
				1469	if (sizeof(**disk_super) > PAGE_SIZE)
				1470	return 1;
				1471
				1472	/* make sure our super doesn't straddle pages on disk */
				1473	index = bytenr >> PAGE_SHIFT;
				1474	if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
				1475	return 1;
				1476
				1477	/* pull in the page with our super */
				1478	*page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
				1479	index, GFP_KERNEL);
				1480
				1481	if (IS_ERR_OR_NULL(*page))
				1482	return 1;
				1483
				1484	p = kmap(*page);
				1485
				1486	/* align our pointer to the offset of the super block */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1487	*disk_super = p + offset_in_page(bytenr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1488
				1489	if (btrfs_super_bytenr(*disk_super) != bytenr \|\|
				1490	btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
				1491	btrfs_release_disk_super(*page);
				1492	return 1;
				1493	}
				1494
				1495	if ((*disk_super)->label[0] &&
				1496	(*disk_super)->label[BTRFS_LABEL_SIZE - 1])
				1497	(*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
				1498
				1499	return 0;
				1500	}
				1501
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1502	int btrfs_forget_devices(const char *path)
				1503	{
				1504	int ret;
				1505
				1506	mutex_lock(&uuid_mutex);
				1507	ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
				1508	mutex_unlock(&uuid_mutex);
				1509
				1510	return ret;
				1511	}
				1512
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1513	/*
				1514	* Look for a btrfs signature on a device. This may be called out of the mount path
				1515	* and we are not allowed to call set_blocksize during the scan. The superblock
				1516	* is read via pagecache
				1517	*/
				1518	struct btrfs_device btrfs_scan_one_device(const char path, fmode_t flags,
				1519	void *holder)
				1520	{
				1521	struct btrfs_super_block *disk_super;
				1522	bool new_device_added = false;
				1523	struct btrfs_device *device = NULL;
				1524	struct block_device *bdev;
				1525	struct page *page;
				1526	u64 bytenr;
				1527
				1528	lockdep_assert_held(&uuid_mutex);
				1529
				1530	/*
				1531	* we would like to check all the supers, but that would make
				1532	* a btrfs mount succeed after a mkfs from a different FS.
				1533	* So, we need to add a special mount option to scan for
				1534	* later supers, using BTRFS_SUPER_MIRROR_MAX instead
				1535	*/
				1536	bytenr = btrfs_sb_offset(0);
				1537	flags \|= FMODE_EXCL;
				1538
				1539	bdev = blkdev_get_by_path(path, flags, holder);
				1540	if (IS_ERR(bdev))
				1541	return ERR_CAST(bdev);
				1542
				1543	if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
				1544	device = ERR_PTR(-EINVAL);
				1545	goto error_bdev_put;
				1546	}
				1547
				1548	device = device_list_add(path, disk_super, &new_device_added);
				1549	if (!IS_ERR(device)) {
				1550	if (new_device_added)
				1551	btrfs_free_stale_devices(path, device);
				1552	}
				1553
				1554	btrfs_release_disk_super(page);
				1555
				1556	error_bdev_put:
				1557	blkdev_put(bdev, flags);
				1558
				1559	return device;
				1560	}
				1561
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1562	/*
				1563	* Try to find a chunk that intersects [start, start + len] range and when one
				1564	* such is found, record the end of it in *start
				1565	*/
				1566	static bool contains_pending_extent(struct btrfs_device device, u64 start,
				1567	u64 len)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1568	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1569	u64 physical_start, physical_end;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1570
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1571	lockdep_assert_held(&device->fs_info->chunk_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1572
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1573	if (!find_first_extent_bit(&device->alloc_state, *start,
				1574	&physical_start, &physical_end,
				1575	CHUNK_ALLOCATED, NULL)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1576
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1577	if (in_range(physical_start, *start, len) \|\|
				1578	in_range(*start, physical_start,
				1579	physical_end - physical_start)) {
				1580	*start = physical_end + 1;
				1581	return true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1582	}
				1583	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1584	return false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1585	}
				1586
				1587
				1588	/*
				1589	* find_free_dev_extent_start - find free space in the specified device
				1590	* @device: the device which we search the free space in
				1591	* @num_bytes: the size of the free space that we need
				1592	* @search_start: the position from which to begin the search
				1593	* @start: store the start of the free space.
				1594	* @len: the size of the free space. that we find, or the size
				1595	* of the max free space if we don't find suitable free space
				1596	*
				1597	* this uses a pretty simple search, the expectation is that it is
				1598	* called very infrequently and that a given device has a small number
				1599	* of extents
				1600	*
				1601	* @start is used to store the start of the free space if we find. But if we
				1602	* don't find suitable free space, it will be used to store the start position
				1603	* of the max free space.
				1604	*
				1605	* @len is used to store the size of the free space that we find.
				1606	* But if we don't find suitable free space, it is used to store the size of
				1607	* the max free space.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1608	*
				1609	* NOTE: This function will search commit root of device tree, and does extra
				1610	* check to ensure dev extents are not double allocated.
				1611	* This makes the function safe to allocate dev extents but may not report
				1612	* correct usable device space, as device extent freed in current transaction
				1613	* is not reported as avaiable.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1614	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1615	static int find_free_dev_extent_start(struct btrfs_device *device,
				1616	u64 num_bytes, u64 search_start, u64 *start,
				1617	u64 *len)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1618	{
				1619	struct btrfs_fs_info *fs_info = device->fs_info;
				1620	struct btrfs_root *root = fs_info->dev_root;
				1621	struct btrfs_key key;
				1622	struct btrfs_dev_extent *dev_extent;
				1623	struct btrfs_path *path;
				1624	u64 hole_size;
				1625	u64 max_hole_start;
				1626	u64 max_hole_size;
				1627	u64 extent_end;
				1628	u64 search_end = device->total_bytes;
				1629	int ret;
				1630	int slot;
				1631	struct extent_buffer *l;
				1632
				1633	/*
				1634	* We don't want to overwrite the superblock on the drive nor any area
				1635	* used by the boot loader (grub for example), so we make sure to start
				1636	* at an offset of at least 1MB.
				1637	*/
				1638	search_start = max_t(u64, search_start, SZ_1M);
				1639
				1640	path = btrfs_alloc_path();
				1641	if (!path)
				1642	return -ENOMEM;
				1643
				1644	max_hole_start = search_start;
				1645	max_hole_size = 0;
				1646
				1647	again:
				1648	if (search_start >= search_end \|\|
				1649	test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
				1650	ret = -ENOSPC;
				1651	goto out;
				1652	}
				1653
				1654	path->reada = READA_FORWARD;
				1655	path->search_commit_root = 1;
				1656	path->skip_locking = 1;
				1657
				1658	key.objectid = device->devid;
				1659	key.offset = search_start;
				1660	key.type = BTRFS_DEV_EXTENT_KEY;
				1661
				1662	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1663	if (ret < 0)
				1664	goto out;
				1665	if (ret > 0) {
				1666	ret = btrfs_previous_item(root, path, key.objectid, key.type);
				1667	if (ret < 0)
				1668	goto out;
				1669	}
				1670
				1671	while (1) {
				1672	l = path->nodes[0];
				1673	slot = path->slots[0];
				1674	if (slot >= btrfs_header_nritems(l)) {
				1675	ret = btrfs_next_leaf(root, path);
				1676	if (ret == 0)
				1677	continue;
				1678	if (ret < 0)
				1679	goto out;
				1680
				1681	break;
				1682	}
				1683	btrfs_item_key_to_cpu(l, &key, slot);
				1684
				1685	if (key.objectid < device->devid)
				1686	goto next;
				1687
				1688	if (key.objectid > device->devid)
				1689	break;
				1690
				1691	if (key.type != BTRFS_DEV_EXTENT_KEY)
				1692	goto next;
				1693
				1694	if (key.offset > search_start) {
				1695	hole_size = key.offset - search_start;
				1696
				1697	/*
				1698	* Have to check before we set max_hole_start, otherwise
				1699	* we could end up sending back this offset anyway.
				1700	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1701	if (contains_pending_extent(device, &search_start,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1702	hole_size)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1703	if (key.offset >= search_start)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1704	hole_size = key.offset - search_start;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1705	else
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1706	hole_size = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1707	}
				1708
				1709	if (hole_size > max_hole_size) {
				1710	max_hole_start = search_start;
				1711	max_hole_size = hole_size;
				1712	}
				1713
				1714	/*
				1715	* If this free space is greater than which we need,
				1716	* it must be the max free space that we have found
				1717	* until now, so max_hole_start must point to the start
				1718	* of this free space and the length of this free space
				1719	* is stored in max_hole_size. Thus, we return
				1720	* max_hole_start and max_hole_size and go back to the
				1721	* caller.
				1722	*/
				1723	if (hole_size >= num_bytes) {
				1724	ret = 0;
				1725	goto out;
				1726	}
				1727	}
				1728
				1729	dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
				1730	extent_end = key.offset + btrfs_dev_extent_length(l,
				1731	dev_extent);
				1732	if (extent_end > search_start)
				1733	search_start = extent_end;
				1734	next:
				1735	path->slots[0]++;
				1736	cond_resched();
				1737	}
				1738
				1739	/*
				1740	* At this point, search_start should be the end of
				1741	* allocated dev extents, and when shrinking the device,
				1742	* search_end may be smaller than search_start.
				1743	*/
				1744	if (search_end > search_start) {
				1745	hole_size = search_end - search_start;
				1746
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1747	if (contains_pending_extent(device, &search_start, hole_size)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1748	btrfs_release_path(path);
				1749	goto again;
				1750	}
				1751
				1752	if (hole_size > max_hole_size) {
				1753	max_hole_start = search_start;
				1754	max_hole_size = hole_size;
				1755	}
				1756	}
				1757
				1758	/* See above. */
				1759	if (max_hole_size < num_bytes)
				1760	ret = -ENOSPC;
				1761	else
				1762	ret = 0;
				1763
				1764	out:
				1765	btrfs_free_path(path);
				1766	*start = max_hole_start;
				1767	if (len)
				1768	*len = max_hole_size;
				1769	return ret;
				1770	}
				1771
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1772	int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1773	u64 start, u64 len)
				1774	{
				1775	/* FIXME use last free of some kind */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1776	return find_free_dev_extent_start(device, num_bytes, 0, start, len);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1777	}
				1778
				1779	static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
				1780	struct btrfs_device *device,
				1781	u64 start, u64 *dev_extent_len)
				1782	{
				1783	struct btrfs_fs_info *fs_info = device->fs_info;
				1784	struct btrfs_root *root = fs_info->dev_root;
				1785	int ret;
				1786	struct btrfs_path *path;
				1787	struct btrfs_key key;
				1788	struct btrfs_key found_key;
				1789	struct extent_buffer *leaf = NULL;
				1790	struct btrfs_dev_extent *extent = NULL;
				1791
				1792	path = btrfs_alloc_path();
				1793	if (!path)
				1794	return -ENOMEM;
				1795
				1796	key.objectid = device->devid;
				1797	key.offset = start;
				1798	key.type = BTRFS_DEV_EXTENT_KEY;
				1799	again:
				1800	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1801	if (ret > 0) {
				1802	ret = btrfs_previous_item(root, path, key.objectid,
				1803	BTRFS_DEV_EXTENT_KEY);
				1804	if (ret)
				1805	goto out;
				1806	leaf = path->nodes[0];
				1807	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
				1808	extent = btrfs_item_ptr(leaf, path->slots[0],
				1809	struct btrfs_dev_extent);
				1810	BUG_ON(found_key.offset > start \|\| found_key.offset +
				1811	btrfs_dev_extent_length(leaf, extent) < start);
				1812	key = found_key;
				1813	btrfs_release_path(path);
				1814	goto again;
				1815	} else if (ret == 0) {
				1816	leaf = path->nodes[0];
				1817	extent = btrfs_item_ptr(leaf, path->slots[0],
				1818	struct btrfs_dev_extent);
				1819	} else {
				1820	btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
				1821	goto out;
				1822	}
				1823
				1824	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
				1825
				1826	ret = btrfs_del_item(trans, root, path);
				1827	if (ret) {
				1828	btrfs_handle_fs_error(fs_info, ret,
				1829	"Failed to remove dev extent item");
				1830	} else {
				1831	set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
				1832	}
				1833	out:
				1834	btrfs_free_path(path);
				1835	return ret;
				1836	}
				1837
				1838	static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
				1839	struct btrfs_device *device,
				1840	u64 chunk_offset, u64 start, u64 num_bytes)
				1841	{
				1842	int ret;
				1843	struct btrfs_path *path;
				1844	struct btrfs_fs_info *fs_info = device->fs_info;
				1845	struct btrfs_root *root = fs_info->dev_root;
				1846	struct btrfs_dev_extent *extent;
				1847	struct extent_buffer *leaf;
				1848	struct btrfs_key key;
				1849
				1850	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
				1851	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
				1852	path = btrfs_alloc_path();
				1853	if (!path)
				1854	return -ENOMEM;
				1855
				1856	key.objectid = device->devid;
				1857	key.offset = start;
				1858	key.type = BTRFS_DEV_EXTENT_KEY;
				1859	ret = btrfs_insert_empty_item(trans, root, path, &key,
				1860	sizeof(*extent));
				1861	if (ret)
				1862	goto out;
				1863
				1864	leaf = path->nodes[0];
				1865	extent = btrfs_item_ptr(leaf, path->slots[0],
				1866	struct btrfs_dev_extent);
				1867	btrfs_set_dev_extent_chunk_tree(leaf, extent,
				1868	BTRFS_CHUNK_TREE_OBJECTID);
				1869	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
				1870	BTRFS_FIRST_CHUNK_TREE_OBJECTID);
				1871	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
				1872
				1873	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
				1874	btrfs_mark_buffer_dirty(leaf);
				1875	out:
				1876	btrfs_free_path(path);
				1877	return ret;
				1878	}
				1879
				1880	static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
				1881	{
				1882	struct extent_map_tree *em_tree;
				1883	struct extent_map *em;
				1884	struct rb_node *n;
				1885	u64 ret = 0;
				1886
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1887	em_tree = &fs_info->mapping_tree;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1888	read_lock(&em_tree->lock);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1889	n = rb_last(&em_tree->map.rb_root);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1890	if (n) {
				1891	em = rb_entry(n, struct extent_map, rb_node);
				1892	ret = em->start + em->len;
				1893	}
				1894	read_unlock(&em_tree->lock);
				1895
				1896	return ret;
				1897	}
				1898
				1899	static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
				1900	u64 *devid_ret)
				1901	{
				1902	int ret;
				1903	struct btrfs_key key;
				1904	struct btrfs_key found_key;
				1905	struct btrfs_path *path;
				1906
				1907	path = btrfs_alloc_path();
				1908	if (!path)
				1909	return -ENOMEM;
				1910
				1911	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				1912	key.type = BTRFS_DEV_ITEM_KEY;
				1913	key.offset = (u64)-1;
				1914
				1915	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
				1916	if (ret < 0)
				1917	goto error;
				1918
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1919	if (ret == 0) {
				1920	/* Corruption */
				1921	btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
				1922	ret = -EUCLEAN;
				1923	goto error;
				1924	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1925
				1926	ret = btrfs_previous_item(fs_info->chunk_root, path,
				1927	BTRFS_DEV_ITEMS_OBJECTID,
				1928	BTRFS_DEV_ITEM_KEY);
				1929	if (ret) {
				1930	*devid_ret = 1;
				1931	} else {
				1932	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				1933	path->slots[0]);
				1934	*devid_ret = found_key.offset + 1;
				1935	}
				1936	ret = 0;
				1937	error:
				1938	btrfs_free_path(path);
				1939	return ret;
				1940	}
				1941
				1942	/*
				1943	* the device information is stored in the chunk root
				1944	* the btrfs_device struct should be fully filled in
				1945	*/
				1946	static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
				1947	struct btrfs_device *device)
				1948	{
				1949	int ret;
				1950	struct btrfs_path *path;
				1951	struct btrfs_dev_item *dev_item;
				1952	struct extent_buffer *leaf;
				1953	struct btrfs_key key;
				1954	unsigned long ptr;
				1955
				1956	path = btrfs_alloc_path();
				1957	if (!path)
				1958	return -ENOMEM;
				1959
				1960	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				1961	key.type = BTRFS_DEV_ITEM_KEY;
				1962	key.offset = device->devid;
				1963
				1964	ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
				1965	&key, sizeof(*dev_item));
				1966	if (ret)
				1967	goto out;
				1968
				1969	leaf = path->nodes[0];
				1970	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
				1971
				1972	btrfs_set_device_id(leaf, dev_item, device->devid);
				1973	btrfs_set_device_generation(leaf, dev_item, 0);
				1974	btrfs_set_device_type(leaf, dev_item, device->type);
				1975	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
				1976	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
				1977	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
				1978	btrfs_set_device_total_bytes(leaf, dev_item,
				1979	btrfs_device_get_disk_total_bytes(device));
				1980	btrfs_set_device_bytes_used(leaf, dev_item,
				1981	btrfs_device_get_bytes_used(device));
				1982	btrfs_set_device_group(leaf, dev_item, 0);
				1983	btrfs_set_device_seek_speed(leaf, dev_item, 0);
				1984	btrfs_set_device_bandwidth(leaf, dev_item, 0);
				1985	btrfs_set_device_start_offset(leaf, dev_item, 0);
				1986
				1987	ptr = btrfs_device_uuid(dev_item);
				1988	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
				1989	ptr = btrfs_device_fsid(dev_item);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1990	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
				1991	ptr, BTRFS_FSID_SIZE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1992	btrfs_mark_buffer_dirty(leaf);
				1993
				1994	ret = 0;
				1995	out:
				1996	btrfs_free_path(path);
				1997	return ret;
				1998	}
				1999
				2000	/*
				2001	* Function to update ctime/mtime for a given device path.
				2002	* Mainly used for ctime/mtime based probe like libblkid.
				2003	*/
				2004	static void update_dev_time(const char *path_name)
				2005	{
				2006	struct file *filp;
				2007
				2008	filp = filp_open(path_name, O_RDWR, 0);
				2009	if (IS_ERR(filp))
				2010	return;
				2011	file_update_time(filp);
				2012	filp_close(filp, NULL);
				2013	}
				2014
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2015	static int btrfs_rm_dev_item(struct btrfs_device *device)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2016	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2017	struct btrfs_root *root = device->fs_info->chunk_root;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2018	int ret;
				2019	struct btrfs_path *path;
				2020	struct btrfs_key key;
				2021	struct btrfs_trans_handle *trans;
				2022
				2023	path = btrfs_alloc_path();
				2024	if (!path)
				2025	return -ENOMEM;
				2026
				2027	trans = btrfs_start_transaction(root, 0);
				2028	if (IS_ERR(trans)) {
				2029	btrfs_free_path(path);
				2030	return PTR_ERR(trans);
				2031	}
				2032	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				2033	key.type = BTRFS_DEV_ITEM_KEY;
				2034	key.offset = device->devid;
				2035
				2036	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				2037	if (ret) {
				2038	if (ret > 0)
				2039	ret = -ENOENT;
				2040	btrfs_abort_transaction(trans, ret);
				2041	btrfs_end_transaction(trans);
				2042	goto out;
				2043	}
				2044
				2045	ret = btrfs_del_item(trans, root, path);
				2046	if (ret) {
				2047	btrfs_abort_transaction(trans, ret);
				2048	btrfs_end_transaction(trans);
				2049	}
				2050
				2051	out:
				2052	btrfs_free_path(path);
				2053	if (!ret)
				2054	ret = btrfs_commit_transaction(trans);
				2055	return ret;
				2056	}
				2057
				2058	/*
				2059	* Verify that @num_devices satisfies the RAID profile constraints in the whole
				2060	* filesystem. It's up to the caller to adjust that number regarding eg. device
				2061	* replace.
				2062	*/
				2063	static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
				2064	u64 num_devices)
				2065	{
				2066	u64 all_avail;
				2067	unsigned seq;
				2068	int i;
				2069
				2070	do {
				2071	seq = read_seqbegin(&fs_info->profiles_lock);
				2072
				2073	all_avail = fs_info->avail_data_alloc_bits \|
				2074	fs_info->avail_system_alloc_bits \|
				2075	fs_info->avail_metadata_alloc_bits;
				2076	} while (read_seqretry(&fs_info->profiles_lock, seq));
				2077
				2078	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
				2079	if (!(all_avail & btrfs_raid_array[i].bg_flag))
				2080	continue;
				2081
				2082	if (num_devices < btrfs_raid_array[i].devs_min) {
				2083	int ret = btrfs_raid_array[i].mindev_error;
				2084
				2085	if (ret)
				2086	return ret;
				2087	}
				2088	}
				2089
				2090	return 0;
				2091	}
				2092
				2093	static struct btrfs_device * btrfs_find_next_active_device(
				2094	struct btrfs_fs_devices fs_devs, struct btrfs_device device)
				2095	{
				2096	struct btrfs_device *next_device;
				2097
				2098	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
				2099	if (next_device != device &&
				2100	!test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
				2101	&& next_device->bdev)
				2102	return next_device;
				2103	}
				2104
				2105	return NULL;
				2106	}
				2107
				2108	/*
				2109	* Helper function to check if the given device is part of s_bdev / latest_bdev
				2110	* and replace it with the provided or the next active device, in the context
				2111	* where this function called, there should be always be another device (or
				2112	* this_dev) which is active.
				2113	*/
				2114	void btrfs_assign_next_active_device(struct btrfs_device *device,
				2115	struct btrfs_device *this_dev)
				2116	{
				2117	struct btrfs_fs_info *fs_info = device->fs_info;
				2118	struct btrfs_device *next_device;
				2119
				2120	if (this_dev)
				2121	next_device = this_dev;
				2122	else
				2123	next_device = btrfs_find_next_active_device(fs_info->fs_devices,
				2124	device);
				2125	ASSERT(next_device);
				2126
				2127	if (fs_info->sb->s_bdev &&
				2128	(fs_info->sb->s_bdev == device->bdev))
				2129	fs_info->sb->s_bdev = next_device->bdev;
				2130
				2131	if (fs_info->fs_devices->latest_bdev == device->bdev)
				2132	fs_info->fs_devices->latest_bdev = next_device->bdev;
				2133	}
				2134
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2135	/*
				2136	* Return btrfs_fs_devices::num_devices excluding the device that's being
				2137	* currently replaced.
				2138	*/
				2139	static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
				2140	{
				2141	u64 num_devices = fs_info->fs_devices->num_devices;
				2142
				2143	down_read(&fs_info->dev_replace.rwsem);
				2144	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
				2145	ASSERT(num_devices > 1);
				2146	num_devices--;
				2147	}
				2148	up_read(&fs_info->dev_replace.rwsem);
				2149
				2150	return num_devices;
				2151	}
				2152
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2153	int btrfs_rm_device(struct btrfs_fs_info fs_info, const char device_path,
				2154	u64 devid)
				2155	{
				2156	struct btrfs_device *device;
				2157	struct btrfs_fs_devices *cur_devices;
				2158	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				2159	u64 num_devices;
				2160	int ret = 0;
				2161
				2162	mutex_lock(&uuid_mutex);
				2163
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2164	num_devices = btrfs_num_devices(fs_info);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2165
				2166	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
				2167	if (ret)
				2168	goto out;
				2169
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2170	device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
				2171
				2172	if (IS_ERR(device)) {
				2173	if (PTR_ERR(device) == -ENOENT &&
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2174	device_path && strcmp(device_path, "missing") == 0)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2175	ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
				2176	else
				2177	ret = PTR_ERR(device);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2178	goto out;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2179	}
				2180
				2181	if (btrfs_pinned_by_swapfile(fs_info, device)) {
				2182	btrfs_warn_in_rcu(fs_info,
				2183	"cannot remove device %s (devid %llu) due to active swapfile",
				2184	rcu_str_deref(device->name), device->devid);
				2185	ret = -ETXTBSY;
				2186	goto out;
				2187	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2188
				2189	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
				2190	ret = BTRFS_ERROR_DEV_TGT_REPLACE;
				2191	goto out;
				2192	}
				2193
				2194	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
				2195	fs_info->fs_devices->rw_devices == 1) {
				2196	ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
				2197	goto out;
				2198	}
				2199
				2200	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
				2201	mutex_lock(&fs_info->chunk_mutex);
				2202	list_del_init(&device->dev_alloc_list);
				2203	device->fs_devices->rw_devices--;
				2204	mutex_unlock(&fs_info->chunk_mutex);
				2205	}
				2206
				2207	mutex_unlock(&uuid_mutex);
				2208	ret = btrfs_shrink_device(device, 0);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2209	if (!ret)
				2210	btrfs_reada_remove_dev(device);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2211	mutex_lock(&uuid_mutex);
				2212	if (ret)
				2213	goto error_undo;
				2214
				2215	/*
				2216	* TODO: the superblock still includes this device in its num_devices
				2217	* counter although write_all_supers() is not locked out. This
				2218	* could give a filesystem state which requires a degraded mount.
				2219	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2220	ret = btrfs_rm_dev_item(device);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2221	if (ret)
				2222	goto error_undo;
				2223
				2224	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2225	btrfs_scrub_cancel_dev(device);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2226
				2227	/*
				2228	* the device list mutex makes sure that we don't change
				2229	* the device list while someone else is writing out all
				2230	* the device supers. Whoever is writing all supers, should
				2231	* lock the device list mutex before getting the number of
				2232	* devices in the super block (super_copy). Conversely,
				2233	* whoever updates the number of devices in the super block
				2234	* (super_copy) should hold the device list mutex.
				2235	*/
				2236
				2237	/*
				2238	* In normal cases the cur_devices == fs_devices. But in case
				2239	* of deleting a seed device, the cur_devices should point to
				2240	* its own fs_devices listed under the fs_devices->seed.
				2241	*/
				2242	cur_devices = device->fs_devices;
				2243	mutex_lock(&fs_devices->device_list_mutex);
				2244	list_del_rcu(&device->dev_list);
				2245
				2246	cur_devices->num_devices--;
				2247	cur_devices->total_devices--;
				2248	/* Update total_devices of the parent fs_devices if it's seed */
				2249	if (cur_devices != fs_devices)
				2250	fs_devices->total_devices--;
				2251
				2252	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
				2253	cur_devices->missing_devices--;
				2254
				2255	btrfs_assign_next_active_device(device, NULL);
				2256
				2257	if (device->bdev) {
				2258	cur_devices->open_devices--;
				2259	/* remove sysfs entry */
				2260	btrfs_sysfs_rm_device_link(fs_devices, device);
				2261	}
				2262
				2263	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
				2264	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
				2265	mutex_unlock(&fs_devices->device_list_mutex);
				2266
				2267	/*
				2268	* at this point, the device is zero sized and detached from
				2269	* the devices list. All that's left is to zero out the old
				2270	* supers and free the device.
				2271	*/
				2272	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
				2273	btrfs_scratch_superblocks(device->bdev, device->name->str);
				2274
				2275	btrfs_close_bdev(device);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2276	synchronize_rcu();
				2277	btrfs_free_device(device);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2278
				2279	if (cur_devices->open_devices == 0) {
				2280	while (fs_devices) {
				2281	if (fs_devices->seed == cur_devices) {
				2282	fs_devices->seed = cur_devices->seed;
				2283	break;
				2284	}
				2285	fs_devices = fs_devices->seed;
				2286	}
				2287	cur_devices->seed = NULL;
				2288	close_fs_devices(cur_devices);
				2289	free_fs_devices(cur_devices);
				2290	}
				2291
				2292	out:
				2293	mutex_unlock(&uuid_mutex);
				2294	return ret;
				2295
				2296	error_undo:
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2297	btrfs_reada_undo_remove_dev(device);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2298	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
				2299	mutex_lock(&fs_info->chunk_mutex);
				2300	list_add(&device->dev_alloc_list,
				2301	&fs_devices->alloc_list);
				2302	device->fs_devices->rw_devices++;
				2303	mutex_unlock(&fs_info->chunk_mutex);
				2304	}
				2305	goto out;
				2306	}
				2307
				2308	void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
				2309	{
				2310	struct btrfs_fs_devices *fs_devices;
				2311
				2312	lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
				2313
				2314	/*
				2315	* in case of fs with no seed, srcdev->fs_devices will point
				2316	* to fs_devices of fs_info. However when the dev being replaced is
				2317	* a seed dev it will point to the seed's local fs_devices. In short
				2318	* srcdev will have its correct fs_devices in both the cases.
				2319	*/
				2320	fs_devices = srcdev->fs_devices;
				2321
				2322	list_del_rcu(&srcdev->dev_list);
				2323	list_del(&srcdev->dev_alloc_list);
				2324	fs_devices->num_devices--;
				2325	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
				2326	fs_devices->missing_devices--;
				2327
				2328	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
				2329	fs_devices->rw_devices--;
				2330
				2331	if (srcdev->bdev)
				2332	fs_devices->open_devices--;
				2333	}
				2334
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2335	void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2336	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2337	struct btrfs_fs_info *fs_info = srcdev->fs_info;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2338	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
				2339
				2340	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
				2341	/* zero out the old super if it is writable */
				2342	btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
				2343	}
				2344
				2345	btrfs_close_bdev(srcdev);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2346	synchronize_rcu();
				2347	btrfs_free_device(srcdev);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2348
				2349	/* if this is no devs we rather delete the fs_devices */
				2350	if (!fs_devices->num_devices) {
				2351	struct btrfs_fs_devices *tmp_fs_devices;
				2352
				2353	/*
				2354	* On a mounted FS, num_devices can't be zero unless it's a
				2355	* seed. In case of a seed device being replaced, the replace
				2356	* target added to the sprout FS, so there will be no more
				2357	* device left under the seed FS.
				2358	*/
				2359	ASSERT(fs_devices->seeding);
				2360
				2361	tmp_fs_devices = fs_info->fs_devices;
				2362	while (tmp_fs_devices) {
				2363	if (tmp_fs_devices->seed == fs_devices) {
				2364	tmp_fs_devices->seed = fs_devices->seed;
				2365	break;
				2366	}
				2367	tmp_fs_devices = tmp_fs_devices->seed;
				2368	}
				2369	fs_devices->seed = NULL;
				2370	close_fs_devices(fs_devices);
				2371	free_fs_devices(fs_devices);
				2372	}
				2373	}
				2374
				2375	void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
				2376	{
				2377	struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
				2378
				2379	WARN_ON(!tgtdev);
				2380	mutex_lock(&fs_devices->device_list_mutex);
				2381
				2382	btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
				2383
				2384	if (tgtdev->bdev)
				2385	fs_devices->open_devices--;
				2386
				2387	fs_devices->num_devices--;
				2388
				2389	btrfs_assign_next_active_device(tgtdev, NULL);
				2390
				2391	list_del_rcu(&tgtdev->dev_list);
				2392
				2393	mutex_unlock(&fs_devices->device_list_mutex);
				2394
				2395	/*
				2396	* The update_dev_time() with in btrfs_scratch_superblocks()
				2397	* may lead to a call to btrfs_show_devname() which will try
				2398	* to hold device_list_mutex. And here this device
				2399	* is already out of device list, so we don't have to hold
				2400	* the device_list_mutex lock.
				2401	*/
				2402	btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
				2403
				2404	btrfs_close_bdev(tgtdev);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2405	synchronize_rcu();
				2406	btrfs_free_device(tgtdev);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2407	}
				2408
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2409	static struct btrfs_device *btrfs_find_device_by_path(
				2410	struct btrfs_fs_info fs_info, const char device_path)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2411	{
				2412	int ret = 0;
				2413	struct btrfs_super_block *disk_super;
				2414	u64 devid;
				2415	u8 *dev_uuid;
				2416	struct block_device *bdev;
				2417	struct buffer_head *bh;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2418	struct btrfs_device *device;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2419
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2420	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
				2421	fs_info->bdev_holder, 0, &bdev, &bh);
				2422	if (ret)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2423	return ERR_PTR(ret);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2424	disk_super = (struct btrfs_super_block *)bh->b_data;
				2425	devid = btrfs_stack_device_id(&disk_super->dev_item);
				2426	dev_uuid = disk_super->dev_item.uuid;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2427	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
				2428	device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
				2429	disk_super->metadata_uuid, true);
				2430	else
				2431	device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
				2432	disk_super->fsid, true);
				2433
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2434	brelse(bh);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2435	if (!device)
				2436	device = ERR_PTR(-ENOENT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2437	blkdev_put(bdev, FMODE_READ);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2438	return device;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2439	}
				2440
				2441	/*
				2442	* Lookup a device given by device id, or the path if the id is 0.
				2443	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2444	struct btrfs_device *btrfs_find_device_by_devspec(
				2445	struct btrfs_fs_info *fs_info, u64 devid,
				2446	const char *device_path)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2447	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2448	struct btrfs_device *device;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2449
				2450	if (devid) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2451	device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
				2452	NULL, true);
				2453	if (!device)
				2454	return ERR_PTR(-ENOENT);
				2455	return device;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2456	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2457
				2458	if (!device_path \|\| !device_path[0])
				2459	return ERR_PTR(-EINVAL);
				2460
				2461	if (strcmp(device_path, "missing") == 0) {
				2462	/* Find first missing device */
				2463	list_for_each_entry(device, &fs_info->fs_devices->devices,
				2464	dev_list) {
				2465	if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				2466	&device->dev_state) && !device->bdev)
				2467	return device;
				2468	}
				2469	return ERR_PTR(-ENOENT);
				2470	}
				2471
				2472	return btrfs_find_device_by_path(fs_info, device_path);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2473	}
				2474
				2475	/*
				2476	* does all the dirty work required for changing file system's UUID.
				2477	*/
				2478	static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
				2479	{
				2480	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				2481	struct btrfs_fs_devices *old_devices;
				2482	struct btrfs_fs_devices *seed_devices;
				2483	struct btrfs_super_block *disk_super = fs_info->super_copy;
				2484	struct btrfs_device *device;
				2485	u64 super_flags;
				2486
				2487	lockdep_assert_held(&uuid_mutex);
				2488	if (!fs_devices->seeding)
				2489	return -EINVAL;
				2490
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2491	seed_devices = alloc_fs_devices(NULL, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2492	if (IS_ERR(seed_devices))
				2493	return PTR_ERR(seed_devices);
				2494
				2495	old_devices = clone_fs_devices(fs_devices);
				2496	if (IS_ERR(old_devices)) {
				2497	kfree(seed_devices);
				2498	return PTR_ERR(old_devices);
				2499	}
				2500
				2501	list_add(&old_devices->fs_list, &fs_uuids);
				2502
				2503	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
				2504	seed_devices->opened = 1;
				2505	INIT_LIST_HEAD(&seed_devices->devices);
				2506	INIT_LIST_HEAD(&seed_devices->alloc_list);
				2507	mutex_init(&seed_devices->device_list_mutex);
				2508
				2509	mutex_lock(&fs_devices->device_list_mutex);
				2510	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
				2511	synchronize_rcu);
				2512	list_for_each_entry(device, &seed_devices->devices, dev_list)
				2513	device->fs_devices = seed_devices;
				2514
				2515	mutex_lock(&fs_info->chunk_mutex);
				2516	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
				2517	mutex_unlock(&fs_info->chunk_mutex);
				2518
				2519	fs_devices->seeding = 0;
				2520	fs_devices->num_devices = 0;
				2521	fs_devices->open_devices = 0;
				2522	fs_devices->missing_devices = 0;
				2523	fs_devices->rotating = 0;
				2524	fs_devices->seed = seed_devices;
				2525
				2526	generate_random_uuid(fs_devices->fsid);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2527	memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2528	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
				2529	mutex_unlock(&fs_devices->device_list_mutex);
				2530
				2531	super_flags = btrfs_super_flags(disk_super) &
				2532	~BTRFS_SUPER_FLAG_SEEDING;
				2533	btrfs_set_super_flags(disk_super, super_flags);
				2534
				2535	return 0;
				2536	}
				2537
				2538	/*
				2539	* Store the expected generation for seed devices in device items.
				2540	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2541	static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2542	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2543	struct btrfs_fs_info *fs_info = trans->fs_info;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2544	struct btrfs_root *root = fs_info->chunk_root;
				2545	struct btrfs_path *path;
				2546	struct extent_buffer *leaf;
				2547	struct btrfs_dev_item *dev_item;
				2548	struct btrfs_device *device;
				2549	struct btrfs_key key;
				2550	u8 fs_uuid[BTRFS_FSID_SIZE];
				2551	u8 dev_uuid[BTRFS_UUID_SIZE];
				2552	u64 devid;
				2553	int ret;
				2554
				2555	path = btrfs_alloc_path();
				2556	if (!path)
				2557	return -ENOMEM;
				2558
				2559	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				2560	key.offset = 0;
				2561	key.type = BTRFS_DEV_ITEM_KEY;
				2562
				2563	while (1) {
				2564	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
				2565	if (ret < 0)
				2566	goto error;
				2567
				2568	leaf = path->nodes[0];
				2569	next_slot:
				2570	if (path->slots[0] >= btrfs_header_nritems(leaf)) {
				2571	ret = btrfs_next_leaf(root, path);
				2572	if (ret > 0)
				2573	break;
				2574	if (ret < 0)
				2575	goto error;
				2576	leaf = path->nodes[0];
				2577	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				2578	btrfs_release_path(path);
				2579	continue;
				2580	}
				2581
				2582	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				2583	if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID \|\|
				2584	key.type != BTRFS_DEV_ITEM_KEY)
				2585	break;
				2586
				2587	dev_item = btrfs_item_ptr(leaf, path->slots[0],
				2588	struct btrfs_dev_item);
				2589	devid = btrfs_device_id(leaf, dev_item);
				2590	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
				2591	BTRFS_UUID_SIZE);
				2592	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
				2593	BTRFS_FSID_SIZE);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2594	device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
				2595	fs_uuid, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2596	BUG_ON(!device); /* Logic error */
				2597
				2598	if (device->fs_devices->seeding) {
				2599	btrfs_set_device_generation(leaf, dev_item,
				2600	device->generation);
				2601	btrfs_mark_buffer_dirty(leaf);
				2602	}
				2603
				2604	path->slots[0]++;
				2605	goto next_slot;
				2606	}
				2607	ret = 0;
				2608	error:
				2609	btrfs_free_path(path);
				2610	return ret;
				2611	}
				2612
				2613	int btrfs_init_new_device(struct btrfs_fs_info fs_info, const char device_path)
				2614	{
				2615	struct btrfs_root *root = fs_info->dev_root;
				2616	struct request_queue *q;
				2617	struct btrfs_trans_handle *trans;
				2618	struct btrfs_device *device;
				2619	struct block_device *bdev;
				2620	struct super_block *sb = fs_info->sb;
				2621	struct rcu_string *name;
				2622	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				2623	u64 orig_super_total_bytes;
				2624	u64 orig_super_num_devices;
				2625	int seeding_dev = 0;
				2626	int ret = 0;
				2627	bool unlocked = false;
				2628
				2629	if (sb_rdonly(sb) && !fs_devices->seeding)
				2630	return -EROFS;
				2631
				2632	bdev = blkdev_get_by_path(device_path, FMODE_WRITE \| FMODE_EXCL,
				2633	fs_info->bdev_holder);
				2634	if (IS_ERR(bdev))
				2635	return PTR_ERR(bdev);
				2636
				2637	if (fs_devices->seeding) {
				2638	seeding_dev = 1;
				2639	down_write(&sb->s_umount);
				2640	mutex_lock(&uuid_mutex);
				2641	}
				2642
				2643	filemap_write_and_wait(bdev->bd_inode->i_mapping);
				2644
				2645	mutex_lock(&fs_devices->device_list_mutex);
				2646	list_for_each_entry(device, &fs_devices->devices, dev_list) {
				2647	if (device->bdev == bdev) {
				2648	ret = -EEXIST;
				2649	mutex_unlock(
				2650	&fs_devices->device_list_mutex);
				2651	goto error;
				2652	}
				2653	}
				2654	mutex_unlock(&fs_devices->device_list_mutex);
				2655
				2656	device = btrfs_alloc_device(fs_info, NULL, NULL);
				2657	if (IS_ERR(device)) {
				2658	/* we can safely leave the fs_devices entry around */
				2659	ret = PTR_ERR(device);
				2660	goto error;
				2661	}
				2662
				2663	name = rcu_string_strdup(device_path, GFP_KERNEL);
				2664	if (!name) {
				2665	ret = -ENOMEM;
				2666	goto error_free_device;
				2667	}
				2668	rcu_assign_pointer(device->name, name);
				2669
				2670	trans = btrfs_start_transaction(root, 0);
				2671	if (IS_ERR(trans)) {
				2672	ret = PTR_ERR(trans);
				2673	goto error_free_device;
				2674	}
				2675
				2676	q = bdev_get_queue(bdev);
				2677	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
				2678	device->generation = trans->transid;
				2679	device->io_width = fs_info->sectorsize;
				2680	device->io_align = fs_info->sectorsize;
				2681	device->sector_size = fs_info->sectorsize;
				2682	device->total_bytes = round_down(i_size_read(bdev->bd_inode),
				2683	fs_info->sectorsize);
				2684	device->disk_total_bytes = device->total_bytes;
				2685	device->commit_total_bytes = device->total_bytes;
				2686	device->fs_info = fs_info;
				2687	device->bdev = bdev;
				2688	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
				2689	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
				2690	device->mode = FMODE_EXCL;
				2691	device->dev_stats_valid = 1;
				2692	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
				2693
				2694	if (seeding_dev) {
				2695	sb->s_flags &= ~SB_RDONLY;
				2696	ret = btrfs_prepare_sprout(fs_info);
				2697	if (ret) {
				2698	btrfs_abort_transaction(trans, ret);
				2699	goto error_trans;
				2700	}
				2701	}
				2702
				2703	device->fs_devices = fs_devices;
				2704
				2705	mutex_lock(&fs_devices->device_list_mutex);
				2706	mutex_lock(&fs_info->chunk_mutex);
				2707	list_add_rcu(&device->dev_list, &fs_devices->devices);
				2708	list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
				2709	fs_devices->num_devices++;
				2710	fs_devices->open_devices++;
				2711	fs_devices->rw_devices++;
				2712	fs_devices->total_devices++;
				2713	fs_devices->total_rw_bytes += device->total_bytes;
				2714
				2715	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
				2716
				2717	if (!blk_queue_nonrot(q))
				2718	fs_devices->rotating = 1;
				2719
				2720	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
				2721	btrfs_set_super_total_bytes(fs_info->super_copy,
				2722	round_down(orig_super_total_bytes + device->total_bytes,
				2723	fs_info->sectorsize));
				2724
				2725	orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
				2726	btrfs_set_super_num_devices(fs_info->super_copy,
				2727	orig_super_num_devices + 1);
				2728
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2729	/*
				2730	* we've got more storage, clear any full flags on the space
				2731	* infos
				2732	*/
				2733	btrfs_clear_space_info_full(fs_info);
				2734
				2735	mutex_unlock(&fs_info->chunk_mutex);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2736
				2737	/* Add sysfs device entry */
				2738	btrfs_sysfs_add_device_link(fs_devices, device);
				2739
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2740	mutex_unlock(&fs_devices->device_list_mutex);
				2741
				2742	if (seeding_dev) {
				2743	mutex_lock(&fs_info->chunk_mutex);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2744	ret = init_first_rw_device(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2745	mutex_unlock(&fs_info->chunk_mutex);
				2746	if (ret) {
				2747	btrfs_abort_transaction(trans, ret);
				2748	goto error_sysfs;
				2749	}
				2750	}
				2751
				2752	ret = btrfs_add_dev_item(trans, device);
				2753	if (ret) {
				2754	btrfs_abort_transaction(trans, ret);
				2755	goto error_sysfs;
				2756	}
				2757
				2758	if (seeding_dev) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2759	ret = btrfs_finish_sprout(trans);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2760	if (ret) {
				2761	btrfs_abort_transaction(trans, ret);
				2762	goto error_sysfs;
				2763	}
				2764
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2765	btrfs_sysfs_update_sprout_fsid(fs_devices,
				2766	fs_info->fs_devices->fsid);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2767	}
				2768
				2769	ret = btrfs_commit_transaction(trans);
				2770
				2771	if (seeding_dev) {
				2772	mutex_unlock(&uuid_mutex);
				2773	up_write(&sb->s_umount);
				2774	unlocked = true;
				2775
				2776	if (ret) /* transaction commit */
				2777	return ret;
				2778
				2779	ret = btrfs_relocate_sys_chunks(fs_info);
				2780	if (ret < 0)
				2781	btrfs_handle_fs_error(fs_info, ret,
				2782	"Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
				2783	trans = btrfs_attach_transaction(root);
				2784	if (IS_ERR(trans)) {
				2785	if (PTR_ERR(trans) == -ENOENT)
				2786	return 0;
				2787	ret = PTR_ERR(trans);
				2788	trans = NULL;
				2789	goto error_sysfs;
				2790	}
				2791	ret = btrfs_commit_transaction(trans);
				2792	}
				2793
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2794	/*
				2795	* Now that we have written a new super block to this device, check all
				2796	* other fs_devices list if device_path alienates any other scanned
				2797	* device.
				2798	* We can ignore the return value as it typically returns -EINVAL and
				2799	* only succeeds if the device was an alien.
				2800	*/
				2801	btrfs_forget_devices(device_path);
				2802
				2803	/* Update ctime/mtime for blkid or udev */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2804	update_dev_time(device_path);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2805
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2806	return ret;
				2807
				2808	error_sysfs:
				2809	btrfs_sysfs_rm_device_link(fs_devices, device);
				2810	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				2811	mutex_lock(&fs_info->chunk_mutex);
				2812	list_del_rcu(&device->dev_list);
				2813	list_del(&device->dev_alloc_list);
				2814	fs_info->fs_devices->num_devices--;
				2815	fs_info->fs_devices->open_devices--;
				2816	fs_info->fs_devices->rw_devices--;
				2817	fs_info->fs_devices->total_devices--;
				2818	fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
				2819	atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
				2820	btrfs_set_super_total_bytes(fs_info->super_copy,
				2821	orig_super_total_bytes);
				2822	btrfs_set_super_num_devices(fs_info->super_copy,
				2823	orig_super_num_devices);
				2824	mutex_unlock(&fs_info->chunk_mutex);
				2825	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				2826	error_trans:
				2827	if (seeding_dev)
				2828	sb->s_flags \|= SB_RDONLY;
				2829	if (trans)
				2830	btrfs_end_transaction(trans);
				2831	error_free_device:
				2832	btrfs_free_device(device);
				2833	error:
				2834	blkdev_put(bdev, FMODE_EXCL);
				2835	if (seeding_dev && !unlocked) {
				2836	mutex_unlock(&uuid_mutex);
				2837	up_write(&sb->s_umount);
				2838	}
				2839	return ret;
				2840	}
				2841
				2842	static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
				2843	struct btrfs_device *device)
				2844	{
				2845	int ret;
				2846	struct btrfs_path *path;
				2847	struct btrfs_root *root = device->fs_info->chunk_root;
				2848	struct btrfs_dev_item *dev_item;
				2849	struct extent_buffer *leaf;
				2850	struct btrfs_key key;
				2851
				2852	path = btrfs_alloc_path();
				2853	if (!path)
				2854	return -ENOMEM;
				2855
				2856	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				2857	key.type = BTRFS_DEV_ITEM_KEY;
				2858	key.offset = device->devid;
				2859
				2860	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
				2861	if (ret < 0)
				2862	goto out;
				2863
				2864	if (ret > 0) {
				2865	ret = -ENOENT;
				2866	goto out;
				2867	}
				2868
				2869	leaf = path->nodes[0];
				2870	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
				2871
				2872	btrfs_set_device_id(leaf, dev_item, device->devid);
				2873	btrfs_set_device_type(leaf, dev_item, device->type);
				2874	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
				2875	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
				2876	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
				2877	btrfs_set_device_total_bytes(leaf, dev_item,
				2878	btrfs_device_get_disk_total_bytes(device));
				2879	btrfs_set_device_bytes_used(leaf, dev_item,
				2880	btrfs_device_get_bytes_used(device));
				2881	btrfs_mark_buffer_dirty(leaf);
				2882
				2883	out:
				2884	btrfs_free_path(path);
				2885	return ret;
				2886	}
				2887
				2888	int btrfs_grow_device(struct btrfs_trans_handle *trans,
				2889	struct btrfs_device *device, u64 new_size)
				2890	{
				2891	struct btrfs_fs_info *fs_info = device->fs_info;
				2892	struct btrfs_super_block *super_copy = fs_info->super_copy;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2893	u64 old_total;
				2894	u64 diff;
				2895
				2896	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
				2897	return -EACCES;
				2898
				2899	new_size = round_down(new_size, fs_info->sectorsize);
				2900
				2901	mutex_lock(&fs_info->chunk_mutex);
				2902	old_total = btrfs_super_total_bytes(super_copy);
				2903	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
				2904
				2905	if (new_size <= device->total_bytes \|\|
				2906	test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
				2907	mutex_unlock(&fs_info->chunk_mutex);
				2908	return -EINVAL;
				2909	}
				2910
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2911	btrfs_set_super_total_bytes(super_copy,
				2912	round_down(old_total + diff, fs_info->sectorsize));
				2913	device->fs_devices->total_rw_bytes += diff;
				2914
				2915	btrfs_device_set_total_bytes(device, new_size);
				2916	btrfs_device_set_disk_total_bytes(device, new_size);
				2917	btrfs_clear_space_info_full(device->fs_info);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2918	if (list_empty(&device->post_commit_list))
				2919	list_add_tail(&device->post_commit_list,
				2920	&trans->transaction->dev_update_list);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2921	mutex_unlock(&fs_info->chunk_mutex);
				2922
				2923	return btrfs_update_device(trans, device);
				2924	}
				2925
				2926	static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
				2927	{
				2928	struct btrfs_fs_info *fs_info = trans->fs_info;
				2929	struct btrfs_root *root = fs_info->chunk_root;
				2930	int ret;
				2931	struct btrfs_path *path;
				2932	struct btrfs_key key;
				2933
				2934	path = btrfs_alloc_path();
				2935	if (!path)
				2936	return -ENOMEM;
				2937
				2938	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
				2939	key.offset = chunk_offset;
				2940	key.type = BTRFS_CHUNK_ITEM_KEY;
				2941
				2942	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				2943	if (ret < 0)
				2944	goto out;
				2945	else if (ret > 0) { /* Logic error or corruption */
				2946	btrfs_handle_fs_error(fs_info, -ENOENT,
				2947	"Failed lookup while freeing chunk.");
				2948	ret = -ENOENT;
				2949	goto out;
				2950	}
				2951
				2952	ret = btrfs_del_item(trans, root, path);
				2953	if (ret < 0)
				2954	btrfs_handle_fs_error(fs_info, ret,
				2955	"Failed to delete chunk item.");
				2956	out:
				2957	btrfs_free_path(path);
				2958	return ret;
				2959	}
				2960
				2961	static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
				2962	{
				2963	struct btrfs_super_block *super_copy = fs_info->super_copy;
				2964	struct btrfs_disk_key *disk_key;
				2965	struct btrfs_chunk *chunk;
				2966	u8 *ptr;
				2967	int ret = 0;
				2968	u32 num_stripes;
				2969	u32 array_size;
				2970	u32 len = 0;
				2971	u32 cur;
				2972	struct btrfs_key key;
				2973
				2974	mutex_lock(&fs_info->chunk_mutex);
				2975	array_size = btrfs_super_sys_array_size(super_copy);
				2976
				2977	ptr = super_copy->sys_chunk_array;
				2978	cur = 0;
				2979
				2980	while (cur < array_size) {
				2981	disk_key = (struct btrfs_disk_key *)ptr;
				2982	btrfs_disk_key_to_cpu(&key, disk_key);
				2983
				2984	len = sizeof(*disk_key);
				2985
				2986	if (key.type == BTRFS_CHUNK_ITEM_KEY) {
				2987	chunk = (struct btrfs_chunk *)(ptr + len);
				2988	num_stripes = btrfs_stack_chunk_num_stripes(chunk);
				2989	len += btrfs_chunk_item_size(num_stripes);
				2990	} else {
				2991	ret = -EIO;
				2992	break;
				2993	}
				2994	if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
				2995	key.offset == chunk_offset) {
				2996	memmove(ptr, ptr + len, array_size - (cur + len));
				2997	array_size -= len;
				2998	btrfs_set_super_sys_array_size(super_copy, array_size);
				2999	} else {
				3000	ptr += len;
				3001	cur += len;
				3002	}
				3003	}
				3004	mutex_unlock(&fs_info->chunk_mutex);
				3005	return ret;
				3006	}
				3007
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3008	/*
				3009	* btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
				3010	* @logical: Logical block offset in bytes.
				3011	* @length: Length of extent in bytes.
				3012	*
				3013	* Return: Chunk mapping or ERR_PTR.
				3014	*/
				3015	struct extent_map btrfs_get_chunk_map(struct btrfs_fs_info fs_info,
				3016	u64 logical, u64 length)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3017	{
				3018	struct extent_map_tree *em_tree;
				3019	struct extent_map *em;
				3020
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3021	em_tree = &fs_info->mapping_tree;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3022	read_lock(&em_tree->lock);
				3023	em = lookup_extent_mapping(em_tree, logical, length);
				3024	read_unlock(&em_tree->lock);
				3025
				3026	if (!em) {
				3027	btrfs_crit(fs_info, "unable to find logical %llu length %llu",
				3028	logical, length);
				3029	return ERR_PTR(-EINVAL);
				3030	}
				3031
				3032	if (em->start > logical \|\| em->start + em->len < logical) {
				3033	btrfs_crit(fs_info,
				3034	"found a bad mapping, wanted %llu-%llu, found %llu-%llu",
				3035	logical, length, em->start, em->start + em->len);
				3036	free_extent_map(em);
				3037	return ERR_PTR(-EINVAL);
				3038	}
				3039
				3040	/* callers are responsible for dropping em's ref. */
				3041	return em;
				3042	}
				3043
				3044	int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
				3045	{
				3046	struct btrfs_fs_info *fs_info = trans->fs_info;
				3047	struct extent_map *em;
				3048	struct map_lookup *map;
				3049	u64 dev_extent_len = 0;
				3050	int i, ret = 0;
				3051	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				3052
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3053	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3054	if (IS_ERR(em)) {
				3055	/*
				3056	* This is a logic error, but we don't want to just rely on the
				3057	* user having built with ASSERT enabled, so if ASSERT doesn't
				3058	* do anything we still error out.
				3059	*/
				3060	ASSERT(0);
				3061	return PTR_ERR(em);
				3062	}
				3063	map = em->map_lookup;
				3064	mutex_lock(&fs_info->chunk_mutex);
				3065	check_system_chunk(trans, map->type);
				3066	mutex_unlock(&fs_info->chunk_mutex);
				3067
				3068	/*
				3069	* Take the device list mutex to prevent races with the final phase of
				3070	* a device replace operation that replaces the device object associated
				3071	* with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
				3072	*/
				3073	mutex_lock(&fs_devices->device_list_mutex);
				3074	for (i = 0; i < map->num_stripes; i++) {
				3075	struct btrfs_device *device = map->stripes[i].dev;
				3076	ret = btrfs_free_dev_extent(trans, device,
				3077	map->stripes[i].physical,
				3078	&dev_extent_len);
				3079	if (ret) {
				3080	mutex_unlock(&fs_devices->device_list_mutex);
				3081	btrfs_abort_transaction(trans, ret);
				3082	goto out;
				3083	}
				3084
				3085	if (device->bytes_used > 0) {
				3086	mutex_lock(&fs_info->chunk_mutex);
				3087	btrfs_device_set_bytes_used(device,
				3088	device->bytes_used - dev_extent_len);
				3089	atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
				3090	btrfs_clear_space_info_full(fs_info);
				3091	mutex_unlock(&fs_info->chunk_mutex);
				3092	}
				3093
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3094	ret = btrfs_update_device(trans, device);
				3095	if (ret) {
				3096	mutex_unlock(&fs_devices->device_list_mutex);
				3097	btrfs_abort_transaction(trans, ret);
				3098	goto out;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3099	}
				3100	}
				3101	mutex_unlock(&fs_devices->device_list_mutex);
				3102
				3103	ret = btrfs_free_chunk(trans, chunk_offset);
				3104	if (ret) {
				3105	btrfs_abort_transaction(trans, ret);
				3106	goto out;
				3107	}
				3108
				3109	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
				3110
				3111	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
				3112	ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
				3113	if (ret) {
				3114	btrfs_abort_transaction(trans, ret);
				3115	goto out;
				3116	}
				3117	}
				3118
				3119	ret = btrfs_remove_block_group(trans, chunk_offset, em);
				3120	if (ret) {
				3121	btrfs_abort_transaction(trans, ret);
				3122	goto out;
				3123	}
				3124
				3125	out:
				3126	/* once for us */
				3127	free_extent_map(em);
				3128	return ret;
				3129	}
				3130
				3131	static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
				3132	{
				3133	struct btrfs_root *root = fs_info->chunk_root;
				3134	struct btrfs_trans_handle *trans;
				3135	int ret;
				3136
				3137	/*
				3138	* Prevent races with automatic removal of unused block groups.
				3139	* After we relocate and before we remove the chunk with offset
				3140	* chunk_offset, automatic removal of the block group can kick in,
				3141	* resulting in a failure when calling btrfs_remove_chunk() below.
				3142	*
				3143	* Make sure to acquire this mutex before doing a tree search (dev
				3144	* or chunk trees) to find chunks. Otherwise the cleaner kthread might
				3145	* call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
				3146	* we release the path used to search the chunk/dev tree and before
				3147	* the current task acquires this mutex and calls us.
				3148	*/
				3149	lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
				3150
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3151	/* step one, relocate all the extents inside this chunk */
				3152	btrfs_scrub_pause(fs_info);
				3153	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
				3154	btrfs_scrub_continue(fs_info);
				3155	if (ret)
				3156	return ret;
				3157
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3158	trans = btrfs_start_trans_remove_block_group(root->fs_info,
				3159	chunk_offset);
				3160	if (IS_ERR(trans)) {
				3161	ret = PTR_ERR(trans);
				3162	btrfs_handle_fs_error(root->fs_info, ret, NULL);
				3163	return ret;
				3164	}
				3165
				3166	/*
				3167	* step two, delete the device extents and the
				3168	* chunk tree entries
				3169	*/
				3170	ret = btrfs_remove_chunk(trans, chunk_offset);
				3171	btrfs_end_transaction(trans);
				3172	return ret;
				3173	}
				3174
				3175	static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
				3176	{
				3177	struct btrfs_root *chunk_root = fs_info->chunk_root;
				3178	struct btrfs_path *path;
				3179	struct extent_buffer *leaf;
				3180	struct btrfs_chunk *chunk;
				3181	struct btrfs_key key;
				3182	struct btrfs_key found_key;
				3183	u64 chunk_type;
				3184	bool retried = false;
				3185	int failed = 0;
				3186	int ret;
				3187
				3188	path = btrfs_alloc_path();
				3189	if (!path)
				3190	return -ENOMEM;
				3191
				3192	again:
				3193	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
				3194	key.offset = (u64)-1;
				3195	key.type = BTRFS_CHUNK_ITEM_KEY;
				3196
				3197	while (1) {
				3198	mutex_lock(&fs_info->delete_unused_bgs_mutex);
				3199	ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
				3200	if (ret < 0) {
				3201	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3202	goto error;
				3203	}
				3204	BUG_ON(ret == 0); /* Corruption */
				3205
				3206	ret = btrfs_previous_item(chunk_root, path, key.objectid,
				3207	key.type);
				3208	if (ret)
				3209	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3210	if (ret < 0)
				3211	goto error;
				3212	if (ret > 0)
				3213	break;
				3214
				3215	leaf = path->nodes[0];
				3216	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
				3217
				3218	chunk = btrfs_item_ptr(leaf, path->slots[0],
				3219	struct btrfs_chunk);
				3220	chunk_type = btrfs_chunk_type(leaf, chunk);
				3221	btrfs_release_path(path);
				3222
				3223	if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
				3224	ret = btrfs_relocate_chunk(fs_info, found_key.offset);
				3225	if (ret == -ENOSPC)
				3226	failed++;
				3227	else
				3228	BUG_ON(ret);
				3229	}
				3230	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3231
				3232	if (found_key.offset == 0)
				3233	break;
				3234	key.offset = found_key.offset - 1;
				3235	}
				3236	ret = 0;
				3237	if (failed && !retried) {
				3238	failed = 0;
				3239	retried = true;
				3240	goto again;
				3241	} else if (WARN_ON(failed && retried)) {
				3242	ret = -ENOSPC;
				3243	}
				3244	error:
				3245	btrfs_free_path(path);
				3246	return ret;
				3247	}
				3248
				3249	/*
				3250	* return 1 : allocate a data chunk successfully,
				3251	* return <0: errors during allocating a data chunk,
				3252	* return 0 : no need to allocate a data chunk.
				3253	*/
				3254	static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
				3255	u64 chunk_offset)
				3256	{
				3257	struct btrfs_block_group_cache *cache;
				3258	u64 bytes_used;
				3259	u64 chunk_type;
				3260
				3261	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
				3262	ASSERT(cache);
				3263	chunk_type = cache->flags;
				3264	btrfs_put_block_group(cache);
				3265
				3266	if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
				3267	spin_lock(&fs_info->data_sinfo->lock);
				3268	bytes_used = fs_info->data_sinfo->bytes_used;
				3269	spin_unlock(&fs_info->data_sinfo->lock);
				3270
				3271	if (!bytes_used) {
				3272	struct btrfs_trans_handle *trans;
				3273	int ret;
				3274
				3275	trans = btrfs_join_transaction(fs_info->tree_root);
				3276	if (IS_ERR(trans))
				3277	return PTR_ERR(trans);
				3278
				3279	ret = btrfs_force_chunk_alloc(trans,
				3280	BTRFS_BLOCK_GROUP_DATA);
				3281	btrfs_end_transaction(trans);
				3282	if (ret < 0)
				3283	return ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3284	return 1;
				3285	}
				3286	}
				3287	return 0;
				3288	}
				3289
				3290	static int insert_balance_item(struct btrfs_fs_info *fs_info,
				3291	struct btrfs_balance_control *bctl)
				3292	{
				3293	struct btrfs_root *root = fs_info->tree_root;
				3294	struct btrfs_trans_handle *trans;
				3295	struct btrfs_balance_item *item;
				3296	struct btrfs_disk_balance_args disk_bargs;
				3297	struct btrfs_path *path;
				3298	struct extent_buffer *leaf;
				3299	struct btrfs_key key;
				3300	int ret, err;
				3301
				3302	path = btrfs_alloc_path();
				3303	if (!path)
				3304	return -ENOMEM;
				3305
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3306	trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3307	if (IS_ERR(trans)) {
				3308	btrfs_free_path(path);
				3309	return PTR_ERR(trans);
				3310	}
				3311
				3312	key.objectid = BTRFS_BALANCE_OBJECTID;
				3313	key.type = BTRFS_TEMPORARY_ITEM_KEY;
				3314	key.offset = 0;
				3315
				3316	ret = btrfs_insert_empty_item(trans, root, path, &key,
				3317	sizeof(*item));
				3318	if (ret)
				3319	goto out;
				3320
				3321	leaf = path->nodes[0];
				3322	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
				3323
				3324	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
				3325
				3326	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
				3327	btrfs_set_balance_data(leaf, item, &disk_bargs);
				3328	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
				3329	btrfs_set_balance_meta(leaf, item, &disk_bargs);
				3330	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
				3331	btrfs_set_balance_sys(leaf, item, &disk_bargs);
				3332
				3333	btrfs_set_balance_flags(leaf, item, bctl->flags);
				3334
				3335	btrfs_mark_buffer_dirty(leaf);
				3336	out:
				3337	btrfs_free_path(path);
				3338	err = btrfs_commit_transaction(trans);
				3339	if (err && !ret)
				3340	ret = err;
				3341	return ret;
				3342	}
				3343
				3344	static int del_balance_item(struct btrfs_fs_info *fs_info)
				3345	{
				3346	struct btrfs_root *root = fs_info->tree_root;
				3347	struct btrfs_trans_handle *trans;
				3348	struct btrfs_path *path;
				3349	struct btrfs_key key;
				3350	int ret, err;
				3351
				3352	path = btrfs_alloc_path();
				3353	if (!path)
				3354	return -ENOMEM;
				3355
				3356	trans = btrfs_start_transaction(root, 0);
				3357	if (IS_ERR(trans)) {
				3358	btrfs_free_path(path);
				3359	return PTR_ERR(trans);
				3360	}
				3361
				3362	key.objectid = BTRFS_BALANCE_OBJECTID;
				3363	key.type = BTRFS_TEMPORARY_ITEM_KEY;
				3364	key.offset = 0;
				3365
				3366	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				3367	if (ret < 0)
				3368	goto out;
				3369	if (ret > 0) {
				3370	ret = -ENOENT;
				3371	goto out;
				3372	}
				3373
				3374	ret = btrfs_del_item(trans, root, path);
				3375	out:
				3376	btrfs_free_path(path);
				3377	err = btrfs_commit_transaction(trans);
				3378	if (err && !ret)
				3379	ret = err;
				3380	return ret;
				3381	}
				3382
				3383	/*
				3384	* This is a heuristic used to reduce the number of chunks balanced on
				3385	* resume after balance was interrupted.
				3386	*/
				3387	static void update_balance_args(struct btrfs_balance_control *bctl)
				3388	{
				3389	/*
				3390	* Turn on soft mode for chunk types that were being converted.
				3391	*/
				3392	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
				3393	bctl->data.flags \|= BTRFS_BALANCE_ARGS_SOFT;
				3394	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
				3395	bctl->sys.flags \|= BTRFS_BALANCE_ARGS_SOFT;
				3396	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
				3397	bctl->meta.flags \|= BTRFS_BALANCE_ARGS_SOFT;
				3398
				3399	/*
				3400	* Turn on usage filter if is not already used. The idea is
				3401	* that chunks that we have already balanced should be
				3402	* reasonably full. Don't do it for chunks that are being
				3403	* converted - that will keep us from relocating unconverted
				3404	* (albeit full) chunks.
				3405	*/
				3406	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
				3407	!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
				3408	!(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
				3409	bctl->data.flags \|= BTRFS_BALANCE_ARGS_USAGE;
				3410	bctl->data.usage = 90;
				3411	}
				3412	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
				3413	!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
				3414	!(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
				3415	bctl->sys.flags \|= BTRFS_BALANCE_ARGS_USAGE;
				3416	bctl->sys.usage = 90;
				3417	}
				3418	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
				3419	!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
				3420	!(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
				3421	bctl->meta.flags \|= BTRFS_BALANCE_ARGS_USAGE;
				3422	bctl->meta.usage = 90;
				3423	}
				3424	}
				3425
				3426	/*
				3427	* Clear the balance status in fs_info and delete the balance item from disk.
				3428	*/
				3429	static void reset_balance_state(struct btrfs_fs_info *fs_info)
				3430	{
				3431	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				3432	int ret;
				3433
				3434	BUG_ON(!fs_info->balance_ctl);
				3435
				3436	spin_lock(&fs_info->balance_lock);
				3437	fs_info->balance_ctl = NULL;
				3438	spin_unlock(&fs_info->balance_lock);
				3439
				3440	kfree(bctl);
				3441	ret = del_balance_item(fs_info);
				3442	if (ret)
				3443	btrfs_handle_fs_error(fs_info, ret, NULL);
				3444	}
				3445
				3446	/*
				3447	* Balance filters. Return 1 if chunk should be filtered out
				3448	* (should not be balanced).
				3449	*/
				3450	static int chunk_profiles_filter(u64 chunk_type,
				3451	struct btrfs_balance_args *bargs)
				3452	{
				3453	chunk_type = chunk_to_extended(chunk_type) &
				3454	BTRFS_EXTENDED_PROFILE_MASK;
				3455
				3456	if (bargs->profiles & chunk_type)
				3457	return 0;
				3458
				3459	return 1;
				3460	}
				3461
				3462	static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
				3463	struct btrfs_balance_args *bargs)
				3464	{
				3465	struct btrfs_block_group_cache *cache;
				3466	u64 chunk_used;
				3467	u64 user_thresh_min;
				3468	u64 user_thresh_max;
				3469	int ret = 1;
				3470
				3471	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
				3472	chunk_used = btrfs_block_group_used(&cache->item);
				3473
				3474	if (bargs->usage_min == 0)
				3475	user_thresh_min = 0;
				3476	else
				3477	user_thresh_min = div_factor_fine(cache->key.offset,
				3478	bargs->usage_min);
				3479
				3480	if (bargs->usage_max == 0)
				3481	user_thresh_max = 1;
				3482	else if (bargs->usage_max > 100)
				3483	user_thresh_max = cache->key.offset;
				3484	else
				3485	user_thresh_max = div_factor_fine(cache->key.offset,
				3486	bargs->usage_max);
				3487
				3488	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
				3489	ret = 0;
				3490
				3491	btrfs_put_block_group(cache);
				3492	return ret;
				3493	}
				3494
				3495	static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
				3496	u64 chunk_offset, struct btrfs_balance_args *bargs)
				3497	{
				3498	struct btrfs_block_group_cache *cache;
				3499	u64 chunk_used, user_thresh;
				3500	int ret = 1;
				3501
				3502	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
				3503	chunk_used = btrfs_block_group_used(&cache->item);
				3504
				3505	if (bargs->usage_min == 0)
				3506	user_thresh = 1;
				3507	else if (bargs->usage > 100)
				3508	user_thresh = cache->key.offset;
				3509	else
				3510	user_thresh = div_factor_fine(cache->key.offset,
				3511	bargs->usage);
				3512
				3513	if (chunk_used < user_thresh)
				3514	ret = 0;
				3515
				3516	btrfs_put_block_group(cache);
				3517	return ret;
				3518	}
				3519
				3520	static int chunk_devid_filter(struct extent_buffer *leaf,
				3521	struct btrfs_chunk *chunk,
				3522	struct btrfs_balance_args *bargs)
				3523	{
				3524	struct btrfs_stripe *stripe;
				3525	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
				3526	int i;
				3527
				3528	for (i = 0; i < num_stripes; i++) {
				3529	stripe = btrfs_stripe_nr(chunk, i);
				3530	if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
				3531	return 0;
				3532	}
				3533
				3534	return 1;
				3535	}
				3536
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3537	static u64 calc_data_stripes(u64 type, int num_stripes)
				3538	{
				3539	const int index = btrfs_bg_flags_to_raid_index(type);
				3540	const int ncopies = btrfs_raid_array[index].ncopies;
				3541	const int nparity = btrfs_raid_array[index].nparity;
				3542
				3543	if (nparity)
				3544	return num_stripes - nparity;
				3545	else
				3546	return num_stripes / ncopies;
				3547	}
				3548
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3549	/* [pstart, pend) */
				3550	static int chunk_drange_filter(struct extent_buffer *leaf,
				3551	struct btrfs_chunk *chunk,
				3552	struct btrfs_balance_args *bargs)
				3553	{
				3554	struct btrfs_stripe *stripe;
				3555	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
				3556	u64 stripe_offset;
				3557	u64 stripe_length;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3558	u64 type;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3559	int factor;
				3560	int i;
				3561
				3562	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
				3563	return 0;
				3564
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3565	type = btrfs_chunk_type(leaf, chunk);
				3566	factor = calc_data_stripes(type, num_stripes);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3567
				3568	for (i = 0; i < num_stripes; i++) {
				3569	stripe = btrfs_stripe_nr(chunk, i);
				3570	if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
				3571	continue;
				3572
				3573	stripe_offset = btrfs_stripe_offset(leaf, stripe);
				3574	stripe_length = btrfs_chunk_length(leaf, chunk);
				3575	stripe_length = div_u64(stripe_length, factor);
				3576
				3577	if (stripe_offset < bargs->pend &&
				3578	stripe_offset + stripe_length > bargs->pstart)
				3579	return 0;
				3580	}
				3581
				3582	return 1;
				3583	}
				3584
				3585	/* [vstart, vend) */
				3586	static int chunk_vrange_filter(struct extent_buffer *leaf,
				3587	struct btrfs_chunk *chunk,
				3588	u64 chunk_offset,
				3589	struct btrfs_balance_args *bargs)
				3590	{
				3591	if (chunk_offset < bargs->vend &&
				3592	chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
				3593	/* at least part of the chunk is inside this vrange */
				3594	return 0;
				3595
				3596	return 1;
				3597	}
				3598
				3599	static int chunk_stripes_range_filter(struct extent_buffer *leaf,
				3600	struct btrfs_chunk *chunk,
				3601	struct btrfs_balance_args *bargs)
				3602	{
				3603	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
				3604
				3605	if (bargs->stripes_min <= num_stripes
				3606	&& num_stripes <= bargs->stripes_max)
				3607	return 0;
				3608
				3609	return 1;
				3610	}
				3611
				3612	static int chunk_soft_convert_filter(u64 chunk_type,
				3613	struct btrfs_balance_args *bargs)
				3614	{
				3615	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
				3616	return 0;
				3617
				3618	chunk_type = chunk_to_extended(chunk_type) &
				3619	BTRFS_EXTENDED_PROFILE_MASK;
				3620
				3621	if (bargs->target == chunk_type)
				3622	return 1;
				3623
				3624	return 0;
				3625	}
				3626
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3627	static int should_balance_chunk(struct extent_buffer *leaf,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3628	struct btrfs_chunk *chunk, u64 chunk_offset)
				3629	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3630	struct btrfs_fs_info *fs_info = leaf->fs_info;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3631	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				3632	struct btrfs_balance_args *bargs = NULL;
				3633	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
				3634
				3635	/* type filter */
				3636	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
				3637	(bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
				3638	return 0;
				3639	}
				3640
				3641	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
				3642	bargs = &bctl->data;
				3643	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
				3644	bargs = &bctl->sys;
				3645	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
				3646	bargs = &bctl->meta;
				3647
				3648	/* profiles filter */
				3649	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
				3650	chunk_profiles_filter(chunk_type, bargs)) {
				3651	return 0;
				3652	}
				3653
				3654	/* usage filter */
				3655	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
				3656	chunk_usage_filter(fs_info, chunk_offset, bargs)) {
				3657	return 0;
				3658	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
				3659	chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
				3660	return 0;
				3661	}
				3662
				3663	/* devid filter */
				3664	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
				3665	chunk_devid_filter(leaf, chunk, bargs)) {
				3666	return 0;
				3667	}
				3668
				3669	/* drange filter, makes sense only with devid filter */
				3670	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
				3671	chunk_drange_filter(leaf, chunk, bargs)) {
				3672	return 0;
				3673	}
				3674
				3675	/* vrange filter */
				3676	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
				3677	chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
				3678	return 0;
				3679	}
				3680
				3681	/* stripes filter */
				3682	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
				3683	chunk_stripes_range_filter(leaf, chunk, bargs)) {
				3684	return 0;
				3685	}
				3686
				3687	/* soft profile changing mode */
				3688	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
				3689	chunk_soft_convert_filter(chunk_type, bargs)) {
				3690	return 0;
				3691	}
				3692
				3693	/*
				3694	* limited by count, must be the last filter
				3695	*/
				3696	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
				3697	if (bargs->limit == 0)
				3698	return 0;
				3699	else
				3700	bargs->limit--;
				3701	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
				3702	/*
				3703	* Same logic as the 'limit' filter; the minimum cannot be
				3704	* determined here because we do not have the global information
				3705	* about the count of all chunks that satisfy the filters.
				3706	*/
				3707	if (bargs->limit_max == 0)
				3708	return 0;
				3709	else
				3710	bargs->limit_max--;
				3711	}
				3712
				3713	return 1;
				3714	}
				3715
				3716	static int __btrfs_balance(struct btrfs_fs_info *fs_info)
				3717	{
				3718	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				3719	struct btrfs_root *chunk_root = fs_info->chunk_root;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3720	u64 chunk_type;
				3721	struct btrfs_chunk *chunk;
				3722	struct btrfs_path *path = NULL;
				3723	struct btrfs_key key;
				3724	struct btrfs_key found_key;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3725	struct extent_buffer *leaf;
				3726	int slot;
				3727	int ret;
				3728	int enospc_errors = 0;
				3729	bool counting = true;
				3730	/* The single value limit and min/max limits use the same bytes in the */
				3731	u64 limit_data = bctl->data.limit;
				3732	u64 limit_meta = bctl->meta.limit;
				3733	u64 limit_sys = bctl->sys.limit;
				3734	u32 count_data = 0;
				3735	u32 count_meta = 0;
				3736	u32 count_sys = 0;
				3737	int chunk_reserved = 0;
				3738
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3739	path = btrfs_alloc_path();
				3740	if (!path) {
				3741	ret = -ENOMEM;
				3742	goto error;
				3743	}
				3744
				3745	/* zero out stat counters */
				3746	spin_lock(&fs_info->balance_lock);
				3747	memset(&bctl->stat, 0, sizeof(bctl->stat));
				3748	spin_unlock(&fs_info->balance_lock);
				3749	again:
				3750	if (!counting) {
				3751	/*
				3752	* The single value limit and min/max limits use the same bytes
				3753	* in the
				3754	*/
				3755	bctl->data.limit = limit_data;
				3756	bctl->meta.limit = limit_meta;
				3757	bctl->sys.limit = limit_sys;
				3758	}
				3759	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
				3760	key.offset = (u64)-1;
				3761	key.type = BTRFS_CHUNK_ITEM_KEY;
				3762
				3763	while (1) {
				3764	if ((!counting && atomic_read(&fs_info->balance_pause_req)) \|\|
				3765	atomic_read(&fs_info->balance_cancel_req)) {
				3766	ret = -ECANCELED;
				3767	goto error;
				3768	}
				3769
				3770	mutex_lock(&fs_info->delete_unused_bgs_mutex);
				3771	ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
				3772	if (ret < 0) {
				3773	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3774	goto error;
				3775	}
				3776
				3777	/*
				3778	* this shouldn't happen, it means the last relocate
				3779	* failed
				3780	*/
				3781	if (ret == 0)
				3782	BUG(); /* FIXME break ? */
				3783
				3784	ret = btrfs_previous_item(chunk_root, path, 0,
				3785	BTRFS_CHUNK_ITEM_KEY);
				3786	if (ret) {
				3787	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3788	ret = 0;
				3789	break;
				3790	}
				3791
				3792	leaf = path->nodes[0];
				3793	slot = path->slots[0];
				3794	btrfs_item_key_to_cpu(leaf, &found_key, slot);
				3795
				3796	if (found_key.objectid != key.objectid) {
				3797	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3798	break;
				3799	}
				3800
				3801	chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
				3802	chunk_type = btrfs_chunk_type(leaf, chunk);
				3803
				3804	if (!counting) {
				3805	spin_lock(&fs_info->balance_lock);
				3806	bctl->stat.considered++;
				3807	spin_unlock(&fs_info->balance_lock);
				3808	}
				3809
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3810	ret = should_balance_chunk(leaf, chunk, found_key.offset);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3811
				3812	btrfs_release_path(path);
				3813	if (!ret) {
				3814	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3815	goto loop;
				3816	}
				3817
				3818	if (counting) {
				3819	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3820	spin_lock(&fs_info->balance_lock);
				3821	bctl->stat.expected++;
				3822	spin_unlock(&fs_info->balance_lock);
				3823
				3824	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
				3825	count_data++;
				3826	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
				3827	count_sys++;
				3828	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
				3829	count_meta++;
				3830
				3831	goto loop;
				3832	}
				3833
				3834	/*
				3835	* Apply limit_min filter, no need to check if the LIMITS
				3836	* filter is used, limit_min is 0 by default
				3837	*/
				3838	if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
				3839	count_data < bctl->data.limit_min)
				3840	\|\| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
				3841	count_meta < bctl->meta.limit_min)
				3842	\|\| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
				3843	count_sys < bctl->sys.limit_min)) {
				3844	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3845	goto loop;
				3846	}
				3847
				3848	if (!chunk_reserved) {
				3849	/*
				3850	* We may be relocating the only data chunk we have,
				3851	* which could potentially end up with losing data's
				3852	* raid profile, so lets allocate an empty one in
				3853	* advance.
				3854	*/
				3855	ret = btrfs_may_alloc_data_chunk(fs_info,
				3856	found_key.offset);
				3857	if (ret < 0) {
				3858	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3859	goto error;
				3860	} else if (ret == 1) {
				3861	chunk_reserved = 1;
				3862	}
				3863	}
				3864
				3865	ret = btrfs_relocate_chunk(fs_info, found_key.offset);
				3866	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3867	if (ret == -ENOSPC) {
				3868	enospc_errors++;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3869	} else if (ret == -ETXTBSY) {
				3870	btrfs_info(fs_info,
				3871	"skipping relocation of block group %llu due to active swapfile",
				3872	found_key.offset);
				3873	ret = 0;
				3874	} else if (ret) {
				3875	goto error;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3876	} else {
				3877	spin_lock(&fs_info->balance_lock);
				3878	bctl->stat.completed++;
				3879	spin_unlock(&fs_info->balance_lock);
				3880	}
				3881	loop:
				3882	if (found_key.offset == 0)
				3883	break;
				3884	key.offset = found_key.offset - 1;
				3885	}
				3886
				3887	if (counting) {
				3888	btrfs_release_path(path);
				3889	counting = false;
				3890	goto again;
				3891	}
				3892	error:
				3893	btrfs_free_path(path);
				3894	if (enospc_errors) {
				3895	btrfs_info(fs_info, "%d enospc errors during balance",
				3896	enospc_errors);
				3897	if (!ret)
				3898	ret = -ENOSPC;
				3899	}
				3900
				3901	return ret;
				3902	}
				3903
				3904	/**
				3905	* alloc_profile_is_valid - see if a given profile is valid and reduced
				3906	* @flags: profile to validate
				3907	* @extended: if true @flags is treated as an extended profile
				3908	*/
				3909	static int alloc_profile_is_valid(u64 flags, int extended)
				3910	{
				3911	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
				3912	BTRFS_BLOCK_GROUP_PROFILE_MASK);
				3913
				3914	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
				3915
				3916	/* 1) check that all other bits are zeroed */
				3917	if (flags & ~mask)
				3918	return 0;
				3919
				3920	/* 2) see if profile is reduced */
				3921	if (flags == 0)
				3922	return !extended; /* "0" is valid for usual profiles */
				3923
				3924	/* true if exactly one bit set */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3925	/*
				3926	* Don't use is_power_of_2(unsigned long) because it won't work
				3927	* for the single profile (1ULL << 48) on 32-bit CPUs.
				3928	*/
				3929	return flags != 0 && (flags & (flags - 1)) == 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3930	}
				3931
				3932	static inline int balance_need_close(struct btrfs_fs_info *fs_info)
				3933	{
				3934	/* cancel requested \|\| normal exit path */
				3935	return atomic_read(&fs_info->balance_cancel_req) \|\|
				3936	(atomic_read(&fs_info->balance_pause_req) == 0 &&
				3937	atomic_read(&fs_info->balance_cancel_req) == 0);
				3938	}
				3939
				3940	/* Non-zero return value signifies invalidity */
				3941	static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
				3942	u64 allowed)
				3943	{
				3944	return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
				3945	(!alloc_profile_is_valid(bctl_arg->target, 1) \|\|
				3946	(bctl_arg->target & ~allowed)));
				3947	}
				3948
				3949	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3950	* Fill @buf with textual description of balance filter flags @bargs, up to
				3951	* @size_buf including the terminating null. The output may be trimmed if it
				3952	* does not fit into the provided buffer.
				3953	*/
				3954	static void describe_balance_args(struct btrfs_balance_args bargs, char buf,
				3955	u32 size_buf)
				3956	{
				3957	int ret;
				3958	u32 size_bp = size_buf;
				3959	char *bp = buf;
				3960	u64 flags = bargs->flags;
				3961	char tmp_buf[128] = {'\0'};
				3962
				3963	if (!flags)
				3964	return;
				3965
				3966	#define CHECK_APPEND_NOARG(a) \
				3967	do { \
				3968	ret = snprintf(bp, size_bp, (a)); \
				3969	if (ret < 0 \|\| ret >= size_bp) \
				3970	goto out_overflow; \
				3971	size_bp -= ret; \
				3972	bp += ret; \
				3973	} while (0)
				3974
				3975	#define CHECK_APPEND_1ARG(a, v1) \
				3976	do { \
				3977	ret = snprintf(bp, size_bp, (a), (v1)); \
				3978	if (ret < 0 \|\| ret >= size_bp) \
				3979	goto out_overflow; \
				3980	size_bp -= ret; \
				3981	bp += ret; \
				3982	} while (0)
				3983
				3984	#define CHECK_APPEND_2ARG(a, v1, v2) \
				3985	do { \
				3986	ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
				3987	if (ret < 0 \|\| ret >= size_bp) \
				3988	goto out_overflow; \
				3989	size_bp -= ret; \
				3990	bp += ret; \
				3991	} while (0)
				3992
				3993	if (flags & BTRFS_BALANCE_ARGS_CONVERT)
				3994	CHECK_APPEND_1ARG("convert=%s,",
				3995	btrfs_bg_type_to_raid_name(bargs->target));
				3996
				3997	if (flags & BTRFS_BALANCE_ARGS_SOFT)
				3998	CHECK_APPEND_NOARG("soft,");
				3999
				4000	if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
				4001	btrfs_describe_block_groups(bargs->profiles, tmp_buf,
				4002	sizeof(tmp_buf));
				4003	CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
				4004	}
				4005
				4006	if (flags & BTRFS_BALANCE_ARGS_USAGE)
				4007	CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
				4008
				4009	if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
				4010	CHECK_APPEND_2ARG("usage=%u..%u,",
				4011	bargs->usage_min, bargs->usage_max);
				4012
				4013	if (flags & BTRFS_BALANCE_ARGS_DEVID)
				4014	CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
				4015
				4016	if (flags & BTRFS_BALANCE_ARGS_DRANGE)
				4017	CHECK_APPEND_2ARG("drange=%llu..%llu,",
				4018	bargs->pstart, bargs->pend);
				4019
				4020	if (flags & BTRFS_BALANCE_ARGS_VRANGE)
				4021	CHECK_APPEND_2ARG("vrange=%llu..%llu,",
				4022	bargs->vstart, bargs->vend);
				4023
				4024	if (flags & BTRFS_BALANCE_ARGS_LIMIT)
				4025	CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
				4026
				4027	if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
				4028	CHECK_APPEND_2ARG("limit=%u..%u,",
				4029	bargs->limit_min, bargs->limit_max);
				4030
				4031	if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
				4032	CHECK_APPEND_2ARG("stripes=%u..%u,",
				4033	bargs->stripes_min, bargs->stripes_max);
				4034
				4035	#undef CHECK_APPEND_2ARG
				4036	#undef CHECK_APPEND_1ARG
				4037	#undef CHECK_APPEND_NOARG
				4038
				4039	out_overflow:
				4040
				4041	if (size_bp < size_buf)
				4042	buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
				4043	else
				4044	buf[0] = '\0';
				4045	}
				4046
				4047	static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
				4048	{
				4049	u32 size_buf = 1024;
				4050	char tmp_buf[192] = {'\0'};
				4051	char *buf;
				4052	char *bp;
				4053	u32 size_bp = size_buf;
				4054	int ret;
				4055	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				4056
				4057	buf = kzalloc(size_buf, GFP_KERNEL);
				4058	if (!buf)
				4059	return;
				4060
				4061	bp = buf;
				4062
				4063	#define CHECK_APPEND_1ARG(a, v1) \
				4064	do { \
				4065	ret = snprintf(bp, size_bp, (a), (v1)); \
				4066	if (ret < 0 \|\| ret >= size_bp) \
				4067	goto out_overflow; \
				4068	size_bp -= ret; \
				4069	bp += ret; \
				4070	} while (0)
				4071
				4072	if (bctl->flags & BTRFS_BALANCE_FORCE)
				4073	CHECK_APPEND_1ARG("%s", "-f ");
				4074
				4075	if (bctl->flags & BTRFS_BALANCE_DATA) {
				4076	describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
				4077	CHECK_APPEND_1ARG("-d%s ", tmp_buf);
				4078	}
				4079
				4080	if (bctl->flags & BTRFS_BALANCE_METADATA) {
				4081	describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
				4082	CHECK_APPEND_1ARG("-m%s ", tmp_buf);
				4083	}
				4084
				4085	if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
				4086	describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
				4087	CHECK_APPEND_1ARG("-s%s ", tmp_buf);
				4088	}
				4089
				4090	#undef CHECK_APPEND_1ARG
				4091
				4092	out_overflow:
				4093
				4094	if (size_bp < size_buf)
				4095	buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
				4096	btrfs_info(fs_info, "balance: %s %s",
				4097	(bctl->flags & BTRFS_BALANCE_RESUME) ?
				4098	"resume" : "start", buf);
				4099
				4100	kfree(buf);
				4101	}
				4102
				4103	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4104	* Should be called with balance mutexe held
				4105	*/
				4106	int btrfs_balance(struct btrfs_fs_info *fs_info,
				4107	struct btrfs_balance_control *bctl,
				4108	struct btrfs_ioctl_balance_args *bargs)
				4109	{
				4110	u64 meta_target, data_target;
				4111	u64 allowed;
				4112	int mixed = 0;
				4113	int ret;
				4114	u64 num_devices;
				4115	unsigned seq;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4116	bool reducing_integrity;
				4117	int i;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4118
				4119	if (btrfs_fs_closing(fs_info) \|\|
				4120	atomic_read(&fs_info->balance_pause_req) \|\|
				4121	atomic_read(&fs_info->balance_cancel_req)) {
				4122	ret = -EINVAL;
				4123	goto out;
				4124	}
				4125
				4126	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
				4127	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
				4128	mixed = 1;
				4129
				4130	/*
				4131	* In case of mixed groups both data and meta should be picked,
				4132	* and identical options should be given for both of them.
				4133	*/
				4134	allowed = BTRFS_BALANCE_DATA \| BTRFS_BALANCE_METADATA;
				4135	if (mixed && (bctl->flags & allowed)) {
				4136	if (!(bctl->flags & BTRFS_BALANCE_DATA) \|\|
				4137	!(bctl->flags & BTRFS_BALANCE_METADATA) \|\|
				4138	memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
				4139	btrfs_err(fs_info,
				4140	"balance: mixed groups data and metadata options must be the same");
				4141	ret = -EINVAL;
				4142	goto out;
				4143	}
				4144	}
				4145
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4146	/*
				4147	* rw_devices will not change at the moment, device add/delete/replace
				4148	* are excluded by EXCL_OP
				4149	*/
				4150	num_devices = fs_info->fs_devices->rw_devices;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4151
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4152	/*
				4153	* SINGLE profile on-disk has no profile bit, but in-memory we have a
				4154	* special bit for it, to make it easier to distinguish. Thus we need
				4155	* to set it manually, or balance would refuse the profile.
				4156	*/
				4157	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
				4158	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
				4159	if (num_devices >= btrfs_raid_array[i].devs_min)
				4160	allowed \|= btrfs_raid_array[i].bg_flag;
				4161
				4162	if (validate_convert_profile(&bctl->data, allowed)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4163	btrfs_err(fs_info,
				4164	"balance: invalid convert data profile %s",
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4165	btrfs_bg_type_to_raid_name(bctl->data.target));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4166	ret = -EINVAL;
				4167	goto out;
				4168	}
				4169	if (validate_convert_profile(&bctl->meta, allowed)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4170	btrfs_err(fs_info,
				4171	"balance: invalid convert metadata profile %s",
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4172	btrfs_bg_type_to_raid_name(bctl->meta.target));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4173	ret = -EINVAL;
				4174	goto out;
				4175	}
				4176	if (validate_convert_profile(&bctl->sys, allowed)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4177	btrfs_err(fs_info,
				4178	"balance: invalid convert system profile %s",
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4179	btrfs_bg_type_to_raid_name(bctl->sys.target));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4180	ret = -EINVAL;
				4181	goto out;
				4182	}
				4183
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4184	/*
				4185	* Allow to reduce metadata or system integrity only if force set for
				4186	* profiles with redundancy (copies, parity)
				4187	*/
				4188	allowed = 0;
				4189	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
				4190	if (btrfs_raid_array[i].ncopies >= 2 \|\|
				4191	btrfs_raid_array[i].tolerated_failures >= 1)
				4192	allowed \|= btrfs_raid_array[i].bg_flag;
				4193	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4194	do {
				4195	seq = read_seqbegin(&fs_info->profiles_lock);
				4196
				4197	if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
				4198	(fs_info->avail_system_alloc_bits & allowed) &&
				4199	!(bctl->sys.target & allowed)) \|\|
				4200	((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
				4201	(fs_info->avail_metadata_alloc_bits & allowed) &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4202	!(bctl->meta.target & allowed)))
				4203	reducing_integrity = true;
				4204	else
				4205	reducing_integrity = false;
				4206
				4207	/* if we're not converting, the target field is uninitialized */
				4208	meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
				4209	bctl->meta.target : fs_info->avail_metadata_alloc_bits;
				4210	data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
				4211	bctl->data.target : fs_info->avail_data_alloc_bits;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4212	} while (read_seqretry(&fs_info->profiles_lock, seq));
				4213
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4214	if (reducing_integrity) {
				4215	if (bctl->flags & BTRFS_BALANCE_FORCE) {
				4216	btrfs_info(fs_info,
				4217	"balance: force reducing metadata integrity");
				4218	} else {
				4219	btrfs_err(fs_info,
				4220	"balance: reduces metadata integrity, use --force if you want this");
				4221	ret = -EINVAL;
				4222	goto out;
				4223	}
				4224	}
				4225
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4226	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
				4227	btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4228	btrfs_warn(fs_info,
				4229	"balance: metadata profile %s has lower redundancy than data profile %s",
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4230	btrfs_bg_type_to_raid_name(meta_target),
				4231	btrfs_bg_type_to_raid_name(data_target));
				4232	}
				4233
				4234	if (fs_info->send_in_progress) {
				4235	btrfs_warn_rl(fs_info,
				4236	"cannot run balance while send operations are in progress (%d in progress)",
				4237	fs_info->send_in_progress);
				4238	ret = -EAGAIN;
				4239	goto out;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4240	}
				4241
				4242	ret = insert_balance_item(fs_info, bctl);
				4243	if (ret && ret != -EEXIST)
				4244	goto out;
				4245
				4246	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
				4247	BUG_ON(ret == -EEXIST);
				4248	BUG_ON(fs_info->balance_ctl);
				4249	spin_lock(&fs_info->balance_lock);
				4250	fs_info->balance_ctl = bctl;
				4251	spin_unlock(&fs_info->balance_lock);
				4252	} else {
				4253	BUG_ON(ret != -EEXIST);
				4254	spin_lock(&fs_info->balance_lock);
				4255	update_balance_args(bctl);
				4256	spin_unlock(&fs_info->balance_lock);
				4257	}
				4258
				4259	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
				4260	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4261	describe_balance_start_or_resume(fs_info);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4262	mutex_unlock(&fs_info->balance_mutex);
				4263
				4264	ret = __btrfs_balance(fs_info);
				4265
				4266	mutex_lock(&fs_info->balance_mutex);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4267	if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
				4268	btrfs_info(fs_info, "balance: paused");
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4269	/*
				4270	* Balance can be canceled by:
				4271	*
				4272	* - Regular cancel request
				4273	* Then ret == -ECANCELED and balance_cancel_req > 0
				4274	*
				4275	* - Fatal signal to "btrfs" process
				4276	* Either the signal caught by wait_reserve_ticket() and callers
				4277	* got -EINTR, or caught by btrfs_should_cancel_balance() and
				4278	* got -ECANCELED.
				4279	* Either way, in this case balance_cancel_req = 0, and
				4280	* ret == -EINTR or ret == -ECANCELED.
				4281	*
				4282	* So here we only check the return value to catch canceled balance.
				4283	*/
				4284	else if (ret == -ECANCELED \|\| ret == -EINTR)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4285	btrfs_info(fs_info, "balance: canceled");
				4286	else
				4287	btrfs_info(fs_info, "balance: ended with status: %d", ret);
				4288
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4289	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
				4290
				4291	if (bargs) {
				4292	memset(bargs, 0, sizeof(*bargs));
				4293	btrfs_update_ioctl_balance_args(fs_info, bargs);
				4294	}
				4295
				4296	if ((ret && ret != -ECANCELED && ret != -ENOSPC) \|\|
				4297	balance_need_close(fs_info)) {
				4298	reset_balance_state(fs_info);
				4299	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				4300	}
				4301
				4302	wake_up(&fs_info->balance_wait_q);
				4303
				4304	return ret;
				4305	out:
				4306	if (bctl->flags & BTRFS_BALANCE_RESUME)
				4307	reset_balance_state(fs_info);
				4308	else
				4309	kfree(bctl);
				4310	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				4311
				4312	return ret;
				4313	}
				4314
				4315	static int balance_kthread(void *data)
				4316	{
				4317	struct btrfs_fs_info *fs_info = data;
				4318	int ret = 0;
				4319
				4320	mutex_lock(&fs_info->balance_mutex);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4321	if (fs_info->balance_ctl)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4322	ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4323	mutex_unlock(&fs_info->balance_mutex);
				4324
				4325	return ret;
				4326	}
				4327
				4328	int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
				4329	{
				4330	struct task_struct *tsk;
				4331
				4332	mutex_lock(&fs_info->balance_mutex);
				4333	if (!fs_info->balance_ctl) {
				4334	mutex_unlock(&fs_info->balance_mutex);
				4335	return 0;
				4336	}
				4337	mutex_unlock(&fs_info->balance_mutex);
				4338
				4339	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
				4340	btrfs_info(fs_info, "balance: resume skipped");
				4341	return 0;
				4342	}
				4343
				4344	/*
				4345	* A ro->rw remount sequence should continue with the paused balance
				4346	* regardless of who pauses it, system or the user as of now, so set
				4347	* the resume flag.
				4348	*/
				4349	spin_lock(&fs_info->balance_lock);
				4350	fs_info->balance_ctl->flags \|= BTRFS_BALANCE_RESUME;
				4351	spin_unlock(&fs_info->balance_lock);
				4352
				4353	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
				4354	return PTR_ERR_OR_ZERO(tsk);
				4355	}
				4356
				4357	int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
				4358	{
				4359	struct btrfs_balance_control *bctl;
				4360	struct btrfs_balance_item *item;
				4361	struct btrfs_disk_balance_args disk_bargs;
				4362	struct btrfs_path *path;
				4363	struct extent_buffer *leaf;
				4364	struct btrfs_key key;
				4365	int ret;
				4366
				4367	path = btrfs_alloc_path();
				4368	if (!path)
				4369	return -ENOMEM;
				4370
				4371	key.objectid = BTRFS_BALANCE_OBJECTID;
				4372	key.type = BTRFS_TEMPORARY_ITEM_KEY;
				4373	key.offset = 0;
				4374
				4375	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
				4376	if (ret < 0)
				4377	goto out;
				4378	if (ret > 0) { /* ret = -ENOENT; */
				4379	ret = 0;
				4380	goto out;
				4381	}
				4382
				4383	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
				4384	if (!bctl) {
				4385	ret = -ENOMEM;
				4386	goto out;
				4387	}
				4388
				4389	leaf = path->nodes[0];
				4390	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
				4391
				4392	bctl->flags = btrfs_balance_flags(leaf, item);
				4393	bctl->flags \|= BTRFS_BALANCE_RESUME;
				4394
				4395	btrfs_balance_data(leaf, item, &disk_bargs);
				4396	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
				4397	btrfs_balance_meta(leaf, item, &disk_bargs);
				4398	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
				4399	btrfs_balance_sys(leaf, item, &disk_bargs);
				4400	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
				4401
				4402	/*
				4403	* This should never happen, as the paused balance state is recovered
				4404	* during mount without any chance of other exclusive ops to collide.
				4405	*
				4406	* This gives the exclusive op status to balance and keeps in paused
				4407	* state until user intervention (cancel or umount). If the ownership
				4408	* cannot be assigned, show a message but do not fail. The balance
				4409	* is in a paused state and must have fs_info::balance_ctl properly
				4410	* set up.
				4411	*/
				4412	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
				4413	btrfs_warn(fs_info,
				4414	"balance: cannot set exclusive op status, resume manually");
				4415
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4416	btrfs_release_path(path);
				4417
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4418	mutex_lock(&fs_info->balance_mutex);
				4419	BUG_ON(fs_info->balance_ctl);
				4420	spin_lock(&fs_info->balance_lock);
				4421	fs_info->balance_ctl = bctl;
				4422	spin_unlock(&fs_info->balance_lock);
				4423	mutex_unlock(&fs_info->balance_mutex);
				4424	out:
				4425	btrfs_free_path(path);
				4426	return ret;
				4427	}
				4428
				4429	int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
				4430	{
				4431	int ret = 0;
				4432
				4433	mutex_lock(&fs_info->balance_mutex);
				4434	if (!fs_info->balance_ctl) {
				4435	mutex_unlock(&fs_info->balance_mutex);
				4436	return -ENOTCONN;
				4437	}
				4438
				4439	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
				4440	atomic_inc(&fs_info->balance_pause_req);
				4441	mutex_unlock(&fs_info->balance_mutex);
				4442
				4443	wait_event(fs_info->balance_wait_q,
				4444	!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
				4445
				4446	mutex_lock(&fs_info->balance_mutex);
				4447	/* we are good with balance_ctl ripped off from under us */
				4448	BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
				4449	atomic_dec(&fs_info->balance_pause_req);
				4450	} else {
				4451	ret = -ENOTCONN;
				4452	}
				4453
				4454	mutex_unlock(&fs_info->balance_mutex);
				4455	return ret;
				4456	}
				4457
				4458	int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
				4459	{
				4460	mutex_lock(&fs_info->balance_mutex);
				4461	if (!fs_info->balance_ctl) {
				4462	mutex_unlock(&fs_info->balance_mutex);
				4463	return -ENOTCONN;
				4464	}
				4465
				4466	/*
				4467	* A paused balance with the item stored on disk can be resumed at
				4468	* mount time if the mount is read-write. Otherwise it's still paused
				4469	* and we must not allow cancelling as it deletes the item.
				4470	*/
				4471	if (sb_rdonly(fs_info->sb)) {
				4472	mutex_unlock(&fs_info->balance_mutex);
				4473	return -EROFS;
				4474	}
				4475
				4476	atomic_inc(&fs_info->balance_cancel_req);
				4477	/*
				4478	* if we are running just wait and return, balance item is
				4479	* deleted in btrfs_balance in this case
				4480	*/
				4481	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
				4482	mutex_unlock(&fs_info->balance_mutex);
				4483	wait_event(fs_info->balance_wait_q,
				4484	!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
				4485	mutex_lock(&fs_info->balance_mutex);
				4486	} else {
				4487	mutex_unlock(&fs_info->balance_mutex);
				4488	/*
				4489	* Lock released to allow other waiters to continue, we'll
				4490	* reexamine the status again.
				4491	*/
				4492	mutex_lock(&fs_info->balance_mutex);
				4493
				4494	if (fs_info->balance_ctl) {
				4495	reset_balance_state(fs_info);
				4496	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				4497	btrfs_info(fs_info, "balance: canceled");
				4498	}
				4499	}
				4500
				4501	BUG_ON(fs_info->balance_ctl \|\|
				4502	test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
				4503	atomic_dec(&fs_info->balance_cancel_req);
				4504	mutex_unlock(&fs_info->balance_mutex);
				4505	return 0;
				4506	}
				4507
				4508	static int btrfs_uuid_scan_kthread(void *data)
				4509	{
				4510	struct btrfs_fs_info *fs_info = data;
				4511	struct btrfs_root *root = fs_info->tree_root;
				4512	struct btrfs_key key;
				4513	struct btrfs_path *path = NULL;
				4514	int ret = 0;
				4515	struct extent_buffer *eb;
				4516	int slot;
				4517	struct btrfs_root_item root_item;
				4518	u32 item_size;
				4519	struct btrfs_trans_handle *trans = NULL;
				4520
				4521	path = btrfs_alloc_path();
				4522	if (!path) {
				4523	ret = -ENOMEM;
				4524	goto out;
				4525	}
				4526
				4527	key.objectid = 0;
				4528	key.type = BTRFS_ROOT_ITEM_KEY;
				4529	key.offset = 0;
				4530
				4531	while (1) {
				4532	ret = btrfs_search_forward(root, &key, path,
				4533	BTRFS_OLDEST_GENERATION);
				4534	if (ret) {
				4535	if (ret > 0)
				4536	ret = 0;
				4537	break;
				4538	}
				4539
				4540	if (key.type != BTRFS_ROOT_ITEM_KEY \|\|
				4541	(key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
				4542	key.objectid != BTRFS_FS_TREE_OBJECTID) \|\|
				4543	key.objectid > BTRFS_LAST_FREE_OBJECTID)
				4544	goto skip;
				4545
				4546	eb = path->nodes[0];
				4547	slot = path->slots[0];
				4548	item_size = btrfs_item_size_nr(eb, slot);
				4549	if (item_size < sizeof(root_item))
				4550	goto skip;
				4551
				4552	read_extent_buffer(eb, &root_item,
				4553	btrfs_item_ptr_offset(eb, slot),
				4554	(int)sizeof(root_item));
				4555	if (btrfs_root_refs(&root_item) == 0)
				4556	goto skip;
				4557
				4558	if (!btrfs_is_empty_uuid(root_item.uuid) \|\|
				4559	!btrfs_is_empty_uuid(root_item.received_uuid)) {
				4560	if (trans)
				4561	goto update_tree;
				4562
				4563	btrfs_release_path(path);
				4564	/*
				4565	* 1 - subvol uuid item
				4566	* 1 - received_subvol uuid item
				4567	*/
				4568	trans = btrfs_start_transaction(fs_info->uuid_root, 2);
				4569	if (IS_ERR(trans)) {
				4570	ret = PTR_ERR(trans);
				4571	break;
				4572	}
				4573	continue;
				4574	} else {
				4575	goto skip;
				4576	}
				4577	update_tree:
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4578	btrfs_release_path(path);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4579	if (!btrfs_is_empty_uuid(root_item.uuid)) {
				4580	ret = btrfs_uuid_tree_add(trans, root_item.uuid,
				4581	BTRFS_UUID_KEY_SUBVOL,
				4582	key.objectid);
				4583	if (ret < 0) {
				4584	btrfs_warn(fs_info, "uuid_tree_add failed %d",
				4585	ret);
				4586	break;
				4587	}
				4588	}
				4589
				4590	if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
				4591	ret = btrfs_uuid_tree_add(trans,
				4592	root_item.received_uuid,
				4593	BTRFS_UUID_KEY_RECEIVED_SUBVOL,
				4594	key.objectid);
				4595	if (ret < 0) {
				4596	btrfs_warn(fs_info, "uuid_tree_add failed %d",
				4597	ret);
				4598	break;
				4599	}
				4600	}
				4601
				4602	skip:
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4603	btrfs_release_path(path);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4604	if (trans) {
				4605	ret = btrfs_end_transaction(trans);
				4606	trans = NULL;
				4607	if (ret)
				4608	break;
				4609	}
				4610
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4611	if (key.offset < (u64)-1) {
				4612	key.offset++;
				4613	} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
				4614	key.offset = 0;
				4615	key.type = BTRFS_ROOT_ITEM_KEY;
				4616	} else if (key.objectid < (u64)-1) {
				4617	key.offset = 0;
				4618	key.type = BTRFS_ROOT_ITEM_KEY;
				4619	key.objectid++;
				4620	} else {
				4621	break;
				4622	}
				4623	cond_resched();
				4624	}
				4625
				4626	out:
				4627	btrfs_free_path(path);
				4628	if (trans && !IS_ERR(trans))
				4629	btrfs_end_transaction(trans);
				4630	if (ret)
				4631	btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
				4632	else
				4633	set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
				4634	up(&fs_info->uuid_tree_rescan_sem);
				4635	return 0;
				4636	}
				4637
				4638	/*
				4639	* Callback for btrfs_uuid_tree_iterate().
				4640	* returns:
				4641	* 0 check succeeded, the entry is not outdated.
				4642	* < 0 if an error occurred.
				4643	* > 0 if the check failed, which means the caller shall remove the entry.
				4644	*/
				4645	static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
				4646	u8 *uuid, u8 type, u64 subid)
				4647	{
				4648	struct btrfs_key key;
				4649	int ret = 0;
				4650	struct btrfs_root *subvol_root;
				4651
				4652	if (type != BTRFS_UUID_KEY_SUBVOL &&
				4653	type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
				4654	goto out;
				4655
				4656	key.objectid = subid;
				4657	key.type = BTRFS_ROOT_ITEM_KEY;
				4658	key.offset = (u64)-1;
				4659	subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
				4660	if (IS_ERR(subvol_root)) {
				4661	ret = PTR_ERR(subvol_root);
				4662	if (ret == -ENOENT)
				4663	ret = 1;
				4664	goto out;
				4665	}
				4666
				4667	switch (type) {
				4668	case BTRFS_UUID_KEY_SUBVOL:
				4669	if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
				4670	ret = 1;
				4671	break;
				4672	case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
				4673	if (memcmp(uuid, subvol_root->root_item.received_uuid,
				4674	BTRFS_UUID_SIZE))
				4675	ret = 1;
				4676	break;
				4677	}
				4678
				4679	out:
				4680	return ret;
				4681	}
				4682
				4683	static int btrfs_uuid_rescan_kthread(void *data)
				4684	{
				4685	struct btrfs_fs_info fs_info = (struct btrfs_fs_info )data;
				4686	int ret;
				4687
				4688	/*
				4689	* 1st step is to iterate through the existing UUID tree and
				4690	* to delete all entries that contain outdated data.
				4691	* 2nd step is to add all missing entries to the UUID tree.
				4692	*/
				4693	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
				4694	if (ret < 0) {
				4695	btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
				4696	up(&fs_info->uuid_tree_rescan_sem);
				4697	return ret;
				4698	}
				4699	return btrfs_uuid_scan_kthread(data);
				4700	}
				4701
				4702	int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
				4703	{
				4704	struct btrfs_trans_handle *trans;
				4705	struct btrfs_root *tree_root = fs_info->tree_root;
				4706	struct btrfs_root *uuid_root;
				4707	struct task_struct *task;
				4708	int ret;
				4709
				4710	/*
				4711	* 1 - root node
				4712	* 1 - root item
				4713	*/
				4714	trans = btrfs_start_transaction(tree_root, 2);
				4715	if (IS_ERR(trans))
				4716	return PTR_ERR(trans);
				4717
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4718	uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4719	if (IS_ERR(uuid_root)) {
				4720	ret = PTR_ERR(uuid_root);
				4721	btrfs_abort_transaction(trans, ret);
				4722	btrfs_end_transaction(trans);
				4723	return ret;
				4724	}
				4725
				4726	fs_info->uuid_root = uuid_root;
				4727
				4728	ret = btrfs_commit_transaction(trans);
				4729	if (ret)
				4730	return ret;
				4731
				4732	down(&fs_info->uuid_tree_rescan_sem);
				4733	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
				4734	if (IS_ERR(task)) {
				4735	/* fs_info->update_uuid_tree_gen remains 0 in all error case */
				4736	btrfs_warn(fs_info, "failed to start uuid_scan task");
				4737	up(&fs_info->uuid_tree_rescan_sem);
				4738	return PTR_ERR(task);
				4739	}
				4740
				4741	return 0;
				4742	}
				4743
				4744	int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
				4745	{
				4746	struct task_struct *task;
				4747
				4748	down(&fs_info->uuid_tree_rescan_sem);
				4749	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
				4750	if (IS_ERR(task)) {
				4751	/* fs_info->update_uuid_tree_gen remains 0 in all error case */
				4752	btrfs_warn(fs_info, "failed to start uuid_rescan task");
				4753	up(&fs_info->uuid_tree_rescan_sem);
				4754	return PTR_ERR(task);
				4755	}
				4756
				4757	return 0;
				4758	}
				4759
				4760	/*
				4761	* shrinking a device means finding all of the device extents past
				4762	* the new size, and then following the back refs to the chunks.
				4763	* The chunk relocation code actually frees the device extent
				4764	*/
				4765	int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
				4766	{
				4767	struct btrfs_fs_info *fs_info = device->fs_info;
				4768	struct btrfs_root *root = fs_info->dev_root;
				4769	struct btrfs_trans_handle *trans;
				4770	struct btrfs_dev_extent *dev_extent = NULL;
				4771	struct btrfs_path *path;
				4772	u64 length;
				4773	u64 chunk_offset;
				4774	int ret;
				4775	int slot;
				4776	int failed = 0;
				4777	bool retried = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4778	struct extent_buffer *l;
				4779	struct btrfs_key key;
				4780	struct btrfs_super_block *super_copy = fs_info->super_copy;
				4781	u64 old_total = btrfs_super_total_bytes(super_copy);
				4782	u64 old_size = btrfs_device_get_total_bytes(device);
				4783	u64 diff;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4784	u64 start;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4785
				4786	new_size = round_down(new_size, fs_info->sectorsize);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4787	start = new_size;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4788	diff = round_down(old_size - new_size, fs_info->sectorsize);
				4789
				4790	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
				4791	return -EINVAL;
				4792
				4793	path = btrfs_alloc_path();
				4794	if (!path)
				4795	return -ENOMEM;
				4796
				4797	path->reada = READA_BACK;
				4798
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4799	trans = btrfs_start_transaction(root, 0);
				4800	if (IS_ERR(trans)) {
				4801	btrfs_free_path(path);
				4802	return PTR_ERR(trans);
				4803	}
				4804
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4805	mutex_lock(&fs_info->chunk_mutex);
				4806
				4807	btrfs_device_set_total_bytes(device, new_size);
				4808	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
				4809	device->fs_devices->total_rw_bytes -= diff;
				4810	atomic64_sub(diff, &fs_info->free_chunk_space);
				4811	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4812
				4813	/*
				4814	* Once the device's size has been set to the new size, ensure all
				4815	* in-memory chunks are synced to disk so that the loop below sees them
				4816	* and relocates them accordingly.
				4817	*/
				4818	if (contains_pending_extent(device, &start, diff)) {
				4819	mutex_unlock(&fs_info->chunk_mutex);
				4820	ret = btrfs_commit_transaction(trans);
				4821	if (ret)
				4822	goto done;
				4823	} else {
				4824	mutex_unlock(&fs_info->chunk_mutex);
				4825	btrfs_end_transaction(trans);
				4826	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4827
				4828	again:
				4829	key.objectid = device->devid;
				4830	key.offset = (u64)-1;
				4831	key.type = BTRFS_DEV_EXTENT_KEY;
				4832
				4833	do {
				4834	mutex_lock(&fs_info->delete_unused_bgs_mutex);
				4835	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				4836	if (ret < 0) {
				4837	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4838	goto done;
				4839	}
				4840
				4841	ret = btrfs_previous_item(root, path, 0, key.type);
				4842	if (ret)
				4843	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4844	if (ret < 0)
				4845	goto done;
				4846	if (ret) {
				4847	ret = 0;
				4848	btrfs_release_path(path);
				4849	break;
				4850	}
				4851
				4852	l = path->nodes[0];
				4853	slot = path->slots[0];
				4854	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
				4855
				4856	if (key.objectid != device->devid) {
				4857	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4858	btrfs_release_path(path);
				4859	break;
				4860	}
				4861
				4862	dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
				4863	length = btrfs_dev_extent_length(l, dev_extent);
				4864
				4865	if (key.offset + length <= new_size) {
				4866	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4867	btrfs_release_path(path);
				4868	break;
				4869	}
				4870
				4871	chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
				4872	btrfs_release_path(path);
				4873
				4874	/*
				4875	* We may be relocating the only data chunk we have,
				4876	* which could potentially end up with losing data's
				4877	* raid profile, so lets allocate an empty one in
				4878	* advance.
				4879	*/
				4880	ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
				4881	if (ret < 0) {
				4882	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4883	goto done;
				4884	}
				4885
				4886	ret = btrfs_relocate_chunk(fs_info, chunk_offset);
				4887	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4888	if (ret == -ENOSPC) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4889	failed++;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4890	} else if (ret) {
				4891	if (ret == -ETXTBSY) {
				4892	btrfs_warn(fs_info,
				4893	"could not shrink block group %llu due to active swapfile",
				4894	chunk_offset);
				4895	}
				4896	goto done;
				4897	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4898	} while (key.offset-- > 0);
				4899
				4900	if (failed && !retried) {
				4901	failed = 0;
				4902	retried = true;
				4903	goto again;
				4904	} else if (failed && retried) {
				4905	ret = -ENOSPC;
				4906	goto done;
				4907	}
				4908
				4909	/* Shrinking succeeded, else we would be at "done". */
				4910	trans = btrfs_start_transaction(root, 0);
				4911	if (IS_ERR(trans)) {
				4912	ret = PTR_ERR(trans);
				4913	goto done;
				4914	}
				4915
				4916	mutex_lock(&fs_info->chunk_mutex);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4917	/* Clear all state bits beyond the shrunk device size */
				4918	clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
				4919	CHUNK_STATE_MASK);
				4920
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4921	btrfs_device_set_disk_total_bytes(device, new_size);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4922	if (list_empty(&device->post_commit_list))
				4923	list_add_tail(&device->post_commit_list,
				4924	&trans->transaction->dev_update_list);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4925
				4926	WARN_ON(diff > old_total);
				4927	btrfs_set_super_total_bytes(super_copy,
				4928	round_down(old_total - diff, fs_info->sectorsize));
				4929	mutex_unlock(&fs_info->chunk_mutex);
				4930
				4931	/* Now btrfs_update_device() will change the on-disk size. */
				4932	ret = btrfs_update_device(trans, device);
				4933	if (ret < 0) {
				4934	btrfs_abort_transaction(trans, ret);
				4935	btrfs_end_transaction(trans);
				4936	} else {
				4937	ret = btrfs_commit_transaction(trans);
				4938	}
				4939	done:
				4940	btrfs_free_path(path);
				4941	if (ret) {
				4942	mutex_lock(&fs_info->chunk_mutex);
				4943	btrfs_device_set_total_bytes(device, old_size);
				4944	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
				4945	device->fs_devices->total_rw_bytes += diff;
				4946	atomic64_add(diff, &fs_info->free_chunk_space);
				4947	mutex_unlock(&fs_info->chunk_mutex);
				4948	}
				4949	return ret;
				4950	}
				4951
				4952	static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
				4953	struct btrfs_key *key,
				4954	struct btrfs_chunk *chunk, int item_size)
				4955	{
				4956	struct btrfs_super_block *super_copy = fs_info->super_copy;
				4957	struct btrfs_disk_key disk_key;
				4958	u32 array_size;
				4959	u8 *ptr;
				4960
				4961	mutex_lock(&fs_info->chunk_mutex);
				4962	array_size = btrfs_super_sys_array_size(super_copy);
				4963	if (array_size + item_size + sizeof(disk_key)
				4964	> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
				4965	mutex_unlock(&fs_info->chunk_mutex);
				4966	return -EFBIG;
				4967	}
				4968
				4969	ptr = super_copy->sys_chunk_array + array_size;
				4970	btrfs_cpu_key_to_disk(&disk_key, key);
				4971	memcpy(ptr, &disk_key, sizeof(disk_key));
				4972	ptr += sizeof(disk_key);
				4973	memcpy(ptr, chunk, item_size);
				4974	item_size += sizeof(disk_key);
				4975	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
				4976	mutex_unlock(&fs_info->chunk_mutex);
				4977
				4978	return 0;
				4979	}
				4980
				4981	/*
				4982	* sort the devices in descending order by max_avail, total_avail
				4983	*/
				4984	static int btrfs_cmp_device_info(const void a, const void b)
				4985	{
				4986	const struct btrfs_device_info *di_a = a;
				4987	const struct btrfs_device_info *di_b = b;
				4988
				4989	if (di_a->max_avail > di_b->max_avail)
				4990	return -1;
				4991	if (di_a->max_avail < di_b->max_avail)
				4992	return 1;
				4993	if (di_a->total_avail > di_b->total_avail)
				4994	return -1;
				4995	if (di_a->total_avail < di_b->total_avail)
				4996	return 1;
				4997	return 0;
				4998	}
				4999
				5000	static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
				5001	{
				5002	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
				5003	return;
				5004
				5005	btrfs_set_fs_incompat(info, RAID56);
				5006	}
				5007
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5008	static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
				5009	u64 start, u64 type)
				5010	{
				5011	struct btrfs_fs_info *info = trans->fs_info;
				5012	struct btrfs_fs_devices *fs_devices = info->fs_devices;
				5013	struct btrfs_device *device;
				5014	struct map_lookup *map = NULL;
				5015	struct extent_map_tree *em_tree;
				5016	struct extent_map *em;
				5017	struct btrfs_device_info *devices_info = NULL;
				5018	u64 total_avail;
				5019	int num_stripes; /* total number of stripes to allocate */
				5020	int data_stripes; /* number of stripes that count for
				5021	block group size */
				5022	int sub_stripes; /* sub_stripes info for map */
				5023	int dev_stripes; /* stripes per dev */
				5024	int devs_max; /* max devs to use */
				5025	int devs_min; /* min devs needed */
				5026	int devs_increment; /* ndevs has to be a multiple of this */
				5027	int ncopies; /* how many copies to data has */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5028	int nparity; /* number of stripes worth of bytes to
				5029	store parity information */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5030	int ret;
				5031	u64 max_stripe_size;
				5032	u64 max_chunk_size;
				5033	u64 stripe_size;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5034	u64 chunk_size;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5035	int ndevs;
				5036	int i;
				5037	int j;
				5038	int index;
				5039
				5040	BUG_ON(!alloc_profile_is_valid(type, 0));
				5041
				5042	if (list_empty(&fs_devices->alloc_list)) {
				5043	if (btrfs_test_opt(info, ENOSPC_DEBUG))
				5044	btrfs_debug(info, "%s: no writable device", __func__);
				5045	return -ENOSPC;
				5046	}
				5047
				5048	index = btrfs_bg_flags_to_raid_index(type);
				5049
				5050	sub_stripes = btrfs_raid_array[index].sub_stripes;
				5051	dev_stripes = btrfs_raid_array[index].dev_stripes;
				5052	devs_max = btrfs_raid_array[index].devs_max;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5053	if (!devs_max)
				5054	devs_max = BTRFS_MAX_DEVS(info);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5055	devs_min = btrfs_raid_array[index].devs_min;
				5056	devs_increment = btrfs_raid_array[index].devs_increment;
				5057	ncopies = btrfs_raid_array[index].ncopies;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5058	nparity = btrfs_raid_array[index].nparity;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5059
				5060	if (type & BTRFS_BLOCK_GROUP_DATA) {
				5061	max_stripe_size = SZ_1G;
				5062	max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5063	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
				5064	/* for larger filesystems, use larger metadata chunks */
				5065	if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
				5066	max_stripe_size = SZ_1G;
				5067	else
				5068	max_stripe_size = SZ_256M;
				5069	max_chunk_size = max_stripe_size;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5070	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
				5071	max_stripe_size = SZ_32M;
				5072	max_chunk_size = 2 * max_stripe_size;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5073	devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5074	} else {
				5075	btrfs_err(info, "invalid chunk type 0x%llx requested",
				5076	type);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5077	BUG();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5078	}
				5079
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5080	/* We don't want a chunk larger than 10% of writable space */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5081	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
				5082	max_chunk_size);
				5083
				5084	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
				5085	GFP_NOFS);
				5086	if (!devices_info)
				5087	return -ENOMEM;
				5088
				5089	/*
				5090	* in the first pass through the devices list, we gather information
				5091	* about the available holes on each device.
				5092	*/
				5093	ndevs = 0;
				5094	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
				5095	u64 max_avail;
				5096	u64 dev_offset;
				5097
				5098	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
				5099	WARN(1, KERN_ERR
				5100	"BTRFS: read-only device in alloc_list\n");
				5101	continue;
				5102	}
				5103
				5104	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				5105	&device->dev_state) \|\|
				5106	test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
				5107	continue;
				5108
				5109	if (device->total_bytes > device->bytes_used)
				5110	total_avail = device->total_bytes - device->bytes_used;
				5111	else
				5112	total_avail = 0;
				5113
				5114	/* If there is no space on this device, skip it. */
				5115	if (total_avail == 0)
				5116	continue;
				5117
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5118	ret = find_free_dev_extent(device,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5119	max_stripe_size * dev_stripes,
				5120	&dev_offset, &max_avail);
				5121	if (ret && ret != -ENOSPC)
				5122	goto error;
				5123
				5124	if (ret == 0)
				5125	max_avail = max_stripe_size * dev_stripes;
				5126
				5127	if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
				5128	if (btrfs_test_opt(info, ENOSPC_DEBUG))
				5129	btrfs_debug(info,
				5130	"%s: devid %llu has no free space, have=%llu want=%u",
				5131	__func__, device->devid, max_avail,
				5132	BTRFS_STRIPE_LEN * dev_stripes);
				5133	continue;
				5134	}
				5135
				5136	if (ndevs == fs_devices->rw_devices) {
				5137	WARN(1, "%s: found more than %llu devices\n",
				5138	__func__, fs_devices->rw_devices);
				5139	break;
				5140	}
				5141	devices_info[ndevs].dev_offset = dev_offset;
				5142	devices_info[ndevs].max_avail = max_avail;
				5143	devices_info[ndevs].total_avail = total_avail;
				5144	devices_info[ndevs].dev = device;
				5145	++ndevs;
				5146	}
				5147
				5148	/*
				5149	* now sort the devices by hole size / available space
				5150	*/
				5151	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
				5152	btrfs_cmp_device_info, NULL);
				5153
				5154	/* round down to number of usable stripes */
				5155	ndevs = round_down(ndevs, devs_increment);
				5156
				5157	if (ndevs < devs_min) {
				5158	ret = -ENOSPC;
				5159	if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
				5160	btrfs_debug(info,
				5161	"%s: not enough devices with free space: have=%d minimum required=%d",
				5162	__func__, ndevs, devs_min);
				5163	}
				5164	goto error;
				5165	}
				5166
				5167	ndevs = min(ndevs, devs_max);
				5168
				5169	/*
				5170	* The primary goal is to maximize the number of stripes, so use as
				5171	* many devices as possible, even if the stripes are not maximum sized.
				5172	*
				5173	* The DUP profile stores more than one stripe per device, the
				5174	* max_avail is the total size so we have to adjust.
				5175	*/
				5176	stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
				5177	num_stripes = ndevs * dev_stripes;
				5178
				5179	/*
				5180	* this will have to be fixed for RAID1 and RAID10 over
				5181	* more drives
				5182	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5183	data_stripes = (num_stripes - nparity) / ncopies;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5184
				5185	/*
				5186	* Use the number of data stripes to figure out how big this chunk
				5187	* is really going to be in terms of logical address space,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5188	* and compare that answer with the max chunk size. If it's higher,
				5189	* we try to reduce stripe_size.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5190	*/
				5191	if (stripe_size * data_stripes > max_chunk_size) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5192	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5193	* Reduce stripe_size, round it up to a 16MB boundary again and
				5194	* then use it, unless it ends up being even bigger than the
				5195	* previous value we had already.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5196	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5197	stripe_size = min(round_up(div_u64(max_chunk_size,
				5198	data_stripes), SZ_16M),
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5199	stripe_size);
				5200	}
				5201
				5202	/* align to BTRFS_STRIPE_LEN */
				5203	stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
				5204
				5205	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
				5206	if (!map) {
				5207	ret = -ENOMEM;
				5208	goto error;
				5209	}
				5210	map->num_stripes = num_stripes;
				5211
				5212	for (i = 0; i < ndevs; ++i) {
				5213	for (j = 0; j < dev_stripes; ++j) {
				5214	int s = i * dev_stripes + j;
				5215	map->stripes[s].dev = devices_info[i].dev;
				5216	map->stripes[s].physical = devices_info[i].dev_offset +
				5217	j * stripe_size;
				5218	}
				5219	}
				5220	map->stripe_len = BTRFS_STRIPE_LEN;
				5221	map->io_align = BTRFS_STRIPE_LEN;
				5222	map->io_width = BTRFS_STRIPE_LEN;
				5223	map->type = type;
				5224	map->sub_stripes = sub_stripes;
				5225
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5226	chunk_size = stripe_size * data_stripes;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5227
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5228	trace_btrfs_chunk_alloc(info, map, start, chunk_size);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5229
				5230	em = alloc_extent_map();
				5231	if (!em) {
				5232	kfree(map);
				5233	ret = -ENOMEM;
				5234	goto error;
				5235	}
				5236	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
				5237	em->map_lookup = map;
				5238	em->start = start;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5239	em->len = chunk_size;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5240	em->block_start = 0;
				5241	em->block_len = em->len;
				5242	em->orig_block_len = stripe_size;
				5243
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5244	em_tree = &info->mapping_tree;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5245	write_lock(&em_tree->lock);
				5246	ret = add_extent_mapping(em_tree, em, 0);
				5247	if (ret) {
				5248	write_unlock(&em_tree->lock);
				5249	free_extent_map(em);
				5250	goto error;
				5251	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5252	write_unlock(&em_tree->lock);
				5253
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5254	ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5255	if (ret)
				5256	goto error_del_extent;
				5257
				5258	for (i = 0; i < map->num_stripes; i++) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5259	struct btrfs_device *dev = map->stripes[i].dev;
				5260
				5261	btrfs_device_set_bytes_used(dev, dev->bytes_used + stripe_size);
				5262	if (list_empty(&dev->post_commit_list))
				5263	list_add_tail(&dev->post_commit_list,
				5264	&trans->transaction->dev_update_list);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5265	}
				5266
				5267	atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
				5268
				5269	free_extent_map(em);
				5270	check_raid56_incompat_flag(info, type);
				5271
				5272	kfree(devices_info);
				5273	return 0;
				5274
				5275	error_del_extent:
				5276	write_lock(&em_tree->lock);
				5277	remove_extent_mapping(em_tree, em);
				5278	write_unlock(&em_tree->lock);
				5279
				5280	/* One for our allocation */
				5281	free_extent_map(em);
				5282	/* One for the tree reference */
				5283	free_extent_map(em);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5284	error:
				5285	kfree(devices_info);
				5286	return ret;
				5287	}
				5288
				5289	int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
				5290	u64 chunk_offset, u64 chunk_size)
				5291	{
				5292	struct btrfs_fs_info *fs_info = trans->fs_info;
				5293	struct btrfs_root *extent_root = fs_info->extent_root;
				5294	struct btrfs_root *chunk_root = fs_info->chunk_root;
				5295	struct btrfs_key key;
				5296	struct btrfs_device *device;
				5297	struct btrfs_chunk *chunk;
				5298	struct btrfs_stripe *stripe;
				5299	struct extent_map *em;
				5300	struct map_lookup *map;
				5301	size_t item_size;
				5302	u64 dev_offset;
				5303	u64 stripe_size;
				5304	int i = 0;
				5305	int ret = 0;
				5306
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5307	em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5308	if (IS_ERR(em))
				5309	return PTR_ERR(em);
				5310
				5311	map = em->map_lookup;
				5312	item_size = btrfs_chunk_item_size(map->num_stripes);
				5313	stripe_size = em->orig_block_len;
				5314
				5315	chunk = kzalloc(item_size, GFP_NOFS);
				5316	if (!chunk) {
				5317	ret = -ENOMEM;
				5318	goto out;
				5319	}
				5320
				5321	/*
				5322	* Take the device list mutex to prevent races with the final phase of
				5323	* a device replace operation that replaces the device object associated
				5324	* with the map's stripes, because the device object's id can change
				5325	* at any time during that final phase of the device replace operation
				5326	* (dev-replace.c:btrfs_dev_replace_finishing()).
				5327	*/
				5328	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				5329	for (i = 0; i < map->num_stripes; i++) {
				5330	device = map->stripes[i].dev;
				5331	dev_offset = map->stripes[i].physical;
				5332
				5333	ret = btrfs_update_device(trans, device);
				5334	if (ret)
				5335	break;
				5336	ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
				5337	dev_offset, stripe_size);
				5338	if (ret)
				5339	break;
				5340	}
				5341	if (ret) {
				5342	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				5343	goto out;
				5344	}
				5345
				5346	stripe = &chunk->stripe;
				5347	for (i = 0; i < map->num_stripes; i++) {
				5348	device = map->stripes[i].dev;
				5349	dev_offset = map->stripes[i].physical;
				5350
				5351	btrfs_set_stack_stripe_devid(stripe, device->devid);
				5352	btrfs_set_stack_stripe_offset(stripe, dev_offset);
				5353	memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
				5354	stripe++;
				5355	}
				5356	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				5357
				5358	btrfs_set_stack_chunk_length(chunk, chunk_size);
				5359	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
				5360	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
				5361	btrfs_set_stack_chunk_type(chunk, map->type);
				5362	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
				5363	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
				5364	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
				5365	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
				5366	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
				5367
				5368	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
				5369	key.type = BTRFS_CHUNK_ITEM_KEY;
				5370	key.offset = chunk_offset;
				5371
				5372	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
				5373	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
				5374	/*
				5375	* TODO: Cleanup of inserted chunk root in case of
				5376	* failure.
				5377	*/
				5378	ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
				5379	}
				5380
				5381	out:
				5382	kfree(chunk);
				5383	free_extent_map(em);
				5384	return ret;
				5385	}
				5386
				5387	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5388	* Chunk allocation falls into two parts. The first part does work
				5389	* that makes the new allocated chunk usable, but does not do any operation
				5390	* that modifies the chunk tree. The second part does the work that
				5391	* requires modifying the chunk tree. This division is important for the
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5392	* bootstrap process of adding storage to a seed btrfs.
				5393	*/
				5394	int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
				5395	{
				5396	u64 chunk_offset;
				5397
				5398	lockdep_assert_held(&trans->fs_info->chunk_mutex);
				5399	chunk_offset = find_next_chunk(trans->fs_info);
				5400	return __btrfs_alloc_chunk(trans, chunk_offset, type);
				5401	}
				5402
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5403	static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5404	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5405	struct btrfs_fs_info *fs_info = trans->fs_info;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5406	u64 chunk_offset;
				5407	u64 sys_chunk_offset;
				5408	u64 alloc_profile;
				5409	int ret;
				5410
				5411	chunk_offset = find_next_chunk(fs_info);
				5412	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
				5413	ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
				5414	if (ret)
				5415	return ret;
				5416
				5417	sys_chunk_offset = find_next_chunk(fs_info);
				5418	alloc_profile = btrfs_system_alloc_profile(fs_info);
				5419	ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
				5420	return ret;
				5421	}
				5422
				5423	static inline int btrfs_chunk_max_errors(struct map_lookup *map)
				5424	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5425	const int index = btrfs_bg_flags_to_raid_index(map->type);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5426
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5427	return btrfs_raid_array[index].tolerated_failures;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5428	}
				5429
				5430	int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
				5431	{
				5432	struct extent_map *em;
				5433	struct map_lookup *map;
				5434	int readonly = 0;
				5435	int miss_ndevs = 0;
				5436	int i;
				5437
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5438	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5439	if (IS_ERR(em))
				5440	return 1;
				5441
				5442	map = em->map_lookup;
				5443	for (i = 0; i < map->num_stripes; i++) {
				5444	if (test_bit(BTRFS_DEV_STATE_MISSING,
				5445	&map->stripes[i].dev->dev_state)) {
				5446	miss_ndevs++;
				5447	continue;
				5448	}
				5449	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
				5450	&map->stripes[i].dev->dev_state)) {
				5451	readonly = 1;
				5452	goto end;
				5453	}
				5454	}
				5455
				5456	/*
				5457	* If the number of missing devices is larger than max errors,
				5458	* we can not write the data into that chunk successfully, so
				5459	* set it readonly.
				5460	*/
				5461	if (miss_ndevs > btrfs_chunk_max_errors(map))
				5462	readonly = 1;
				5463	end:
				5464	free_extent_map(em);
				5465	return readonly;
				5466	}
				5467
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5468	void btrfs_mapping_tree_free(struct extent_map_tree *tree)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5469	{
				5470	struct extent_map *em;
				5471
				5472	while (1) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5473	write_lock(&tree->lock);
				5474	em = lookup_extent_mapping(tree, 0, (u64)-1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5475	if (em)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5476	remove_extent_mapping(tree, em);
				5477	write_unlock(&tree->lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5478	if (!em)
				5479	break;
				5480	/* once for us */
				5481	free_extent_map(em);
				5482	/* once for the tree */
				5483	free_extent_map(em);
				5484	}
				5485	}
				5486
				5487	int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
				5488	{
				5489	struct extent_map *em;
				5490	struct map_lookup *map;
				5491	int ret;
				5492
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5493	em = btrfs_get_chunk_map(fs_info, logical, len);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5494	if (IS_ERR(em))
				5495	/*
				5496	* We could return errors for these cases, but that could get
				5497	* ugly and we'd probably do the same thing which is just not do
				5498	* anything else and exit, so return 1 so the callers don't try
				5499	* to use other copies.
				5500	*/
				5501	return 1;
				5502
				5503	map = em->map_lookup;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5504	if (map->type & (BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1_MASK))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5505	ret = map->num_stripes;
				5506	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
				5507	ret = map->sub_stripes;
				5508	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
				5509	ret = 2;
				5510	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
				5511	/*
				5512	* There could be two corrupted data stripes, we need
				5513	* to loop retry in order to rebuild the correct data.
				5514	*
				5515	* Fail a stripe at a time on every retry except the
				5516	* stripe under reconstruction.
				5517	*/
				5518	ret = map->num_stripes;
				5519	else
				5520	ret = 1;
				5521	free_extent_map(em);
				5522
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5523	down_read(&fs_info->dev_replace.rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5524	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
				5525	fs_info->dev_replace.tgtdev)
				5526	ret++;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5527	up_read(&fs_info->dev_replace.rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5528
				5529	return ret;
				5530	}
				5531
				5532	unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
				5533	u64 logical)
				5534	{
				5535	struct extent_map *em;
				5536	struct map_lookup *map;
				5537	unsigned long len = fs_info->sectorsize;
				5538
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5539	em = btrfs_get_chunk_map(fs_info, logical, len);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5540
				5541	if (!WARN_ON(IS_ERR(em))) {
				5542	map = em->map_lookup;
				5543	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
				5544	len = map->stripe_len * nr_data_stripes(map);
				5545	free_extent_map(em);
				5546	}
				5547	return len;
				5548	}
				5549
				5550	int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
				5551	{
				5552	struct extent_map *em;
				5553	struct map_lookup *map;
				5554	int ret = 0;
				5555
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5556	em = btrfs_get_chunk_map(fs_info, logical, len);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5557
				5558	if(!WARN_ON(IS_ERR(em))) {
				5559	map = em->map_lookup;
				5560	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
				5561	ret = 1;
				5562	free_extent_map(em);
				5563	}
				5564	return ret;
				5565	}
				5566
				5567	static int find_live_mirror(struct btrfs_fs_info *fs_info,
				5568	struct map_lookup *map, int first,
				5569	int dev_replace_is_ongoing)
				5570	{
				5571	int i;
				5572	int num_stripes;
				5573	int preferred_mirror;
				5574	int tolerance;
				5575	struct btrfs_device *srcdev;
				5576
				5577	ASSERT((map->type &
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5578	(BTRFS_BLOCK_GROUP_RAID1_MASK \| BTRFS_BLOCK_GROUP_RAID10)));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5579
				5580	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
				5581	num_stripes = map->sub_stripes;
				5582	else
				5583	num_stripes = map->num_stripes;
				5584
				5585	preferred_mirror = first + current->pid % num_stripes;
				5586
				5587	if (dev_replace_is_ongoing &&
				5588	fs_info->dev_replace.cont_reading_from_srcdev_mode ==
				5589	BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
				5590	srcdev = fs_info->dev_replace.srcdev;
				5591	else
				5592	srcdev = NULL;
				5593
				5594	/*
				5595	* try to avoid the drive that is the source drive for a
				5596	* dev-replace procedure, only choose it if no other non-missing
				5597	* mirror is available
				5598	*/
				5599	for (tolerance = 0; tolerance < 2; tolerance++) {
				5600	if (map->stripes[preferred_mirror].dev->bdev &&
				5601	(tolerance \|\| map->stripes[preferred_mirror].dev != srcdev))
				5602	return preferred_mirror;
				5603	for (i = first; i < first + num_stripes; i++) {
				5604	if (map->stripes[i].dev->bdev &&
				5605	(tolerance \|\| map->stripes[i].dev != srcdev))
				5606	return i;
				5607	}
				5608	}
				5609
				5610	/* we couldn't find one that doesn't fail. Just return something
				5611	* and the io error handling code will clean up eventually
				5612	*/
				5613	return preferred_mirror;
				5614	}
				5615
				5616	static inline int parity_smaller(u64 a, u64 b)
				5617	{
				5618	return a > b;
				5619	}
				5620
				5621	/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
				5622	static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
				5623	{
				5624	struct btrfs_bio_stripe s;
				5625	int i;
				5626	u64 l;
				5627	int again = 1;
				5628
				5629	while (again) {
				5630	again = 0;
				5631	for (i = 0; i < num_stripes - 1; i++) {
				5632	if (parity_smaller(bbio->raid_map[i],
				5633	bbio->raid_map[i+1])) {
				5634	s = bbio->stripes[i];
				5635	l = bbio->raid_map[i];
				5636	bbio->stripes[i] = bbio->stripes[i+1];
				5637	bbio->raid_map[i] = bbio->raid_map[i+1];
				5638	bbio->stripes[i+1] = s;
				5639	bbio->raid_map[i+1] = l;
				5640
				5641	again = 1;
				5642	}
				5643	}
				5644	}
				5645	}
				5646
				5647	static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
				5648	{
				5649	struct btrfs_bio *bbio = kzalloc(
				5650	/* the size of the btrfs_bio */
				5651	sizeof(struct btrfs_bio) +
				5652	/* plus the variable array for the stripes */
				5653	sizeof(struct btrfs_bio_stripe) * (total_stripes) +
				5654	/* plus the variable array for the tgt dev */
				5655	sizeof(int) * (real_stripes) +
				5656	/*
				5657	* plus the raid_map, which includes both the tgt dev
				5658	* and the stripes
				5659	*/
				5660	sizeof(u64) * (total_stripes),
				5661	GFP_NOFS\|__GFP_NOFAIL);
				5662
				5663	atomic_set(&bbio->error, 0);
				5664	refcount_set(&bbio->refs, 1);
				5665
				5666	return bbio;
				5667	}
				5668
				5669	void btrfs_get_bbio(struct btrfs_bio *bbio)
				5670	{
				5671	WARN_ON(!refcount_read(&bbio->refs));
				5672	refcount_inc(&bbio->refs);
				5673	}
				5674
				5675	void btrfs_put_bbio(struct btrfs_bio *bbio)
				5676	{
				5677	if (!bbio)
				5678	return;
				5679	if (refcount_dec_and_test(&bbio->refs))
				5680	kfree(bbio);
				5681	}
				5682
				5683	/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
				5684	/*
				5685	* Please note that, discard won't be sent to target device of device
				5686	* replace.
				5687	*/
				5688	static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5689	u64 logical, u64 *length_ret,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5690	struct btrfs_bio **bbio_ret)
				5691	{
				5692	struct extent_map *em;
				5693	struct map_lookup *map;
				5694	struct btrfs_bio *bbio;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5695	u64 length = *length_ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5696	u64 offset;
				5697	u64 stripe_nr;
				5698	u64 stripe_nr_end;
				5699	u64 stripe_end_offset;
				5700	u64 stripe_cnt;
				5701	u64 stripe_len;
				5702	u64 stripe_offset;
				5703	u64 num_stripes;
				5704	u32 stripe_index;
				5705	u32 factor = 0;
				5706	u32 sub_stripes = 0;
				5707	u64 stripes_per_dev = 0;
				5708	u32 remaining_stripes = 0;
				5709	u32 last_stripe = 0;
				5710	int ret = 0;
				5711	int i;
				5712
				5713	/* discard always return a bbio */
				5714	ASSERT(bbio_ret);
				5715
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5716	em = btrfs_get_chunk_map(fs_info, logical, length);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5717	if (IS_ERR(em))
				5718	return PTR_ERR(em);
				5719
				5720	map = em->map_lookup;
				5721	/* we don't discard raid56 yet */
				5722	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				5723	ret = -EOPNOTSUPP;
				5724	goto out;
				5725	}
				5726
				5727	offset = logical - em->start;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5728	length = min_t(u64, em->start + em->len - logical, length);
				5729	*length_ret = length;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5730
				5731	stripe_len = map->stripe_len;
				5732	/*
				5733	* stripe_nr counts the total number of stripes we have to stride
				5734	* to get to this block
				5735	*/
				5736	stripe_nr = div64_u64(offset, stripe_len);
				5737
				5738	/* stripe_offset is the offset of this block in its stripe */
				5739	stripe_offset = offset - stripe_nr * stripe_len;
				5740
				5741	stripe_nr_end = round_up(offset + length, map->stripe_len);
				5742	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
				5743	stripe_cnt = stripe_nr_end - stripe_nr;
				5744	stripe_end_offset = stripe_nr_end * map->stripe_len -
				5745	(offset + length);
				5746	/*
				5747	* after this, stripe_nr is the number of stripes on this
				5748	* device we have to walk to find the data, and stripe_index is
				5749	* the number of our device in the stripe array
				5750	*/
				5751	num_stripes = 1;
				5752	stripe_index = 0;
				5753	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 \|
				5754	BTRFS_BLOCK_GROUP_RAID10)) {
				5755	if (map->type & BTRFS_BLOCK_GROUP_RAID0)
				5756	sub_stripes = 1;
				5757	else
				5758	sub_stripes = map->sub_stripes;
				5759
				5760	factor = map->num_stripes / sub_stripes;
				5761	num_stripes = min_t(u64, map->num_stripes,
				5762	sub_stripes * stripe_cnt);
				5763	stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
				5764	stripe_index *= sub_stripes;
				5765	stripes_per_dev = div_u64_rem(stripe_cnt, factor,
				5766	&remaining_stripes);
				5767	div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
				5768	last_stripe *= sub_stripes;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5769	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK \|
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5770	BTRFS_BLOCK_GROUP_DUP)) {
				5771	num_stripes = map->num_stripes;
				5772	} else {
				5773	stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				5774	&stripe_index);
				5775	}
				5776
				5777	bbio = alloc_btrfs_bio(num_stripes, 0);
				5778	if (!bbio) {
				5779	ret = -ENOMEM;
				5780	goto out;
				5781	}
				5782
				5783	for (i = 0; i < num_stripes; i++) {
				5784	bbio->stripes[i].physical =
				5785	map->stripes[stripe_index].physical +
				5786	stripe_offset + stripe_nr * map->stripe_len;
				5787	bbio->stripes[i].dev = map->stripes[stripe_index].dev;
				5788
				5789	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 \|
				5790	BTRFS_BLOCK_GROUP_RAID10)) {
				5791	bbio->stripes[i].length = stripes_per_dev *
				5792	map->stripe_len;
				5793
				5794	if (i / sub_stripes < remaining_stripes)
				5795	bbio->stripes[i].length +=
				5796	map->stripe_len;
				5797
				5798	/*
				5799	* Special for the first stripe and
				5800	* the last stripe:
				5801	*
				5802	* \|-------\|...\|-------\|
				5803	* \|----------\|
				5804	* off end_off
				5805	*/
				5806	if (i < sub_stripes)
				5807	bbio->stripes[i].length -=
				5808	stripe_offset;
				5809
				5810	if (stripe_index >= last_stripe &&
				5811	stripe_index <= (last_stripe +
				5812	sub_stripes - 1))
				5813	bbio->stripes[i].length -=
				5814	stripe_end_offset;
				5815
				5816	if (i == sub_stripes - 1)
				5817	stripe_offset = 0;
				5818	} else {
				5819	bbio->stripes[i].length = length;
				5820	}
				5821
				5822	stripe_index++;
				5823	if (stripe_index == map->num_stripes) {
				5824	stripe_index = 0;
				5825	stripe_nr++;
				5826	}
				5827	}
				5828
				5829	*bbio_ret = bbio;
				5830	bbio->map_type = map->type;
				5831	bbio->num_stripes = num_stripes;
				5832	out:
				5833	free_extent_map(em);
				5834	return ret;
				5835	}
				5836
				5837	/*
				5838	* In dev-replace case, for repair case (that's the only case where the mirror
				5839	* is selected explicitly when calling btrfs_map_block), blocks left of the
				5840	* left cursor can also be read from the target drive.
				5841	*
				5842	* For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
				5843	* array of stripes.
				5844	* For READ, it also needs to be supported using the same mirror number.
				5845	*
				5846	* If the requested block is not left of the left cursor, EIO is returned. This
				5847	* can happen because btrfs_num_copies() returns one more in the dev-replace
				5848	* case.
				5849	*/
				5850	static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
				5851	u64 logical, u64 length,
				5852	u64 srcdev_devid, int *mirror_num,
				5853	u64 *physical)
				5854	{
				5855	struct btrfs_bio *bbio = NULL;
				5856	int num_stripes;
				5857	int index_srcdev = 0;
				5858	int found = 0;
				5859	u64 physical_of_found = 0;
				5860	int i;
				5861	int ret = 0;
				5862
				5863	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
				5864	logical, &length, &bbio, 0, 0);
				5865	if (ret) {
				5866	ASSERT(bbio == NULL);
				5867	return ret;
				5868	}
				5869
				5870	num_stripes = bbio->num_stripes;
				5871	if (*mirror_num > num_stripes) {
				5872	/*
				5873	* BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
				5874	* that means that the requested area is not left of the left
				5875	* cursor
				5876	*/
				5877	btrfs_put_bbio(bbio);
				5878	return -EIO;
				5879	}
				5880
				5881	/*
				5882	* process the rest of the function using the mirror_num of the source
				5883	* drive. Therefore look it up first. At the end, patch the device
				5884	* pointer to the one of the target drive.
				5885	*/
				5886	for (i = 0; i < num_stripes; i++) {
				5887	if (bbio->stripes[i].dev->devid != srcdev_devid)
				5888	continue;
				5889
				5890	/*
				5891	* In case of DUP, in order to keep it simple, only add the
				5892	* mirror with the lowest physical address
				5893	*/
				5894	if (found &&
				5895	physical_of_found <= bbio->stripes[i].physical)
				5896	continue;
				5897
				5898	index_srcdev = i;
				5899	found = 1;
				5900	physical_of_found = bbio->stripes[i].physical;
				5901	}
				5902
				5903	btrfs_put_bbio(bbio);
				5904
				5905	ASSERT(found);
				5906	if (!found)
				5907	return -EIO;
				5908
				5909	*mirror_num = index_srcdev + 1;
				5910	*physical = physical_of_found;
				5911	return ret;
				5912	}
				5913
				5914	static void handle_ops_on_dev_replace(enum btrfs_map_op op,
				5915	struct btrfs_bio **bbio_ret,
				5916	struct btrfs_dev_replace *dev_replace,
				5917	int num_stripes_ret, int max_errors_ret)
				5918	{
				5919	struct btrfs_bio bbio = bbio_ret;
				5920	u64 srcdev_devid = dev_replace->srcdev->devid;
				5921	int tgtdev_indexes = 0;
				5922	int num_stripes = *num_stripes_ret;
				5923	int max_errors = *max_errors_ret;
				5924	int i;
				5925
				5926	if (op == BTRFS_MAP_WRITE) {
				5927	int index_where_to_add;
				5928
				5929	/*
				5930	* duplicate the write operations while the dev replace
				5931	* procedure is running. Since the copying of the old disk to
				5932	* the new disk takes place at run time while the filesystem is
				5933	* mounted writable, the regular write operations to the old
				5934	* disk have to be duplicated to go to the new disk as well.
				5935	*
				5936	* Note that device->missing is handled by the caller, and that
				5937	* the write to the old disk is already set up in the stripes
				5938	* array.
				5939	*/
				5940	index_where_to_add = num_stripes;
				5941	for (i = 0; i < num_stripes; i++) {
				5942	if (bbio->stripes[i].dev->devid == srcdev_devid) {
				5943	/* write to new disk, too */
				5944	struct btrfs_bio_stripe *new =
				5945	bbio->stripes + index_where_to_add;
				5946	struct btrfs_bio_stripe *old =
				5947	bbio->stripes + i;
				5948
				5949	new->physical = old->physical;
				5950	new->length = old->length;
				5951	new->dev = dev_replace->tgtdev;
				5952	bbio->tgtdev_map[i] = index_where_to_add;
				5953	index_where_to_add++;
				5954	max_errors++;
				5955	tgtdev_indexes++;
				5956	}
				5957	}
				5958	num_stripes = index_where_to_add;
				5959	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
				5960	int index_srcdev = 0;
				5961	int found = 0;
				5962	u64 physical_of_found = 0;
				5963
				5964	/*
				5965	* During the dev-replace procedure, the target drive can also
				5966	* be used to read data in case it is needed to repair a corrupt
				5967	* block elsewhere. This is possible if the requested area is
				5968	* left of the left cursor. In this area, the target drive is a
				5969	* full copy of the source drive.
				5970	*/
				5971	for (i = 0; i < num_stripes; i++) {
				5972	if (bbio->stripes[i].dev->devid == srcdev_devid) {
				5973	/*
				5974	* In case of DUP, in order to keep it simple,
				5975	* only add the mirror with the lowest physical
				5976	* address
				5977	*/
				5978	if (found &&
				5979	physical_of_found <=
				5980	bbio->stripes[i].physical)
				5981	continue;
				5982	index_srcdev = i;
				5983	found = 1;
				5984	physical_of_found = bbio->stripes[i].physical;
				5985	}
				5986	}
				5987	if (found) {
				5988	struct btrfs_bio_stripe *tgtdev_stripe =
				5989	bbio->stripes + num_stripes;
				5990
				5991	tgtdev_stripe->physical = physical_of_found;
				5992	tgtdev_stripe->length =
				5993	bbio->stripes[index_srcdev].length;
				5994	tgtdev_stripe->dev = dev_replace->tgtdev;
				5995	bbio->tgtdev_map[index_srcdev] = num_stripes;
				5996
				5997	tgtdev_indexes++;
				5998	num_stripes++;
				5999	}
				6000	}
				6001
				6002	*num_stripes_ret = num_stripes;
				6003	*max_errors_ret = max_errors;
				6004	bbio->num_tgtdevs = tgtdev_indexes;
				6005	*bbio_ret = bbio;
				6006	}
				6007
				6008	static bool need_full_stripe(enum btrfs_map_op op)
				6009	{
				6010	return (op == BTRFS_MAP_WRITE \|\| op == BTRFS_MAP_GET_READ_MIRRORS);
				6011	}
				6012
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6013	/*
				6014	* btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
				6015	* tuple. This information is used to calculate how big a
				6016	* particular bio can get before it straddles a stripe.
				6017	*
				6018	* @fs_info - the filesystem
				6019	* @logical - address that we want to figure out the geometry of
				6020	* @len - the length of IO we are going to perform, starting at @logical
				6021	* @op - type of operation - write or read
				6022	* @io_geom - pointer used to return values
				6023	*
				6024	* Returns < 0 in case a chunk for the given logical address cannot be found,
				6025	* usually shouldn't happen unless @logical is corrupted, 0 otherwise.
				6026	*/
				6027	int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
				6028	u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
				6029	{
				6030	struct extent_map *em;
				6031	struct map_lookup *map;
				6032	u64 offset;
				6033	u64 stripe_offset;
				6034	u64 stripe_nr;
				6035	u64 stripe_len;
				6036	u64 raid56_full_stripe_start = (u64)-1;
				6037	int data_stripes;
				6038	int ret = 0;
				6039
				6040	ASSERT(op != BTRFS_MAP_DISCARD);
				6041
				6042	em = btrfs_get_chunk_map(fs_info, logical, len);
				6043	if (IS_ERR(em))
				6044	return PTR_ERR(em);
				6045
				6046	map = em->map_lookup;
				6047	/* Offset of this logical address in the chunk */
				6048	offset = logical - em->start;
				6049	/* Len of a stripe in a chunk */
				6050	stripe_len = map->stripe_len;
				6051	/* Stripe wher this block falls in */
				6052	stripe_nr = div64_u64(offset, stripe_len);
				6053	/* Offset of stripe in the chunk */
				6054	stripe_offset = stripe_nr * stripe_len;
				6055	if (offset < stripe_offset) {
				6056	btrfs_crit(fs_info,
				6057	"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
				6058	stripe_offset, offset, em->start, logical, stripe_len);
				6059	ret = -EINVAL;
				6060	goto out;
				6061	}
				6062
				6063	/* stripe_offset is the offset of this block in its stripe */
				6064	stripe_offset = offset - stripe_offset;
				6065	data_stripes = nr_data_stripes(map);
				6066
				6067	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
				6068	u64 max_len = stripe_len - stripe_offset;
				6069
				6070	/*
				6071	* In case of raid56, we need to know the stripe aligned start
				6072	*/
				6073	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				6074	unsigned long full_stripe_len = stripe_len * data_stripes;
				6075	raid56_full_stripe_start = offset;
				6076
				6077	/*
				6078	* Allow a write of a full stripe, but make sure we
				6079	* don't allow straddling of stripes
				6080	*/
				6081	raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
				6082	full_stripe_len);
				6083	raid56_full_stripe_start *= full_stripe_len;
				6084
				6085	/*
				6086	* For writes to RAID[56], allow a full stripeset across
				6087	* all disks. For other RAID types and for RAID[56]
				6088	* reads, just allow a single stripe (on a single disk).
				6089	*/
				6090	if (op == BTRFS_MAP_WRITE) {
				6091	max_len = stripe_len * data_stripes -
				6092	(offset - raid56_full_stripe_start);
				6093	}
				6094	}
				6095	len = min_t(u64, em->len - offset, max_len);
				6096	} else {
				6097	len = em->len - offset;
				6098	}
				6099
				6100	io_geom->len = len;
				6101	io_geom->offset = offset;
				6102	io_geom->stripe_len = stripe_len;
				6103	io_geom->stripe_nr = stripe_nr;
				6104	io_geom->stripe_offset = stripe_offset;
				6105	io_geom->raid56_stripe_offset = raid56_full_stripe_start;
				6106
				6107	out:
				6108	/* once for us */
				6109	free_extent_map(em);
				6110	return ret;
				6111	}
				6112
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6113	static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
				6114	enum btrfs_map_op op,
				6115	u64 logical, u64 *length,
				6116	struct btrfs_bio **bbio_ret,
				6117	int mirror_num, int need_raid_map)
				6118	{
				6119	struct extent_map *em;
				6120	struct map_lookup *map;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6121	u64 stripe_offset;
				6122	u64 stripe_nr;
				6123	u64 stripe_len;
				6124	u32 stripe_index;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6125	int data_stripes;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6126	int i;
				6127	int ret = 0;
				6128	int num_stripes;
				6129	int max_errors = 0;
				6130	int tgtdev_indexes = 0;
				6131	struct btrfs_bio *bbio = NULL;
				6132	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
				6133	int dev_replace_is_ongoing = 0;
				6134	int num_alloc_stripes;
				6135	int patch_the_first_stripe_for_dev_replace = 0;
				6136	u64 physical_to_patch_in_first_stripe = 0;
				6137	u64 raid56_full_stripe_start = (u64)-1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6138	struct btrfs_io_geometry geom;
				6139
				6140	ASSERT(bbio_ret);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6141
				6142	if (op == BTRFS_MAP_DISCARD)
				6143	return __btrfs_map_block_for_discard(fs_info, logical,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6144	length, bbio_ret);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6145
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6146	ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
				6147	if (ret < 0)
				6148	return ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6149
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6150	em = btrfs_get_chunk_map(fs_info, logical, *length);
				6151	ASSERT(!IS_ERR(em));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6152	map = em->map_lookup;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6153
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6154	*length = geom.len;
				6155	stripe_len = geom.stripe_len;
				6156	stripe_nr = geom.stripe_nr;
				6157	stripe_offset = geom.stripe_offset;
				6158	raid56_full_stripe_start = geom.raid56_stripe_offset;
				6159	data_stripes = nr_data_stripes(map);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6160
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6161	down_read(&dev_replace->rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6162	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6163	/*
				6164	* Hold the semaphore for read during the whole operation, write is
				6165	* requested at commit time but must wait.
				6166	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6167	if (!dev_replace_is_ongoing)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6168	up_read(&dev_replace->rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6169
				6170	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
				6171	!need_full_stripe(op) && dev_replace->tgtdev != NULL) {
				6172	ret = get_extra_mirror_from_replace(fs_info, logical, *length,
				6173	dev_replace->srcdev->devid,
				6174	&mirror_num,
				6175	&physical_to_patch_in_first_stripe);
				6176	if (ret)
				6177	goto out;
				6178	else
				6179	patch_the_first_stripe_for_dev_replace = 1;
				6180	} else if (mirror_num > map->num_stripes) {
				6181	mirror_num = 0;
				6182	}
				6183
				6184	num_stripes = 1;
				6185	stripe_index = 0;
				6186	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
				6187	stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				6188	&stripe_index);
				6189	if (!need_full_stripe(op))
				6190	mirror_num = 1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6191	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6192	if (need_full_stripe(op))
				6193	num_stripes = map->num_stripes;
				6194	else if (mirror_num)
				6195	stripe_index = mirror_num - 1;
				6196	else {
				6197	stripe_index = find_live_mirror(fs_info, map, 0,
				6198	dev_replace_is_ongoing);
				6199	mirror_num = stripe_index + 1;
				6200	}
				6201
				6202	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
				6203	if (need_full_stripe(op)) {
				6204	num_stripes = map->num_stripes;
				6205	} else if (mirror_num) {
				6206	stripe_index = mirror_num - 1;
				6207	} else {
				6208	mirror_num = 1;
				6209	}
				6210
				6211	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
				6212	u32 factor = map->num_stripes / map->sub_stripes;
				6213
				6214	stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
				6215	stripe_index *= map->sub_stripes;
				6216
				6217	if (need_full_stripe(op))
				6218	num_stripes = map->sub_stripes;
				6219	else if (mirror_num)
				6220	stripe_index += mirror_num - 1;
				6221	else {
				6222	int old_stripe_index = stripe_index;
				6223	stripe_index = find_live_mirror(fs_info, map,
				6224	stripe_index,
				6225	dev_replace_is_ongoing);
				6226	mirror_num = stripe_index - old_stripe_index + 1;
				6227	}
				6228
				6229	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				6230	if (need_raid_map && (need_full_stripe(op) \|\| mirror_num > 1)) {
				6231	/* push stripe_nr back to the start of the full stripe */
				6232	stripe_nr = div64_u64(raid56_full_stripe_start,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6233	stripe_len * data_stripes);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6234
				6235	/* RAID[56] write or recovery. Return all stripes */
				6236	num_stripes = map->num_stripes;
				6237	max_errors = nr_parity_stripes(map);
				6238
				6239	*length = map->stripe_len;
				6240	stripe_index = 0;
				6241	stripe_offset = 0;
				6242	} else {
				6243	/*
				6244	* Mirror #0 or #1 means the original data block.
				6245	* Mirror #2 is RAID5 parity block.
				6246	* Mirror #3 is RAID6 Q block.
				6247	*/
				6248	stripe_nr = div_u64_rem(stripe_nr,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6249	data_stripes, &stripe_index);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6250	if (mirror_num > 1)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6251	stripe_index = data_stripes + mirror_num - 2;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6252
				6253	/* We distribute the parity blocks across stripes */
				6254	div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
				6255	&stripe_index);
				6256	if (!need_full_stripe(op) && mirror_num <= 1)
				6257	mirror_num = 1;
				6258	}
				6259	} else {
				6260	/*
				6261	* after this, stripe_nr is the number of stripes on this
				6262	* device we have to walk to find the data, and stripe_index is
				6263	* the number of our device in the stripe array
				6264	*/
				6265	stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				6266	&stripe_index);
				6267	mirror_num = stripe_index + 1;
				6268	}
				6269	if (stripe_index >= map->num_stripes) {
				6270	btrfs_crit(fs_info,
				6271	"stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
				6272	stripe_index, map->num_stripes);
				6273	ret = -EINVAL;
				6274	goto out;
				6275	}
				6276
				6277	num_alloc_stripes = num_stripes;
				6278	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
				6279	if (op == BTRFS_MAP_WRITE)
				6280	num_alloc_stripes <<= 1;
				6281	if (op == BTRFS_MAP_GET_READ_MIRRORS)
				6282	num_alloc_stripes++;
				6283	tgtdev_indexes = num_stripes;
				6284	}
				6285
				6286	bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
				6287	if (!bbio) {
				6288	ret = -ENOMEM;
				6289	goto out;
				6290	}
				6291	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
				6292	bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
				6293
				6294	/* build raid_map */
				6295	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
				6296	(need_full_stripe(op) \|\| mirror_num > 1)) {
				6297	u64 tmp;
				6298	unsigned rot;
				6299
				6300	bbio->raid_map = (u64 )((void )bbio->stripes +
				6301	sizeof(struct btrfs_bio_stripe) *
				6302	num_alloc_stripes +
				6303	sizeof(int) * tgtdev_indexes);
				6304
				6305	/* Work out the disk rotation on this stripe-set */
				6306	div_u64_rem(stripe_nr, num_stripes, &rot);
				6307
				6308	/* Fill in the logical address of each stripe */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6309	tmp = stripe_nr * data_stripes;
				6310	for (i = 0; i < data_stripes; i++)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6311	bbio->raid_map[(i+rot) % num_stripes] =
				6312	em->start + (tmp + i) * map->stripe_len;
				6313
				6314	bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
				6315	if (map->type & BTRFS_BLOCK_GROUP_RAID6)
				6316	bbio->raid_map[(i+rot+1) % num_stripes] =
				6317	RAID6_Q_STRIPE;
				6318	}
				6319
				6320
				6321	for (i = 0; i < num_stripes; i++) {
				6322	bbio->stripes[i].physical =
				6323	map->stripes[stripe_index].physical +
				6324	stripe_offset +
				6325	stripe_nr * map->stripe_len;
				6326	bbio->stripes[i].dev =
				6327	map->stripes[stripe_index].dev;
				6328	stripe_index++;
				6329	}
				6330
				6331	if (need_full_stripe(op))
				6332	max_errors = btrfs_chunk_max_errors(map);
				6333
				6334	if (bbio->raid_map)
				6335	sort_parity_stripes(bbio, num_stripes);
				6336
				6337	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
				6338	need_full_stripe(op)) {
				6339	handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
				6340	&max_errors);
				6341	}
				6342
				6343	*bbio_ret = bbio;
				6344	bbio->map_type = map->type;
				6345	bbio->num_stripes = num_stripes;
				6346	bbio->max_errors = max_errors;
				6347	bbio->mirror_num = mirror_num;
				6348
				6349	/*
				6350	* this is the case that REQ_READ && dev_replace_is_ongoing &&
				6351	* mirror_num == num_stripes + 1 && dev_replace target drive is
				6352	* available as a mirror
				6353	*/
				6354	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
				6355	WARN_ON(num_stripes > 1);
				6356	bbio->stripes[0].dev = dev_replace->tgtdev;
				6357	bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
				6358	bbio->mirror_num = map->num_stripes + 1;
				6359	}
				6360	out:
				6361	if (dev_replace_is_ongoing) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6362	lockdep_assert_held(&dev_replace->rwsem);
				6363	/* Unlock and let waiting writers proceed */
				6364	up_read(&dev_replace->rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6365	}
				6366	free_extent_map(em);
				6367	return ret;
				6368	}
				6369
				6370	int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
				6371	u64 logical, u64 *length,
				6372	struct btrfs_bio **bbio_ret, int mirror_num)
				6373	{
				6374	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
				6375	mirror_num, 0);
				6376	}
				6377
				6378	/* For Scrub/replace */
				6379	int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
				6380	u64 logical, u64 *length,
				6381	struct btrfs_bio **bbio_ret)
				6382	{
				6383	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
				6384	}
				6385
				6386	int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
				6387	u64 physical, u64 *logical, int naddrs, int *stripe_len)
				6388	{
				6389	struct extent_map *em;
				6390	struct map_lookup *map;
				6391	u64 *buf;
				6392	u64 bytenr;
				6393	u64 length;
				6394	u64 stripe_nr;
				6395	u64 rmap_len;
				6396	int i, j, nr = 0;
				6397
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6398	em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6399	if (IS_ERR(em))
				6400	return -EIO;
				6401
				6402	map = em->map_lookup;
				6403	length = em->len;
				6404	rmap_len = map->stripe_len;
				6405
				6406	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
				6407	length = div_u64(length, map->num_stripes / map->sub_stripes);
				6408	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
				6409	length = div_u64(length, map->num_stripes);
				6410	else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				6411	length = div_u64(length, nr_data_stripes(map));
				6412	rmap_len = map->stripe_len * nr_data_stripes(map);
				6413	}
				6414
				6415	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
				6416	BUG_ON(!buf); /* -ENOMEM */
				6417
				6418	for (i = 0; i < map->num_stripes; i++) {
				6419	if (map->stripes[i].physical > physical \|\|
				6420	map->stripes[i].physical + length <= physical)
				6421	continue;
				6422
				6423	stripe_nr = physical - map->stripes[i].physical;
				6424	stripe_nr = div64_u64(stripe_nr, map->stripe_len);
				6425
				6426	if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
				6427	stripe_nr = stripe_nr * map->num_stripes + i;
				6428	stripe_nr = div_u64(stripe_nr, map->sub_stripes);
				6429	} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
				6430	stripe_nr = stripe_nr * map->num_stripes + i;
				6431	} /* else if RAID[56], multiply by nr_data_stripes().
				6432	* Alternatively, just use rmap_len below instead of
				6433	* map->stripe_len */
				6434
				6435	bytenr = chunk_start + stripe_nr * rmap_len;
				6436	WARN_ON(nr >= map->num_stripes);
				6437	for (j = 0; j < nr; j++) {
				6438	if (buf[j] == bytenr)
				6439	break;
				6440	}
				6441	if (j == nr) {
				6442	WARN_ON(nr >= map->num_stripes);
				6443	buf[nr++] = bytenr;
				6444	}
				6445	}
				6446
				6447	*logical = buf;
				6448	*naddrs = nr;
				6449	*stripe_len = rmap_len;
				6450
				6451	free_extent_map(em);
				6452	return 0;
				6453	}
				6454
				6455	static inline void btrfs_end_bbio(struct btrfs_bio bbio, struct bio bio)
				6456	{
				6457	bio->bi_private = bbio->private;
				6458	bio->bi_end_io = bbio->end_io;
				6459	bio_endio(bio);
				6460
				6461	btrfs_put_bbio(bbio);
				6462	}
				6463
				6464	static void btrfs_end_bio(struct bio *bio)
				6465	{
				6466	struct btrfs_bio *bbio = bio->bi_private;
				6467	int is_orig_bio = 0;
				6468
				6469	if (bio->bi_status) {
				6470	atomic_inc(&bbio->error);
				6471	if (bio->bi_status == BLK_STS_IOERR \|\|
				6472	bio->bi_status == BLK_STS_TARGET) {
				6473	unsigned int stripe_index =
				6474	btrfs_io_bio(bio)->stripe_index;
				6475	struct btrfs_device *dev;
				6476
				6477	BUG_ON(stripe_index >= bbio->num_stripes);
				6478	dev = bbio->stripes[stripe_index].dev;
				6479	if (dev->bdev) {
				6480	if (bio_op(bio) == REQ_OP_WRITE)
				6481	btrfs_dev_stat_inc_and_print(dev,
				6482	BTRFS_DEV_STAT_WRITE_ERRS);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6483	else if (!(bio->bi_opf & REQ_RAHEAD))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6484	btrfs_dev_stat_inc_and_print(dev,
				6485	BTRFS_DEV_STAT_READ_ERRS);
				6486	if (bio->bi_opf & REQ_PREFLUSH)
				6487	btrfs_dev_stat_inc_and_print(dev,
				6488	BTRFS_DEV_STAT_FLUSH_ERRS);
				6489	}
				6490	}
				6491	}
				6492
				6493	if (bio == bbio->orig_bio)
				6494	is_orig_bio = 1;
				6495
				6496	btrfs_bio_counter_dec(bbio->fs_info);
				6497
				6498	if (atomic_dec_and_test(&bbio->stripes_pending)) {
				6499	if (!is_orig_bio) {
				6500	bio_put(bio);
				6501	bio = bbio->orig_bio;
				6502	}
				6503
				6504	btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
				6505	/* only send an error to the higher layers if it is
				6506	* beyond the tolerance of the btrfs bio
				6507	*/
				6508	if (atomic_read(&bbio->error) > bbio->max_errors) {
				6509	bio->bi_status = BLK_STS_IOERR;
				6510	} else {
				6511	/*
				6512	* this bio is actually up to date, we didn't
				6513	* go over the max number of errors
				6514	*/
				6515	bio->bi_status = BLK_STS_OK;
				6516	}
				6517
				6518	btrfs_end_bbio(bbio, bio);
				6519	} else if (!is_orig_bio) {
				6520	bio_put(bio);
				6521	}
				6522	}
				6523
				6524	/*
				6525	* see run_scheduled_bios for a description of why bios are collected for
				6526	* async submit.
				6527	*
				6528	* This will add one bio to the pending list for a device and make sure
				6529	* the work struct is scheduled.
				6530	*/
				6531	static noinline void btrfs_schedule_bio(struct btrfs_device *device,
				6532	struct bio *bio)
				6533	{
				6534	struct btrfs_fs_info *fs_info = device->fs_info;
				6535	int should_queue = 1;
				6536	struct btrfs_pending_bios *pending_bios;
				6537
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6538	/* don't bother with additional async steps for reads, right now */
				6539	if (bio_op(bio) == REQ_OP_READ) {
				6540	btrfsic_submit_bio(bio);
				6541	return;
				6542	}
				6543
				6544	WARN_ON(bio->bi_next);
				6545	bio->bi_next = NULL;
				6546
				6547	spin_lock(&device->io_lock);
				6548	if (op_is_sync(bio->bi_opf))
				6549	pending_bios = &device->pending_sync_bios;
				6550	else
				6551	pending_bios = &device->pending_bios;
				6552
				6553	if (pending_bios->tail)
				6554	pending_bios->tail->bi_next = bio;
				6555
				6556	pending_bios->tail = bio;
				6557	if (!pending_bios->head)
				6558	pending_bios->head = bio;
				6559	if (device->running_pending)
				6560	should_queue = 0;
				6561
				6562	spin_unlock(&device->io_lock);
				6563
				6564	if (should_queue)
				6565	btrfs_queue_work(fs_info->submit_workers, &device->work);
				6566	}
				6567
				6568	static void submit_stripe_bio(struct btrfs_bio bbio, struct bio bio,
				6569	u64 physical, int dev_nr, int async)
				6570	{
				6571	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
				6572	struct btrfs_fs_info *fs_info = bbio->fs_info;
				6573
				6574	bio->bi_private = bbio;
				6575	btrfs_io_bio(bio)->stripe_index = dev_nr;
				6576	bio->bi_end_io = btrfs_end_bio;
				6577	bio->bi_iter.bi_sector = physical >> 9;
				6578	btrfs_debug_in_rcu(fs_info,
				6579	"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
				6580	bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
				6581	(u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid,
				6582	bio->bi_iter.bi_size);
				6583	bio_set_dev(bio, dev->bdev);
				6584
				6585	btrfs_bio_counter_inc_noblocked(fs_info);
				6586
				6587	if (async)
				6588	btrfs_schedule_bio(dev, bio);
				6589	else
				6590	btrfsic_submit_bio(bio);
				6591	}
				6592
				6593	static void bbio_error(struct btrfs_bio bbio, struct bio bio, u64 logical)
				6594	{
				6595	atomic_inc(&bbio->error);
				6596	if (atomic_dec_and_test(&bbio->stripes_pending)) {
				6597	/* Should be the original bio. */
				6598	WARN_ON(bio != bbio->orig_bio);
				6599
				6600	btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
				6601	bio->bi_iter.bi_sector = logical >> 9;
				6602	if (atomic_read(&bbio->error) > bbio->max_errors)
				6603	bio->bi_status = BLK_STS_IOERR;
				6604	else
				6605	bio->bi_status = BLK_STS_OK;
				6606	btrfs_end_bbio(bbio, bio);
				6607	}
				6608	}
				6609
				6610	blk_status_t btrfs_map_bio(struct btrfs_fs_info fs_info, struct bio bio,
				6611	int mirror_num, int async_submit)
				6612	{
				6613	struct btrfs_device *dev;
				6614	struct bio *first_bio = bio;
				6615	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
				6616	u64 length = 0;
				6617	u64 map_length;
				6618	int ret;
				6619	int dev_nr;
				6620	int total_devs;
				6621	struct btrfs_bio *bbio = NULL;
				6622
				6623	length = bio->bi_iter.bi_size;
				6624	map_length = length;
				6625
				6626	btrfs_bio_counter_inc_blocked(fs_info);
				6627	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
				6628	&map_length, &bbio, mirror_num, 1);
				6629	if (ret) {
				6630	btrfs_bio_counter_dec(fs_info);
				6631	return errno_to_blk_status(ret);
				6632	}
				6633
				6634	total_devs = bbio->num_stripes;
				6635	bbio->orig_bio = first_bio;
				6636	bbio->private = first_bio->bi_private;
				6637	bbio->end_io = first_bio->bi_end_io;
				6638	bbio->fs_info = fs_info;
				6639	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
				6640
				6641	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
				6642	((bio_op(bio) == REQ_OP_WRITE) \|\| (mirror_num > 1))) {
				6643	/* In this case, map_length has been set to the length of
				6644	a single stripe; not the whole write */
				6645	if (bio_op(bio) == REQ_OP_WRITE) {
				6646	ret = raid56_parity_write(fs_info, bio, bbio,
				6647	map_length);
				6648	} else {
				6649	ret = raid56_parity_recover(fs_info, bio, bbio,
				6650	map_length, mirror_num, 1);
				6651	}
				6652
				6653	btrfs_bio_counter_dec(fs_info);
				6654	return errno_to_blk_status(ret);
				6655	}
				6656
				6657	if (map_length < length) {
				6658	btrfs_crit(fs_info,
				6659	"mapping failed logical %llu bio len %llu len %llu",
				6660	logical, length, map_length);
				6661	BUG();
				6662	}
				6663
				6664	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
				6665	dev = bbio->stripes[dev_nr].dev;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6666	if (!dev \|\| !dev->bdev \|\| test_bit(BTRFS_DEV_STATE_MISSING,
				6667	&dev->dev_state) \|\|
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6668	(bio_op(first_bio) == REQ_OP_WRITE &&
				6669	!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
				6670	bbio_error(bbio, first_bio, logical);
				6671	continue;
				6672	}
				6673
				6674	if (dev_nr < total_devs - 1)
				6675	bio = btrfs_bio_clone(first_bio);
				6676	else
				6677	bio = first_bio;
				6678
				6679	submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
				6680	dev_nr, async_submit);
				6681	}
				6682	btrfs_bio_counter_dec(fs_info);
				6683	return BLK_STS_OK;
				6684	}
				6685
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6686	/*
				6687	* Find a device specified by @devid or @uuid in the list of @fs_devices, or
				6688	* return NULL.
				6689	*
				6690	* If devid and uuid are both specified, the match must be exact, otherwise
				6691	* only devid is used.
				6692	*
				6693	* If @seed is true, traverse through the seed devices.
				6694	*/
				6695	struct btrfs_device btrfs_find_device(struct btrfs_fs_devices fs_devices,
				6696	u64 devid, u8 uuid, u8 fsid,
				6697	bool seed)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6698	{
				6699	struct btrfs_device *device;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6700
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6701	while (fs_devices) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6702	if (!fsid \|\|
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6703	!memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
				6704	list_for_each_entry(device, &fs_devices->devices,
				6705	dev_list) {
				6706	if (device->devid == devid &&
				6707	(!uuid \|\| memcmp(device->uuid, uuid,
				6708	BTRFS_UUID_SIZE) == 0))
				6709	return device;
				6710	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6711	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6712	if (seed)
				6713	fs_devices = fs_devices->seed;
				6714	else
				6715	return NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6716	}
				6717	return NULL;
				6718	}
				6719
				6720	static struct btrfs_device add_missing_dev(struct btrfs_fs_devices fs_devices,
				6721	u64 devid, u8 *dev_uuid)
				6722	{
				6723	struct btrfs_device *device;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6724	unsigned int nofs_flag;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6725
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6726	/*
				6727	* We call this under the chunk_mutex, so we want to use NOFS for this
				6728	* allocation, however we don't want to change btrfs_alloc_device() to
				6729	* always do NOFS because we use it in a lot of other GFP_KERNEL safe
				6730	* places.
				6731	*/
				6732	nofs_flag = memalloc_nofs_save();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6733	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6734	memalloc_nofs_restore(nofs_flag);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6735	if (IS_ERR(device))
				6736	return device;
				6737
				6738	list_add(&device->dev_list, &fs_devices->devices);
				6739	device->fs_devices = fs_devices;
				6740	fs_devices->num_devices++;
				6741
				6742	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
				6743	fs_devices->missing_devices++;
				6744
				6745	return device;
				6746	}
				6747
				6748	/**
				6749	* btrfs_alloc_device - allocate struct btrfs_device
				6750	* @fs_info: used only for generating a new devid, can be NULL if
				6751	* devid is provided (i.e. @devid != NULL).
				6752	* @devid: a pointer to devid for this device. If NULL a new devid
				6753	* is generated.
				6754	* @uuid: a pointer to UUID for this device. If NULL a new UUID
				6755	* is generated.
				6756	*
				6757	* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
				6758	* on error. Returned struct is not linked onto any lists and must be
				6759	* destroyed with btrfs_free_device.
				6760	*/
				6761	struct btrfs_device btrfs_alloc_device(struct btrfs_fs_info fs_info,
				6762	const u64 *devid,
				6763	const u8 *uuid)
				6764	{
				6765	struct btrfs_device *dev;
				6766	u64 tmp;
				6767
				6768	if (WARN_ON(!devid && !fs_info))
				6769	return ERR_PTR(-EINVAL);
				6770
				6771	dev = __alloc_device();
				6772	if (IS_ERR(dev))
				6773	return dev;
				6774
				6775	if (devid)
				6776	tmp = *devid;
				6777	else {
				6778	int ret;
				6779
				6780	ret = find_next_devid(fs_info, &tmp);
				6781	if (ret) {
				6782	btrfs_free_device(dev);
				6783	return ERR_PTR(ret);
				6784	}
				6785	}
				6786	dev->devid = tmp;
				6787
				6788	if (uuid)
				6789	memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
				6790	else
				6791	generate_random_uuid(dev->uuid);
				6792
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6793	btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6794
				6795	return dev;
				6796	}
				6797
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6798	static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
				6799	u64 devid, u8 *uuid, bool error)
				6800	{
				6801	if (error)
				6802	btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
				6803	devid, uuid);
				6804	else
				6805	btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
				6806	devid, uuid);
				6807	}
				6808
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6809	static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
				6810	{
				6811	int index = btrfs_bg_flags_to_raid_index(type);
				6812	int ncopies = btrfs_raid_array[index].ncopies;
				6813	int data_stripes;
				6814
				6815	switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
				6816	case BTRFS_BLOCK_GROUP_RAID5:
				6817	data_stripes = num_stripes - 1;
				6818	break;
				6819	case BTRFS_BLOCK_GROUP_RAID6:
				6820	data_stripes = num_stripes - 2;
				6821	break;
				6822	default:
				6823	data_stripes = num_stripes / ncopies;
				6824	break;
				6825	}
				6826	return div_u64(chunk_len, data_stripes);
				6827	}
				6828
				6829	static int read_one_chunk(struct btrfs_key key, struct extent_buffer leaf,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6830	struct btrfs_chunk *chunk)
				6831	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6832	struct btrfs_fs_info *fs_info = leaf->fs_info;
				6833	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6834	struct map_lookup *map;
				6835	struct extent_map *em;
				6836	u64 logical;
				6837	u64 length;
				6838	u64 devid;
				6839	u8 uuid[BTRFS_UUID_SIZE];
				6840	int num_stripes;
				6841	int ret;
				6842	int i;
				6843
				6844	logical = key->offset;
				6845	length = btrfs_chunk_length(leaf, chunk);
				6846	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
				6847
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6848	/*
				6849	* Only need to verify chunk item if we're reading from sys chunk array,
				6850	* as chunk item in tree block is already verified by tree-checker.
				6851	*/
				6852	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
				6853	ret = btrfs_check_chunk_valid(leaf, chunk, logical);
				6854	if (ret)
				6855	return ret;
				6856	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6857
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6858	read_lock(&map_tree->lock);
				6859	em = lookup_extent_mapping(map_tree, logical, 1);
				6860	read_unlock(&map_tree->lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6861
				6862	/* already mapped? */
				6863	if (em && em->start <= logical && em->start + em->len > logical) {
				6864	free_extent_map(em);
				6865	return 0;
				6866	} else if (em) {
				6867	free_extent_map(em);
				6868	}
				6869
				6870	em = alloc_extent_map();
				6871	if (!em)
				6872	return -ENOMEM;
				6873	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
				6874	if (!map) {
				6875	free_extent_map(em);
				6876	return -ENOMEM;
				6877	}
				6878
				6879	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
				6880	em->map_lookup = map;
				6881	em->start = logical;
				6882	em->len = length;
				6883	em->orig_start = 0;
				6884	em->block_start = 0;
				6885	em->block_len = em->len;
				6886
				6887	map->num_stripes = num_stripes;
				6888	map->io_width = btrfs_chunk_io_width(leaf, chunk);
				6889	map->io_align = btrfs_chunk_io_align(leaf, chunk);
				6890	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
				6891	map->type = btrfs_chunk_type(leaf, chunk);
				6892	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
				6893	map->verified_stripes = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6894	em->orig_block_len = calc_stripe_length(map->type, em->len,
				6895	map->num_stripes);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6896	for (i = 0; i < num_stripes; i++) {
				6897	map->stripes[i].physical =
				6898	btrfs_stripe_offset_nr(leaf, chunk, i);
				6899	devid = btrfs_stripe_devid_nr(leaf, chunk, i);
				6900	read_extent_buffer(leaf, uuid, (unsigned long)
				6901	btrfs_stripe_dev_uuid_nr(chunk, i),
				6902	BTRFS_UUID_SIZE);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6903	map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
				6904	devid, uuid, NULL, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6905	if (!map->stripes[i].dev &&
				6906	!btrfs_test_opt(fs_info, DEGRADED)) {
				6907	free_extent_map(em);
				6908	btrfs_report_missing_device(fs_info, devid, uuid, true);
				6909	return -ENOENT;
				6910	}
				6911	if (!map->stripes[i].dev) {
				6912	map->stripes[i].dev =
				6913	add_missing_dev(fs_info->fs_devices, devid,
				6914	uuid);
				6915	if (IS_ERR(map->stripes[i].dev)) {
				6916	free_extent_map(em);
				6917	btrfs_err(fs_info,
				6918	"failed to init missing dev %llu: %ld",
				6919	devid, PTR_ERR(map->stripes[i].dev));
				6920	return PTR_ERR(map->stripes[i].dev);
				6921	}
				6922	btrfs_report_missing_device(fs_info, devid, uuid, false);
				6923	}
				6924	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				6925	&(map->stripes[i].dev->dev_state));
				6926
				6927	}
				6928
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6929	write_lock(&map_tree->lock);
				6930	ret = add_extent_mapping(map_tree, em, 0);
				6931	write_unlock(&map_tree->lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6932	if (ret < 0) {
				6933	btrfs_err(fs_info,
				6934	"failed to add chunk map, start=%llu len=%llu: %d",
				6935	em->start, em->len, ret);
				6936	}
				6937	free_extent_map(em);
				6938
				6939	return ret;
				6940	}
				6941
				6942	static void fill_device_from_item(struct extent_buffer *leaf,
				6943	struct btrfs_dev_item *dev_item,
				6944	struct btrfs_device *device)
				6945	{
				6946	unsigned long ptr;
				6947
				6948	device->devid = btrfs_device_id(leaf, dev_item);
				6949	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
				6950	device->total_bytes = device->disk_total_bytes;
				6951	device->commit_total_bytes = device->disk_total_bytes;
				6952	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
				6953	device->commit_bytes_used = device->bytes_used;
				6954	device->type = btrfs_device_type(leaf, dev_item);
				6955	device->io_align = btrfs_device_io_align(leaf, dev_item);
				6956	device->io_width = btrfs_device_io_width(leaf, dev_item);
				6957	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
				6958	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
				6959	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
				6960
				6961	ptr = btrfs_device_uuid(dev_item);
				6962	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
				6963	}
				6964
				6965	static struct btrfs_fs_devices open_seed_devices(struct btrfs_fs_info fs_info,
				6966	u8 *fsid)
				6967	{
				6968	struct btrfs_fs_devices *fs_devices;
				6969	int ret;
				6970
				6971	lockdep_assert_held(&uuid_mutex);
				6972	ASSERT(fsid);
				6973
				6974	fs_devices = fs_info->fs_devices->seed;
				6975	while (fs_devices) {
				6976	if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
				6977	return fs_devices;
				6978
				6979	fs_devices = fs_devices->seed;
				6980	}
				6981
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6982	fs_devices = find_fsid(fsid, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6983	if (!fs_devices) {
				6984	if (!btrfs_test_opt(fs_info, DEGRADED))
				6985	return ERR_PTR(-ENOENT);
				6986
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6987	fs_devices = alloc_fs_devices(fsid, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6988	if (IS_ERR(fs_devices))
				6989	return fs_devices;
				6990
				6991	fs_devices->seeding = 1;
				6992	fs_devices->opened = 1;
				6993	return fs_devices;
				6994	}
				6995
				6996	fs_devices = clone_fs_devices(fs_devices);
				6997	if (IS_ERR(fs_devices))
				6998	return fs_devices;
				6999
				7000	ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
				7001	if (ret) {
				7002	free_fs_devices(fs_devices);
				7003	fs_devices = ERR_PTR(ret);
				7004	goto out;
				7005	}
				7006
				7007	if (!fs_devices->seeding) {
				7008	close_fs_devices(fs_devices);
				7009	free_fs_devices(fs_devices);
				7010	fs_devices = ERR_PTR(-EINVAL);
				7011	goto out;
				7012	}
				7013
				7014	fs_devices->seed = fs_info->fs_devices->seed;
				7015	fs_info->fs_devices->seed = fs_devices;
				7016	out:
				7017	return fs_devices;
				7018	}
				7019
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7020	static int read_one_dev(struct extent_buffer *leaf,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7021	struct btrfs_dev_item *dev_item)
				7022	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7023	struct btrfs_fs_info *fs_info = leaf->fs_info;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7024	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7025	struct btrfs_device *device;
				7026	u64 devid;
				7027	int ret;
				7028	u8 fs_uuid[BTRFS_FSID_SIZE];
				7029	u8 dev_uuid[BTRFS_UUID_SIZE];
				7030
				7031	devid = btrfs_device_id(leaf, dev_item);
				7032	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
				7033	BTRFS_UUID_SIZE);
				7034	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
				7035	BTRFS_FSID_SIZE);
				7036
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7037	if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7038	fs_devices = open_seed_devices(fs_info, fs_uuid);
				7039	if (IS_ERR(fs_devices))
				7040	return PTR_ERR(fs_devices);
				7041	}
				7042
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7043	device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
				7044	fs_uuid, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7045	if (!device) {
				7046	if (!btrfs_test_opt(fs_info, DEGRADED)) {
				7047	btrfs_report_missing_device(fs_info, devid,
				7048	dev_uuid, true);
				7049	return -ENOENT;
				7050	}
				7051
				7052	device = add_missing_dev(fs_devices, devid, dev_uuid);
				7053	if (IS_ERR(device)) {
				7054	btrfs_err(fs_info,
				7055	"failed to add missing dev %llu: %ld",
				7056	devid, PTR_ERR(device));
				7057	return PTR_ERR(device);
				7058	}
				7059	btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
				7060	} else {
				7061	if (!device->bdev) {
				7062	if (!btrfs_test_opt(fs_info, DEGRADED)) {
				7063	btrfs_report_missing_device(fs_info,
				7064	devid, dev_uuid, true);
				7065	return -ENOENT;
				7066	}
				7067	btrfs_report_missing_device(fs_info, devid,
				7068	dev_uuid, false);
				7069	}
				7070
				7071	if (!device->bdev &&
				7072	!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
				7073	/*
				7074	* this happens when a device that was properly setup
				7075	* in the device info lists suddenly goes bad.
				7076	* device->bdev is NULL, and so we have to set
				7077	* device->missing to one here
				7078	*/
				7079	device->fs_devices->missing_devices++;
				7080	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
				7081	}
				7082
				7083	/* Move the device to its own fs_devices */
				7084	if (device->fs_devices != fs_devices) {
				7085	ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
				7086	&device->dev_state));
				7087
				7088	list_move(&device->dev_list, &fs_devices->devices);
				7089	device->fs_devices->num_devices--;
				7090	fs_devices->num_devices++;
				7091
				7092	device->fs_devices->missing_devices--;
				7093	fs_devices->missing_devices++;
				7094
				7095	device->fs_devices = fs_devices;
				7096	}
				7097	}
				7098
				7099	if (device->fs_devices != fs_info->fs_devices) {
				7100	BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
				7101	if (device->generation !=
				7102	btrfs_device_generation(leaf, dev_item))
				7103	return -EINVAL;
				7104	}
				7105
				7106	fill_device_from_item(leaf, dev_item, device);
				7107	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
				7108	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
				7109	!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
				7110	device->fs_devices->total_rw_bytes += device->total_bytes;
				7111	atomic64_add(device->total_bytes - device->bytes_used,
				7112	&fs_info->free_chunk_space);
				7113	}
				7114	ret = 0;
				7115	return ret;
				7116	}
				7117
				7118	int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
				7119	{
				7120	struct btrfs_root *root = fs_info->tree_root;
				7121	struct btrfs_super_block *super_copy = fs_info->super_copy;
				7122	struct extent_buffer *sb;
				7123	struct btrfs_disk_key *disk_key;
				7124	struct btrfs_chunk *chunk;
				7125	u8 *array_ptr;
				7126	unsigned long sb_array_offset;
				7127	int ret = 0;
				7128	u32 num_stripes;
				7129	u32 array_size;
				7130	u32 len = 0;
				7131	u32 cur_offset;
				7132	u64 type;
				7133	struct btrfs_key key;
				7134
				7135	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
				7136	/*
				7137	* This will create extent buffer of nodesize, superblock size is
				7138	* fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
				7139	* overallocate but we can keep it as-is, only the first page is used.
				7140	*/
				7141	sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
				7142	if (IS_ERR(sb))
				7143	return PTR_ERR(sb);
				7144	set_extent_buffer_uptodate(sb);
				7145	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
				7146	/*
				7147	* The sb extent buffer is artificial and just used to read the system array.
				7148	* set_extent_buffer_uptodate() call does not properly mark all it's
				7149	* pages up-to-date when the page is larger: extent does not cover the
				7150	* whole page and consequently check_page_uptodate does not find all
				7151	* the page's extents up-to-date (the hole beyond sb),
				7152	* write_extent_buffer then triggers a WARN_ON.
				7153	*
				7154	* Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
				7155	* but sb spans only this function. Add an explicit SetPageUptodate call
				7156	* to silence the warning eg. on PowerPC 64.
				7157	*/
				7158	if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
				7159	SetPageUptodate(sb->pages[0]);
				7160
				7161	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
				7162	array_size = btrfs_super_sys_array_size(super_copy);
				7163
				7164	array_ptr = super_copy->sys_chunk_array;
				7165	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
				7166	cur_offset = 0;
				7167
				7168	while (cur_offset < array_size) {
				7169	disk_key = (struct btrfs_disk_key *)array_ptr;
				7170	len = sizeof(*disk_key);
				7171	if (cur_offset + len > array_size)
				7172	goto out_short_read;
				7173
				7174	btrfs_disk_key_to_cpu(&key, disk_key);
				7175
				7176	array_ptr += len;
				7177	sb_array_offset += len;
				7178	cur_offset += len;
				7179
				7180	if (key.type == BTRFS_CHUNK_ITEM_KEY) {
				7181	chunk = (struct btrfs_chunk *)sb_array_offset;
				7182	/*
				7183	* At least one btrfs_chunk with one stripe must be
				7184	* present, exact stripe count check comes afterwards
				7185	*/
				7186	len = btrfs_chunk_item_size(1);
				7187	if (cur_offset + len > array_size)
				7188	goto out_short_read;
				7189
				7190	num_stripes = btrfs_chunk_num_stripes(sb, chunk);
				7191	if (!num_stripes) {
				7192	btrfs_err(fs_info,
				7193	"invalid number of stripes %u in sys_array at offset %u",
				7194	num_stripes, cur_offset);
				7195	ret = -EIO;
				7196	break;
				7197	}
				7198
				7199	type = btrfs_chunk_type(sb, chunk);
				7200	if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
				7201	btrfs_err(fs_info,
				7202	"invalid chunk type %llu in sys_array at offset %u",
				7203	type, cur_offset);
				7204	ret = -EIO;
				7205	break;
				7206	}
				7207
				7208	len = btrfs_chunk_item_size(num_stripes);
				7209	if (cur_offset + len > array_size)
				7210	goto out_short_read;
				7211
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7212	ret = read_one_chunk(&key, sb, chunk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7213	if (ret)
				7214	break;
				7215	} else {
				7216	btrfs_err(fs_info,
				7217	"unexpected item type %u in sys_array at offset %u",
				7218	(u32)key.type, cur_offset);
				7219	ret = -EIO;
				7220	break;
				7221	}
				7222	array_ptr += len;
				7223	sb_array_offset += len;
				7224	cur_offset += len;
				7225	}
				7226	clear_extent_buffer_uptodate(sb);
				7227	free_extent_buffer_stale(sb);
				7228	return ret;
				7229
				7230	out_short_read:
				7231	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
				7232	len, cur_offset);
				7233	clear_extent_buffer_uptodate(sb);
				7234	free_extent_buffer_stale(sb);
				7235	return -EIO;
				7236	}
				7237
				7238	/*
				7239	* Check if all chunks in the fs are OK for read-write degraded mount
				7240	*
				7241	* If the @failing_dev is specified, it's accounted as missing.
				7242	*
				7243	* Return true if all chunks meet the minimal RW mount requirements.
				7244	* Return false if any chunk doesn't meet the minimal RW mount requirements.
				7245	*/
				7246	bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
				7247	struct btrfs_device *failing_dev)
				7248	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7249	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7250	struct extent_map *em;
				7251	u64 next_start = 0;
				7252	bool ret = true;
				7253
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7254	read_lock(&map_tree->lock);
				7255	em = lookup_extent_mapping(map_tree, 0, (u64)-1);
				7256	read_unlock(&map_tree->lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7257	/* No chunk at all? Return false anyway */
				7258	if (!em) {
				7259	ret = false;
				7260	goto out;
				7261	}
				7262	while (em) {
				7263	struct map_lookup *map;
				7264	int missing = 0;
				7265	int max_tolerated;
				7266	int i;
				7267
				7268	map = em->map_lookup;
				7269	max_tolerated =
				7270	btrfs_get_num_tolerated_disk_barrier_failures(
				7271	map->type);
				7272	for (i = 0; i < map->num_stripes; i++) {
				7273	struct btrfs_device *dev = map->stripes[i].dev;
				7274
				7275	if (!dev \|\| !dev->bdev \|\|
				7276	test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) \|\|
				7277	dev->last_flush_error)
				7278	missing++;
				7279	else if (failing_dev && failing_dev == dev)
				7280	missing++;
				7281	}
				7282	if (missing > max_tolerated) {
				7283	if (!failing_dev)
				7284	btrfs_warn(fs_info,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7285	"chunk %llu missing %d devices, max tolerance is %d for writable mount",
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7286	em->start, missing, max_tolerated);
				7287	free_extent_map(em);
				7288	ret = false;
				7289	goto out;
				7290	}
				7291	next_start = extent_map_end(em);
				7292	free_extent_map(em);
				7293
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7294	read_lock(&map_tree->lock);
				7295	em = lookup_extent_mapping(map_tree, next_start,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7296	(u64)(-1) - next_start);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7297	read_unlock(&map_tree->lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7298	}
				7299	out:
				7300	return ret;
				7301	}
				7302
				7303	int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
				7304	{
				7305	struct btrfs_root *root = fs_info->chunk_root;
				7306	struct btrfs_path *path;
				7307	struct extent_buffer *leaf;
				7308	struct btrfs_key key;
				7309	struct btrfs_key found_key;
				7310	int ret;
				7311	int slot;
				7312	u64 total_dev = 0;
				7313
				7314	path = btrfs_alloc_path();
				7315	if (!path)
				7316	return -ENOMEM;
				7317
				7318	/*
				7319	* uuid_mutex is needed only if we are mounting a sprout FS
				7320	* otherwise we don't need it.
				7321	*/
				7322	mutex_lock(&uuid_mutex);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7323
				7324	/*
				7325	* It is possible for mount and umount to race in such a way that
				7326	* we execute this code path, but open_fs_devices failed to clear
				7327	* total_rw_bytes. We certainly want it cleared before reading the
				7328	* device items, so clear it here.
				7329	*/
				7330	fs_info->fs_devices->total_rw_bytes = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7331
				7332	/*
				7333	* Read all device items, and then all the chunk items. All
				7334	* device items are found before any chunk item (their object id
				7335	* is smaller than the lowest possible object id for a chunk
				7336	* item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
				7337	*/
				7338	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				7339	key.offset = 0;
				7340	key.type = 0;
				7341	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				7342	if (ret < 0)
				7343	goto error;
				7344	while (1) {
				7345	leaf = path->nodes[0];
				7346	slot = path->slots[0];
				7347	if (slot >= btrfs_header_nritems(leaf)) {
				7348	ret = btrfs_next_leaf(root, path);
				7349	if (ret == 0)
				7350	continue;
				7351	if (ret < 0)
				7352	goto error;
				7353	break;
				7354	}
				7355	btrfs_item_key_to_cpu(leaf, &found_key, slot);
				7356	if (found_key.type == BTRFS_DEV_ITEM_KEY) {
				7357	struct btrfs_dev_item *dev_item;
				7358	dev_item = btrfs_item_ptr(leaf, slot,
				7359	struct btrfs_dev_item);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7360	ret = read_one_dev(leaf, dev_item);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7361	if (ret)
				7362	goto error;
				7363	total_dev++;
				7364	} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
				7365	struct btrfs_chunk *chunk;
				7366	chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7367	mutex_lock(&fs_info->chunk_mutex);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7368	ret = read_one_chunk(&found_key, leaf, chunk);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7369	mutex_unlock(&fs_info->chunk_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7370	if (ret)
				7371	goto error;
				7372	}
				7373	path->slots[0]++;
				7374	}
				7375
				7376	/*
				7377	* After loading chunk tree, we've got all device information,
				7378	* do another round of validation checks.
				7379	*/
				7380	if (total_dev != fs_info->fs_devices->total_devices) {
				7381	btrfs_err(fs_info,
				7382	"super_num_devices %llu mismatch with num_devices %llu found here",
				7383	btrfs_super_num_devices(fs_info->super_copy),
				7384	total_dev);
				7385	ret = -EINVAL;
				7386	goto error;
				7387	}
				7388	if (btrfs_super_total_bytes(fs_info->super_copy) <
				7389	fs_info->fs_devices->total_rw_bytes) {
				7390	btrfs_err(fs_info,
				7391	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
				7392	btrfs_super_total_bytes(fs_info->super_copy),
				7393	fs_info->fs_devices->total_rw_bytes);
				7394	ret = -EINVAL;
				7395	goto error;
				7396	}
				7397	ret = 0;
				7398	error:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7399	mutex_unlock(&uuid_mutex);
				7400
				7401	btrfs_free_path(path);
				7402	return ret;
				7403	}
				7404
				7405	void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
				7406	{
				7407	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7408	struct btrfs_device *device;
				7409
				7410	while (fs_devices) {
				7411	mutex_lock(&fs_devices->device_list_mutex);
				7412	list_for_each_entry(device, &fs_devices->devices, dev_list)
				7413	device->fs_info = fs_info;
				7414	mutex_unlock(&fs_devices->device_list_mutex);
				7415
				7416	fs_devices = fs_devices->seed;
				7417	}
				7418	}
				7419
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7420	static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
				7421	const struct btrfs_dev_stats_item *ptr,
				7422	int index)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7423	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7424	u64 val;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7425
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7426	read_extent_buffer(eb, &val,
				7427	offsetof(struct btrfs_dev_stats_item, values) +
				7428	((unsigned long)ptr) + (index * sizeof(u64)),
				7429	sizeof(val));
				7430	return val;
				7431	}
				7432
				7433	static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
				7434	struct btrfs_dev_stats_item *ptr,
				7435	int index, u64 val)
				7436	{
				7437	write_extent_buffer(eb, &val,
				7438	offsetof(struct btrfs_dev_stats_item, values) +
				7439	((unsigned long)ptr) + (index * sizeof(u64)),
				7440	sizeof(val));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7441	}
				7442
				7443	int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
				7444	{
				7445	struct btrfs_key key;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7446	struct btrfs_root *dev_root = fs_info->dev_root;
				7447	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7448	struct extent_buffer *eb;
				7449	int slot;
				7450	int ret = 0;
				7451	struct btrfs_device *device;
				7452	struct btrfs_path *path = NULL;
				7453	int i;
				7454
				7455	path = btrfs_alloc_path();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7456	if (!path)
				7457	return -ENOMEM;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7458
				7459	mutex_lock(&fs_devices->device_list_mutex);
				7460	list_for_each_entry(device, &fs_devices->devices, dev_list) {
				7461	int item_size;
				7462	struct btrfs_dev_stats_item *ptr;
				7463
				7464	key.objectid = BTRFS_DEV_STATS_OBJECTID;
				7465	key.type = BTRFS_PERSISTENT_ITEM_KEY;
				7466	key.offset = device->devid;
				7467	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
				7468	if (ret) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7469	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
				7470	btrfs_dev_stat_set(device, i, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7471	device->dev_stats_valid = 1;
				7472	btrfs_release_path(path);
				7473	continue;
				7474	}
				7475	slot = path->slots[0];
				7476	eb = path->nodes[0];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7477	item_size = btrfs_item_size_nr(eb, slot);
				7478
				7479	ptr = btrfs_item_ptr(eb, slot,
				7480	struct btrfs_dev_stats_item);
				7481
				7482	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
				7483	if (item_size >= (1 + i) * sizeof(__le64))
				7484	btrfs_dev_stat_set(device, i,
				7485	btrfs_dev_stats_value(eb, ptr, i));
				7486	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7487	btrfs_dev_stat_set(device, i, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7488	}
				7489
				7490	device->dev_stats_valid = 1;
				7491	btrfs_dev_stat_print_on_load(device);
				7492	btrfs_release_path(path);
				7493	}
				7494	mutex_unlock(&fs_devices->device_list_mutex);
				7495
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7496	btrfs_free_path(path);
				7497	return ret < 0 ? ret : 0;
				7498	}
				7499
				7500	static int update_dev_stat_item(struct btrfs_trans_handle *trans,
				7501	struct btrfs_device *device)
				7502	{
				7503	struct btrfs_fs_info *fs_info = trans->fs_info;
				7504	struct btrfs_root *dev_root = fs_info->dev_root;
				7505	struct btrfs_path *path;
				7506	struct btrfs_key key;
				7507	struct extent_buffer *eb;
				7508	struct btrfs_dev_stats_item *ptr;
				7509	int ret;
				7510	int i;
				7511
				7512	key.objectid = BTRFS_DEV_STATS_OBJECTID;
				7513	key.type = BTRFS_PERSISTENT_ITEM_KEY;
				7514	key.offset = device->devid;
				7515
				7516	path = btrfs_alloc_path();
				7517	if (!path)
				7518	return -ENOMEM;
				7519	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
				7520	if (ret < 0) {
				7521	btrfs_warn_in_rcu(fs_info,
				7522	"error %d while searching for dev_stats item for device %s",
				7523	ret, rcu_str_deref(device->name));
				7524	goto out;
				7525	}
				7526
				7527	if (ret == 0 &&
				7528	btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
				7529	/* need to delete old one and insert a new one */
				7530	ret = btrfs_del_item(trans, dev_root, path);
				7531	if (ret != 0) {
				7532	btrfs_warn_in_rcu(fs_info,
				7533	"delete too small dev_stats item for device %s failed %d",
				7534	rcu_str_deref(device->name), ret);
				7535	goto out;
				7536	}
				7537	ret = 1;
				7538	}
				7539
				7540	if (ret == 1) {
				7541	/* need to insert a new item */
				7542	btrfs_release_path(path);
				7543	ret = btrfs_insert_empty_item(trans, dev_root, path,
				7544	&key, sizeof(*ptr));
				7545	if (ret < 0) {
				7546	btrfs_warn_in_rcu(fs_info,
				7547	"insert dev_stats item for device %s failed %d",
				7548	rcu_str_deref(device->name), ret);
				7549	goto out;
				7550	}
				7551	}
				7552
				7553	eb = path->nodes[0];
				7554	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
				7555	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
				7556	btrfs_set_dev_stats_value(eb, ptr, i,
				7557	btrfs_dev_stat_read(device, i));
				7558	btrfs_mark_buffer_dirty(eb);
				7559
				7560	out:
				7561	btrfs_free_path(path);
				7562	return ret;
				7563	}
				7564
				7565	/*
				7566	* called from commit_transaction. Writes all changed device stats to disk.
				7567	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7568	int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7569	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7570	struct btrfs_fs_info *fs_info = trans->fs_info;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7571	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7572	struct btrfs_device *device;
				7573	int stats_cnt;
				7574	int ret = 0;
				7575
				7576	mutex_lock(&fs_devices->device_list_mutex);
				7577	list_for_each_entry(device, &fs_devices->devices, dev_list) {
				7578	stats_cnt = atomic_read(&device->dev_stats_ccnt);
				7579	if (!device->dev_stats_valid \|\| stats_cnt == 0)
				7580	continue;
				7581
				7582
				7583	/*
				7584	* There is a LOAD-LOAD control dependency between the value of
				7585	* dev_stats_ccnt and updating the on-disk values which requires
				7586	* reading the in-memory counters. Such control dependencies
				7587	* require explicit read memory barriers.
				7588	*
				7589	* This memory barriers pairs with smp_mb__before_atomic in
				7590	* btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
				7591	* barrier implied by atomic_xchg in
				7592	* btrfs_dev_stats_read_and_reset
				7593	*/
				7594	smp_rmb();
				7595
				7596	ret = update_dev_stat_item(trans, device);
				7597	if (!ret)
				7598	atomic_sub(stats_cnt, &device->dev_stats_ccnt);
				7599	}
				7600	mutex_unlock(&fs_devices->device_list_mutex);
				7601
				7602	return ret;
				7603	}
				7604
				7605	void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
				7606	{
				7607	btrfs_dev_stat_inc(dev, index);
				7608	btrfs_dev_stat_print_on_error(dev);
				7609	}
				7610
				7611	static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
				7612	{
				7613	if (!dev->dev_stats_valid)
				7614	return;
				7615	btrfs_err_rl_in_rcu(dev->fs_info,
				7616	"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
				7617	rcu_str_deref(dev->name),
				7618	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
				7619	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
				7620	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
				7621	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
				7622	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
				7623	}
				7624
				7625	static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
				7626	{
				7627	int i;
				7628
				7629	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
				7630	if (btrfs_dev_stat_read(dev, i) != 0)
				7631	break;
				7632	if (i == BTRFS_DEV_STAT_VALUES_MAX)
				7633	return; /* all values == 0, suppress message */
				7634
				7635	btrfs_info_in_rcu(dev->fs_info,
				7636	"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
				7637	rcu_str_deref(dev->name),
				7638	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
				7639	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
				7640	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
				7641	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
				7642	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
				7643	}
				7644
				7645	int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
				7646	struct btrfs_ioctl_get_dev_stats *stats)
				7647	{
				7648	struct btrfs_device *dev;
				7649	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7650	int i;
				7651
				7652	mutex_lock(&fs_devices->device_list_mutex);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7653	dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
				7654	true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7655	mutex_unlock(&fs_devices->device_list_mutex);
				7656
				7657	if (!dev) {
				7658	btrfs_warn(fs_info, "get dev_stats failed, device not found");
				7659	return -ENODEV;
				7660	} else if (!dev->dev_stats_valid) {
				7661	btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
				7662	return -ENODEV;
				7663	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
				7664	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
				7665	if (stats->nr_items > i)
				7666	stats->values[i] =
				7667	btrfs_dev_stat_read_and_reset(dev, i);
				7668	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7669	btrfs_dev_stat_set(dev, i, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7670	}
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7671	btrfs_info(fs_info, "device stats zeroed by %s (%d)",
				7672	current->comm, task_pid_nr(current));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7673	} else {
				7674	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
				7675	if (stats->nr_items > i)
				7676	stats->values[i] = btrfs_dev_stat_read(dev, i);
				7677	}
				7678	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
				7679	stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
				7680	return 0;
				7681	}
				7682
				7683	void btrfs_scratch_superblocks(struct block_device bdev, const char device_path)
				7684	{
				7685	struct buffer_head *bh;
				7686	struct btrfs_super_block *disk_super;
				7687	int copy_num;
				7688
				7689	if (!bdev)
				7690	return;
				7691
				7692	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
				7693	copy_num++) {
				7694
				7695	if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
				7696	continue;
				7697
				7698	disk_super = (struct btrfs_super_block *)bh->b_data;
				7699
				7700	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
				7701	set_buffer_dirty(bh);
				7702	sync_dirty_buffer(bh);
				7703	brelse(bh);
				7704	}
				7705
				7706	/* Notify udev that device has changed */
				7707	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
				7708
				7709	/* Update ctime/mtime for device path for libblkid */
				7710	update_dev_time(device_path);
				7711	}
				7712
				7713	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7714	* Update the size and bytes used for each device where it changed. This is
				7715	* delayed since we would otherwise get errors while writing out the
				7716	* superblocks.
				7717	*
				7718	* Must be invoked during transaction commit.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7719	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7720	void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7721	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7722	struct btrfs_device curr, next;
				7723
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7724	ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
				7725
				7726	if (list_empty(&trans->dev_update_list))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7727	return;
				7728
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7729	/*
				7730	* We don't need the device_list_mutex here. This list is owned by the
				7731	* transaction and the transaction must complete before the device is
				7732	* released.
				7733	*/
				7734	mutex_lock(&trans->fs_info->chunk_mutex);
				7735	list_for_each_entry_safe(curr, next, &trans->dev_update_list,
				7736	post_commit_list) {
				7737	list_del_init(&curr->post_commit_list);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7738	curr->commit_total_bytes = curr->disk_total_bytes;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7739	curr->commit_bytes_used = curr->bytes_used;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7740	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7741	mutex_unlock(&trans->fs_info->chunk_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7742	}
				7743
				7744	void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
				7745	{
				7746	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7747	while (fs_devices) {
				7748	fs_devices->fs_info = fs_info;
				7749	fs_devices = fs_devices->seed;
				7750	}
				7751	}
				7752
				7753	void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
				7754	{
				7755	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7756	while (fs_devices) {
				7757	fs_devices->fs_info = NULL;
				7758	fs_devices = fs_devices->seed;
				7759	}
				7760	}
				7761
				7762	/*
				7763	* Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
				7764	*/
				7765	int btrfs_bg_type_to_factor(u64 flags)
				7766	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7767	const int index = btrfs_bg_flags_to_raid_index(flags);
				7768
				7769	return btrfs_raid_array[index].ncopies;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7770	}
				7771
				7772
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7773
				7774	static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
				7775	u64 chunk_offset, u64 devid,
				7776	u64 physical_offset, u64 physical_len)
				7777	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7778	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7779	struct extent_map *em;
				7780	struct map_lookup *map;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7781	struct btrfs_device *dev;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7782	u64 stripe_len;
				7783	bool found = false;
				7784	int ret = 0;
				7785	int i;
				7786
				7787	read_lock(&em_tree->lock);
				7788	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
				7789	read_unlock(&em_tree->lock);
				7790
				7791	if (!em) {
				7792	btrfs_err(fs_info,
				7793	"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
				7794	physical_offset, devid);
				7795	ret = -EUCLEAN;
				7796	goto out;
				7797	}
				7798
				7799	map = em->map_lookup;
				7800	stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
				7801	if (physical_len != stripe_len) {
				7802	btrfs_err(fs_info,
				7803	"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
				7804	physical_offset, devid, em->start, physical_len,
				7805	stripe_len);
				7806	ret = -EUCLEAN;
				7807	goto out;
				7808	}
				7809
				7810	for (i = 0; i < map->num_stripes; i++) {
				7811	if (map->stripes[i].dev->devid == devid &&
				7812	map->stripes[i].physical == physical_offset) {
				7813	found = true;
				7814	if (map->verified_stripes >= map->num_stripes) {
				7815	btrfs_err(fs_info,
				7816	"too many dev extents for chunk %llu found",
				7817	em->start);
				7818	ret = -EUCLEAN;
				7819	goto out;
				7820	}
				7821	map->verified_stripes++;
				7822	break;
				7823	}
				7824	}
				7825	if (!found) {
				7826	btrfs_err(fs_info,
				7827	"dev extent physical offset %llu devid %llu has no corresponding chunk",
				7828	physical_offset, devid);
				7829	ret = -EUCLEAN;
				7830	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7831
				7832	/* Make sure no dev extent is beyond device bondary */
				7833	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
				7834	if (!dev) {
				7835	btrfs_err(fs_info, "failed to find devid %llu", devid);
				7836	ret = -EUCLEAN;
				7837	goto out;
				7838	}
				7839
				7840	/* It's possible this device is a dummy for seed device */
				7841	if (dev->disk_total_bytes == 0) {
				7842	dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
				7843	NULL, false);
				7844	if (!dev) {
				7845	btrfs_err(fs_info, "failed to find seed devid %llu",
				7846	devid);
				7847	ret = -EUCLEAN;
				7848	goto out;
				7849	}
				7850	}
				7851
				7852	if (physical_offset + physical_len > dev->disk_total_bytes) {
				7853	btrfs_err(fs_info,
				7854	"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
				7855	devid, physical_offset, physical_len,
				7856	dev->disk_total_bytes);
				7857	ret = -EUCLEAN;
				7858	goto out;
				7859	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7860	out:
				7861	free_extent_map(em);
				7862	return ret;
				7863	}
				7864
				7865	static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
				7866	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7867	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7868	struct extent_map *em;
				7869	struct rb_node *node;
				7870	int ret = 0;
				7871
				7872	read_lock(&em_tree->lock);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7873	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7874	em = rb_entry(node, struct extent_map, rb_node);
				7875	if (em->map_lookup->num_stripes !=
				7876	em->map_lookup->verified_stripes) {
				7877	btrfs_err(fs_info,
				7878	"chunk %llu has missing dev extent, have %d expect %d",
				7879	em->start, em->map_lookup->verified_stripes,
				7880	em->map_lookup->num_stripes);
				7881	ret = -EUCLEAN;
				7882	goto out;
				7883	}
				7884	}
				7885	out:
				7886	read_unlock(&em_tree->lock);
				7887	return ret;
				7888	}
				7889
				7890	/*
				7891	* Ensure that all dev extents are mapped to correct chunk, otherwise
				7892	* later chunk allocation/free would cause unexpected behavior.
				7893	*
				7894	* NOTE: This will iterate through the whole device tree, which should be of
				7895	* the same size level as the chunk tree. This slightly increases mount time.
				7896	*/
				7897	int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
				7898	{
				7899	struct btrfs_path *path;
				7900	struct btrfs_root *root = fs_info->dev_root;
				7901	struct btrfs_key key;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7902	u64 prev_devid = 0;
				7903	u64 prev_dev_ext_end = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7904	int ret = 0;
				7905
				7906	key.objectid = 1;
				7907	key.type = BTRFS_DEV_EXTENT_KEY;
				7908	key.offset = 0;
				7909
				7910	path = btrfs_alloc_path();
				7911	if (!path)
				7912	return -ENOMEM;
				7913
				7914	path->reada = READA_FORWARD;
				7915	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				7916	if (ret < 0)
				7917	goto out;
				7918
				7919	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
				7920	ret = btrfs_next_item(root, path);
				7921	if (ret < 0)
				7922	goto out;
				7923	/* No dev extents at all? Not good */
				7924	if (ret > 0) {
				7925	ret = -EUCLEAN;
				7926	goto out;
				7927	}
				7928	}
				7929	while (1) {
				7930	struct extent_buffer *leaf = path->nodes[0];
				7931	struct btrfs_dev_extent *dext;
				7932	int slot = path->slots[0];
				7933	u64 chunk_offset;
				7934	u64 physical_offset;
				7935	u64 physical_len;
				7936	u64 devid;
				7937
				7938	btrfs_item_key_to_cpu(leaf, &key, slot);
				7939	if (key.type != BTRFS_DEV_EXTENT_KEY)
				7940	break;
				7941	devid = key.objectid;
				7942	physical_offset = key.offset;
				7943
				7944	dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
				7945	chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
				7946	physical_len = btrfs_dev_extent_length(leaf, dext);
				7947
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7948	/* Check if this dev extent overlaps with the previous one */
				7949	if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
				7950	btrfs_err(fs_info,
				7951	"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
				7952	devid, physical_offset, prev_dev_ext_end);
				7953	ret = -EUCLEAN;
				7954	goto out;
				7955	}
				7956
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7957	ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
				7958	physical_offset, physical_len);
				7959	if (ret < 0)
				7960	goto out;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7961	prev_devid = devid;
				7962	prev_dev_ext_end = physical_offset + physical_len;
				7963
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7964	ret = btrfs_next_item(root, path);
				7965	if (ret < 0)
				7966	goto out;
				7967	if (ret > 0) {
				7968	ret = 0;
				7969	break;
				7970	}
				7971	}
				7972
				7973	/* Ensure all chunks have corresponding dev extents */
				7974	ret = verify_chunk_dev_extent_mapping(fs_info);
				7975	out:
				7976	btrfs_free_path(path);
				7977	return ret;
				7978	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7979
				7980	/*
				7981	* Check whether the given block group or device is pinned by any inode being
				7982	* used as a swapfile.
				7983	*/
				7984	bool btrfs_pinned_by_swapfile(struct btrfs_fs_info fs_info, void ptr)
				7985	{
				7986	struct btrfs_swapfile_pin *sp;
				7987	struct rb_node *node;
				7988
				7989	spin_lock(&fs_info->swapfile_pins_lock);
				7990	node = fs_info->swapfile_pins.rb_node;
				7991	while (node) {
				7992	sp = rb_entry(node, struct btrfs_swapfile_pin, node);
				7993	if (ptr < sp->ptr)
				7994	node = node->rb_left;
				7995	else if (ptr > sp->ptr)
				7996	node = node->rb_right;
				7997	else
				7998	break;
				7999	}
				8000	spin_unlock(&fs_info->swapfile_pins_lock);
				8001	return node != NULL;
				8002	}