Blame - fs/btrfs/scrub.c - hafnium/third_party/linux.git

blob: f7d4e03f4c5d5ba78b5fd8e01b8ed3be3ea274a1 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (C) 2011, 2012 STRATO. All rights reserved.
				4	*/
				5
				6	#include <linux/blkdev.h>
				7	#include <linux/ratelimit.h>
				8	#include <linux/sched/mm.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	9	#include <crypto/hash.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10	#include "ctree.h"
				11	#include "volumes.h"
				12	#include "disk-io.h"
				13	#include "ordered-data.h"
				14	#include "transaction.h"
				15	#include "backref.h"
				16	#include "extent_io.h"
				17	#include "dev-replace.h"
				18	#include "check-integrity.h"
				19	#include "rcu-string.h"
				20	#include "raid56.h"
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	21	#include "block-group.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	22
				23	/*
				24	* This is only the first step towards a full-features scrub. It reads all
				25	* extent and super block and verifies the checksums. In case a bad checksum
				26	* is found or the extent cannot be read, good data will be written back if
				27	* any can be found.
				28	*
				29	* Future enhancements:
				30	* - In case an unrepairable extent is encountered, track which files are
				31	* affected and report them
				32	* - track and record media errors, throw out bad devices
				33	* - add a mode to also read unallocated space
				34	*/
				35
				36	struct scrub_block;
				37	struct scrub_ctx;
				38
				39	/*
				40	* the following three values only influence the performance.
				41	* The last one configures the number of parallel and outstanding I/O
				42	* operations. The first two values configure an upper limit for the number
				43	* of (dynamically allocated) pages that are added to a bio.
				44	*/
				45	#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
				46	#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
				47	#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
				48
				49	/*
				50	* the following value times PAGE_SIZE needs to be large enough to match the
				51	* largest node/leaf/sector size that shall be supported.
				52	* Values larger than BTRFS_STRIPE_LEN are not supported.
				53	*/
				54	#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
				55
				56	struct scrub_recover {
				57	refcount_t refs;
				58	struct btrfs_bio *bbio;
				59	u64 map_length;
				60	};
				61
				62	struct scrub_page {
				63	struct scrub_block *sblock;
				64	struct page *page;
				65	struct btrfs_device *dev;
				66	struct list_head list;
				67	u64 flags; /* extent flags */
				68	u64 generation;
				69	u64 logical;
				70	u64 physical;
				71	u64 physical_for_dev_replace;
				72	atomic_t refs;
				73	struct {
				74	unsigned int mirror_num:8;
				75	unsigned int have_csum:1;
				76	unsigned int io_error:1;
				77	};
				78	u8 csum[BTRFS_CSUM_SIZE];
				79
				80	struct scrub_recover *recover;
				81	};
				82
				83	struct scrub_bio {
				84	int index;
				85	struct scrub_ctx *sctx;
				86	struct btrfs_device *dev;
				87	struct bio *bio;
				88	blk_status_t status;
				89	u64 logical;
				90	u64 physical;
				91	#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
				92	struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
				93	#else
				94	struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
				95	#endif
				96	int page_count;
				97	int next_free;
				98	struct btrfs_work work;
				99	};
				100
				101	struct scrub_block {
				102	struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
				103	int page_count;
				104	atomic_t outstanding_pages;
				105	refcount_t refs; /* free mem on transition to zero */
				106	struct scrub_ctx *sctx;
				107	struct scrub_parity *sparity;
				108	struct {
				109	unsigned int header_error:1;
				110	unsigned int checksum_error:1;
				111	unsigned int no_io_error_seen:1;
				112	unsigned int generation_error:1; /* also sets header_error */
				113
				114	/* The following is for the data used to check parity */
				115	/* It is for the data with checksum */
				116	unsigned int data_corrected:1;
				117	};
				118	struct btrfs_work work;
				119	};
				120
				121	/* Used for the chunks with parity stripe such RAID5/6 */
				122	struct scrub_parity {
				123	struct scrub_ctx *sctx;
				124
				125	struct btrfs_device *scrub_dev;
				126
				127	u64 logic_start;
				128
				129	u64 logic_end;
				130
				131	int nsectors;
				132
				133	u64 stripe_len;
				134
				135	refcount_t refs;
				136
				137	struct list_head spages;
				138
				139	/* Work of parity check and repair */
				140	struct btrfs_work work;
				141
				142	/* Mark the parity blocks which have data */
				143	unsigned long *dbitmap;
				144
				145	/*
				146	* Mark the parity blocks which have data, but errors happen when
				147	* read data or check data
				148	*/
				149	unsigned long *ebitmap;
				150
				151	unsigned long bitmap[0];
				152	};
				153
				154	struct scrub_ctx {
				155	struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
				156	struct btrfs_fs_info *fs_info;
				157	int first_free;
				158	int curr;
				159	atomic_t bios_in_flight;
				160	atomic_t workers_pending;
				161	spinlock_t list_lock;
				162	wait_queue_head_t list_wait;
				163	u16 csum_size;
				164	struct list_head csum_list;
				165	atomic_t cancel_req;
				166	int readonly;
				167	int pages_per_rd_bio;
				168
				169	int is_dev_replace;
				170
				171	struct scrub_bio *wr_curr_bio;
				172	struct mutex wr_lock;
				173	int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
				174	struct btrfs_device *wr_tgtdev;
				175	bool flush_all_writes;
				176
				177	/*
				178	* statistics
				179	*/
				180	struct btrfs_scrub_progress stat;
				181	spinlock_t stat_lock;
				182
				183	/*
				184	* Use a ref counter to avoid use-after-free issues. Scrub workers
				185	* decrement bios_in_flight and workers_pending and then do a wakeup
				186	* on the list_wait wait queue. We must ensure the main scrub task
				187	* doesn't free the scrub context before or while the workers are
				188	* doing the wakeup() call.
				189	*/
				190	refcount_t refs;
				191	};
				192
				193	struct scrub_warning {
				194	struct btrfs_path *path;
				195	u64 extent_item_size;
				196	const char *errstr;
				197	u64 physical;
				198	u64 logical;
				199	struct btrfs_device *dev;
				200	};
				201
				202	struct full_stripe_lock {
				203	struct rb_node node;
				204	u64 logical;
				205	u64 refs;
				206	struct mutex mutex;
				207	};
				208
				209	static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
				210	static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
				211	static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
				212	static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
				213	struct scrub_block *sblocks_for_recheck);
				214	static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
				215	struct scrub_block *sblock,
				216	int retry_failed_mirror);
				217	static void scrub_recheck_block_checksum(struct scrub_block *sblock);
				218	static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
				219	struct scrub_block *sblock_good);
				220	static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
				221	struct scrub_block *sblock_good,
				222	int page_num, int force_write);
				223	static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
				224	static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
				225	int page_num);
				226	static int scrub_checksum_data(struct scrub_block *sblock);
				227	static int scrub_checksum_tree_block(struct scrub_block *sblock);
				228	static int scrub_checksum_super(struct scrub_block *sblock);
				229	static void scrub_block_get(struct scrub_block *sblock);
				230	static void scrub_block_put(struct scrub_block *sblock);
				231	static void scrub_page_get(struct scrub_page *spage);
				232	static void scrub_page_put(struct scrub_page *spage);
				233	static void scrub_parity_get(struct scrub_parity *sparity);
				234	static void scrub_parity_put(struct scrub_parity *sparity);
				235	static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
				236	struct scrub_page *spage);
				237	static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
				238	u64 physical, struct btrfs_device *dev, u64 flags,
				239	u64 gen, int mirror_num, u8 *csum, int force,
				240	u64 physical_for_dev_replace);
				241	static void scrub_bio_end_io(struct bio *bio);
				242	static void scrub_bio_end_io_worker(struct btrfs_work *work);
				243	static void scrub_block_complete(struct scrub_block *sblock);
				244	static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
				245	u64 extent_logical, u64 extent_len,
				246	u64 *extent_physical,
				247	struct btrfs_device **extent_dev,
				248	int *extent_mirror_num);
				249	static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
				250	struct scrub_page *spage);
				251	static void scrub_wr_submit(struct scrub_ctx *sctx);
				252	static void scrub_wr_bio_end_io(struct bio *bio);
				253	static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
				254	static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
				255	static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
				256	static void scrub_put_ctx(struct scrub_ctx *sctx);
				257
				258	static inline int scrub_is_page_on_raid56(struct scrub_page *page)
				259	{
				260	return page->recover &&
				261	(page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
				262	}
				263
				264	static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
				265	{
				266	refcount_inc(&sctx->refs);
				267	atomic_inc(&sctx->bios_in_flight);
				268	}
				269
				270	static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
				271	{
				272	atomic_dec(&sctx->bios_in_flight);
				273	wake_up(&sctx->list_wait);
				274	scrub_put_ctx(sctx);
				275	}
				276
				277	static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
				278	{
				279	while (atomic_read(&fs_info->scrub_pause_req)) {
				280	mutex_unlock(&fs_info->scrub_lock);
				281	wait_event(fs_info->scrub_pause_wait,
				282	atomic_read(&fs_info->scrub_pause_req) == 0);
				283	mutex_lock(&fs_info->scrub_lock);
				284	}
				285	}
				286
				287	static void scrub_pause_on(struct btrfs_fs_info *fs_info)
				288	{
				289	atomic_inc(&fs_info->scrubs_paused);
				290	wake_up(&fs_info->scrub_pause_wait);
				291	}
				292
				293	static void scrub_pause_off(struct btrfs_fs_info *fs_info)
				294	{
				295	mutex_lock(&fs_info->scrub_lock);
				296	__scrub_blocked_if_needed(fs_info);
				297	atomic_dec(&fs_info->scrubs_paused);
				298	mutex_unlock(&fs_info->scrub_lock);
				299
				300	wake_up(&fs_info->scrub_pause_wait);
				301	}
				302
				303	static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
				304	{
				305	scrub_pause_on(fs_info);
				306	scrub_pause_off(fs_info);
				307	}
				308
				309	/*
				310	* Insert new full stripe lock into full stripe locks tree
				311	*
				312	* Return pointer to existing or newly inserted full_stripe_lock structure if
				313	* everything works well.
				314	* Return ERR_PTR(-ENOMEM) if we failed to allocate memory
				315	*
				316	* NOTE: caller must hold full_stripe_locks_root->lock before calling this
				317	* function
				318	*/
				319	static struct full_stripe_lock *insert_full_stripe_lock(
				320	struct btrfs_full_stripe_locks_tree *locks_root,
				321	u64 fstripe_logical)
				322	{
				323	struct rb_node **p;
				324	struct rb_node *parent = NULL;
				325	struct full_stripe_lock *entry;
				326	struct full_stripe_lock *ret;
				327
				328	lockdep_assert_held(&locks_root->lock);
				329
				330	p = &locks_root->root.rb_node;
				331	while (*p) {
				332	parent = *p;
				333	entry = rb_entry(parent, struct full_stripe_lock, node);
				334	if (fstripe_logical < entry->logical) {
				335	p = &(*p)->rb_left;
				336	} else if (fstripe_logical > entry->logical) {
				337	p = &(*p)->rb_right;
				338	} else {
				339	entry->refs++;
				340	return entry;
				341	}
				342	}
				343
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	344	/*
				345	* Insert new lock.
				346	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	347	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
				348	if (!ret)
				349	return ERR_PTR(-ENOMEM);
				350	ret->logical = fstripe_logical;
				351	ret->refs = 1;
				352	mutex_init(&ret->mutex);
				353
				354	rb_link_node(&ret->node, parent, p);
				355	rb_insert_color(&ret->node, &locks_root->root);
				356	return ret;
				357	}
				358
				359	/*
				360	* Search for a full stripe lock of a block group
				361	*
				362	* Return pointer to existing full stripe lock if found
				363	* Return NULL if not found
				364	*/
				365	static struct full_stripe_lock *search_full_stripe_lock(
				366	struct btrfs_full_stripe_locks_tree *locks_root,
				367	u64 fstripe_logical)
				368	{
				369	struct rb_node *node;
				370	struct full_stripe_lock *entry;
				371
				372	lockdep_assert_held(&locks_root->lock);
				373
				374	node = locks_root->root.rb_node;
				375	while (node) {
				376	entry = rb_entry(node, struct full_stripe_lock, node);
				377	if (fstripe_logical < entry->logical)
				378	node = node->rb_left;
				379	else if (fstripe_logical > entry->logical)
				380	node = node->rb_right;
				381	else
				382	return entry;
				383	}
				384	return NULL;
				385	}
				386
				387	/*
				388	* Helper to get full stripe logical from a normal bytenr.
				389	*
				390	* Caller must ensure @cache is a RAID56 block group.
				391	*/
				392	static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
				393	u64 bytenr)
				394	{
				395	u64 ret;
				396
				397	/*
				398	* Due to chunk item size limit, full stripe length should not be
				399	* larger than U32_MAX. Just a sanity check here.
				400	*/
				401	WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
				402
				403	/*
				404	* round_down() can only handle power of 2, while RAID56 full
				405	* stripe length can be 64KiB * n, so we need to manually round down.
				406	*/
				407	ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
				408	cache->full_stripe_len + cache->key.objectid;
				409	return ret;
				410	}
				411
				412	/*
				413	* Lock a full stripe to avoid concurrency of recovery and read
				414	*
				415	* It's only used for profiles with parities (RAID5/6), for other profiles it
				416	* does nothing.
				417	*
				418	* Return 0 if we locked full stripe covering @bytenr, with a mutex held.
				419	* So caller must call unlock_full_stripe() at the same context.
				420	*
				421	* Return <0 if encounters error.
				422	*/
				423	static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
				424	bool *locked_ret)
				425	{
				426	struct btrfs_block_group_cache *bg_cache;
				427	struct btrfs_full_stripe_locks_tree *locks_root;
				428	struct full_stripe_lock *existing;
				429	u64 fstripe_start;
				430	int ret = 0;
				431
				432	*locked_ret = false;
				433	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
				434	if (!bg_cache) {
				435	ASSERT(0);
				436	return -ENOENT;
				437	}
				438
				439	/* Profiles not based on parity don't need full stripe lock */
				440	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
				441	goto out;
				442	locks_root = &bg_cache->full_stripe_locks_root;
				443
				444	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
				445
				446	/* Now insert the full stripe lock */
				447	mutex_lock(&locks_root->lock);
				448	existing = insert_full_stripe_lock(locks_root, fstripe_start);
				449	mutex_unlock(&locks_root->lock);
				450	if (IS_ERR(existing)) {
				451	ret = PTR_ERR(existing);
				452	goto out;
				453	}
				454	mutex_lock(&existing->mutex);
				455	*locked_ret = true;
				456	out:
				457	btrfs_put_block_group(bg_cache);
				458	return ret;
				459	}
				460
				461	/*
				462	* Unlock a full stripe.
				463	*
				464	* NOTE: Caller must ensure it's the same context calling corresponding
				465	* lock_full_stripe().
				466	*
				467	* Return 0 if we unlock full stripe without problem.
				468	* Return <0 for error
				469	*/
				470	static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
				471	bool locked)
				472	{
				473	struct btrfs_block_group_cache *bg_cache;
				474	struct btrfs_full_stripe_locks_tree *locks_root;
				475	struct full_stripe_lock *fstripe_lock;
				476	u64 fstripe_start;
				477	bool freeit = false;
				478	int ret = 0;
				479
				480	/* If we didn't acquire full stripe lock, no need to continue */
				481	if (!locked)
				482	return 0;
				483
				484	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
				485	if (!bg_cache) {
				486	ASSERT(0);
				487	return -ENOENT;
				488	}
				489	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
				490	goto out;
				491
				492	locks_root = &bg_cache->full_stripe_locks_root;
				493	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
				494
				495	mutex_lock(&locks_root->lock);
				496	fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
				497	/* Unpaired unlock_full_stripe() detected */
				498	if (!fstripe_lock) {
				499	WARN_ON(1);
				500	ret = -ENOENT;
				501	mutex_unlock(&locks_root->lock);
				502	goto out;
				503	}
				504
				505	if (fstripe_lock->refs == 0) {
				506	WARN_ON(1);
				507	btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
				508	fstripe_lock->logical);
				509	} else {
				510	fstripe_lock->refs--;
				511	}
				512
				513	if (fstripe_lock->refs == 0) {
				514	rb_erase(&fstripe_lock->node, &locks_root->root);
				515	freeit = true;
				516	}
				517	mutex_unlock(&locks_root->lock);
				518
				519	mutex_unlock(&fstripe_lock->mutex);
				520	if (freeit)
				521	kfree(fstripe_lock);
				522	out:
				523	btrfs_put_block_group(bg_cache);
				524	return ret;
				525	}
				526
				527	static void scrub_free_csums(struct scrub_ctx *sctx)
				528	{
				529	while (!list_empty(&sctx->csum_list)) {
				530	struct btrfs_ordered_sum *sum;
				531	sum = list_first_entry(&sctx->csum_list,
				532	struct btrfs_ordered_sum, list);
				533	list_del(&sum->list);
				534	kfree(sum);
				535	}
				536	}
				537
				538	static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
				539	{
				540	int i;
				541
				542	if (!sctx)
				543	return;
				544
				545	/* this can happen when scrub is cancelled */
				546	if (sctx->curr != -1) {
				547	struct scrub_bio *sbio = sctx->bios[sctx->curr];
				548
				549	for (i = 0; i < sbio->page_count; i++) {
				550	WARN_ON(!sbio->pagev[i]->page);
				551	scrub_block_put(sbio->pagev[i]->sblock);
				552	}
				553	bio_put(sbio->bio);
				554	}
				555
				556	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
				557	struct scrub_bio *sbio = sctx->bios[i];
				558
				559	if (!sbio)
				560	break;
				561	kfree(sbio);
				562	}
				563
				564	kfree(sctx->wr_curr_bio);
				565	scrub_free_csums(sctx);
				566	kfree(sctx);
				567	}
				568
				569	static void scrub_put_ctx(struct scrub_ctx *sctx)
				570	{
				571	if (refcount_dec_and_test(&sctx->refs))
				572	scrub_free_ctx(sctx);
				573	}
				574
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	575	static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
				576	struct btrfs_fs_info *fs_info, int is_dev_replace)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	577	{
				578	struct scrub_ctx *sctx;
				579	int i;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	580
				581	sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
				582	if (!sctx)
				583	goto nomem;
				584	refcount_set(&sctx->refs, 1);
				585	sctx->is_dev_replace = is_dev_replace;
				586	sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
				587	sctx->curr = -1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	588	sctx->fs_info = fs_info;
				589	INIT_LIST_HEAD(&sctx->csum_list);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	590	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
				591	struct scrub_bio *sbio;
				592
				593	sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
				594	if (!sbio)
				595	goto nomem;
				596	sctx->bios[i] = sbio;
				597
				598	sbio->index = i;
				599	sbio->sctx = sctx;
				600	sbio->page_count = 0;
				601	btrfs_init_work(&sbio->work, btrfs_scrub_helper,
				602	scrub_bio_end_io_worker, NULL, NULL);
				603
				604	if (i != SCRUB_BIOS_PER_SCTX - 1)
				605	sctx->bios[i]->next_free = i + 1;
				606	else
				607	sctx->bios[i]->next_free = -1;
				608	}
				609	sctx->first_free = 0;
				610	atomic_set(&sctx->bios_in_flight, 0);
				611	atomic_set(&sctx->workers_pending, 0);
				612	atomic_set(&sctx->cancel_req, 0);
				613	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	614
				615	spin_lock_init(&sctx->list_lock);
				616	spin_lock_init(&sctx->stat_lock);
				617	init_waitqueue_head(&sctx->list_wait);
				618
				619	WARN_ON(sctx->wr_curr_bio != NULL);
				620	mutex_init(&sctx->wr_lock);
				621	sctx->wr_curr_bio = NULL;
				622	if (is_dev_replace) {
				623	WARN_ON(!fs_info->dev_replace.tgtdev);
				624	sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
				625	sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
				626	sctx->flush_all_writes = false;
				627	}
				628
				629	return sctx;
				630
				631	nomem:
				632	scrub_free_ctx(sctx);
				633	return ERR_PTR(-ENOMEM);
				634	}
				635
				636	static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
				637	void *warn_ctx)
				638	{
				639	u64 isize;
				640	u32 nlink;
				641	int ret;
				642	int i;
				643	unsigned nofs_flag;
				644	struct extent_buffer *eb;
				645	struct btrfs_inode_item *inode_item;
				646	struct scrub_warning *swarn = warn_ctx;
				647	struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
				648	struct inode_fs_paths *ipath = NULL;
				649	struct btrfs_root *local_root;
				650	struct btrfs_key root_key;
				651	struct btrfs_key key;
				652
				653	root_key.objectid = root;
				654	root_key.type = BTRFS_ROOT_ITEM_KEY;
				655	root_key.offset = (u64)-1;
				656	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
				657	if (IS_ERR(local_root)) {
				658	ret = PTR_ERR(local_root);
				659	goto err;
				660	}
				661
				662	/*
				663	* this makes the path point to (inum INODE_ITEM ioff)
				664	*/
				665	key.objectid = inum;
				666	key.type = BTRFS_INODE_ITEM_KEY;
				667	key.offset = 0;
				668
				669	ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
				670	if (ret) {
				671	btrfs_release_path(swarn->path);
				672	goto err;
				673	}
				674
				675	eb = swarn->path->nodes[0];
				676	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
				677	struct btrfs_inode_item);
				678	isize = btrfs_inode_size(eb, inode_item);
				679	nlink = btrfs_inode_nlink(eb, inode_item);
				680	btrfs_release_path(swarn->path);
				681
				682	/*
				683	* init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
				684	* uses GFP_NOFS in this context, so we keep it consistent but it does
				685	* not seem to be strictly necessary.
				686	*/
				687	nofs_flag = memalloc_nofs_save();
				688	ipath = init_ipath(4096, local_root, swarn->path);
				689	memalloc_nofs_restore(nofs_flag);
				690	if (IS_ERR(ipath)) {
				691	ret = PTR_ERR(ipath);
				692	ipath = NULL;
				693	goto err;
				694	}
				695	ret = paths_from_inode(inum, ipath);
				696
				697	if (ret < 0)
				698	goto err;
				699
				700	/*
				701	* we deliberately ignore the bit ipath might have been too small to
				702	* hold all of the paths here
				703	*/
				704	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
				705	btrfs_warn_in_rcu(fs_info,
				706	"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
				707	swarn->errstr, swarn->logical,
				708	rcu_str_deref(swarn->dev->name),
				709	swarn->physical,
				710	root, inum, offset,
				711	min(isize - offset, (u64)PAGE_SIZE), nlink,
				712	(char *)(unsigned long)ipath->fspath->val[i]);
				713
				714	free_ipath(ipath);
				715	return 0;
				716
				717	err:
				718	btrfs_warn_in_rcu(fs_info,
				719	"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
				720	swarn->errstr, swarn->logical,
				721	rcu_str_deref(swarn->dev->name),
				722	swarn->physical,
				723	root, inum, offset, ret);
				724
				725	free_ipath(ipath);
				726	return 0;
				727	}
				728
				729	static void scrub_print_warning(const char errstr, struct scrub_block sblock)
				730	{
				731	struct btrfs_device *dev;
				732	struct btrfs_fs_info *fs_info;
				733	struct btrfs_path *path;
				734	struct btrfs_key found_key;
				735	struct extent_buffer *eb;
				736	struct btrfs_extent_item *ei;
				737	struct scrub_warning swarn;
				738	unsigned long ptr = 0;
				739	u64 extent_item_pos;
				740	u64 flags = 0;
				741	u64 ref_root;
				742	u32 item_size;
				743	u8 ref_level = 0;
				744	int ret;
				745
				746	WARN_ON(sblock->page_count < 1);
				747	dev = sblock->pagev[0]->dev;
				748	fs_info = sblock->sctx->fs_info;
				749
				750	path = btrfs_alloc_path();
				751	if (!path)
				752	return;
				753
				754	swarn.physical = sblock->pagev[0]->physical;
				755	swarn.logical = sblock->pagev[0]->logical;
				756	swarn.errstr = errstr;
				757	swarn.dev = NULL;
				758
				759	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
				760	&flags);
				761	if (ret < 0)
				762	goto out;
				763
				764	extent_item_pos = swarn.logical - found_key.objectid;
				765	swarn.extent_item_size = found_key.offset;
				766
				767	eb = path->nodes[0];
				768	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
				769	item_size = btrfs_item_size_nr(eb, path->slots[0]);
				770
				771	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
				772	do {
				773	ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
				774	item_size, &ref_root,
				775	&ref_level);
				776	btrfs_warn_in_rcu(fs_info,
				777	"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
				778	errstr, swarn.logical,
				779	rcu_str_deref(dev->name),
				780	swarn.physical,
				781	ref_level ? "node" : "leaf",
				782	ret < 0 ? -1 : ref_level,
				783	ret < 0 ? -1 : ref_root);
				784	} while (ret != 1);
				785	btrfs_release_path(path);
				786	} else {
				787	btrfs_release_path(path);
				788	swarn.path = path;
				789	swarn.dev = dev;
				790	iterate_extent_inodes(fs_info, found_key.objectid,
				791	extent_item_pos, 1,
				792	scrub_print_warning_inode, &swarn, false);
				793	}
				794
				795	out:
				796	btrfs_free_path(path);
				797	}
				798
				799	static inline void scrub_get_recover(struct scrub_recover *recover)
				800	{
				801	refcount_inc(&recover->refs);
				802	}
				803
				804	static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
				805	struct scrub_recover *recover)
				806	{
				807	if (refcount_dec_and_test(&recover->refs)) {
				808	btrfs_bio_counter_dec(fs_info);
				809	btrfs_put_bbio(recover->bbio);
				810	kfree(recover);
				811	}
				812	}
				813
				814	/*
				815	* scrub_handle_errored_block gets called when either verification of the
				816	* pages failed or the bio failed to read, e.g. with EIO. In the latter
				817	* case, this function handles all pages in the bio, even though only one
				818	* may be bad.
				819	* The goal of this function is to repair the errored block by using the
				820	* contents of one of the mirrors.
				821	*/
				822	static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
				823	{
				824	struct scrub_ctx *sctx = sblock_to_check->sctx;
				825	struct btrfs_device *dev;
				826	struct btrfs_fs_info *fs_info;
				827	u64 logical;
				828	unsigned int failed_mirror_index;
				829	unsigned int is_metadata;
				830	unsigned int have_csum;
				831	struct scrub_block sblocks_for_recheck; / holds one for each mirror */
				832	struct scrub_block *sblock_bad;
				833	int ret;
				834	int mirror_index;
				835	int page_num;
				836	int success;
				837	bool full_stripe_locked;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	838	unsigned int nofs_flag;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	839	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
				840	DEFAULT_RATELIMIT_BURST);
				841
				842	BUG_ON(sblock_to_check->page_count < 1);
				843	fs_info = sctx->fs_info;
				844	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
				845	/*
				846	* if we find an error in a super block, we just report it.
				847	* They will get written with the next transaction commit
				848	* anyway
				849	*/
				850	spin_lock(&sctx->stat_lock);
				851	++sctx->stat.super_errors;
				852	spin_unlock(&sctx->stat_lock);
				853	return 0;
				854	}
				855	logical = sblock_to_check->pagev[0]->logical;
				856	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
				857	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
				858	is_metadata = !(sblock_to_check->pagev[0]->flags &
				859	BTRFS_EXTENT_FLAG_DATA);
				860	have_csum = sblock_to_check->pagev[0]->have_csum;
				861	dev = sblock_to_check->pagev[0]->dev;
				862
				863	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	864	* We must use GFP_NOFS because the scrub task might be waiting for a
				865	* worker task executing this function and in turn a transaction commit
				866	* might be waiting the scrub task to pause (which needs to wait for all
				867	* the worker tasks to complete before pausing).
				868	* We do allocations in the workers through insert_full_stripe_lock()
				869	* and scrub_add_page_to_wr_bio(), which happens down the call chain of
				870	* this function.
				871	*/
				872	nofs_flag = memalloc_nofs_save();
				873	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	874	* For RAID5/6, race can happen for a different device scrub thread.
				875	* For data corruption, Parity and Data threads will both try
				876	* to recovery the data.
				877	* Race can lead to doubly added csum error, or even unrecoverable
				878	* error.
				879	*/
				880	ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
				881	if (ret < 0) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	882	memalloc_nofs_restore(nofs_flag);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	883	spin_lock(&sctx->stat_lock);
				884	if (ret == -ENOMEM)
				885	sctx->stat.malloc_errors++;
				886	sctx->stat.read_errors++;
				887	sctx->stat.uncorrectable_errors++;
				888	spin_unlock(&sctx->stat_lock);
				889	return ret;
				890	}
				891
				892	/*
				893	* read all mirrors one after the other. This includes to
				894	* re-read the extent or metadata block that failed (that was
				895	* the cause that this fixup code is called) another time,
				896	* page by page this time in order to know which pages
				897	* caused I/O errors and which ones are good (for all mirrors).
				898	* It is the goal to handle the situation when more than one
				899	* mirror contains I/O errors, but the errors do not
				900	* overlap, i.e. the data can be repaired by selecting the
				901	* pages from those mirrors without I/O error on the
				902	* particular pages. One example (with blocks >= 2 * PAGE_SIZE)
				903	* would be that mirror #1 has an I/O error on the first page,
				904	* the second page is good, and mirror #2 has an I/O error on
				905	* the second page, but the first page is good.
				906	* Then the first page of the first mirror can be repaired by
				907	* taking the first page of the second mirror, and the
				908	* second page of the second mirror can be repaired by
				909	* copying the contents of the 2nd page of the 1st mirror.
				910	* One more note: if the pages of one mirror contain I/O
				911	* errors, the checksum cannot be verified. In order to get
				912	* the best data for repairing, the first attempt is to find
				913	* a mirror without I/O errors and with a validated checksum.
				914	* Only if this is not possible, the pages are picked from
				915	* mirrors with I/O errors without considering the checksum.
				916	* If the latter is the case, at the end, the checksum of the
				917	* repaired area is verified in order to correctly maintain
				918	* the statistics.
				919	*/
				920
				921	sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	922	sizeof(*sblocks_for_recheck), GFP_KERNEL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	923	if (!sblocks_for_recheck) {
				924	spin_lock(&sctx->stat_lock);
				925	sctx->stat.malloc_errors++;
				926	sctx->stat.read_errors++;
				927	sctx->stat.uncorrectable_errors++;
				928	spin_unlock(&sctx->stat_lock);
				929	btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
				930	goto out;
				931	}
				932
				933	/* setup the context, map the logical blocks and alloc the pages */
				934	ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
				935	if (ret) {
				936	spin_lock(&sctx->stat_lock);
				937	sctx->stat.read_errors++;
				938	sctx->stat.uncorrectable_errors++;
				939	spin_unlock(&sctx->stat_lock);
				940	btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
				941	goto out;
				942	}
				943	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
				944	sblock_bad = sblocks_for_recheck + failed_mirror_index;
				945
				946	/* build and submit the bios for the failed mirror, check checksums */
				947	scrub_recheck_block(fs_info, sblock_bad, 1);
				948
				949	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
				950	sblock_bad->no_io_error_seen) {
				951	/*
				952	* the error disappeared after reading page by page, or
				953	* the area was part of a huge bio and other parts of the
				954	* bio caused I/O errors, or the block layer merged several
				955	* read requests into one and the error is caused by a
				956	* different bio (usually one of the two latter cases is
				957	* the cause)
				958	*/
				959	spin_lock(&sctx->stat_lock);
				960	sctx->stat.unverified_errors++;
				961	sblock_to_check->data_corrected = 1;
				962	spin_unlock(&sctx->stat_lock);
				963
				964	if (sctx->is_dev_replace)
				965	scrub_write_block_to_dev_replace(sblock_bad);
				966	goto out;
				967	}
				968
				969	if (!sblock_bad->no_io_error_seen) {
				970	spin_lock(&sctx->stat_lock);
				971	sctx->stat.read_errors++;
				972	spin_unlock(&sctx->stat_lock);
				973	if (__ratelimit(&_rs))
				974	scrub_print_warning("i/o error", sblock_to_check);
				975	btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
				976	} else if (sblock_bad->checksum_error) {
				977	spin_lock(&sctx->stat_lock);
				978	sctx->stat.csum_errors++;
				979	spin_unlock(&sctx->stat_lock);
				980	if (__ratelimit(&_rs))
				981	scrub_print_warning("checksum error", sblock_to_check);
				982	btrfs_dev_stat_inc_and_print(dev,
				983	BTRFS_DEV_STAT_CORRUPTION_ERRS);
				984	} else if (sblock_bad->header_error) {
				985	spin_lock(&sctx->stat_lock);
				986	sctx->stat.verify_errors++;
				987	spin_unlock(&sctx->stat_lock);
				988	if (__ratelimit(&_rs))
				989	scrub_print_warning("checksum/header error",
				990	sblock_to_check);
				991	if (sblock_bad->generation_error)
				992	btrfs_dev_stat_inc_and_print(dev,
				993	BTRFS_DEV_STAT_GENERATION_ERRS);
				994	else
				995	btrfs_dev_stat_inc_and_print(dev,
				996	BTRFS_DEV_STAT_CORRUPTION_ERRS);
				997	}
				998
				999	if (sctx->readonly) {
				1000	ASSERT(!sctx->is_dev_replace);
				1001	goto out;
				1002	}
				1003
				1004	/*
				1005	* now build and submit the bios for the other mirrors, check
				1006	* checksums.
				1007	* First try to pick the mirror which is completely without I/O
				1008	* errors and also does not have a checksum error.
				1009	* If one is found, and if a checksum is present, the full block
				1010	* that is known to contain an error is rewritten. Afterwards
				1011	* the block is known to be corrected.
				1012	* If a mirror is found which is completely correct, and no
				1013	* checksum is present, only those pages are rewritten that had
				1014	* an I/O error in the block to be repaired, since it cannot be
				1015	* determined, which copy of the other pages is better (and it
				1016	* could happen otherwise that a correct page would be
				1017	* overwritten by a bad one).
				1018	*/
				1019	for (mirror_index = 0; ;mirror_index++) {
				1020	struct scrub_block *sblock_other;
				1021
				1022	if (mirror_index == failed_mirror_index)
				1023	continue;
				1024
				1025	/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
				1026	if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
				1027	if (mirror_index >= BTRFS_MAX_MIRRORS)
				1028	break;
				1029	if (!sblocks_for_recheck[mirror_index].page_count)
				1030	break;
				1031
				1032	sblock_other = sblocks_for_recheck + mirror_index;
				1033	} else {
				1034	struct scrub_recover *r = sblock_bad->pagev[0]->recover;
				1035	int max_allowed = r->bbio->num_stripes -
				1036	r->bbio->num_tgtdevs;
				1037
				1038	if (mirror_index >= max_allowed)
				1039	break;
				1040	if (!sblocks_for_recheck[1].page_count)
				1041	break;
				1042
				1043	ASSERT(failed_mirror_index == 0);
				1044	sblock_other = sblocks_for_recheck + 1;
				1045	sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
				1046	}
				1047
				1048	/* build and submit the bios, check checksums */
				1049	scrub_recheck_block(fs_info, sblock_other, 0);
				1050
				1051	if (!sblock_other->header_error &&
				1052	!sblock_other->checksum_error &&
				1053	sblock_other->no_io_error_seen) {
				1054	if (sctx->is_dev_replace) {
				1055	scrub_write_block_to_dev_replace(sblock_other);
				1056	goto corrected_error;
				1057	} else {
				1058	ret = scrub_repair_block_from_good_copy(
				1059	sblock_bad, sblock_other);
				1060	if (!ret)
				1061	goto corrected_error;
				1062	}
				1063	}
				1064	}
				1065
				1066	if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
				1067	goto did_not_correct_error;
				1068
				1069	/*
				1070	* In case of I/O errors in the area that is supposed to be
				1071	* repaired, continue by picking good copies of those pages.
				1072	* Select the good pages from mirrors to rewrite bad pages from
				1073	* the area to fix. Afterwards verify the checksum of the block
				1074	* that is supposed to be repaired. This verification step is
				1075	* only done for the purpose of statistic counting and for the
				1076	* final scrub report, whether errors remain.
				1077	* A perfect algorithm could make use of the checksum and try
				1078	* all possible combinations of pages from the different mirrors
				1079	* until the checksum verification succeeds. For example, when
				1080	* the 2nd page of mirror #1 faces I/O errors, and the 2nd page
				1081	* of mirror #2 is readable but the final checksum test fails,
				1082	* then the 2nd page of mirror #3 could be tried, whether now
				1083	* the final checksum succeeds. But this would be a rare
				1084	* exception and is therefore not implemented. At least it is
				1085	* avoided that the good copy is overwritten.
				1086	* A more useful improvement would be to pick the sectors
				1087	* without I/O error based on sector sizes (512 bytes on legacy
				1088	* disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
				1089	* mirror could be repaired by taking 512 byte of a different
				1090	* mirror, even if other 512 byte sectors in the same PAGE_SIZE
				1091	* area are unreadable.
				1092	*/
				1093	success = 1;
				1094	for (page_num = 0; page_num < sblock_bad->page_count;
				1095	page_num++) {
				1096	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
				1097	struct scrub_block *sblock_other = NULL;
				1098
				1099	/* skip no-io-error page in scrub */
				1100	if (!page_bad->io_error && !sctx->is_dev_replace)
				1101	continue;
				1102
				1103	if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
				1104	/*
				1105	* In case of dev replace, if raid56 rebuild process
				1106	* didn't work out correct data, then copy the content
				1107	* in sblock_bad to make sure target device is identical
				1108	* to source device, instead of writing garbage data in
				1109	* sblock_for_recheck array to target device.
				1110	*/
				1111	sblock_other = NULL;
				1112	} else if (page_bad->io_error) {
				1113	/* try to find no-io-error page in mirrors */
				1114	for (mirror_index = 0;
				1115	mirror_index < BTRFS_MAX_MIRRORS &&
				1116	sblocks_for_recheck[mirror_index].page_count > 0;
				1117	mirror_index++) {
				1118	if (!sblocks_for_recheck[mirror_index].
				1119	pagev[page_num]->io_error) {
				1120	sblock_other = sblocks_for_recheck +
				1121	mirror_index;
				1122	break;
				1123	}
				1124	}
				1125	if (!sblock_other)
				1126	success = 0;
				1127	}
				1128
				1129	if (sctx->is_dev_replace) {
				1130	/*
				1131	* did not find a mirror to fetch the page
				1132	* from. scrub_write_page_to_dev_replace()
				1133	* handles this case (page->io_error), by
				1134	* filling the block with zeros before
				1135	* submitting the write request
				1136	*/
				1137	if (!sblock_other)
				1138	sblock_other = sblock_bad;
				1139
				1140	if (scrub_write_page_to_dev_replace(sblock_other,
				1141	page_num) != 0) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1142	atomic64_inc(
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1143	&fs_info->dev_replace.num_write_errors);
				1144	success = 0;
				1145	}
				1146	} else if (sblock_other) {
				1147	ret = scrub_repair_page_from_good_copy(sblock_bad,
				1148	sblock_other,
				1149	page_num, 0);
				1150	if (0 == ret)
				1151	page_bad->io_error = 0;
				1152	else
				1153	success = 0;
				1154	}
				1155	}
				1156
				1157	if (success && !sctx->is_dev_replace) {
				1158	if (is_metadata \|\| have_csum) {
				1159	/*
				1160	* need to verify the checksum now that all
				1161	* sectors on disk are repaired (the write
				1162	* request for data to be repaired is on its way).
				1163	* Just be lazy and use scrub_recheck_block()
				1164	* which re-reads the data before the checksum
				1165	* is verified, but most likely the data comes out
				1166	* of the page cache.
				1167	*/
				1168	scrub_recheck_block(fs_info, sblock_bad, 1);
				1169	if (!sblock_bad->header_error &&
				1170	!sblock_bad->checksum_error &&
				1171	sblock_bad->no_io_error_seen)
				1172	goto corrected_error;
				1173	else
				1174	goto did_not_correct_error;
				1175	} else {
				1176	corrected_error:
				1177	spin_lock(&sctx->stat_lock);
				1178	sctx->stat.corrected_errors++;
				1179	sblock_to_check->data_corrected = 1;
				1180	spin_unlock(&sctx->stat_lock);
				1181	btrfs_err_rl_in_rcu(fs_info,
				1182	"fixed up error at logical %llu on dev %s",
				1183	logical, rcu_str_deref(dev->name));
				1184	}
				1185	} else {
				1186	did_not_correct_error:
				1187	spin_lock(&sctx->stat_lock);
				1188	sctx->stat.uncorrectable_errors++;
				1189	spin_unlock(&sctx->stat_lock);
				1190	btrfs_err_rl_in_rcu(fs_info,
				1191	"unable to fixup (regular) error at logical %llu on dev %s",
				1192	logical, rcu_str_deref(dev->name));
				1193	}
				1194
				1195	out:
				1196	if (sblocks_for_recheck) {
				1197	for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
				1198	mirror_index++) {
				1199	struct scrub_block *sblock = sblocks_for_recheck +
				1200	mirror_index;
				1201	struct scrub_recover *recover;
				1202	int page_index;
				1203
				1204	for (page_index = 0; page_index < sblock->page_count;
				1205	page_index++) {
				1206	sblock->pagev[page_index]->sblock = NULL;
				1207	recover = sblock->pagev[page_index]->recover;
				1208	if (recover) {
				1209	scrub_put_recover(fs_info, recover);
				1210	sblock->pagev[page_index]->recover =
				1211	NULL;
				1212	}
				1213	scrub_page_put(sblock->pagev[page_index]);
				1214	}
				1215	}
				1216	kfree(sblocks_for_recheck);
				1217	}
				1218
				1219	ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1220	memalloc_nofs_restore(nofs_flag);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1221	if (ret < 0)
				1222	return ret;
				1223	return 0;
				1224	}
				1225
				1226	static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
				1227	{
				1228	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
				1229	return 2;
				1230	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
				1231	return 3;
				1232	else
				1233	return (int)bbio->num_stripes;
				1234	}
				1235
				1236	static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
				1237	u64 *raid_map,
				1238	u64 mapped_length,
				1239	int nstripes, int mirror,
				1240	int *stripe_index,
				1241	u64 *stripe_offset)
				1242	{
				1243	int i;
				1244
				1245	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				1246	/* RAID5/6 */
				1247	for (i = 0; i < nstripes; i++) {
				1248	if (raid_map[i] == RAID6_Q_STRIPE \|\|
				1249	raid_map[i] == RAID5_P_STRIPE)
				1250	continue;
				1251
				1252	if (logical >= raid_map[i] &&
				1253	logical < raid_map[i] + mapped_length)
				1254	break;
				1255	}
				1256
				1257	*stripe_index = i;
				1258	*stripe_offset = logical - raid_map[i];
				1259	} else {
				1260	/* The other RAID type */
				1261	*stripe_index = mirror;
				1262	*stripe_offset = 0;
				1263	}
				1264	}
				1265
				1266	static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
				1267	struct scrub_block *sblocks_for_recheck)
				1268	{
				1269	struct scrub_ctx *sctx = original_sblock->sctx;
				1270	struct btrfs_fs_info *fs_info = sctx->fs_info;
				1271	u64 length = original_sblock->page_count * PAGE_SIZE;
				1272	u64 logical = original_sblock->pagev[0]->logical;
				1273	u64 generation = original_sblock->pagev[0]->generation;
				1274	u64 flags = original_sblock->pagev[0]->flags;
				1275	u64 have_csum = original_sblock->pagev[0]->have_csum;
				1276	struct scrub_recover *recover;
				1277	struct btrfs_bio *bbio;
				1278	u64 sublen;
				1279	u64 mapped_length;
				1280	u64 stripe_offset;
				1281	int stripe_index;
				1282	int page_index = 0;
				1283	int mirror_index;
				1284	int nmirrors;
				1285	int ret;
				1286
				1287	/*
				1288	* note: the two members refs and outstanding_pages
				1289	* are not used (and not set) in the blocks that are used for
				1290	* the recheck procedure
				1291	*/
				1292
				1293	while (length > 0) {
				1294	sublen = min_t(u64, length, PAGE_SIZE);
				1295	mapped_length = sublen;
				1296	bbio = NULL;
				1297
				1298	/*
				1299	* with a length of PAGE_SIZE, each returned stripe
				1300	* represents one mirror
				1301	*/
				1302	btrfs_bio_counter_inc_blocked(fs_info);
				1303	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
				1304	logical, &mapped_length, &bbio);
				1305	if (ret \|\| !bbio \|\| mapped_length < sublen) {
				1306	btrfs_put_bbio(bbio);
				1307	btrfs_bio_counter_dec(fs_info);
				1308	return -EIO;
				1309	}
				1310
				1311	recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
				1312	if (!recover) {
				1313	btrfs_put_bbio(bbio);
				1314	btrfs_bio_counter_dec(fs_info);
				1315	return -ENOMEM;
				1316	}
				1317
				1318	refcount_set(&recover->refs, 1);
				1319	recover->bbio = bbio;
				1320	recover->map_length = mapped_length;
				1321
				1322	BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
				1323
				1324	nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
				1325
				1326	for (mirror_index = 0; mirror_index < nmirrors;
				1327	mirror_index++) {
				1328	struct scrub_block *sblock;
				1329	struct scrub_page *page;
				1330
				1331	sblock = sblocks_for_recheck + mirror_index;
				1332	sblock->sctx = sctx;
				1333
				1334	page = kzalloc(sizeof(*page), GFP_NOFS);
				1335	if (!page) {
				1336	leave_nomem:
				1337	spin_lock(&sctx->stat_lock);
				1338	sctx->stat.malloc_errors++;
				1339	spin_unlock(&sctx->stat_lock);
				1340	scrub_put_recover(fs_info, recover);
				1341	return -ENOMEM;
				1342	}
				1343	scrub_page_get(page);
				1344	sblock->pagev[page_index] = page;
				1345	page->sblock = sblock;
				1346	page->flags = flags;
				1347	page->generation = generation;
				1348	page->logical = logical;
				1349	page->have_csum = have_csum;
				1350	if (have_csum)
				1351	memcpy(page->csum,
				1352	original_sblock->pagev[0]->csum,
				1353	sctx->csum_size);
				1354
				1355	scrub_stripe_index_and_offset(logical,
				1356	bbio->map_type,
				1357	bbio->raid_map,
				1358	mapped_length,
				1359	bbio->num_stripes -
				1360	bbio->num_tgtdevs,
				1361	mirror_index,
				1362	&stripe_index,
				1363	&stripe_offset);
				1364	page->physical = bbio->stripes[stripe_index].physical +
				1365	stripe_offset;
				1366	page->dev = bbio->stripes[stripe_index].dev;
				1367
				1368	BUG_ON(page_index >= original_sblock->page_count);
				1369	page->physical_for_dev_replace =
				1370	original_sblock->pagev[page_index]->
				1371	physical_for_dev_replace;
				1372	/* for missing devices, dev->bdev is NULL */
				1373	page->mirror_num = mirror_index + 1;
				1374	sblock->page_count++;
				1375	page->page = alloc_page(GFP_NOFS);
				1376	if (!page->page)
				1377	goto leave_nomem;
				1378
				1379	scrub_get_recover(recover);
				1380	page->recover = recover;
				1381	}
				1382	scrub_put_recover(fs_info, recover);
				1383	length -= sublen;
				1384	logical += sublen;
				1385	page_index++;
				1386	}
				1387
				1388	return 0;
				1389	}
				1390
				1391	static void scrub_bio_wait_endio(struct bio *bio)
				1392	{
				1393	complete(bio->bi_private);
				1394	}
				1395
				1396	static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
				1397	struct bio *bio,
				1398	struct scrub_page *page)
				1399	{
				1400	DECLARE_COMPLETION_ONSTACK(done);
				1401	int ret;
				1402	int mirror_num;
				1403
				1404	bio->bi_iter.bi_sector = page->logical >> 9;
				1405	bio->bi_private = &done;
				1406	bio->bi_end_io = scrub_bio_wait_endio;
				1407
				1408	mirror_num = page->sblock->pagev[0]->mirror_num;
				1409	ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
				1410	page->recover->map_length,
				1411	mirror_num, 0);
				1412	if (ret)
				1413	return ret;
				1414
				1415	wait_for_completion_io(&done);
				1416	return blk_status_to_errno(bio->bi_status);
				1417	}
				1418
				1419	static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
				1420	struct scrub_block *sblock)
				1421	{
				1422	struct scrub_page *first_page = sblock->pagev[0];
				1423	struct bio *bio;
				1424	int page_num;
				1425
				1426	/* All pages in sblock belong to the same stripe on the same device. */
				1427	ASSERT(first_page->dev);
				1428	if (!first_page->dev->bdev)
				1429	goto out;
				1430
				1431	bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
				1432	bio_set_dev(bio, first_page->dev->bdev);
				1433
				1434	for (page_num = 0; page_num < sblock->page_count; page_num++) {
				1435	struct scrub_page *page = sblock->pagev[page_num];
				1436
				1437	WARN_ON(!page->page);
				1438	bio_add_page(bio, page->page, PAGE_SIZE, 0);
				1439	}
				1440
				1441	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
				1442	bio_put(bio);
				1443	goto out;
				1444	}
				1445
				1446	bio_put(bio);
				1447
				1448	scrub_recheck_block_checksum(sblock);
				1449
				1450	return;
				1451	out:
				1452	for (page_num = 0; page_num < sblock->page_count; page_num++)
				1453	sblock->pagev[page_num]->io_error = 1;
				1454
				1455	sblock->no_io_error_seen = 0;
				1456	}
				1457
				1458	/*
				1459	* this function will check the on disk data for checksum errors, header
				1460	* errors and read I/O errors. If any I/O errors happen, the exact pages
				1461	* which are errored are marked as being bad. The goal is to enable scrub
				1462	* to take those pages that are not errored from all the mirrors so that
				1463	* the pages that are errored in the just handled mirror can be repaired.
				1464	*/
				1465	static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
				1466	struct scrub_block *sblock,
				1467	int retry_failed_mirror)
				1468	{
				1469	int page_num;
				1470
				1471	sblock->no_io_error_seen = 1;
				1472
				1473	/* short cut for raid56 */
				1474	if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
				1475	return scrub_recheck_block_on_raid56(fs_info, sblock);
				1476
				1477	for (page_num = 0; page_num < sblock->page_count; page_num++) {
				1478	struct bio *bio;
				1479	struct scrub_page *page = sblock->pagev[page_num];
				1480
				1481	if (page->dev->bdev == NULL) {
				1482	page->io_error = 1;
				1483	sblock->no_io_error_seen = 0;
				1484	continue;
				1485	}
				1486
				1487	WARN_ON(!page->page);
				1488	bio = btrfs_io_bio_alloc(1);
				1489	bio_set_dev(bio, page->dev->bdev);
				1490
				1491	bio_add_page(bio, page->page, PAGE_SIZE, 0);
				1492	bio->bi_iter.bi_sector = page->physical >> 9;
				1493	bio->bi_opf = REQ_OP_READ;
				1494
				1495	if (btrfsic_submit_bio_wait(bio)) {
				1496	page->io_error = 1;
				1497	sblock->no_io_error_seen = 0;
				1498	}
				1499
				1500	bio_put(bio);
				1501	}
				1502
				1503	if (sblock->no_io_error_seen)
				1504	scrub_recheck_block_checksum(sblock);
				1505	}
				1506
				1507	static inline int scrub_check_fsid(u8 fsid[],
				1508	struct scrub_page *spage)
				1509	{
				1510	struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
				1511	int ret;
				1512
				1513	ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
				1514	return !ret;
				1515	}
				1516
				1517	static void scrub_recheck_block_checksum(struct scrub_block *sblock)
				1518	{
				1519	sblock->header_error = 0;
				1520	sblock->checksum_error = 0;
				1521	sblock->generation_error = 0;
				1522
				1523	if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
				1524	scrub_checksum_data(sblock);
				1525	else
				1526	scrub_checksum_tree_block(sblock);
				1527	}
				1528
				1529	static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
				1530	struct scrub_block *sblock_good)
				1531	{
				1532	int page_num;
				1533	int ret = 0;
				1534
				1535	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
				1536	int ret_sub;
				1537
				1538	ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
				1539	sblock_good,
				1540	page_num, 1);
				1541	if (ret_sub)
				1542	ret = ret_sub;
				1543	}
				1544
				1545	return ret;
				1546	}
				1547
				1548	static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
				1549	struct scrub_block *sblock_good,
				1550	int page_num, int force_write)
				1551	{
				1552	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
				1553	struct scrub_page *page_good = sblock_good->pagev[page_num];
				1554	struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
				1555
				1556	BUG_ON(page_bad->page == NULL);
				1557	BUG_ON(page_good->page == NULL);
				1558	if (force_write \|\| sblock_bad->header_error \|\|
				1559	sblock_bad->checksum_error \|\| page_bad->io_error) {
				1560	struct bio *bio;
				1561	int ret;
				1562
				1563	if (!page_bad->dev->bdev) {
				1564	btrfs_warn_rl(fs_info,
				1565	"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
				1566	return -EIO;
				1567	}
				1568
				1569	bio = btrfs_io_bio_alloc(1);
				1570	bio_set_dev(bio, page_bad->dev->bdev);
				1571	bio->bi_iter.bi_sector = page_bad->physical >> 9;
				1572	bio->bi_opf = REQ_OP_WRITE;
				1573
				1574	ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
				1575	if (PAGE_SIZE != ret) {
				1576	bio_put(bio);
				1577	return -EIO;
				1578	}
				1579
				1580	if (btrfsic_submit_bio_wait(bio)) {
				1581	btrfs_dev_stat_inc_and_print(page_bad->dev,
				1582	BTRFS_DEV_STAT_WRITE_ERRS);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1583	atomic64_inc(&fs_info->dev_replace.num_write_errors);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1584	bio_put(bio);
				1585	return -EIO;
				1586	}
				1587	bio_put(bio);
				1588	}
				1589
				1590	return 0;
				1591	}
				1592
				1593	static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
				1594	{
				1595	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
				1596	int page_num;
				1597
				1598	/*
				1599	* This block is used for the check of the parity on the source device,
				1600	* so the data needn't be written into the destination device.
				1601	*/
				1602	if (sblock->sparity)
				1603	return;
				1604
				1605	for (page_num = 0; page_num < sblock->page_count; page_num++) {
				1606	int ret;
				1607
				1608	ret = scrub_write_page_to_dev_replace(sblock, page_num);
				1609	if (ret)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1610	atomic64_inc(&fs_info->dev_replace.num_write_errors);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1611	}
				1612	}
				1613
				1614	static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
				1615	int page_num)
				1616	{
				1617	struct scrub_page *spage = sblock->pagev[page_num];
				1618
				1619	BUG_ON(spage->page == NULL);
				1620	if (spage->io_error) {
				1621	void *mapped_buffer = kmap_atomic(spage->page);
				1622
				1623	clear_page(mapped_buffer);
				1624	flush_dcache_page(spage->page);
				1625	kunmap_atomic(mapped_buffer);
				1626	}
				1627	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
				1628	}
				1629
				1630	static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
				1631	struct scrub_page *spage)
				1632	{
				1633	struct scrub_bio *sbio;
				1634	int ret;
				1635
				1636	mutex_lock(&sctx->wr_lock);
				1637	again:
				1638	if (!sctx->wr_curr_bio) {
				1639	sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
				1640	GFP_KERNEL);
				1641	if (!sctx->wr_curr_bio) {
				1642	mutex_unlock(&sctx->wr_lock);
				1643	return -ENOMEM;
				1644	}
				1645	sctx->wr_curr_bio->sctx = sctx;
				1646	sctx->wr_curr_bio->page_count = 0;
				1647	}
				1648	sbio = sctx->wr_curr_bio;
				1649	if (sbio->page_count == 0) {
				1650	struct bio *bio;
				1651
				1652	sbio->physical = spage->physical_for_dev_replace;
				1653	sbio->logical = spage->logical;
				1654	sbio->dev = sctx->wr_tgtdev;
				1655	bio = sbio->bio;
				1656	if (!bio) {
				1657	bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
				1658	sbio->bio = bio;
				1659	}
				1660
				1661	bio->bi_private = sbio;
				1662	bio->bi_end_io = scrub_wr_bio_end_io;
				1663	bio_set_dev(bio, sbio->dev->bdev);
				1664	bio->bi_iter.bi_sector = sbio->physical >> 9;
				1665	bio->bi_opf = REQ_OP_WRITE;
				1666	sbio->status = 0;
				1667	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
				1668	spage->physical_for_dev_replace \|\|
				1669	sbio->logical + sbio->page_count * PAGE_SIZE !=
				1670	spage->logical) {
				1671	scrub_wr_submit(sctx);
				1672	goto again;
				1673	}
				1674
				1675	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
				1676	if (ret != PAGE_SIZE) {
				1677	if (sbio->page_count < 1) {
				1678	bio_put(sbio->bio);
				1679	sbio->bio = NULL;
				1680	mutex_unlock(&sctx->wr_lock);
				1681	return -EIO;
				1682	}
				1683	scrub_wr_submit(sctx);
				1684	goto again;
				1685	}
				1686
				1687	sbio->pagev[sbio->page_count] = spage;
				1688	scrub_page_get(spage);
				1689	sbio->page_count++;
				1690	if (sbio->page_count == sctx->pages_per_wr_bio)
				1691	scrub_wr_submit(sctx);
				1692	mutex_unlock(&sctx->wr_lock);
				1693
				1694	return 0;
				1695	}
				1696
				1697	static void scrub_wr_submit(struct scrub_ctx *sctx)
				1698	{
				1699	struct scrub_bio *sbio;
				1700
				1701	if (!sctx->wr_curr_bio)
				1702	return;
				1703
				1704	sbio = sctx->wr_curr_bio;
				1705	sctx->wr_curr_bio = NULL;
				1706	WARN_ON(!sbio->bio->bi_disk);
				1707	scrub_pending_bio_inc(sctx);
				1708	/* process all writes in a single worker thread. Then the block layer
				1709	* orders the requests before sending them to the driver which
				1710	* doubled the write performance on spinning disks when measured
				1711	* with Linux 3.5 */
				1712	btrfsic_submit_bio(sbio->bio);
				1713	}
				1714
				1715	static void scrub_wr_bio_end_io(struct bio *bio)
				1716	{
				1717	struct scrub_bio *sbio = bio->bi_private;
				1718	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
				1719
				1720	sbio->status = bio->bi_status;
				1721	sbio->bio = bio;
				1722
				1723	btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
				1724	scrub_wr_bio_end_io_worker, NULL, NULL);
				1725	btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
				1726	}
				1727
				1728	static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
				1729	{
				1730	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
				1731	struct scrub_ctx *sctx = sbio->sctx;
				1732	int i;
				1733
				1734	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
				1735	if (sbio->status) {
				1736	struct btrfs_dev_replace *dev_replace =
				1737	&sbio->sctx->fs_info->dev_replace;
				1738
				1739	for (i = 0; i < sbio->page_count; i++) {
				1740	struct scrub_page *spage = sbio->pagev[i];
				1741
				1742	spage->io_error = 1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1743	atomic64_inc(&dev_replace->num_write_errors);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1744	}
				1745	}
				1746
				1747	for (i = 0; i < sbio->page_count; i++)
				1748	scrub_page_put(sbio->pagev[i]);
				1749
				1750	bio_put(sbio->bio);
				1751	kfree(sbio);
				1752	scrub_pending_bio_dec(sctx);
				1753	}
				1754
				1755	static int scrub_checksum(struct scrub_block *sblock)
				1756	{
				1757	u64 flags;
				1758	int ret;
				1759
				1760	/*
				1761	* No need to initialize these stats currently,
				1762	* because this function only use return value
				1763	* instead of these stats value.
				1764	*
				1765	* Todo:
				1766	* always use stats
				1767	*/
				1768	sblock->header_error = 0;
				1769	sblock->generation_error = 0;
				1770	sblock->checksum_error = 0;
				1771
				1772	WARN_ON(sblock->page_count < 1);
				1773	flags = sblock->pagev[0]->flags;
				1774	ret = 0;
				1775	if (flags & BTRFS_EXTENT_FLAG_DATA)
				1776	ret = scrub_checksum_data(sblock);
				1777	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
				1778	ret = scrub_checksum_tree_block(sblock);
				1779	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
				1780	(void)scrub_checksum_super(sblock);
				1781	else
				1782	WARN_ON(1);
				1783	if (ret)
				1784	scrub_handle_errored_block(sblock);
				1785
				1786	return ret;
				1787	}
				1788
				1789	static int scrub_checksum_data(struct scrub_block *sblock)
				1790	{
				1791	struct scrub_ctx *sctx = sblock->sctx;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1792	struct btrfs_fs_info *fs_info = sctx->fs_info;
				1793	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1794	u8 csum[BTRFS_CSUM_SIZE];
				1795	u8 *on_disk_csum;
				1796	struct page *page;
				1797	void *buffer;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1798	u64 len;
				1799	int index;
				1800
				1801	BUG_ON(sblock->page_count < 1);
				1802	if (!sblock->pagev[0]->have_csum)
				1803	return 0;
				1804
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1805	shash->tfm = fs_info->csum_shash;
				1806	crypto_shash_init(shash);
				1807
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1808	on_disk_csum = sblock->pagev[0]->csum;
				1809	page = sblock->pagev[0]->page;
				1810	buffer = kmap_atomic(page);
				1811
				1812	len = sctx->fs_info->sectorsize;
				1813	index = 0;
				1814	for (;;) {
				1815	u64 l = min_t(u64, len, PAGE_SIZE);
				1816
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1817	crypto_shash_update(shash, buffer, l);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1818	kunmap_atomic(buffer);
				1819	len -= l;
				1820	if (len == 0)
				1821	break;
				1822	index++;
				1823	BUG_ON(index >= sblock->page_count);
				1824	BUG_ON(!sblock->pagev[index]->page);
				1825	page = sblock->pagev[index]->page;
				1826	buffer = kmap_atomic(page);
				1827	}
				1828
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1829	crypto_shash_final(shash, csum);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1830	if (memcmp(csum, on_disk_csum, sctx->csum_size))
				1831	sblock->checksum_error = 1;
				1832
				1833	return sblock->checksum_error;
				1834	}
				1835
				1836	static int scrub_checksum_tree_block(struct scrub_block *sblock)
				1837	{
				1838	struct scrub_ctx *sctx = sblock->sctx;
				1839	struct btrfs_header *h;
				1840	struct btrfs_fs_info *fs_info = sctx->fs_info;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1841	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1842	u8 calculated_csum[BTRFS_CSUM_SIZE];
				1843	u8 on_disk_csum[BTRFS_CSUM_SIZE];
				1844	struct page *page;
				1845	void *mapped_buffer;
				1846	u64 mapped_size;
				1847	void *p;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1848	u64 len;
				1849	int index;
				1850
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1851	shash->tfm = fs_info->csum_shash;
				1852	crypto_shash_init(shash);
				1853
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1854	BUG_ON(sblock->page_count < 1);
				1855	page = sblock->pagev[0]->page;
				1856	mapped_buffer = kmap_atomic(page);
				1857	h = (struct btrfs_header *)mapped_buffer;
				1858	memcpy(on_disk_csum, h->csum, sctx->csum_size);
				1859
				1860	/*
				1861	* we don't use the getter functions here, as we
				1862	* a) don't have an extent buffer and
				1863	* b) the page is already kmapped
				1864	*/
				1865	if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
				1866	sblock->header_error = 1;
				1867
				1868	if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
				1869	sblock->header_error = 1;
				1870	sblock->generation_error = 1;
				1871	}
				1872
				1873	if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
				1874	sblock->header_error = 1;
				1875
				1876	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
				1877	BTRFS_UUID_SIZE))
				1878	sblock->header_error = 1;
				1879
				1880	len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
				1881	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
				1882	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
				1883	index = 0;
				1884	for (;;) {
				1885	u64 l = min_t(u64, len, mapped_size);
				1886
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1887	crypto_shash_update(shash, p, l);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1888	kunmap_atomic(mapped_buffer);
				1889	len -= l;
				1890	if (len == 0)
				1891	break;
				1892	index++;
				1893	BUG_ON(index >= sblock->page_count);
				1894	BUG_ON(!sblock->pagev[index]->page);
				1895	page = sblock->pagev[index]->page;
				1896	mapped_buffer = kmap_atomic(page);
				1897	mapped_size = PAGE_SIZE;
				1898	p = mapped_buffer;
				1899	}
				1900
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1901	crypto_shash_final(shash, calculated_csum);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1902	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
				1903	sblock->checksum_error = 1;
				1904
				1905	return sblock->header_error \|\| sblock->checksum_error;
				1906	}
				1907
				1908	static int scrub_checksum_super(struct scrub_block *sblock)
				1909	{
				1910	struct btrfs_super_block *s;
				1911	struct scrub_ctx *sctx = sblock->sctx;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1912	struct btrfs_fs_info *fs_info = sctx->fs_info;
				1913	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1914	u8 calculated_csum[BTRFS_CSUM_SIZE];
				1915	u8 on_disk_csum[BTRFS_CSUM_SIZE];
				1916	struct page *page;
				1917	void *mapped_buffer;
				1918	u64 mapped_size;
				1919	void *p;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1920	int fail_gen = 0;
				1921	int fail_cor = 0;
				1922	u64 len;
				1923	int index;
				1924
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1925	shash->tfm = fs_info->csum_shash;
				1926	crypto_shash_init(shash);
				1927
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1928	BUG_ON(sblock->page_count < 1);
				1929	page = sblock->pagev[0]->page;
				1930	mapped_buffer = kmap_atomic(page);
				1931	s = (struct btrfs_super_block *)mapped_buffer;
				1932	memcpy(on_disk_csum, s->csum, sctx->csum_size);
				1933
				1934	if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
				1935	++fail_cor;
				1936
				1937	if (sblock->pagev[0]->generation != btrfs_super_generation(s))
				1938	++fail_gen;
				1939
				1940	if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
				1941	++fail_cor;
				1942
				1943	len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
				1944	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
				1945	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
				1946	index = 0;
				1947	for (;;) {
				1948	u64 l = min_t(u64, len, mapped_size);
				1949
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1950	crypto_shash_update(shash, p, l);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1951	kunmap_atomic(mapped_buffer);
				1952	len -= l;
				1953	if (len == 0)
				1954	break;
				1955	index++;
				1956	BUG_ON(index >= sblock->page_count);
				1957	BUG_ON(!sblock->pagev[index]->page);
				1958	page = sblock->pagev[index]->page;
				1959	mapped_buffer = kmap_atomic(page);
				1960	mapped_size = PAGE_SIZE;
				1961	p = mapped_buffer;
				1962	}
				1963
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1964	crypto_shash_final(shash, calculated_csum);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1965	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
				1966	++fail_cor;
				1967
				1968	if (fail_cor + fail_gen) {
				1969	/*
				1970	* if we find an error in a super block, we just report it.
				1971	* They will get written with the next transaction commit
				1972	* anyway
				1973	*/
				1974	spin_lock(&sctx->stat_lock);
				1975	++sctx->stat.super_errors;
				1976	spin_unlock(&sctx->stat_lock);
				1977	if (fail_cor)
				1978	btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
				1979	BTRFS_DEV_STAT_CORRUPTION_ERRS);
				1980	else
				1981	btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
				1982	BTRFS_DEV_STAT_GENERATION_ERRS);
				1983	}
				1984
				1985	return fail_cor + fail_gen;
				1986	}
				1987
				1988	static void scrub_block_get(struct scrub_block *sblock)
				1989	{
				1990	refcount_inc(&sblock->refs);
				1991	}
				1992
				1993	static void scrub_block_put(struct scrub_block *sblock)
				1994	{
				1995	if (refcount_dec_and_test(&sblock->refs)) {
				1996	int i;
				1997
				1998	if (sblock->sparity)
				1999	scrub_parity_put(sblock->sparity);
				2000
				2001	for (i = 0; i < sblock->page_count; i++)
				2002	scrub_page_put(sblock->pagev[i]);
				2003	kfree(sblock);
				2004	}
				2005	}
				2006
				2007	static void scrub_page_get(struct scrub_page *spage)
				2008	{
				2009	atomic_inc(&spage->refs);
				2010	}
				2011
				2012	static void scrub_page_put(struct scrub_page *spage)
				2013	{
				2014	if (atomic_dec_and_test(&spage->refs)) {
				2015	if (spage->page)
				2016	__free_page(spage->page);
				2017	kfree(spage);
				2018	}
				2019	}
				2020
				2021	static void scrub_submit(struct scrub_ctx *sctx)
				2022	{
				2023	struct scrub_bio *sbio;
				2024
				2025	if (sctx->curr == -1)
				2026	return;
				2027
				2028	sbio = sctx->bios[sctx->curr];
				2029	sctx->curr = -1;
				2030	scrub_pending_bio_inc(sctx);
				2031	btrfsic_submit_bio(sbio->bio);
				2032	}
				2033
				2034	static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
				2035	struct scrub_page *spage)
				2036	{
				2037	struct scrub_block *sblock = spage->sblock;
				2038	struct scrub_bio *sbio;
				2039	int ret;
				2040
				2041	again:
				2042	/*
				2043	* grab a fresh bio or wait for one to become available
				2044	*/
				2045	while (sctx->curr == -1) {
				2046	spin_lock(&sctx->list_lock);
				2047	sctx->curr = sctx->first_free;
				2048	if (sctx->curr != -1) {
				2049	sctx->first_free = sctx->bios[sctx->curr]->next_free;
				2050	sctx->bios[sctx->curr]->next_free = -1;
				2051	sctx->bios[sctx->curr]->page_count = 0;
				2052	spin_unlock(&sctx->list_lock);
				2053	} else {
				2054	spin_unlock(&sctx->list_lock);
				2055	wait_event(sctx->list_wait, sctx->first_free != -1);
				2056	}
				2057	}
				2058	sbio = sctx->bios[sctx->curr];
				2059	if (sbio->page_count == 0) {
				2060	struct bio *bio;
				2061
				2062	sbio->physical = spage->physical;
				2063	sbio->logical = spage->logical;
				2064	sbio->dev = spage->dev;
				2065	bio = sbio->bio;
				2066	if (!bio) {
				2067	bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
				2068	sbio->bio = bio;
				2069	}
				2070
				2071	bio->bi_private = sbio;
				2072	bio->bi_end_io = scrub_bio_end_io;
				2073	bio_set_dev(bio, sbio->dev->bdev);
				2074	bio->bi_iter.bi_sector = sbio->physical >> 9;
				2075	bio->bi_opf = REQ_OP_READ;
				2076	sbio->status = 0;
				2077	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
				2078	spage->physical \|\|
				2079	sbio->logical + sbio->page_count * PAGE_SIZE !=
				2080	spage->logical \|\|
				2081	sbio->dev != spage->dev) {
				2082	scrub_submit(sctx);
				2083	goto again;
				2084	}
				2085
				2086	sbio->pagev[sbio->page_count] = spage;
				2087	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
				2088	if (ret != PAGE_SIZE) {
				2089	if (sbio->page_count < 1) {
				2090	bio_put(sbio->bio);
				2091	sbio->bio = NULL;
				2092	return -EIO;
				2093	}
				2094	scrub_submit(sctx);
				2095	goto again;
				2096	}
				2097
				2098	scrub_block_get(sblock); /* one for the page added to the bio */
				2099	atomic_inc(&sblock->outstanding_pages);
				2100	sbio->page_count++;
				2101	if (sbio->page_count == sctx->pages_per_rd_bio)
				2102	scrub_submit(sctx);
				2103
				2104	return 0;
				2105	}
				2106
				2107	static void scrub_missing_raid56_end_io(struct bio *bio)
				2108	{
				2109	struct scrub_block *sblock = bio->bi_private;
				2110	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
				2111
				2112	if (bio->bi_status)
				2113	sblock->no_io_error_seen = 0;
				2114
				2115	bio_put(bio);
				2116
				2117	btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
				2118	}
				2119
				2120	static void scrub_missing_raid56_worker(struct btrfs_work *work)
				2121	{
				2122	struct scrub_block *sblock = container_of(work, struct scrub_block, work);
				2123	struct scrub_ctx *sctx = sblock->sctx;
				2124	struct btrfs_fs_info *fs_info = sctx->fs_info;
				2125	u64 logical;
				2126	struct btrfs_device *dev;
				2127
				2128	logical = sblock->pagev[0]->logical;
				2129	dev = sblock->pagev[0]->dev;
				2130
				2131	if (sblock->no_io_error_seen)
				2132	scrub_recheck_block_checksum(sblock);
				2133
				2134	if (!sblock->no_io_error_seen) {
				2135	spin_lock(&sctx->stat_lock);
				2136	sctx->stat.read_errors++;
				2137	spin_unlock(&sctx->stat_lock);
				2138	btrfs_err_rl_in_rcu(fs_info,
				2139	"IO error rebuilding logical %llu for dev %s",
				2140	logical, rcu_str_deref(dev->name));
				2141	} else if (sblock->header_error \|\| sblock->checksum_error) {
				2142	spin_lock(&sctx->stat_lock);
				2143	sctx->stat.uncorrectable_errors++;
				2144	spin_unlock(&sctx->stat_lock);
				2145	btrfs_err_rl_in_rcu(fs_info,
				2146	"failed to rebuild valid logical %llu for dev %s",
				2147	logical, rcu_str_deref(dev->name));
				2148	} else {
				2149	scrub_write_block_to_dev_replace(sblock);
				2150	}
				2151
				2152	scrub_block_put(sblock);
				2153
				2154	if (sctx->is_dev_replace && sctx->flush_all_writes) {
				2155	mutex_lock(&sctx->wr_lock);
				2156	scrub_wr_submit(sctx);
				2157	mutex_unlock(&sctx->wr_lock);
				2158	}
				2159
				2160	scrub_pending_bio_dec(sctx);
				2161	}
				2162
				2163	static void scrub_missing_raid56_pages(struct scrub_block *sblock)
				2164	{
				2165	struct scrub_ctx *sctx = sblock->sctx;
				2166	struct btrfs_fs_info *fs_info = sctx->fs_info;
				2167	u64 length = sblock->page_count * PAGE_SIZE;
				2168	u64 logical = sblock->pagev[0]->logical;
				2169	struct btrfs_bio *bbio = NULL;
				2170	struct bio *bio;
				2171	struct btrfs_raid_bio *rbio;
				2172	int ret;
				2173	int i;
				2174
				2175	btrfs_bio_counter_inc_blocked(fs_info);
				2176	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
				2177	&length, &bbio);
				2178	if (ret \|\| !bbio \|\| !bbio->raid_map)
				2179	goto bbio_out;
				2180
				2181	if (WARN_ON(!sctx->is_dev_replace \|\|
				2182	!(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
				2183	/*
				2184	* We shouldn't be scrubbing a missing device. Even for dev
				2185	* replace, we should only get here for RAID 5/6. We either
				2186	* managed to mount something with no mirrors remaining or
				2187	* there's a bug in scrub_remap_extent()/btrfs_map_block().
				2188	*/
				2189	goto bbio_out;
				2190	}
				2191
				2192	bio = btrfs_io_bio_alloc(0);
				2193	bio->bi_iter.bi_sector = logical >> 9;
				2194	bio->bi_private = sblock;
				2195	bio->bi_end_io = scrub_missing_raid56_end_io;
				2196
				2197	rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
				2198	if (!rbio)
				2199	goto rbio_out;
				2200
				2201	for (i = 0; i < sblock->page_count; i++) {
				2202	struct scrub_page *spage = sblock->pagev[i];
				2203
				2204	raid56_add_scrub_pages(rbio, spage->page, spage->logical);
				2205	}
				2206
				2207	btrfs_init_work(&sblock->work, btrfs_scrub_helper,
				2208	scrub_missing_raid56_worker, NULL, NULL);
				2209	scrub_block_get(sblock);
				2210	scrub_pending_bio_inc(sctx);
				2211	raid56_submit_missing_rbio(rbio);
				2212	return;
				2213
				2214	rbio_out:
				2215	bio_put(bio);
				2216	bbio_out:
				2217	btrfs_bio_counter_dec(fs_info);
				2218	btrfs_put_bbio(bbio);
				2219	spin_lock(&sctx->stat_lock);
				2220	sctx->stat.malloc_errors++;
				2221	spin_unlock(&sctx->stat_lock);
				2222	}
				2223
				2224	static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
				2225	u64 physical, struct btrfs_device *dev, u64 flags,
				2226	u64 gen, int mirror_num, u8 *csum, int force,
				2227	u64 physical_for_dev_replace)
				2228	{
				2229	struct scrub_block *sblock;
				2230	int index;
				2231
				2232	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
				2233	if (!sblock) {
				2234	spin_lock(&sctx->stat_lock);
				2235	sctx->stat.malloc_errors++;
				2236	spin_unlock(&sctx->stat_lock);
				2237	return -ENOMEM;
				2238	}
				2239
				2240	/* one ref inside this function, plus one for each page added to
				2241	* a bio later on */
				2242	refcount_set(&sblock->refs, 1);
				2243	sblock->sctx = sctx;
				2244	sblock->no_io_error_seen = 1;
				2245
				2246	for (index = 0; len > 0; index++) {
				2247	struct scrub_page *spage;
				2248	u64 l = min_t(u64, len, PAGE_SIZE);
				2249
				2250	spage = kzalloc(sizeof(*spage), GFP_KERNEL);
				2251	if (!spage) {
				2252	leave_nomem:
				2253	spin_lock(&sctx->stat_lock);
				2254	sctx->stat.malloc_errors++;
				2255	spin_unlock(&sctx->stat_lock);
				2256	scrub_block_put(sblock);
				2257	return -ENOMEM;
				2258	}
				2259	BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
				2260	scrub_page_get(spage);
				2261	sblock->pagev[index] = spage;
				2262	spage->sblock = sblock;
				2263	spage->dev = dev;
				2264	spage->flags = flags;
				2265	spage->generation = gen;
				2266	spage->logical = logical;
				2267	spage->physical = physical;
				2268	spage->physical_for_dev_replace = physical_for_dev_replace;
				2269	spage->mirror_num = mirror_num;
				2270	if (csum) {
				2271	spage->have_csum = 1;
				2272	memcpy(spage->csum, csum, sctx->csum_size);
				2273	} else {
				2274	spage->have_csum = 0;
				2275	}
				2276	sblock->page_count++;
				2277	spage->page = alloc_page(GFP_KERNEL);
				2278	if (!spage->page)
				2279	goto leave_nomem;
				2280	len -= l;
				2281	logical += l;
				2282	physical += l;
				2283	physical_for_dev_replace += l;
				2284	}
				2285
				2286	WARN_ON(sblock->page_count == 0);
				2287	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
				2288	/*
				2289	* This case should only be hit for RAID 5/6 device replace. See
				2290	* the comment in scrub_missing_raid56_pages() for details.
				2291	*/
				2292	scrub_missing_raid56_pages(sblock);
				2293	} else {
				2294	for (index = 0; index < sblock->page_count; index++) {
				2295	struct scrub_page *spage = sblock->pagev[index];
				2296	int ret;
				2297
				2298	ret = scrub_add_page_to_rd_bio(sctx, spage);
				2299	if (ret) {
				2300	scrub_block_put(sblock);
				2301	return ret;
				2302	}
				2303	}
				2304
				2305	if (force)
				2306	scrub_submit(sctx);
				2307	}
				2308
				2309	/* last one frees, either here or in bio completion for last page */
				2310	scrub_block_put(sblock);
				2311	return 0;
				2312	}
				2313
				2314	static void scrub_bio_end_io(struct bio *bio)
				2315	{
				2316	struct scrub_bio *sbio = bio->bi_private;
				2317	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
				2318
				2319	sbio->status = bio->bi_status;
				2320	sbio->bio = bio;
				2321
				2322	btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
				2323	}
				2324
				2325	static void scrub_bio_end_io_worker(struct btrfs_work *work)
				2326	{
				2327	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
				2328	struct scrub_ctx *sctx = sbio->sctx;
				2329	int i;
				2330
				2331	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
				2332	if (sbio->status) {
				2333	for (i = 0; i < sbio->page_count; i++) {
				2334	struct scrub_page *spage = sbio->pagev[i];
				2335
				2336	spage->io_error = 1;
				2337	spage->sblock->no_io_error_seen = 0;
				2338	}
				2339	}
				2340
				2341	/* now complete the scrub_block items that have all pages completed */
				2342	for (i = 0; i < sbio->page_count; i++) {
				2343	struct scrub_page *spage = sbio->pagev[i];
				2344	struct scrub_block *sblock = spage->sblock;
				2345
				2346	if (atomic_dec_and_test(&sblock->outstanding_pages))
				2347	scrub_block_complete(sblock);
				2348	scrub_block_put(sblock);
				2349	}
				2350
				2351	bio_put(sbio->bio);
				2352	sbio->bio = NULL;
				2353	spin_lock(&sctx->list_lock);
				2354	sbio->next_free = sctx->first_free;
				2355	sctx->first_free = sbio->index;
				2356	spin_unlock(&sctx->list_lock);
				2357
				2358	if (sctx->is_dev_replace && sctx->flush_all_writes) {
				2359	mutex_lock(&sctx->wr_lock);
				2360	scrub_wr_submit(sctx);
				2361	mutex_unlock(&sctx->wr_lock);
				2362	}
				2363
				2364	scrub_pending_bio_dec(sctx);
				2365	}
				2366
				2367	static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
				2368	unsigned long *bitmap,
				2369	u64 start, u64 len)
				2370	{
				2371	u64 offset;
				2372	u64 nsectors64;
				2373	u32 nsectors;
				2374	int sectorsize = sparity->sctx->fs_info->sectorsize;
				2375
				2376	if (len >= sparity->stripe_len) {
				2377	bitmap_set(bitmap, 0, sparity->nsectors);
				2378	return;
				2379	}
				2380
				2381	start -= sparity->logic_start;
				2382	start = div64_u64_rem(start, sparity->stripe_len, &offset);
				2383	offset = div_u64(offset, sectorsize);
				2384	nsectors64 = div_u64(len, sectorsize);
				2385
				2386	ASSERT(nsectors64 < UINT_MAX);
				2387	nsectors = (u32)nsectors64;
				2388
				2389	if (offset + nsectors <= sparity->nsectors) {
				2390	bitmap_set(bitmap, offset, nsectors);
				2391	return;
				2392	}
				2393
				2394	bitmap_set(bitmap, offset, sparity->nsectors - offset);
				2395	bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
				2396	}
				2397
				2398	static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
				2399	u64 start, u64 len)
				2400	{
				2401	__scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
				2402	}
				2403
				2404	static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
				2405	u64 start, u64 len)
				2406	{
				2407	__scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
				2408	}
				2409
				2410	static void scrub_block_complete(struct scrub_block *sblock)
				2411	{
				2412	int corrupted = 0;
				2413
				2414	if (!sblock->no_io_error_seen) {
				2415	corrupted = 1;
				2416	scrub_handle_errored_block(sblock);
				2417	} else {
				2418	/*
				2419	* if has checksum error, write via repair mechanism in
				2420	* dev replace case, otherwise write here in dev replace
				2421	* case.
				2422	*/
				2423	corrupted = scrub_checksum(sblock);
				2424	if (!corrupted && sblock->sctx->is_dev_replace)
				2425	scrub_write_block_to_dev_replace(sblock);
				2426	}
				2427
				2428	if (sblock->sparity && corrupted && !sblock->data_corrected) {
				2429	u64 start = sblock->pagev[0]->logical;
				2430	u64 end = sblock->pagev[sblock->page_count - 1]->logical +
				2431	PAGE_SIZE;
				2432
				2433	scrub_parity_mark_sectors_error(sblock->sparity,
				2434	start, end - start);
				2435	}
				2436	}
				2437
				2438	static int scrub_find_csum(struct scrub_ctx sctx, u64 logical, u8 csum)
				2439	{
				2440	struct btrfs_ordered_sum *sum = NULL;
				2441	unsigned long index;
				2442	unsigned long num_sectors;
				2443
				2444	while (!list_empty(&sctx->csum_list)) {
				2445	sum = list_first_entry(&sctx->csum_list,
				2446	struct btrfs_ordered_sum, list);
				2447	if (sum->bytenr > logical)
				2448	return 0;
				2449	if (sum->bytenr + sum->len > logical)
				2450	break;
				2451
				2452	++sctx->stat.csum_discards;
				2453	list_del(&sum->list);
				2454	kfree(sum);
				2455	sum = NULL;
				2456	}
				2457	if (!sum)
				2458	return 0;
				2459
				2460	index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
				2461	ASSERT(index < UINT_MAX);
				2462
				2463	num_sectors = sum->len / sctx->fs_info->sectorsize;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	2464	memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2465	if (index == num_sectors - 1) {
				2466	list_del(&sum->list);
				2467	kfree(sum);
				2468	}
				2469	return 1;
				2470	}
				2471
				2472	/* scrub extent tries to collect up to 64 kB for each bio */
				2473	static int scrub_extent(struct scrub_ctx sctx, struct map_lookup map,
				2474	u64 logical, u64 len,
				2475	u64 physical, struct btrfs_device *dev, u64 flags,
				2476	u64 gen, int mirror_num, u64 physical_for_dev_replace)
				2477	{
				2478	int ret;
				2479	u8 csum[BTRFS_CSUM_SIZE];
				2480	u32 blocksize;
				2481
				2482	if (flags & BTRFS_EXTENT_FLAG_DATA) {
				2483	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
				2484	blocksize = map->stripe_len;
				2485	else
				2486	blocksize = sctx->fs_info->sectorsize;
				2487	spin_lock(&sctx->stat_lock);
				2488	sctx->stat.data_extents_scrubbed++;
				2489	sctx->stat.data_bytes_scrubbed += len;
				2490	spin_unlock(&sctx->stat_lock);
				2491	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
				2492	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
				2493	blocksize = map->stripe_len;
				2494	else
				2495	blocksize = sctx->fs_info->nodesize;
				2496	spin_lock(&sctx->stat_lock);
				2497	sctx->stat.tree_extents_scrubbed++;
				2498	sctx->stat.tree_bytes_scrubbed += len;
				2499	spin_unlock(&sctx->stat_lock);
				2500	} else {
				2501	blocksize = sctx->fs_info->sectorsize;
				2502	WARN_ON(1);
				2503	}
				2504
				2505	while (len) {
				2506	u64 l = min_t(u64, len, blocksize);
				2507	int have_csum = 0;
				2508
				2509	if (flags & BTRFS_EXTENT_FLAG_DATA) {
				2510	/* push csums to sbio */
				2511	have_csum = scrub_find_csum(sctx, logical, csum);
				2512	if (have_csum == 0)
				2513	++sctx->stat.no_csum;
				2514	}
				2515	ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
				2516	mirror_num, have_csum ? csum : NULL, 0,
				2517	physical_for_dev_replace);
				2518	if (ret)
				2519	return ret;
				2520	len -= l;
				2521	logical += l;
				2522	physical += l;
				2523	physical_for_dev_replace += l;
				2524	}
				2525	return 0;
				2526	}
				2527
				2528	static int scrub_pages_for_parity(struct scrub_parity *sparity,
				2529	u64 logical, u64 len,
				2530	u64 physical, struct btrfs_device *dev,
				2531	u64 flags, u64 gen, int mirror_num, u8 *csum)
				2532	{
				2533	struct scrub_ctx *sctx = sparity->sctx;
				2534	struct scrub_block *sblock;
				2535	int index;
				2536
				2537	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
				2538	if (!sblock) {
				2539	spin_lock(&sctx->stat_lock);
				2540	sctx->stat.malloc_errors++;
				2541	spin_unlock(&sctx->stat_lock);
				2542	return -ENOMEM;
				2543	}
				2544
				2545	/* one ref inside this function, plus one for each page added to
				2546	* a bio later on */
				2547	refcount_set(&sblock->refs, 1);
				2548	sblock->sctx = sctx;
				2549	sblock->no_io_error_seen = 1;
				2550	sblock->sparity = sparity;
				2551	scrub_parity_get(sparity);
				2552
				2553	for (index = 0; len > 0; index++) {
				2554	struct scrub_page *spage;
				2555	u64 l = min_t(u64, len, PAGE_SIZE);
				2556
				2557	spage = kzalloc(sizeof(*spage), GFP_KERNEL);
				2558	if (!spage) {
				2559	leave_nomem:
				2560	spin_lock(&sctx->stat_lock);
				2561	sctx->stat.malloc_errors++;
				2562	spin_unlock(&sctx->stat_lock);
				2563	scrub_block_put(sblock);
				2564	return -ENOMEM;
				2565	}
				2566	BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
				2567	/* For scrub block */
				2568	scrub_page_get(spage);
				2569	sblock->pagev[index] = spage;
				2570	/* For scrub parity */
				2571	scrub_page_get(spage);
				2572	list_add_tail(&spage->list, &sparity->spages);
				2573	spage->sblock = sblock;
				2574	spage->dev = dev;
				2575	spage->flags = flags;
				2576	spage->generation = gen;
				2577	spage->logical = logical;
				2578	spage->physical = physical;
				2579	spage->mirror_num = mirror_num;
				2580	if (csum) {
				2581	spage->have_csum = 1;
				2582	memcpy(spage->csum, csum, sctx->csum_size);
				2583	} else {
				2584	spage->have_csum = 0;
				2585	}
				2586	sblock->page_count++;
				2587	spage->page = alloc_page(GFP_KERNEL);
				2588	if (!spage->page)
				2589	goto leave_nomem;
				2590	len -= l;
				2591	logical += l;
				2592	physical += l;
				2593	}
				2594
				2595	WARN_ON(sblock->page_count == 0);
				2596	for (index = 0; index < sblock->page_count; index++) {
				2597	struct scrub_page *spage = sblock->pagev[index];
				2598	int ret;
				2599
				2600	ret = scrub_add_page_to_rd_bio(sctx, spage);
				2601	if (ret) {
				2602	scrub_block_put(sblock);
				2603	return ret;
				2604	}
				2605	}
				2606
				2607	/* last one frees, either here or in bio completion for last page */
				2608	scrub_block_put(sblock);
				2609	return 0;
				2610	}
				2611
				2612	static int scrub_extent_for_parity(struct scrub_parity *sparity,
				2613	u64 logical, u64 len,
				2614	u64 physical, struct btrfs_device *dev,
				2615	u64 flags, u64 gen, int mirror_num)
				2616	{
				2617	struct scrub_ctx *sctx = sparity->sctx;
				2618	int ret;
				2619	u8 csum[BTRFS_CSUM_SIZE];
				2620	u32 blocksize;
				2621
				2622	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
				2623	scrub_parity_mark_sectors_error(sparity, logical, len);
				2624	return 0;
				2625	}
				2626
				2627	if (flags & BTRFS_EXTENT_FLAG_DATA) {
				2628	blocksize = sparity->stripe_len;
				2629	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
				2630	blocksize = sparity->stripe_len;
				2631	} else {
				2632	blocksize = sctx->fs_info->sectorsize;
				2633	WARN_ON(1);
				2634	}
				2635
				2636	while (len) {
				2637	u64 l = min_t(u64, len, blocksize);
				2638	int have_csum = 0;
				2639
				2640	if (flags & BTRFS_EXTENT_FLAG_DATA) {
				2641	/* push csums to sbio */
				2642	have_csum = scrub_find_csum(sctx, logical, csum);
				2643	if (have_csum == 0)
				2644	goto skip;
				2645	}
				2646	ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
				2647	flags, gen, mirror_num,
				2648	have_csum ? csum : NULL);
				2649	if (ret)
				2650	return ret;
				2651	skip:
				2652	len -= l;
				2653	logical += l;
				2654	physical += l;
				2655	}
				2656	return 0;
				2657	}
				2658
				2659	/*
				2660	* Given a physical address, this will calculate it's
				2661	* logical offset. if this is a parity stripe, it will return
				2662	* the most left data stripe's logical offset.
				2663	*
				2664	* return 0 if it is a data stripe, 1 means parity stripe.
				2665	*/
				2666	static int get_raid56_logic_offset(u64 physical, int num,
				2667	struct map_lookup map, u64 offset,
				2668	u64 *stripe_start)
				2669	{
				2670	int i;
				2671	int j = 0;
				2672	u64 stripe_nr;
				2673	u64 last_offset;
				2674	u32 stripe_index;
				2675	u32 rot;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	2676	const int data_stripes = nr_data_stripes(map);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2677
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	2678	last_offset = (physical - map->stripes[num].physical) * data_stripes;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2679	if (stripe_start)
				2680	*stripe_start = last_offset;
				2681
				2682	*offset = last_offset;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	2683	for (i = 0; i < data_stripes; i++) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2684	offset = last_offset + i map->stripe_len;
				2685
				2686	stripe_nr = div64_u64(*offset, map->stripe_len);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	2687	stripe_nr = div_u64(stripe_nr, data_stripes);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2688
				2689	/* Work out the disk rotation on this stripe-set */
				2690	stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
				2691	/* calculate which stripe this data locates */
				2692	rot += i;
				2693	stripe_index = rot % map->num_stripes;
				2694	if (stripe_index == num)
				2695	return 0;
				2696	if (stripe_index < num)
				2697	j++;
				2698	}
				2699	offset = last_offset + j map->stripe_len;
				2700	return 1;
				2701	}
				2702
				2703	static void scrub_free_parity(struct scrub_parity *sparity)
				2704	{
				2705	struct scrub_ctx *sctx = sparity->sctx;
				2706	struct scrub_page curr, next;
				2707	int nbits;
				2708
				2709	nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
				2710	if (nbits) {
				2711	spin_lock(&sctx->stat_lock);
				2712	sctx->stat.read_errors += nbits;
				2713	sctx->stat.uncorrectable_errors += nbits;
				2714	spin_unlock(&sctx->stat_lock);
				2715	}
				2716
				2717	list_for_each_entry_safe(curr, next, &sparity->spages, list) {
				2718	list_del_init(&curr->list);
				2719	scrub_page_put(curr);
				2720	}
				2721
				2722	kfree(sparity);
				2723	}
				2724
				2725	static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
				2726	{
				2727	struct scrub_parity *sparity = container_of(work, struct scrub_parity,
				2728	work);
				2729	struct scrub_ctx *sctx = sparity->sctx;
				2730
				2731	scrub_free_parity(sparity);
				2732	scrub_pending_bio_dec(sctx);
				2733	}
				2734
				2735	static void scrub_parity_bio_endio(struct bio *bio)
				2736	{
				2737	struct scrub_parity sparity = (struct scrub_parity )bio->bi_private;
				2738	struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
				2739
				2740	if (bio->bi_status)
				2741	bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
				2742	sparity->nsectors);
				2743
				2744	bio_put(bio);
				2745
				2746	btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
				2747	scrub_parity_bio_endio_worker, NULL, NULL);
				2748	btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
				2749	}
				2750
				2751	static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
				2752	{
				2753	struct scrub_ctx *sctx = sparity->sctx;
				2754	struct btrfs_fs_info *fs_info = sctx->fs_info;
				2755	struct bio *bio;
				2756	struct btrfs_raid_bio *rbio;
				2757	struct btrfs_bio *bbio = NULL;
				2758	u64 length;
				2759	int ret;
				2760
				2761	if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
				2762	sparity->nsectors))
				2763	goto out;
				2764
				2765	length = sparity->logic_end - sparity->logic_start;
				2766
				2767	btrfs_bio_counter_inc_blocked(fs_info);
				2768	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
				2769	&length, &bbio);
				2770	if (ret \|\| !bbio \|\| !bbio->raid_map)
				2771	goto bbio_out;
				2772
				2773	bio = btrfs_io_bio_alloc(0);
				2774	bio->bi_iter.bi_sector = sparity->logic_start >> 9;
				2775	bio->bi_private = sparity;
				2776	bio->bi_end_io = scrub_parity_bio_endio;
				2777
				2778	rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
				2779	length, sparity->scrub_dev,
				2780	sparity->dbitmap,
				2781	sparity->nsectors);
				2782	if (!rbio)
				2783	goto rbio_out;
				2784
				2785	scrub_pending_bio_inc(sctx);
				2786	raid56_parity_submit_scrub_rbio(rbio);
				2787	return;
				2788
				2789	rbio_out:
				2790	bio_put(bio);
				2791	bbio_out:
				2792	btrfs_bio_counter_dec(fs_info);
				2793	btrfs_put_bbio(bbio);
				2794	bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
				2795	sparity->nsectors);
				2796	spin_lock(&sctx->stat_lock);
				2797	sctx->stat.malloc_errors++;
				2798	spin_unlock(&sctx->stat_lock);
				2799	out:
				2800	scrub_free_parity(sparity);
				2801	}
				2802
				2803	static inline int scrub_calc_parity_bitmap_len(int nsectors)
				2804	{
				2805	return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
				2806	}
				2807
				2808	static void scrub_parity_get(struct scrub_parity *sparity)
				2809	{
				2810	refcount_inc(&sparity->refs);
				2811	}
				2812
				2813	static void scrub_parity_put(struct scrub_parity *sparity)
				2814	{
				2815	if (!refcount_dec_and_test(&sparity->refs))
				2816	return;
				2817
				2818	scrub_parity_check_and_repair(sparity);
				2819	}
				2820
				2821	static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
				2822	struct map_lookup *map,
				2823	struct btrfs_device *sdev,
				2824	struct btrfs_path *path,
				2825	u64 logic_start,
				2826	u64 logic_end)
				2827	{
				2828	struct btrfs_fs_info *fs_info = sctx->fs_info;
				2829	struct btrfs_root *root = fs_info->extent_root;
				2830	struct btrfs_root *csum_root = fs_info->csum_root;
				2831	struct btrfs_extent_item *extent;
				2832	struct btrfs_bio *bbio = NULL;
				2833	u64 flags;
				2834	int ret;
				2835	int slot;
				2836	struct extent_buffer *l;
				2837	struct btrfs_key key;
				2838	u64 generation;
				2839	u64 extent_logical;
				2840	u64 extent_physical;
				2841	u64 extent_len;
				2842	u64 mapped_length;
				2843	struct btrfs_device *extent_dev;
				2844	struct scrub_parity *sparity;
				2845	int nsectors;
				2846	int bitmap_len;
				2847	int extent_mirror_num;
				2848	int stop_loop = 0;
				2849
				2850	nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
				2851	bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
				2852	sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
				2853	GFP_NOFS);
				2854	if (!sparity) {
				2855	spin_lock(&sctx->stat_lock);
				2856	sctx->stat.malloc_errors++;
				2857	spin_unlock(&sctx->stat_lock);
				2858	return -ENOMEM;
				2859	}
				2860
				2861	sparity->stripe_len = map->stripe_len;
				2862	sparity->nsectors = nsectors;
				2863	sparity->sctx = sctx;
				2864	sparity->scrub_dev = sdev;
				2865	sparity->logic_start = logic_start;
				2866	sparity->logic_end = logic_end;
				2867	refcount_set(&sparity->refs, 1);
				2868	INIT_LIST_HEAD(&sparity->spages);
				2869	sparity->dbitmap = sparity->bitmap;
				2870	sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
				2871
				2872	ret = 0;
				2873	while (logic_start < logic_end) {
				2874	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
				2875	key.type = BTRFS_METADATA_ITEM_KEY;
				2876	else
				2877	key.type = BTRFS_EXTENT_ITEM_KEY;
				2878	key.objectid = logic_start;
				2879	key.offset = (u64)-1;
				2880
				2881	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				2882	if (ret < 0)
				2883	goto out;
				2884
				2885	if (ret > 0) {
				2886	ret = btrfs_previous_extent_item(root, path, 0);
				2887	if (ret < 0)
				2888	goto out;
				2889	if (ret > 0) {
				2890	btrfs_release_path(path);
				2891	ret = btrfs_search_slot(NULL, root, &key,
				2892	path, 0, 0);
				2893	if (ret < 0)
				2894	goto out;
				2895	}
				2896	}
				2897
				2898	stop_loop = 0;
				2899	while (1) {
				2900	u64 bytes;
				2901
				2902	l = path->nodes[0];
				2903	slot = path->slots[0];
				2904	if (slot >= btrfs_header_nritems(l)) {
				2905	ret = btrfs_next_leaf(root, path);
				2906	if (ret == 0)
				2907	continue;
				2908	if (ret < 0)
				2909	goto out;
				2910
				2911	stop_loop = 1;
				2912	break;
				2913	}
				2914	btrfs_item_key_to_cpu(l, &key, slot);
				2915
				2916	if (key.type != BTRFS_EXTENT_ITEM_KEY &&
				2917	key.type != BTRFS_METADATA_ITEM_KEY)
				2918	goto next;
				2919
				2920	if (key.type == BTRFS_METADATA_ITEM_KEY)
				2921	bytes = fs_info->nodesize;
				2922	else
				2923	bytes = key.offset;
				2924
				2925	if (key.objectid + bytes <= logic_start)
				2926	goto next;
				2927
				2928	if (key.objectid >= logic_end) {
				2929	stop_loop = 1;
				2930	break;
				2931	}
				2932
				2933	while (key.objectid >= logic_start + map->stripe_len)
				2934	logic_start += map->stripe_len;
				2935
				2936	extent = btrfs_item_ptr(l, slot,
				2937	struct btrfs_extent_item);
				2938	flags = btrfs_extent_flags(l, extent);
				2939	generation = btrfs_extent_generation(l, extent);
				2940
				2941	if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
				2942	(key.objectid < logic_start \|\|
				2943	key.objectid + bytes >
				2944	logic_start + map->stripe_len)) {
				2945	btrfs_err(fs_info,
				2946	"scrub: tree block %llu spanning stripes, ignored. logical=%llu",
				2947	key.objectid, logic_start);
				2948	spin_lock(&sctx->stat_lock);
				2949	sctx->stat.uncorrectable_errors++;
				2950	spin_unlock(&sctx->stat_lock);
				2951	goto next;
				2952	}
				2953	again:
				2954	extent_logical = key.objectid;
				2955	extent_len = bytes;
				2956
				2957	if (extent_logical < logic_start) {
				2958	extent_len -= logic_start - extent_logical;
				2959	extent_logical = logic_start;
				2960	}
				2961
				2962	if (extent_logical + extent_len >
				2963	logic_start + map->stripe_len)
				2964	extent_len = logic_start + map->stripe_len -
				2965	extent_logical;
				2966
				2967	scrub_parity_mark_sectors_data(sparity, extent_logical,
				2968	extent_len);
				2969
				2970	mapped_length = extent_len;
				2971	bbio = NULL;
				2972	ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
				2973	extent_logical, &mapped_length, &bbio,
				2974	0);
				2975	if (!ret) {
				2976	if (!bbio \|\| mapped_length < extent_len)
				2977	ret = -EIO;
				2978	}
				2979	if (ret) {
				2980	btrfs_put_bbio(bbio);
				2981	goto out;
				2982	}
				2983	extent_physical = bbio->stripes[0].physical;
				2984	extent_mirror_num = bbio->mirror_num;
				2985	extent_dev = bbio->stripes[0].dev;
				2986	btrfs_put_bbio(bbio);
				2987
				2988	ret = btrfs_lookup_csums_range(csum_root,
				2989	extent_logical,
				2990	extent_logical + extent_len - 1,
				2991	&sctx->csum_list, 1);
				2992	if (ret)
				2993	goto out;
				2994
				2995	ret = scrub_extent_for_parity(sparity, extent_logical,
				2996	extent_len,
				2997	extent_physical,
				2998	extent_dev, flags,
				2999	generation,
				3000	extent_mirror_num);
				3001
				3002	scrub_free_csums(sctx);
				3003
				3004	if (ret)
				3005	goto out;
				3006
				3007	if (extent_logical + extent_len <
				3008	key.objectid + bytes) {
				3009	logic_start += map->stripe_len;
				3010
				3011	if (logic_start >= logic_end) {
				3012	stop_loop = 1;
				3013	break;
				3014	}
				3015
				3016	if (logic_start < key.objectid + bytes) {
				3017	cond_resched();
				3018	goto again;
				3019	}
				3020	}
				3021	next:
				3022	path->slots[0]++;
				3023	}
				3024
				3025	btrfs_release_path(path);
				3026
				3027	if (stop_loop)
				3028	break;
				3029
				3030	logic_start += map->stripe_len;
				3031	}
				3032	out:
				3033	if (ret < 0)
				3034	scrub_parity_mark_sectors_error(sparity, logic_start,
				3035	logic_end - logic_start);
				3036	scrub_parity_put(sparity);
				3037	scrub_submit(sctx);
				3038	mutex_lock(&sctx->wr_lock);
				3039	scrub_wr_submit(sctx);
				3040	mutex_unlock(&sctx->wr_lock);
				3041
				3042	btrfs_release_path(path);
				3043	return ret < 0 ? ret : 0;
				3044	}
				3045
				3046	static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
				3047	struct map_lookup *map,
				3048	struct btrfs_device *scrub_dev,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3049	int num, u64 base, u64 length)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3050	{
				3051	struct btrfs_path path, ppath;
				3052	struct btrfs_fs_info *fs_info = sctx->fs_info;
				3053	struct btrfs_root *root = fs_info->extent_root;
				3054	struct btrfs_root *csum_root = fs_info->csum_root;
				3055	struct btrfs_extent_item *extent;
				3056	struct blk_plug plug;
				3057	u64 flags;
				3058	int ret;
				3059	int slot;
				3060	u64 nstripes;
				3061	struct extent_buffer *l;
				3062	u64 physical;
				3063	u64 logical;
				3064	u64 logic_end;
				3065	u64 physical_end;
				3066	u64 generation;
				3067	int mirror_num;
				3068	struct reada_control *reada1;
				3069	struct reada_control *reada2;
				3070	struct btrfs_key key;
				3071	struct btrfs_key key_end;
				3072	u64 increment = map->stripe_len;
				3073	u64 offset;
				3074	u64 extent_logical;
				3075	u64 extent_physical;
				3076	u64 extent_len;
				3077	u64 stripe_logical;
				3078	u64 stripe_end;
				3079	struct btrfs_device *extent_dev;
				3080	int extent_mirror_num;
				3081	int stop_loop = 0;
				3082
				3083	physical = map->stripes[num].physical;
				3084	offset = 0;
				3085	nstripes = div64_u64(length, map->stripe_len);
				3086	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
				3087	offset = map->stripe_len * num;
				3088	increment = map->stripe_len * map->num_stripes;
				3089	mirror_num = 1;
				3090	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
				3091	int factor = map->num_stripes / map->sub_stripes;
				3092	offset = map->stripe_len * (num / map->sub_stripes);
				3093	increment = map->stripe_len * factor;
				3094	mirror_num = num % map->sub_stripes + 1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3095	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3096	increment = map->stripe_len;
				3097	mirror_num = num % map->num_stripes + 1;
				3098	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
				3099	increment = map->stripe_len;
				3100	mirror_num = num % map->num_stripes + 1;
				3101	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				3102	get_raid56_logic_offset(physical, num, map, &offset, NULL);
				3103	increment = map->stripe_len * nr_data_stripes(map);
				3104	mirror_num = 1;
				3105	} else {
				3106	increment = map->stripe_len;
				3107	mirror_num = 1;
				3108	}
				3109
				3110	path = btrfs_alloc_path();
				3111	if (!path)
				3112	return -ENOMEM;
				3113
				3114	ppath = btrfs_alloc_path();
				3115	if (!ppath) {
				3116	btrfs_free_path(path);
				3117	return -ENOMEM;
				3118	}
				3119
				3120	/*
				3121	* work on commit root. The related disk blocks are static as
				3122	* long as COW is applied. This means, it is save to rewrite
				3123	* them to repair disk errors without any race conditions
				3124	*/
				3125	path->search_commit_root = 1;
				3126	path->skip_locking = 1;
				3127
				3128	ppath->search_commit_root = 1;
				3129	ppath->skip_locking = 1;
				3130	/*
				3131	* trigger the readahead for extent tree csum tree and wait for
				3132	* completion. During readahead, the scrub is officially paused
				3133	* to not hold off transaction commits
				3134	*/
				3135	logical = base + offset;
				3136	physical_end = physical + nstripes * map->stripe_len;
				3137	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				3138	get_raid56_logic_offset(physical_end, num,
				3139	map, &logic_end, NULL);
				3140	logic_end += base;
				3141	} else {
				3142	logic_end = logical + increment * nstripes;
				3143	}
				3144	wait_event(sctx->list_wait,
				3145	atomic_read(&sctx->bios_in_flight) == 0);
				3146	scrub_blocked_if_needed(fs_info);
				3147
				3148	/* FIXME it might be better to start readahead at commit root */
				3149	key.objectid = logical;
				3150	key.type = BTRFS_EXTENT_ITEM_KEY;
				3151	key.offset = (u64)0;
				3152	key_end.objectid = logic_end;
				3153	key_end.type = BTRFS_METADATA_ITEM_KEY;
				3154	key_end.offset = (u64)-1;
				3155	reada1 = btrfs_reada_add(root, &key, &key_end);
				3156
				3157	key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
				3158	key.type = BTRFS_EXTENT_CSUM_KEY;
				3159	key.offset = logical;
				3160	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
				3161	key_end.type = BTRFS_EXTENT_CSUM_KEY;
				3162	key_end.offset = logic_end;
				3163	reada2 = btrfs_reada_add(csum_root, &key, &key_end);
				3164
				3165	if (!IS_ERR(reada1))
				3166	btrfs_reada_wait(reada1);
				3167	if (!IS_ERR(reada2))
				3168	btrfs_reada_wait(reada2);
				3169
				3170
				3171	/*
				3172	* collect all data csums for the stripe to avoid seeking during
				3173	* the scrub. This might currently (crc32) end up to be about 1MB
				3174	*/
				3175	blk_start_plug(&plug);
				3176
				3177	/*
				3178	* now find all extents for each stripe and scrub them
				3179	*/
				3180	ret = 0;
				3181	while (physical < physical_end) {
				3182	/*
				3183	* canceled?
				3184	*/
				3185	if (atomic_read(&fs_info->scrub_cancel_req) \|\|
				3186	atomic_read(&sctx->cancel_req)) {
				3187	ret = -ECANCELED;
				3188	goto out;
				3189	}
				3190	/*
				3191	* check to see if we have to pause
				3192	*/
				3193	if (atomic_read(&fs_info->scrub_pause_req)) {
				3194	/* push queued extents */
				3195	sctx->flush_all_writes = true;
				3196	scrub_submit(sctx);
				3197	mutex_lock(&sctx->wr_lock);
				3198	scrub_wr_submit(sctx);
				3199	mutex_unlock(&sctx->wr_lock);
				3200	wait_event(sctx->list_wait,
				3201	atomic_read(&sctx->bios_in_flight) == 0);
				3202	sctx->flush_all_writes = false;
				3203	scrub_blocked_if_needed(fs_info);
				3204	}
				3205
				3206	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				3207	ret = get_raid56_logic_offset(physical, num, map,
				3208	&logical,
				3209	&stripe_logical);
				3210	logical += base;
				3211	if (ret) {
				3212	/* it is parity strip */
				3213	stripe_logical += base;
				3214	stripe_end = stripe_logical + increment;
				3215	ret = scrub_raid56_parity(sctx, map, scrub_dev,
				3216	ppath, stripe_logical,
				3217	stripe_end);
				3218	if (ret)
				3219	goto out;
				3220	goto skip;
				3221	}
				3222	}
				3223
				3224	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
				3225	key.type = BTRFS_METADATA_ITEM_KEY;
				3226	else
				3227	key.type = BTRFS_EXTENT_ITEM_KEY;
				3228	key.objectid = logical;
				3229	key.offset = (u64)-1;
				3230
				3231	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				3232	if (ret < 0)
				3233	goto out;
				3234
				3235	if (ret > 0) {
				3236	ret = btrfs_previous_extent_item(root, path, 0);
				3237	if (ret < 0)
				3238	goto out;
				3239	if (ret > 0) {
				3240	/* there's no smaller item, so stick with the
				3241	* larger one */
				3242	btrfs_release_path(path);
				3243	ret = btrfs_search_slot(NULL, root, &key,
				3244	path, 0, 0);
				3245	if (ret < 0)
				3246	goto out;
				3247	}
				3248	}
				3249
				3250	stop_loop = 0;
				3251	while (1) {
				3252	u64 bytes;
				3253
				3254	l = path->nodes[0];
				3255	slot = path->slots[0];
				3256	if (slot >= btrfs_header_nritems(l)) {
				3257	ret = btrfs_next_leaf(root, path);
				3258	if (ret == 0)
				3259	continue;
				3260	if (ret < 0)
				3261	goto out;
				3262
				3263	stop_loop = 1;
				3264	break;
				3265	}
				3266	btrfs_item_key_to_cpu(l, &key, slot);
				3267
				3268	if (key.type != BTRFS_EXTENT_ITEM_KEY &&
				3269	key.type != BTRFS_METADATA_ITEM_KEY)
				3270	goto next;
				3271
				3272	if (key.type == BTRFS_METADATA_ITEM_KEY)
				3273	bytes = fs_info->nodesize;
				3274	else
				3275	bytes = key.offset;
				3276
				3277	if (key.objectid + bytes <= logical)
				3278	goto next;
				3279
				3280	if (key.objectid >= logical + map->stripe_len) {
				3281	/* out of this device extent */
				3282	if (key.objectid >= logic_end)
				3283	stop_loop = 1;
				3284	break;
				3285	}
				3286
				3287	extent = btrfs_item_ptr(l, slot,
				3288	struct btrfs_extent_item);
				3289	flags = btrfs_extent_flags(l, extent);
				3290	generation = btrfs_extent_generation(l, extent);
				3291
				3292	if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
				3293	(key.objectid < logical \|\|
				3294	key.objectid + bytes >
				3295	logical + map->stripe_len)) {
				3296	btrfs_err(fs_info,
				3297	"scrub: tree block %llu spanning stripes, ignored. logical=%llu",
				3298	key.objectid, logical);
				3299	spin_lock(&sctx->stat_lock);
				3300	sctx->stat.uncorrectable_errors++;
				3301	spin_unlock(&sctx->stat_lock);
				3302	goto next;
				3303	}
				3304
				3305	again:
				3306	extent_logical = key.objectid;
				3307	extent_len = bytes;
				3308
				3309	/*
				3310	* trim extent to this stripe
				3311	*/
				3312	if (extent_logical < logical) {
				3313	extent_len -= logical - extent_logical;
				3314	extent_logical = logical;
				3315	}
				3316	if (extent_logical + extent_len >
				3317	logical + map->stripe_len) {
				3318	extent_len = logical + map->stripe_len -
				3319	extent_logical;
				3320	}
				3321
				3322	extent_physical = extent_logical - logical + physical;
				3323	extent_dev = scrub_dev;
				3324	extent_mirror_num = mirror_num;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3325	if (sctx->is_dev_replace)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3326	scrub_remap_extent(fs_info, extent_logical,
				3327	extent_len, &extent_physical,
				3328	&extent_dev,
				3329	&extent_mirror_num);
				3330
				3331	ret = btrfs_lookup_csums_range(csum_root,
				3332	extent_logical,
				3333	extent_logical +
				3334	extent_len - 1,
				3335	&sctx->csum_list, 1);
				3336	if (ret)
				3337	goto out;
				3338
				3339	ret = scrub_extent(sctx, map, extent_logical, extent_len,
				3340	extent_physical, extent_dev, flags,
				3341	generation, extent_mirror_num,
				3342	extent_logical - logical + physical);
				3343
				3344	scrub_free_csums(sctx);
				3345
				3346	if (ret)
				3347	goto out;
				3348
				3349	if (extent_logical + extent_len <
				3350	key.objectid + bytes) {
				3351	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				3352	/*
				3353	* loop until we find next data stripe
				3354	* or we have finished all stripes.
				3355	*/
				3356	loop:
				3357	physical += map->stripe_len;
				3358	ret = get_raid56_logic_offset(physical,
				3359	num, map, &logical,
				3360	&stripe_logical);
				3361	logical += base;
				3362
				3363	if (ret && physical < physical_end) {
				3364	stripe_logical += base;
				3365	stripe_end = stripe_logical +
				3366	increment;
				3367	ret = scrub_raid56_parity(sctx,
				3368	map, scrub_dev, ppath,
				3369	stripe_logical,
				3370	stripe_end);
				3371	if (ret)
				3372	goto out;
				3373	goto loop;
				3374	}
				3375	} else {
				3376	physical += map->stripe_len;
				3377	logical += increment;
				3378	}
				3379	if (logical < key.objectid + bytes) {
				3380	cond_resched();
				3381	goto again;
				3382	}
				3383
				3384	if (physical >= physical_end) {
				3385	stop_loop = 1;
				3386	break;
				3387	}
				3388	}
				3389	next:
				3390	path->slots[0]++;
				3391	}
				3392	btrfs_release_path(path);
				3393	skip:
				3394	logical += increment;
				3395	physical += map->stripe_len;
				3396	spin_lock(&sctx->stat_lock);
				3397	if (stop_loop)
				3398	sctx->stat.last_physical = map->stripes[num].physical +
				3399	length;
				3400	else
				3401	sctx->stat.last_physical = physical;
				3402	spin_unlock(&sctx->stat_lock);
				3403	if (stop_loop)
				3404	break;
				3405	}
				3406	out:
				3407	/* push queued extents */
				3408	scrub_submit(sctx);
				3409	mutex_lock(&sctx->wr_lock);
				3410	scrub_wr_submit(sctx);
				3411	mutex_unlock(&sctx->wr_lock);
				3412
				3413	blk_finish_plug(&plug);
				3414	btrfs_free_path(path);
				3415	btrfs_free_path(ppath);
				3416	return ret < 0 ? ret : 0;
				3417	}
				3418
				3419	static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
				3420	struct btrfs_device *scrub_dev,
				3421	u64 chunk_offset, u64 length,
				3422	u64 dev_offset,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3423	struct btrfs_block_group_cache *cache)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3424	{
				3425	struct btrfs_fs_info *fs_info = sctx->fs_info;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3426	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3427	struct map_lookup *map;
				3428	struct extent_map *em;
				3429	int i;
				3430	int ret = 0;
				3431
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3432	read_lock(&map_tree->lock);
				3433	em = lookup_extent_mapping(map_tree, chunk_offset, 1);
				3434	read_unlock(&map_tree->lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3435
				3436	if (!em) {
				3437	/*
				3438	* Might have been an unused block group deleted by the cleaner
				3439	* kthread or relocation.
				3440	*/
				3441	spin_lock(&cache->lock);
				3442	if (!cache->removed)
				3443	ret = -EINVAL;
				3444	spin_unlock(&cache->lock);
				3445
				3446	return ret;
				3447	}
				3448
				3449	map = em->map_lookup;
				3450	if (em->start != chunk_offset)
				3451	goto out;
				3452
				3453	if (em->len < length)
				3454	goto out;
				3455
				3456	for (i = 0; i < map->num_stripes; ++i) {
				3457	if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
				3458	map->stripes[i].physical == dev_offset) {
				3459	ret = scrub_stripe(sctx, map, scrub_dev, i,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3460	chunk_offset, length);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3461	if (ret)
				3462	goto out;
				3463	}
				3464	}
				3465	out:
				3466	free_extent_map(em);
				3467
				3468	return ret;
				3469	}
				3470
				3471	static noinline_for_stack
				3472	int scrub_enumerate_chunks(struct scrub_ctx *sctx,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3473	struct btrfs_device *scrub_dev, u64 start, u64 end)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3474	{
				3475	struct btrfs_dev_extent *dev_extent = NULL;
				3476	struct btrfs_path *path;
				3477	struct btrfs_fs_info *fs_info = sctx->fs_info;
				3478	struct btrfs_root *root = fs_info->dev_root;
				3479	u64 length;
				3480	u64 chunk_offset;
				3481	int ret = 0;
				3482	int ro_set;
				3483	int slot;
				3484	struct extent_buffer *l;
				3485	struct btrfs_key key;
				3486	struct btrfs_key found_key;
				3487	struct btrfs_block_group_cache *cache;
				3488	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
				3489
				3490	path = btrfs_alloc_path();
				3491	if (!path)
				3492	return -ENOMEM;
				3493
				3494	path->reada = READA_FORWARD;
				3495	path->search_commit_root = 1;
				3496	path->skip_locking = 1;
				3497
				3498	key.objectid = scrub_dev->devid;
				3499	key.offset = 0ull;
				3500	key.type = BTRFS_DEV_EXTENT_KEY;
				3501
				3502	while (1) {
				3503	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				3504	if (ret < 0)
				3505	break;
				3506	if (ret > 0) {
				3507	if (path->slots[0] >=
				3508	btrfs_header_nritems(path->nodes[0])) {
				3509	ret = btrfs_next_leaf(root, path);
				3510	if (ret < 0)
				3511	break;
				3512	if (ret > 0) {
				3513	ret = 0;
				3514	break;
				3515	}
				3516	} else {
				3517	ret = 0;
				3518	}
				3519	}
				3520
				3521	l = path->nodes[0];
				3522	slot = path->slots[0];
				3523
				3524	btrfs_item_key_to_cpu(l, &found_key, slot);
				3525
				3526	if (found_key.objectid != scrub_dev->devid)
				3527	break;
				3528
				3529	if (found_key.type != BTRFS_DEV_EXTENT_KEY)
				3530	break;
				3531
				3532	if (found_key.offset >= end)
				3533	break;
				3534
				3535	if (found_key.offset < key.offset)
				3536	break;
				3537
				3538	dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
				3539	length = btrfs_dev_extent_length(l, dev_extent);
				3540
				3541	if (found_key.offset + length <= start)
				3542	goto skip;
				3543
				3544	chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
				3545
				3546	/*
				3547	* get a reference on the corresponding block group to prevent
				3548	* the chunk from going away while we scrub it
				3549	*/
				3550	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
				3551
				3552	/* some chunks are removed but not committed to disk yet,
				3553	* continue scrubbing */
				3554	if (!cache)
				3555	goto skip;
				3556
				3557	/*
				3558	* we need call btrfs_inc_block_group_ro() with scrubs_paused,
				3559	* to avoid deadlock caused by:
				3560	* btrfs_inc_block_group_ro()
				3561	* -> btrfs_wait_for_commit()
				3562	* -> btrfs_commit_transaction()
				3563	* -> btrfs_scrub_pause()
				3564	*/
				3565	scrub_pause_on(fs_info);
				3566	ret = btrfs_inc_block_group_ro(cache);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3567	if (!ret && sctx->is_dev_replace) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3568	/*
				3569	* If we are doing a device replace wait for any tasks
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3570	* that started delalloc right before we set the block
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3571	* group to RO mode, as they might have just allocated
				3572	* an extent from it or decided they could do a nocow
				3573	* write. And if any such tasks did that, wait for their
				3574	* ordered extents to complete and then commit the
				3575	* current transaction, so that we can later see the new
				3576	* extent items in the extent tree - the ordered extents
				3577	* create delayed data references (for cow writes) when
				3578	* they complete, which will be run and insert the
				3579	* corresponding extent items into the extent tree when
				3580	* we commit the transaction they used when running
				3581	* inode.c:btrfs_finish_ordered_io(). We later use
				3582	* the commit root of the extent tree to find extents
				3583	* to copy from the srcdev into the tgtdev, and we don't
				3584	* want to miss any new extents.
				3585	*/
				3586	btrfs_wait_block_group_reservations(cache);
				3587	btrfs_wait_nocow_writers(cache);
				3588	ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
				3589	cache->key.objectid,
				3590	cache->key.offset);
				3591	if (ret > 0) {
				3592	struct btrfs_trans_handle *trans;
				3593
				3594	trans = btrfs_join_transaction(root);
				3595	if (IS_ERR(trans))
				3596	ret = PTR_ERR(trans);
				3597	else
				3598	ret = btrfs_commit_transaction(trans);
				3599	if (ret) {
				3600	scrub_pause_off(fs_info);
				3601	btrfs_put_block_group(cache);
				3602	break;
				3603	}
				3604	}
				3605	}
				3606	scrub_pause_off(fs_info);
				3607
				3608	if (ret == 0) {
				3609	ro_set = 1;
				3610	} else if (ret == -ENOSPC) {
				3611	/*
				3612	* btrfs_inc_block_group_ro return -ENOSPC when it
				3613	* failed in creating new chunk for metadata.
				3614	* It is not a problem for scrub/replace, because
				3615	* metadata are always cowed, and our scrub paused
				3616	* commit_transactions.
				3617	*/
				3618	ro_set = 0;
				3619	} else {
				3620	btrfs_warn(fs_info,
				3621	"failed setting block group ro: %d", ret);
				3622	btrfs_put_block_group(cache);
				3623	break;
				3624	}
				3625
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3626	down_write(&fs_info->dev_replace.rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3627	dev_replace->cursor_right = found_key.offset + length;
				3628	dev_replace->cursor_left = found_key.offset;
				3629	dev_replace->item_needs_writeback = 1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3630	up_write(&dev_replace->rwsem);
				3631
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3632	ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3633	found_key.offset, cache);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3634
				3635	/*
				3636	* flush, submit all pending read and write bios, afterwards
				3637	* wait for them.
				3638	* Note that in the dev replace case, a read request causes
				3639	* write requests that are submitted in the read completion
				3640	* worker. Therefore in the current situation, it is required
				3641	* that all write requests are flushed, so that all read and
				3642	* write requests are really completed when bios_in_flight
				3643	* changes to 0.
				3644	*/
				3645	sctx->flush_all_writes = true;
				3646	scrub_submit(sctx);
				3647	mutex_lock(&sctx->wr_lock);
				3648	scrub_wr_submit(sctx);
				3649	mutex_unlock(&sctx->wr_lock);
				3650
				3651	wait_event(sctx->list_wait,
				3652	atomic_read(&sctx->bios_in_flight) == 0);
				3653
				3654	scrub_pause_on(fs_info);
				3655
				3656	/*
				3657	* must be called before we decrease @scrub_paused.
				3658	* make sure we don't block transaction commit while
				3659	* we are waiting pending workers finished.
				3660	*/
				3661	wait_event(sctx->list_wait,
				3662	atomic_read(&sctx->workers_pending) == 0);
				3663	sctx->flush_all_writes = false;
				3664
				3665	scrub_pause_off(fs_info);
				3666
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3667	down_write(&fs_info->dev_replace.rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3668	dev_replace->cursor_left = dev_replace->cursor_right;
				3669	dev_replace->item_needs_writeback = 1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3670	up_write(&fs_info->dev_replace.rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3671
				3672	if (ro_set)
				3673	btrfs_dec_block_group_ro(cache);
				3674
				3675	/*
				3676	* We might have prevented the cleaner kthread from deleting
				3677	* this block group if it was already unused because we raced
				3678	* and set it to RO mode first. So add it back to the unused
				3679	* list, otherwise it might not ever be deleted unless a manual
				3680	* balance is triggered or it becomes used and unused again.
				3681	*/
				3682	spin_lock(&cache->lock);
				3683	if (!cache->removed && !cache->ro && cache->reserved == 0 &&
				3684	btrfs_block_group_used(&cache->item) == 0) {
				3685	spin_unlock(&cache->lock);
				3686	btrfs_mark_bg_unused(cache);
				3687	} else {
				3688	spin_unlock(&cache->lock);
				3689	}
				3690
				3691	btrfs_put_block_group(cache);
				3692	if (ret)
				3693	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3694	if (sctx->is_dev_replace &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3695	atomic64_read(&dev_replace->num_write_errors) > 0) {
				3696	ret = -EIO;
				3697	break;
				3698	}
				3699	if (sctx->stat.malloc_errors > 0) {
				3700	ret = -ENOMEM;
				3701	break;
				3702	}
				3703	skip:
				3704	key.offset = found_key.offset + length;
				3705	btrfs_release_path(path);
				3706	}
				3707
				3708	btrfs_free_path(path);
				3709
				3710	return ret;
				3711	}
				3712
				3713	static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
				3714	struct btrfs_device *scrub_dev)
				3715	{
				3716	int i;
				3717	u64 bytenr;
				3718	u64 gen;
				3719	int ret;
				3720	struct btrfs_fs_info *fs_info = sctx->fs_info;
				3721
				3722	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
				3723	return -EIO;
				3724
				3725	/* Seed devices of a new filesystem has their own generation. */
				3726	if (scrub_dev->fs_devices != fs_info->fs_devices)
				3727	gen = scrub_dev->generation;
				3728	else
				3729	gen = fs_info->last_trans_committed;
				3730
				3731	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
				3732	bytenr = btrfs_sb_offset(i);
				3733	if (bytenr + BTRFS_SUPER_INFO_SIZE >
				3734	scrub_dev->commit_total_bytes)
				3735	break;
				3736
				3737	ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
				3738	scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
				3739	NULL, 1, bytenr);
				3740	if (ret)
				3741	return ret;
				3742	}
				3743	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
				3744
				3745	return 0;
				3746	}
				3747
				3748	/*
				3749	* get a reference count on fs_info->scrub_workers. start worker if necessary
				3750	*/
				3751	static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
				3752	int is_dev_replace)
				3753	{
				3754	unsigned int flags = WQ_FREEZABLE \| WQ_UNBOUND;
				3755	int max_active = fs_info->thread_pool_size;
				3756
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3757	lockdep_assert_held(&fs_info->scrub_lock);
				3758
				3759	if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
				3760	ASSERT(fs_info->scrub_workers == NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3761	fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
				3762	flags, is_dev_replace ? 1 : max_active, 4);
				3763	if (!fs_info->scrub_workers)
				3764	goto fail_scrub_workers;
				3765
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3766	ASSERT(fs_info->scrub_wr_completion_workers == NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3767	fs_info->scrub_wr_completion_workers =
				3768	btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
				3769	max_active, 2);
				3770	if (!fs_info->scrub_wr_completion_workers)
				3771	goto fail_scrub_wr_completion_workers;
				3772
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3773	ASSERT(fs_info->scrub_parity_workers == NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3774	fs_info->scrub_parity_workers =
				3775	btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
				3776	max_active, 2);
				3777	if (!fs_info->scrub_parity_workers)
				3778	goto fail_scrub_parity_workers;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3779
				3780	refcount_set(&fs_info->scrub_workers_refcnt, 1);
				3781	} else {
				3782	refcount_inc(&fs_info->scrub_workers_refcnt);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3783	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3784	return 0;
				3785
				3786	fail_scrub_parity_workers:
				3787	btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
				3788	fail_scrub_wr_completion_workers:
				3789	btrfs_destroy_workqueue(fs_info->scrub_workers);
				3790	fail_scrub_workers:
				3791	return -ENOMEM;
				3792	}
				3793
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3794	int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
				3795	u64 end, struct btrfs_scrub_progress *progress,
				3796	int readonly, int is_dev_replace)
				3797	{
				3798	struct scrub_ctx *sctx;
				3799	int ret;
				3800	struct btrfs_device *dev;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3801	unsigned int nofs_flag;
				3802	struct btrfs_workqueue *scrub_workers = NULL;
				3803	struct btrfs_workqueue *scrub_wr_comp = NULL;
				3804	struct btrfs_workqueue *scrub_parity = NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3805
				3806	if (btrfs_fs_closing(fs_info))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3807	return -EAGAIN;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3808
				3809	if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
				3810	/*
				3811	* in this case scrub is unable to calculate the checksum
				3812	* the way scrub is implemented. Do not handle this
				3813	* situation at all because it won't ever happen.
				3814	*/
				3815	btrfs_err(fs_info,
				3816	"scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
				3817	fs_info->nodesize,
				3818	BTRFS_STRIPE_LEN);
				3819	return -EINVAL;
				3820	}
				3821
				3822	if (fs_info->sectorsize != PAGE_SIZE) {
				3823	/* not supported for data w/o checksums */
				3824	btrfs_err_rl(fs_info,
				3825	"scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
				3826	fs_info->sectorsize, PAGE_SIZE);
				3827	return -EINVAL;
				3828	}
				3829
				3830	if (fs_info->nodesize >
				3831	PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK \|\|
				3832	fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
				3833	/*
				3834	* would exhaust the array bounds of pagev member in
				3835	* struct scrub_block
				3836	*/
				3837	btrfs_err(fs_info,
				3838	"scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
				3839	fs_info->nodesize,
				3840	SCRUB_MAX_PAGES_PER_BLOCK,
				3841	fs_info->sectorsize,
				3842	SCRUB_MAX_PAGES_PER_BLOCK);
				3843	return -EINVAL;
				3844	}
				3845
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3846	/* Allocate outside of device_list_mutex */
				3847	sctx = scrub_setup_ctx(fs_info, is_dev_replace);
				3848	if (IS_ERR(sctx))
				3849	return PTR_ERR(sctx);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3850
				3851	mutex_lock(&fs_info->fs_devices->device_list_mutex);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3852	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3853	if (!dev \|\| (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
				3854	!is_dev_replace)) {
				3855	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3856	ret = -ENODEV;
				3857	goto out_free_ctx;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3858	}
				3859
				3860	if (!is_dev_replace && !readonly &&
				3861	!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
				3862	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				3863	btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
				3864	rcu_str_deref(dev->name));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3865	ret = -EROFS;
				3866	goto out_free_ctx;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3867	}
				3868
				3869	mutex_lock(&fs_info->scrub_lock);
				3870	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) \|\|
				3871	test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
				3872	mutex_unlock(&fs_info->scrub_lock);
				3873	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3874	ret = -EIO;
				3875	goto out_free_ctx;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3876	}
				3877
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3878	down_read(&fs_info->dev_replace.rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3879	if (dev->scrub_ctx \|\|
				3880	(!is_dev_replace &&
				3881	btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3882	up_read(&fs_info->dev_replace.rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3883	mutex_unlock(&fs_info->scrub_lock);
				3884	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3885	ret = -EINPROGRESS;
				3886	goto out_free_ctx;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3887	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3888	up_read(&fs_info->dev_replace.rwsem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3889
				3890	ret = scrub_workers_get(fs_info, is_dev_replace);
				3891	if (ret) {
				3892	mutex_unlock(&fs_info->scrub_lock);
				3893	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3894	goto out_free_ctx;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3895	}
				3896
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3897	sctx->readonly = readonly;
				3898	dev->scrub_ctx = sctx;
				3899	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				3900
				3901	/*
				3902	* checking @scrub_pause_req here, we can avoid
				3903	* race between committing transaction and scrubbing.
				3904	*/
				3905	__scrub_blocked_if_needed(fs_info);
				3906	atomic_inc(&fs_info->scrubs_running);
				3907	mutex_unlock(&fs_info->scrub_lock);
				3908
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3909	/*
				3910	* In order to avoid deadlock with reclaim when there is a transaction
				3911	* trying to pause scrub, make sure we use GFP_NOFS for all the
				3912	* allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
				3913	* invoked by our callees. The pausing request is done when the
				3914	* transaction commit starts, and it blocks the transaction until scrub
				3915	* is paused (done at specific points at scrub_stripe() or right above
				3916	* before incrementing fs_info->scrubs_running).
				3917	*/
				3918	nofs_flag = memalloc_nofs_save();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3919	if (!is_dev_replace) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3920	btrfs_info(fs_info, "scrub: started on devid %llu", devid);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3921	/*
				3922	* by holding device list mutex, we can
				3923	* kick off writing super in log tree sync.
				3924	*/
				3925	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				3926	ret = scrub_supers(sctx, dev);
				3927	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				3928	}
				3929
				3930	if (!ret)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3931	ret = scrub_enumerate_chunks(sctx, dev, start, end);
				3932	memalloc_nofs_restore(nofs_flag);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3933
				3934	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
				3935	atomic_dec(&fs_info->scrubs_running);
				3936	wake_up(&fs_info->scrub_pause_wait);
				3937
				3938	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
				3939
				3940	if (progress)
				3941	memcpy(progress, &sctx->stat, sizeof(*progress));
				3942
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3943	if (!is_dev_replace)
				3944	btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
				3945	ret ? "not finished" : "finished", devid, ret);
				3946
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3947	mutex_lock(&fs_info->scrub_lock);
				3948	dev->scrub_ctx = NULL;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3949	if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
				3950	scrub_workers = fs_info->scrub_workers;
				3951	scrub_wr_comp = fs_info->scrub_wr_completion_workers;
				3952	scrub_parity = fs_info->scrub_parity_workers;
				3953
				3954	fs_info->scrub_workers = NULL;
				3955	fs_info->scrub_wr_completion_workers = NULL;
				3956	fs_info->scrub_parity_workers = NULL;
				3957	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3958	mutex_unlock(&fs_info->scrub_lock);
				3959
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3960	btrfs_destroy_workqueue(scrub_workers);
				3961	btrfs_destroy_workqueue(scrub_wr_comp);
				3962	btrfs_destroy_workqueue(scrub_parity);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3963	scrub_put_ctx(sctx);
				3964
				3965	return ret;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	3966
				3967	out_free_ctx:
				3968	scrub_free_ctx(sctx);
				3969
				3970	return ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3971	}
				3972
				3973	void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
				3974	{
				3975	mutex_lock(&fs_info->scrub_lock);
				3976	atomic_inc(&fs_info->scrub_pause_req);
				3977	while (atomic_read(&fs_info->scrubs_paused) !=
				3978	atomic_read(&fs_info->scrubs_running)) {
				3979	mutex_unlock(&fs_info->scrub_lock);
				3980	wait_event(fs_info->scrub_pause_wait,
				3981	atomic_read(&fs_info->scrubs_paused) ==
				3982	atomic_read(&fs_info->scrubs_running));
				3983	mutex_lock(&fs_info->scrub_lock);
				3984	}
				3985	mutex_unlock(&fs_info->scrub_lock);
				3986	}
				3987
				3988	void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
				3989	{
				3990	atomic_dec(&fs_info->scrub_pause_req);
				3991	wake_up(&fs_info->scrub_pause_wait);
				3992	}
				3993
				3994	int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
				3995	{
				3996	mutex_lock(&fs_info->scrub_lock);
				3997	if (!atomic_read(&fs_info->scrubs_running)) {
				3998	mutex_unlock(&fs_info->scrub_lock);
				3999	return -ENOTCONN;
				4000	}
				4001
				4002	atomic_inc(&fs_info->scrub_cancel_req);
				4003	while (atomic_read(&fs_info->scrubs_running)) {
				4004	mutex_unlock(&fs_info->scrub_lock);
				4005	wait_event(fs_info->scrub_pause_wait,
				4006	atomic_read(&fs_info->scrubs_running) == 0);
				4007	mutex_lock(&fs_info->scrub_lock);
				4008	}
				4009	atomic_dec(&fs_info->scrub_cancel_req);
				4010	mutex_unlock(&fs_info->scrub_lock);
				4011
				4012	return 0;
				4013	}
				4014
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	4015	int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4016	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	4017	struct btrfs_fs_info *fs_info = dev->fs_info;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4018	struct scrub_ctx *sctx;
				4019
				4020	mutex_lock(&fs_info->scrub_lock);
				4021	sctx = dev->scrub_ctx;
				4022	if (!sctx) {
				4023	mutex_unlock(&fs_info->scrub_lock);
				4024	return -ENOTCONN;
				4025	}
				4026	atomic_inc(&sctx->cancel_req);
				4027	while (dev->scrub_ctx) {
				4028	mutex_unlock(&fs_info->scrub_lock);
				4029	wait_event(fs_info->scrub_pause_wait,
				4030	dev->scrub_ctx == NULL);
				4031	mutex_lock(&fs_info->scrub_lock);
				4032	}
				4033	mutex_unlock(&fs_info->scrub_lock);
				4034
				4035	return 0;
				4036	}
				4037
				4038	int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
				4039	struct btrfs_scrub_progress *progress)
				4040	{
				4041	struct btrfs_device *dev;
				4042	struct scrub_ctx *sctx = NULL;
				4043
				4044	mutex_lock(&fs_info->fs_devices->device_list_mutex);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	4045	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4046	if (dev)
				4047	sctx = dev->scrub_ctx;
				4048	if (sctx)
				4049	memcpy(progress, &sctx->stat, sizeof(*progress));
				4050	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				4051
				4052	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
				4053	}
				4054
				4055	static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
				4056	u64 extent_logical, u64 extent_len,
				4057	u64 *extent_physical,
				4058	struct btrfs_device **extent_dev,
				4059	int *extent_mirror_num)
				4060	{
				4061	u64 mapped_length;
				4062	struct btrfs_bio *bbio = NULL;
				4063	int ret;
				4064
				4065	mapped_length = extent_len;
				4066	ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
				4067	&mapped_length, &bbio, 0);
				4068	if (ret \|\| !bbio \|\| mapped_length < extent_len \|\|
				4069	!bbio->stripes[0].dev->bdev) {
				4070	btrfs_put_bbio(bbio);
				4071	return;
				4072	}
				4073
				4074	*extent_physical = bbio->stripes[0].physical;
				4075	*extent_mirror_num = bbio->mirror_num;
				4076	*extent_dev = bbio->stripes[0].dev;
				4077	btrfs_put_bbio(bbio);
				4078	}