Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 811427e..8a62c92 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* raid10.c : Multiple Devices driver for Linux
*
@@ -6,16 +7,6 @@
* RAID-10 support for md.
*
* Base on code in raid1.c. See raid1.c for further copyright information.
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/slab.h>
@@ -25,6 +16,7 @@
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
+#include <linux/raid/md_p.h>
#include <trace/events/block.h>
#include "md.h"
#include "raid10.h"
@@ -72,31 +64,6 @@
* [B A] [D C] [B A] [E C D]
*/
-/*
- * Number of guaranteed r10bios in case of extreme VM load:
- */
-#define NR_RAID10_BIOS 256
-
-/* when we get a read error on a read-only array, we redirect to another
- * device without failing the first device, or trying to over-write to
- * correct the read error. To keep track of bad blocks on a per-bio
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
- */
-#define IO_BLOCKED ((struct bio *)1)
-/* When we successfully write to a known bad-block, we need to remove the
- * bad-block marking which must be done from process context. So we record
- * the success by setting devs[n].bio to IO_MADE_GOOD
- */
-#define IO_MADE_GOOD ((struct bio *)2)
-
-#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
-
-/* When there are this many requests queued to be written by
- * the raid10 thread, we become 'congested' to provide back-pressure
- * for writeback.
- */
-static int max_queued_requests = 1024;
-
static void allow_barrier(struct r10conf *conf);
static void lower_barrier(struct r10conf *conf);
static int _enough(struct r10conf *conf, int previous, int ignore);
@@ -131,11 +98,6 @@
return kzalloc(size, gfp_flags);
}
-static void r10bio_pool_free(void *r10_bio, void *data)
-{
- kfree(r10_bio);
-}
-
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
/* amount of memory to reserve for resync requests */
#define RESYNC_WINDOW (1024*1024)
@@ -229,7 +191,7 @@
out_free_pages:
while (--j >= 0)
- resync_free_pages(&rps[j * 2]);
+ resync_free_pages(&rps[j]);
j = 0;
out_free_bio:
@@ -241,7 +203,7 @@
}
kfree(rps);
out_free_r10bio:
- r10bio_pool_free(r10_bio, conf);
+ rbio_pool_free(r10_bio, conf);
return NULL;
}
@@ -269,7 +231,7 @@
/* resync pages array stored in the 1st bio's .bi_private */
kfree(rp);
- r10bio_pool_free(r10bio, conf);
+ rbio_pool_free(r10bio, conf);
}
static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
@@ -503,19 +465,21 @@
if (test_bit(FailFast, &rdev->flags) &&
(bio->bi_opf & MD_FAILFAST)) {
md_error(rdev->mddev, rdev);
- if (!test_bit(Faulty, &rdev->flags))
- /* This is the only remaining device,
- * We need to retry the write without
- * FailFast
- */
- set_bit(R10BIO_WriteError, &r10_bio->state);
- else {
- r10_bio->devs[slot].bio = NULL;
- to_put = bio;
- dec_rdev = 1;
- }
- } else
+ }
+
+ /*
+ * When the device is faulty, it is not necessary to
+ * handle write error.
+ * For failfast, this is the only remaining device,
+ * We need to retry the write without FailFast.
+ */
+ if (!test_bit(Faulty, &rdev->flags))
set_bit(R10BIO_WriteError, &r10_bio->state);
+ else {
+ r10_bio->devs[slot].bio = NULL;
+ to_put = bio;
+ dec_rdev = 1;
+ }
}
} else {
/*
@@ -745,15 +709,19 @@
int sectors = r10_bio->sectors;
int best_good_sectors;
sector_t new_distance, best_dist;
- struct md_rdev *best_rdev, *rdev = NULL;
+ struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL;
int do_balance;
- int best_slot;
+ int best_dist_slot, best_pending_slot;
+ bool has_nonrot_disk = false;
+ unsigned int min_pending;
struct geom *geo = &conf->geo;
raid10_find_phys(conf, r10_bio);
rcu_read_lock();
- best_slot = -1;
- best_rdev = NULL;
+ best_dist_slot = -1;
+ min_pending = UINT_MAX;
+ best_dist_rdev = NULL;
+ best_pending_rdev = NULL;
best_dist = MaxSector;
best_good_sectors = 0;
do_balance = 1;
@@ -775,6 +743,8 @@
sector_t first_bad;
int bad_sectors;
sector_t dev_sector;
+ unsigned int pending;
+ bool nonrot;
if (r10_bio->devs[slot].bio == IO_BLOCKED)
continue;
@@ -811,8 +781,8 @@
first_bad - dev_sector;
if (good_sectors > best_good_sectors) {
best_good_sectors = good_sectors;
- best_slot = slot;
- best_rdev = rdev;
+ best_dist_slot = slot;
+ best_dist_rdev = rdev;
}
if (!do_balance)
/* Must read from here */
@@ -825,14 +795,23 @@
if (!do_balance)
break;
- if (best_slot >= 0)
+ nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
+ has_nonrot_disk |= nonrot;
+ pending = atomic_read(&rdev->nr_pending);
+ if (min_pending > pending && nonrot) {
+ min_pending = pending;
+ best_pending_slot = slot;
+ best_pending_rdev = rdev;
+ }
+
+ if (best_dist_slot >= 0)
/* At least 2 disks to choose from so failfast is OK */
set_bit(R10BIO_FailFast, &r10_bio->state);
/* This optimisation is debatable, and completely destroys
* sequential read speed for 'far copies' arrays. So only
* keep it for 'near' arrays, and review those later.
*/
- if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
+ if (geo->near_copies > 1 && !pending)
new_distance = 0;
/* for far > 1 always use the lowest address */
@@ -841,15 +820,21 @@
else
new_distance = abs(r10_bio->devs[slot].addr -
conf->mirrors[disk].head_position);
+
if (new_distance < best_dist) {
best_dist = new_distance;
- best_slot = slot;
- best_rdev = rdev;
+ best_dist_slot = slot;
+ best_dist_rdev = rdev;
}
}
if (slot >= conf->copies) {
- slot = best_slot;
- rdev = best_rdev;
+ if (has_nonrot_disk) {
+ slot = best_pending_slot;
+ rdev = best_pending_rdev;
+ } else {
+ slot = best_dist_slot;
+ rdev = best_dist_rdev;
+ }
}
if (slot >= 0) {
@@ -1123,6 +1108,29 @@
kfree(plug);
}
+/*
+ * 1. Register the new request and wait if the reconstruction thread has put
+ * up a bar for new requests. Continue immediately if no resync is active
+ * currently.
+ * 2. If IO spans the reshape position. Need to wait for reshape to pass.
+ */
+static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
+ struct bio *bio, sector_t sectors)
+{
+ wait_barrier(conf);
+ while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ bio->bi_iter.bi_sector < conf->reshape_progress &&
+ bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
+ raid10_log(conf->mddev, "wait reshape");
+ allow_barrier(conf);
+ wait_event(conf->wait_barrier,
+ conf->reshape_progress <= bio->bi_iter.bi_sector ||
+ conf->reshape_progress >= bio->bi_iter.bi_sector +
+ sectors);
+ wait_barrier(conf);
+ }
+}
+
static void raid10_read_request(struct mddev *mddev, struct bio *bio,
struct r10bio *r10_bio)
{
@@ -1131,7 +1139,6 @@
const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
int max_sectors;
- sector_t sectors;
struct md_rdev *rdev;
char b[BDEVNAME_SIZE];
int slot = r10_bio->read_slot;
@@ -1165,30 +1172,8 @@
}
rcu_read_unlock();
}
- /*
- * Register the new request and wait if the reconstruction
- * thread has put up a bar for new requests.
- * Continue immediately if no resync is active currently.
- */
- wait_barrier(conf);
- sectors = r10_bio->sectors;
- while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- bio->bi_iter.bi_sector < conf->reshape_progress &&
- bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
- /*
- * IO spans the reshape position. Need to wait for reshape to
- * pass
- */
- raid10_log(conf->mddev, "wait reshape");
- allow_barrier(conf);
- wait_event(conf->wait_barrier,
- conf->reshape_progress <= bio->bi_iter.bi_sector ||
- conf->reshape_progress >= bio->bi_iter.bi_sector +
- sectors);
- wait_barrier(conf);
- }
-
+ regular_request_wait(mddev, conf, bio, r10_bio->sectors);
rdev = read_balance(conf, r10_bio, &max_sectors);
if (!rdev) {
if (err_rdev) {
@@ -1208,7 +1193,9 @@
struct bio *split = bio_split(bio, max_sectors,
gfp, &conf->bio_split);
bio_chain(split, bio);
+ allow_barrier(conf);
generic_make_request(bio);
+ wait_barrier(conf);
bio = split;
r10_bio->master_bio = bio;
r10_bio->sectors = max_sectors;
@@ -1331,30 +1318,8 @@
finish_wait(&conf->wait_barrier, &w);
}
- /*
- * Register the new request and wait if the reconstruction
- * thread has put up a bar for new requests.
- * Continue immediately if no resync is active currently.
- */
- wait_barrier(conf);
-
sectors = r10_bio->sectors;
- while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- bio->bi_iter.bi_sector < conf->reshape_progress &&
- bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
- /*
- * IO spans the reshape position. Need to wait for reshape to
- * pass
- */
- raid10_log(conf->mddev, "wait reshape");
- allow_barrier(conf);
- wait_event(conf->wait_barrier,
- conf->reshape_progress <= bio->bi_iter.bi_sector ||
- conf->reshape_progress >= bio->bi_iter.bi_sector +
- sectors);
- wait_barrier(conf);
- }
-
+ regular_request_wait(mddev, conf, bio, sectors);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
(mddev->reshape_backwards
? (bio->bi_iter.bi_sector < conf->reshape_safe &&
@@ -1513,7 +1478,9 @@
struct bio *split = bio_split(bio, r10_bio->sectors,
GFP_NOIO, &conf->bio_split);
bio_chain(split, bio);
+ allow_barrier(conf);
generic_make_request(bio);
+ wait_barrier(conf);
bio = split;
r10_bio->master_bio = bio;
}
@@ -1673,12 +1640,12 @@
/*
* If it is not operational, then we have already marked it as dead
- * else if it is the last working disks, ignore the error, let the
- * next level up know.
+ * else if it is the last working disks with "fail_last_dev == false",
+ * ignore the error, let the next level up know.
* else mark the drive as failed
*/
spin_lock_irqsave(&conf->device_lock, flags);
- if (test_bit(In_sync, &rdev->flags)
+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
&& !enough(conf, rdev->raid_disk)) {
/*
* Don't fail the drive, just return an IO error.
@@ -3080,6 +3047,8 @@
sector_t sect;
int must_sync;
int any_working;
+ int need_recover = 0;
+ int need_replace = 0;
struct raid10_info *mirror = &conf->mirrors[i];
struct md_rdev *mrdev, *mreplace;
@@ -3087,11 +3056,15 @@
mrdev = rcu_dereference(mirror->rdev);
mreplace = rcu_dereference(mirror->replacement);
- if ((mrdev == NULL ||
- test_bit(Faulty, &mrdev->flags) ||
- test_bit(In_sync, &mrdev->flags)) &&
- (mreplace == NULL ||
- test_bit(Faulty, &mreplace->flags))) {
+ if (mrdev != NULL &&
+ !test_bit(Faulty, &mrdev->flags) &&
+ !test_bit(In_sync, &mrdev->flags))
+ need_recover = 1;
+ if (mreplace != NULL &&
+ !test_bit(Faulty, &mreplace->flags))
+ need_replace = 1;
+
+ if (!need_recover && !need_replace) {
rcu_read_unlock();
continue;
}
@@ -3214,7 +3187,7 @@
r10_bio->devs[1].devnum = i;
r10_bio->devs[1].addr = to_addr;
- if (!test_bit(In_sync, &mrdev->flags)) {
+ if (need_recover) {
bio = r10_bio->devs[1].bio;
bio->bi_next = biolist;
biolist = bio;
@@ -3231,16 +3204,11 @@
bio = r10_bio->devs[1].repl_bio;
if (bio)
bio->bi_end_io = NULL;
- /* Note: if mreplace != NULL, then bio
+ /* Note: if need_replace, then bio
* cannot be NULL as r10buf_pool_alloc will
* have allocated it.
- * So the second test here is pointless.
- * But it keeps semantic-checkers happy, and
- * this comment keeps human reviewers
- * happy.
*/
- if (mreplace == NULL || bio == NULL ||
- test_bit(Faulty, &mreplace->flags))
+ if (!need_replace)
break;
bio->bi_next = biolist;
biolist = bio;
@@ -3700,8 +3668,8 @@
conf->geo = geo;
conf->copies = copies;
- err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc,
- r10bio_pool_free, conf);
+ err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
+ rbio_pool_free, conf);
if (err)
goto out;
@@ -3955,6 +3923,8 @@
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"reshape");
+ if (!mddev->sync_thread)
+ goto out_free_conf;
}
return 0;
@@ -4287,12 +4257,46 @@
spin_unlock_irq(&conf->device_lock);
if (mddev->delta_disks && mddev->bitmap) {
- ret = md_bitmap_resize(mddev->bitmap,
- raid10_size(mddev, 0, conf->geo.raid_disks),
- 0, 0);
+ struct mdp_superblock_1 *sb = NULL;
+ sector_t oldsize, newsize;
+
+ oldsize = raid10_size(mddev, 0, 0);
+ newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
+
+ if (!mddev_is_clustered(mddev)) {
+ ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
+ if (ret)
+ goto abort;
+ else
+ goto out;
+ }
+
+ rdev_for_each(rdev, mddev) {
+ if (rdev->raid_disk > -1 &&
+ !test_bit(Faulty, &rdev->flags))
+ sb = page_address(rdev->sb_page);
+ }
+
+ /*
+ * some node is already performing reshape, and no need to
+ * call md_bitmap_resize again since it should be called when
+ * receiving BITMAP_RESIZE msg
+ */
+ if ((sb && (le32_to_cpu(sb->feature_map) &
+ MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
+ goto out;
+
+ ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
if (ret)
goto abort;
+
+ ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
+ if (ret) {
+ md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
+ goto abort;
+ }
}
+out:
if (mddev->delta_disks > 0) {
rdev_for_each(rdev, mddev)
if (rdev->raid_disk < 0 &&
@@ -4569,6 +4573,32 @@
r10_bio->master_bio = read_bio;
r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
+ /*
+ * Broadcast RESYNC message to other nodes, so all nodes would not
+ * write to the region to avoid conflict.
+ */
+ if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
+ struct mdp_superblock_1 *sb = NULL;
+ int sb_reshape_pos = 0;
+
+ conf->cluster_sync_low = sector_nr;
+ conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
+ sb = page_address(rdev->sb_page);
+ if (sb) {
+ sb_reshape_pos = le64_to_cpu(sb->reshape_position);
+ /*
+ * Set cluster_sync_low again if next address for array
+ * reshape is less than cluster_sync_low. Since we can't
+ * update cluster_sync_low until it has finished reshape.
+ */
+ if (sb_reshape_pos < conf->cluster_sync_low)
+ conf->cluster_sync_low = sb_reshape_pos;
+ }
+
+ md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
+ conf->cluster_sync_high);
+ }
+
/* Now find the locations in the new layout */
__raid10_find_phys(&conf->geo, r10_bio);
@@ -4626,7 +4656,6 @@
atomic_inc(&r10_bio->remaining);
read_bio->bi_next = NULL;
generic_make_request(read_bio);
- sector_nr += nr_sectors;
sectors_done += nr_sectors;
if (sector_nr <= last)
goto read_more;
@@ -4720,6 +4749,19 @@
conf->fullsync = 0;
}
+static void raid10_update_reshape_pos(struct mddev *mddev)
+{
+ struct r10conf *conf = mddev->private;
+ sector_t lo, hi;
+
+ md_cluster_ops->resync_info_get(mddev, &lo, &hi);
+ if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
+ || mddev->reshape_position == MaxSector)
+ conf->reshape_progress = mddev->reshape_position;
+ else
+ WARN_ON_ONCE(1);
+}
+
static int handle_reshape_read_error(struct mddev *mddev,
struct r10bio *r10_bio)
{
@@ -4731,8 +4773,7 @@
int idx = 0;
struct page **pages;
- r10b = kmalloc(sizeof(*r10b) +
- sizeof(struct r10dev) * conf->copies, GFP_NOIO);
+ r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
if (!r10b) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return -ENOMEM;
@@ -4888,6 +4929,7 @@
.check_reshape = raid10_check_reshape,
.start_reshape = raid10_start_reshape,
.finish_reshape = raid10_finish_reshape,
+ .update_reshape_pos = raid10_update_reshape_pos,
.congested = raid10_congested,
};