Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8668793..1be7abe 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
md.c : Multiple Devices driver for Linux
Copyright (C) 1998, 1999, 2000 Ingo Molnar
@@ -22,14 +23,6 @@
- persistent bitmap code
Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
-
- You should have received a copy of the GNU General Public License
- (for example /usr/src/linux/COPYING); if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
Errors, Warnings, etc.
Please use:
@@ -44,6 +37,7 @@
*/
+#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/kthread.h>
#include <linux/blkdev.h>
@@ -88,8 +82,7 @@
struct md_cluster_operations *md_cluster_ops;
EXPORT_SYMBOL(md_cluster_ops);
-struct module *md_cluster_mod;
-EXPORT_SYMBOL(md_cluster_mod);
+static struct module *md_cluster_mod;
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
@@ -132,22 +125,75 @@
mddev->sync_speed_max : sysctl_speed_limit_max;
}
-static void * flush_info_alloc(gfp_t gfp_flags, void *data)
+static int rdev_init_wb(struct md_rdev *rdev)
{
- return kzalloc(sizeof(struct flush_info), gfp_flags);
-}
-static void flush_info_free(void *flush_info, void *data)
-{
- kfree(flush_info);
+ if (rdev->bdev->bd_queue->nr_hw_queues == 1)
+ return 0;
+
+ spin_lock_init(&rdev->wb_list_lock);
+ INIT_LIST_HEAD(&rdev->wb_list);
+ init_waitqueue_head(&rdev->wb_io_wait);
+ set_bit(WBCollisionCheck, &rdev->flags);
+
+ return 1;
}
-static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
+/*
+ * Create wb_info_pool if rdev is the first multi-queue device flaged
+ * with writemostly, also write-behind mode is enabled.
+ */
+void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend)
{
- return kzalloc(sizeof(struct flush_bio), gfp_flags);
+ if (mddev->bitmap_info.max_write_behind == 0)
+ return;
+
+ if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev))
+ return;
+
+ if (mddev->wb_info_pool == NULL) {
+ unsigned int noio_flag;
+
+ if (!is_suspend)
+ mddev_suspend(mddev);
+ noio_flag = memalloc_noio_save();
+ mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS,
+ sizeof(struct wb_info));
+ memalloc_noio_restore(noio_flag);
+ if (!mddev->wb_info_pool)
+ pr_err("can't alloc memory pool for writemostly\n");
+ if (!is_suspend)
+ mddev_resume(mddev);
+ }
}
-static void flush_bio_free(void *flush_bio, void *data)
+EXPORT_SYMBOL_GPL(mddev_create_wb_pool);
+
+/*
+ * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck.
+ */
+static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev)
{
- kfree(flush_bio);
+ if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags))
+ return;
+
+ if (mddev->wb_info_pool) {
+ struct md_rdev *temp;
+ int num = 0;
+
+ /*
+ * Check if other rdevs need wb_info_pool.
+ */
+ rdev_for_each(temp, mddev)
+ if (temp != rdev &&
+ test_bit(WBCollisionCheck, &temp->flags))
+ num++;
+ if (!num) {
+ mddev_suspend(rdev->mddev);
+ mempool_destroy(mddev->wb_info_pool);
+ mddev->wb_info_pool = NULL;
+ mddev_resume(rdev->mddev);
+ }
+ }
}
static struct ctl_table_header *raid_table_header;
@@ -207,15 +253,10 @@
struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
struct mddev *mddev)
{
- struct bio *b;
-
if (!mddev || !bioset_initialized(&mddev->bio_set))
return bio_alloc(gfp_mask, nr_iovecs);
- b = bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
- if (!b)
- return NULL;
- return b;
+ return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
}
EXPORT_SYMBOL_GPL(bio_alloc_mddev);
@@ -334,7 +375,11 @@
const int sgrp = op_stat_group(bio_op(bio));
struct mddev *mddev = q->queuedata;
unsigned int sectors;
- int cpu;
+
+ if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
+ bio_io_error(bio);
+ return BLK_QC_T_NONE;
+ }
blk_queue_split(q, &bio);
@@ -359,9 +404,9 @@
md_handle_request(mddev, bio);
- cpu = part_stat_lock();
- part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
- part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
+ part_stat_lock();
+ part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
+ part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
part_stat_unlock();
return BLK_QC_T_NONE;
@@ -429,54 +474,31 @@
/*
* Generic flush handling for md
*/
-static void submit_flushes(struct work_struct *ws)
+
+static void md_end_flush(struct bio *bio)
{
- struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
- struct mddev *mddev = fi->mddev;
- struct bio *bio = fi->bio;
-
- bio->bi_opf &= ~REQ_PREFLUSH;
- md_handle_request(mddev, bio);
-
- mempool_free(fi, mddev->flush_pool);
-}
-
-static void md_end_flush(struct bio *fbio)
-{
- struct flush_bio *fb = fbio->bi_private;
- struct md_rdev *rdev = fb->rdev;
- struct flush_info *fi = fb->fi;
- struct bio *bio = fi->bio;
- struct mddev *mddev = fi->mddev;
+ struct md_rdev *rdev = bio->bi_private;
+ struct mddev *mddev = rdev->mddev;
rdev_dec_pending(rdev, mddev);
- if (atomic_dec_and_test(&fi->flush_pending)) {
- if (bio->bi_iter.bi_size == 0) {
- /* an empty barrier - all done */
- bio_endio(bio);
- mempool_free(fi, mddev->flush_pool);
- } else {
- INIT_WORK(&fi->flush_work, submit_flushes);
- queue_work(md_wq, &fi->flush_work);
- }
+ if (atomic_dec_and_test(&mddev->flush_pending)) {
+ /* The pre-request flush has finished */
+ queue_work(md_wq, &mddev->flush_work);
}
-
- mempool_free(fb, mddev->flush_bio_pool);
- bio_put(fbio);
+ bio_put(bio);
}
-void md_flush_request(struct mddev *mddev, struct bio *bio)
+static void md_submit_flush_data(struct work_struct *ws);
+
+static void submit_flushes(struct work_struct *ws)
{
+ struct mddev *mddev = container_of(ws, struct mddev, flush_work);
struct md_rdev *rdev;
- struct flush_info *fi;
- fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
-
- fi->bio = bio;
- fi->mddev = mddev;
- atomic_set(&fi->flush_pending, 1);
-
+ mddev->start_flush = ktime_get_boottime();
+ INIT_WORK(&mddev->flush_work, md_submit_flush_data);
+ atomic_set(&mddev->flush_pending, 1);
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
@@ -486,37 +508,74 @@
* we reclaim rcu_read_lock
*/
struct bio *bi;
- struct flush_bio *fb;
atomic_inc(&rdev->nr_pending);
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
-
- fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
- fb->fi = fi;
- fb->rdev = rdev;
-
bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
- bio_set_dev(bi, rdev->bdev);
bi->bi_end_io = md_end_flush;
- bi->bi_private = fb;
+ bi->bi_private = rdev;
+ bio_set_dev(bi, rdev->bdev);
bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-
- atomic_inc(&fi->flush_pending);
+ atomic_inc(&mddev->flush_pending);
submit_bio(bi);
-
rcu_read_lock();
rdev_dec_pending(rdev, mddev);
}
rcu_read_unlock();
+ if (atomic_dec_and_test(&mddev->flush_pending))
+ queue_work(md_wq, &mddev->flush_work);
+}
- if (atomic_dec_and_test(&fi->flush_pending)) {
- if (bio->bi_iter.bi_size == 0) {
+static void md_submit_flush_data(struct work_struct *ws)
+{
+ struct mddev *mddev = container_of(ws, struct mddev, flush_work);
+ struct bio *bio = mddev->flush_bio;
+
+ /*
+ * must reset flush_bio before calling into md_handle_request to avoid a
+ * deadlock, because other bios passed md_handle_request suspend check
+ * could wait for this and below md_handle_request could wait for those
+ * bios because of suspend check
+ */
+ mddev->last_flush = mddev->start_flush;
+ mddev->flush_bio = NULL;
+ wake_up(&mddev->sb_wait);
+
+ if (bio->bi_iter.bi_size == 0) {
+ /* an empty barrier - all done */
+ bio_endio(bio);
+ } else {
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ md_handle_request(mddev, bio);
+ }
+}
+
+void md_flush_request(struct mddev *mddev, struct bio *bio)
+{
+ ktime_t start = ktime_get_boottime();
+ spin_lock_irq(&mddev->lock);
+ wait_event_lock_irq(mddev->sb_wait,
+ !mddev->flush_bio ||
+ ktime_after(mddev->last_flush, start),
+ mddev->lock);
+ if (!ktime_after(mddev->last_flush, start)) {
+ WARN_ON(mddev->flush_bio);
+ mddev->flush_bio = bio;
+ bio = NULL;
+ }
+ spin_unlock_irq(&mddev->lock);
+
+ if (!bio) {
+ INIT_WORK(&mddev->flush_work, submit_flushes);
+ queue_work(md_wq, &mddev->flush_work);
+ } else {
+ /* flush was performed for some other bio while we waited. */
+ if (bio->bi_iter.bi_size == 0)
/* an empty barrier - all done */
bio_endio(bio);
- mempool_free(fi, mddev->flush_pool);
- } else {
- INIT_WORK(&fi->flush_work, submit_flushes);
- queue_work(md_wq, &fi->flush_work);
+ else {
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ mddev->pers->make_request(mddev, bio);
}
}
}
@@ -566,6 +625,7 @@
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
spin_lock_init(&mddev->lock);
+ atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
init_waitqueue_head(&mddev->recovery_wait);
mddev->reshape_position = MaxSector;
@@ -1115,8 +1175,7 @@
* (not needed for Linear and RAID0 as metadata doesn't
* record this size)
*/
- if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
- sb->level >= 1)
+ if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
rdev->sectors = (sector_t)(2ULL << 32) - 2;
if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
@@ -1178,6 +1237,8 @@
mddev->new_layout = mddev->layout;
mddev->new_chunk_sectors = mddev->chunk_sectors;
}
+ if (mddev->level == 0)
+ mddev->layout = -1;
if (sb->state & (1<<MD_SB_CLEAN))
mddev->recovery_cp = MaxSector;
@@ -1414,8 +1475,7 @@
/* Limit to 4TB as metadata cannot record more than that.
* 4TB == 2^32 KB, or 2*2^32 sectors.
*/
- if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
- rdev->mddev->level >= 1)
+ if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
num_sectors = (sector_t)(2ULL << 32) - 2;
do {
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
@@ -1559,7 +1619,7 @@
*/
s32 offset;
sector_t bb_sector;
- u64 *bbp;
+ __le64 *bbp;
int i;
int sectors = le16_to_cpu(sb->bblog_size);
if (sectors > (PAGE_SIZE / 512))
@@ -1571,7 +1631,7 @@
if (!sync_page_io(rdev, bb_sector, sectors << 9,
rdev->bb_page, REQ_OP_READ, 0, true))
return -EIO;
- bbp = (u64 *)page_address(rdev->bb_page);
+ bbp = (__le64 *)page_address(rdev->bb_page);
rdev->badblocks.shift = sb->bblog_shift;
for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
u64 bb = le64_to_cpu(*bbp);
@@ -1594,6 +1654,10 @@
rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
}
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
+ sb->level != 0)
+ return -EINVAL;
+
if (!refdev) {
ret = 1;
} else {
@@ -1704,6 +1768,10 @@
mddev->new_chunk_sectors = mddev->chunk_sectors;
}
+ if (mddev->level == 0 &&
+ !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
+ mddev->layout = -1;
+
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
@@ -1773,8 +1841,15 @@
if (!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_BITMAP))
rdev->saved_raid_disk = -1;
- } else
- set_bit(In_sync, &rdev->flags);
+ } else {
+ /*
+ * If the array is FROZEN, then the device can't
+ * be in_sync with rest of array.
+ */
+ if (!test_bit(MD_RECOVERY_FROZEN,
+ &mddev->recovery))
+ set_bit(In_sync, &rdev->flags);
+ }
rdev->raid_disk = role;
break;
}
@@ -1883,7 +1958,7 @@
md_error(mddev, rdev);
else {
struct badblocks *bb = &rdev->badblocks;
- u64 *bbp = (u64 *)page_address(rdev->bb_page);
+ __le64 *bbp = (__le64 *)page_address(rdev->bb_page);
u64 *p = bb->page;
sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
if (bb->changed) {
@@ -2148,14 +2223,12 @@
*/
int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
{
- struct blk_integrity *bi_rdev;
struct blk_integrity *bi_mddev;
char name[BDEVNAME_SIZE];
if (!mddev->gendisk)
return 0;
- bi_rdev = bdev_get_integrity(rdev->bdev);
bi_mddev = blk_get_integrity(mddev->gendisk);
if (!bi_mddev) /* nothing to do */
@@ -2231,6 +2304,9 @@
rdev->mddev = mddev;
pr_debug("md: bind<%s>\n", b);
+ if (mddev->raid_disks)
+ mddev_create_wb_pool(mddev, rdev, false);
+
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
goto fail;
@@ -2267,6 +2343,7 @@
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
+ mddev_destroy_wb_pool(rdev->mddev, rdev);
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
@@ -2779,8 +2856,10 @@
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
+ mddev_create_wb_pool(rdev->mddev, rdev, false);
err = 0;
} else if (cmd_match(buf, "-writemostly")) {
+ mddev_destroy_wb_pool(rdev->mddev, rdev);
clear_bit(WriteMostly, &rdev->flags);
err = 0;
} else if (cmd_match(buf, "blocked")) {
@@ -2863,8 +2942,10 @@
err = 0;
}
} else if (cmd_match(buf, "re-add")) {
- if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
- rdev->saved_raid_disk >= 0) {
+ if (!rdev->mddev->pers)
+ err = -EINVAL;
+ else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
+ rdev->saved_raid_disk >= 0) {
/* clear_bit is performed _after_ all the devices
* have their local Faulty bit cleared. If any writes
* happen in the meantime in the local node, they
@@ -3375,7 +3456,7 @@
if (!entry->show)
return -EIO;
if (!rdev->mddev)
- return -EBUSY;
+ return -ENODEV;
return entry->show(rdev, page);
}
@@ -3392,10 +3473,10 @@
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- rv = mddev ? mddev_lock(mddev): -EBUSY;
+ rv = mddev ? mddev_lock(mddev) : -ENODEV;
if (!rv) {
if (rdev->mddev == NULL)
- rv = -EBUSY;
+ rv = -ENODEV;
else
rv = entry->store(rdev, page, length);
mddev_unlock(mddev);
@@ -3605,11 +3686,7 @@
return -EINVAL;
if (decimals < 0)
decimals = 0;
- while (decimals < scale) {
- result *= 10;
- decimals ++;
- }
- *res = result;
+ *res = result * int_pow(10, scale - decimals);
return 0;
}
@@ -4096,12 +4173,17 @@
* active-idle
* like active, but no writes have been seen for a while (100msec).
*
+ * broken
+ * RAID0/LINEAR-only: same as clean, but array is missing a member.
+ * It's useful because RAID0/LINEAR mounted-arrays aren't stopped
+ * when a member is gone, so this state will at least alert the
+ * user that something is wrong.
*/
enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
- write_pending, active_idle, bad_word};
+ write_pending, active_idle, broken, bad_word};
static char *array_states[] = {
"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
- "write-pending", "active-idle", NULL };
+ "write-pending", "active-idle", "broken", NULL };
static int match_word(const char *word, char **list)
{
@@ -4117,7 +4199,7 @@
{
enum array_state st = inactive;
- if (mddev->pers)
+ if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
switch(mddev->ro) {
case 1:
st = readonly;
@@ -4137,7 +4219,10 @@
st = active;
spin_unlock(&mddev->lock);
}
- else {
+
+ if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
+ st = broken;
+ } else {
if (list_empty(&mddev->disks) &&
mddev->raid_disks == 0 &&
mddev->dev_sectors == 0)
@@ -4251,6 +4336,7 @@
break;
case write_pending:
case active_idle:
+ case broken:
/* these cannot be set */
break;
}
@@ -5123,6 +5209,34 @@
__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
consistency_policy_store);
+static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d\n", mddev->fail_last_dev);
+}
+
+/*
+ * Setting fail_last_dev to true to allow last device to be forcibly removed
+ * from RAID1/RAID10.
+ */
+static ssize_t
+fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int ret;
+ bool value;
+
+ ret = kstrtobool(buf, &value);
+ if (ret)
+ return ret;
+
+ if (value != mddev->fail_last_dev)
+ mddev->fail_last_dev = value;
+
+ return len;
+}
+static struct md_sysfs_entry md_fail_last_dev =
+__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
+ fail_last_dev_store);
+
static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_layout.attr,
@@ -5139,6 +5253,7 @@
&md_array_size.attr,
&max_corr_read_errors.attr,
&md_consistency_policy.attr,
+ &md_fail_last_dev.attr,
NULL,
};
@@ -5257,7 +5372,8 @@
{
if (mddev->writes_pending.percpu_count_ptr)
return 0;
- if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
+ if (percpu_ref_init(&mddev->writes_pending, no_op,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
return -ENOMEM;
/* We want to start with the refcount at zero */
percpu_ref_put(&mddev->writes_pending);
@@ -5519,22 +5635,6 @@
if (err)
return err;
}
- if (mddev->flush_pool == NULL) {
- mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
- flush_info_free, mddev);
- if (!mddev->flush_pool) {
- err = -ENOMEM;
- goto abort;
- }
- }
- if (mddev->flush_bio_pool == NULL) {
- mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
- flush_bio_free, mddev);
- if (!mddev->flush_bio_pool) {
- err = -ENOMEM;
- goto abort;
- }
- }
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
@@ -5623,15 +5723,28 @@
mddev->bitmap = bitmap;
}
- if (err) {
- mddev_detach(mddev);
- if (mddev->private)
- pers->free(mddev, mddev->private);
- mddev->private = NULL;
- module_put(pers->owner);
- md_bitmap_destroy(mddev);
- goto abort;
+ if (err)
+ goto bitmap_abort;
+
+ if (mddev->bitmap_info.max_write_behind > 0) {
+ bool creat_pool = false;
+
+ rdev_for_each(rdev, mddev) {
+ if (test_bit(WriteMostly, &rdev->flags) &&
+ rdev_init_wb(rdev))
+ creat_pool = true;
+ }
+ if (creat_pool && mddev->wb_info_pool == NULL) {
+ mddev->wb_info_pool =
+ mempool_create_kmalloc_pool(NR_WB_INFOS,
+ sizeof(struct wb_info));
+ if (!mddev->wb_info_pool) {
+ err = -ENOMEM;
+ goto bitmap_abort;
+ }
+ }
}
+
if (mddev->queue) {
bool nonrot = true;
@@ -5674,8 +5787,7 @@
spin_unlock(&mddev->lock);
rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0)
- if (sysfs_link_rdev(mddev, rdev))
- /* failure here is OK */;
+ sysfs_link_rdev(mddev, rdev); /* failure here is OK */
if (mddev->degraded && !mddev->ro)
/* This ensures that recovering status is reported immediately
@@ -5688,21 +5800,18 @@
md_update_sb(mddev, 0);
md_new_event(mddev);
- sysfs_notify_dirent_safe(mddev->sysfs_state);
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- sysfs_notify(&mddev->kobj, NULL, "degraded");
return 0;
+bitmap_abort:
+ mddev_detach(mddev);
+ if (mddev->private)
+ pers->free(mddev, mddev->private);
+ mddev->private = NULL;
+ module_put(pers->owner);
+ md_bitmap_destroy(mddev);
abort:
- if (mddev->flush_bio_pool) {
- mempool_destroy(mddev->flush_bio_pool);
- mddev->flush_bio_pool = NULL;
- }
- if (mddev->flush_pool){
- mempool_destroy(mddev->flush_pool);
- mddev->flush_pool = NULL;
- }
-
+ bioset_exit(&mddev->bio_set);
+ bioset_exit(&mddev->sync_set);
return err;
}
EXPORT_SYMBOL_GPL(md_run);
@@ -5711,6 +5820,7 @@
{
int err;
+ set_bit(MD_NOT_READY, &mddev->flags);
err = md_run(mddev);
if (err)
goto out;
@@ -5731,9 +5841,14 @@
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
+ clear_bit(MD_NOT_READY, &mddev->flags);
mddev->changed = 1;
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
out:
+ clear_bit(MD_NOT_READY, &mddev->flags);
return err;
}
@@ -5868,6 +5983,8 @@
mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
+ mempool_destroy(mddev->wb_info_pool);
+ mddev->wb_info_pool = NULL;
}
void md_stop_writes(struct mddev *mddev)
@@ -5906,14 +6023,6 @@
mddev->to_remove = &md_redundancy_group;
module_put(pers->owner);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- if (mddev->flush_bio_pool) {
- mempool_destroy(mddev->flush_bio_pool);
- mddev->flush_bio_pool = NULL;
- }
- if (mddev->flush_pool) {
- mempool_destroy(mddev->flush_pool);
- mddev->flush_pool = NULL;
- }
}
void md_stop(struct mddev *mddev)
@@ -6799,6 +6908,9 @@
mddev->external = 0;
mddev->layout = info->layout;
+ if (mddev->level == 0)
+ /* Cannot trust RAID0 layout info here */
+ mddev->layout = -1;
mddev->chunk_sectors = info->chunk_size >> 9;
if (mddev->persistent) {
@@ -7657,9 +7769,9 @@
static int status_resync(struct seq_file *seq, struct mddev *mddev)
{
sector_t max_sectors, resync, res;
- unsigned long dt, db;
- sector_t rt;
- int scale;
+ unsigned long dt, db = 0;
+ sector_t rt, curr_mark_cnt, resync_mark_cnt;
+ int scale, recovery_active;
unsigned int per_milli;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@ -7748,22 +7860,30 @@
* db: blocks written from mark until now
* rt: remaining time
*
- * rt is a sector_t, so could be 32bit or 64bit.
- * So we divide before multiply in case it is 32bit and close
- * to the limit.
- * We scale the divisor (db) by 32 to avoid losing precision
- * near the end of resync when the number of remaining sectors
- * is close to 'db'.
- * We then divide rt by 32 after multiplying by db to compensate.
- * The '+1' avoids division by zero if db is very small.
+ * rt is a sector_t, which is always 64bit now. We are keeping
+ * the original algorithm, but it is not really necessary.
+ *
+ * Original algorithm:
+ * So we divide before multiply in case it is 32bit and close
+ * to the limit.
+ * We scale the divisor (db) by 32 to avoid losing precision
+ * near the end of resync when the number of remaining sectors
+ * is close to 'db'.
+ * We then divide rt by 32 after multiplying by db to compensate.
+ * The '+1' avoids division by zero if db is very small.
*/
dt = ((jiffies - mddev->resync_mark) / HZ);
if (!dt) dt++;
- db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
- - mddev->resync_mark_cnt;
+
+ curr_mark_cnt = mddev->curr_mark_cnt;
+ recovery_active = atomic_read(&mddev->recovery_active);
+ resync_mark_cnt = mddev->resync_mark_cnt;
+
+ if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
+ db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
rt = max_sectors - resync; /* number of remaining sectors */
- sector_div(rt, db/32+1);
+ rt = div64_u64(rt, db/32+1);
rt *= dt;
rt >>= 5;
@@ -8240,8 +8360,7 @@
{
struct mddev *mddev = thread->mddev;
struct mddev *mddev2;
- unsigned int currspeed = 0,
- window;
+ unsigned int currspeed = 0, window;
sector_t max_sectors,j, io_sectors, recovery_done;
unsigned long mark[SYNC_MARKS];
unsigned long update_time;
@@ -8298,7 +8417,7 @@
* 0 == not engaged in resync at all
* 2 == checking that there is no conflict with another sync
* 1 == like 2, but have yielded to allow conflicting resync to
- * commense
+ * commence
* other == active in resync - this many blocks
*
* Before starting a resync we must have set curr_resync to
@@ -8372,9 +8491,17 @@
else if (!mddev->bitmap)
j = mddev->recovery_cp;
- } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
max_sectors = mddev->resync_max_sectors;
- else {
+ /*
+ * If the original node aborts reshaping then we continue the
+ * reshaping, so set j again to avoid restart reshape from the
+ * first beginning
+ */
+ if (mddev_is_clustered(mddev) &&
+ mddev->reshape_position != MaxSector)
+ j = mddev->reshape_position;
+ } else {
/* recovery follows the physical size of devices */
max_sectors = mddev->dev_sectors;
j = MaxSector;
@@ -8421,7 +8548,7 @@
/*
* Tune reconstruction:
*/
- window = 32*(PAGE_SIZE/512);
+ window = 32 * (PAGE_SIZE / 512);
pr_debug("md: using %dk window, over a total of %lluk.\n",
window/2, (unsigned long long)max_sectors/2);
@@ -8625,8 +8752,10 @@
mddev_lock_nointr(mddev);
md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
mddev_unlock(mddev);
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ if (!mddev_is_clustered(mddev)) {
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ }
}
spin_lock(&mddev->lock);
@@ -8792,6 +8921,18 @@
*/
void md_check_recovery(struct mddev *mddev)
{
+ if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
+ /* Write superblock - thread that called mddev_suspend()
+ * holds reconfig_mutex for us.
+ */
+ set_bit(MD_UPDATING_SB, &mddev->flags);
+ smp_mb__after_atomic();
+ if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
+ md_update_sb(mddev, 0);
+ clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
+ wake_up(&mddev->sb_wait);
+ }
+
if (mddev->suspended)
return;
@@ -8821,6 +8962,7 @@
if (mddev_trylock(mddev)) {
int spares = 0;
+ bool try_set_sync = mddev->safemode != 0;
if (!mddev->external && mddev->safemode == 1)
mddev->safemode = 0;
@@ -8866,7 +9008,7 @@
}
}
- if (!mddev->external && !mddev->in_sync) {
+ if (try_set_sync && !mddev->external && !mddev->in_sync) {
spin_lock(&mddev->lock);
set_in_sync(mddev);
spin_unlock(&mddev->lock);
@@ -8951,16 +9093,6 @@
unlock:
wake_up(&mddev->sb_wait);
mddev_unlock(mddev);
- } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
- /* Write superblock - thread that called mddev_suspend()
- * holds reconfig_mutex for us.
- */
- set_bit(MD_UPDATING_SB, &mddev->flags);
- smp_mb__after_atomic();
- if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
- md_update_sb(mddev, 0);
- clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
- wake_up(&mddev->sb_wait);
}
}
EXPORT_SYMBOL(md_check_recovery);
@@ -8968,11 +9100,14 @@
void md_reap_sync_thread(struct mddev *mddev)
{
struct md_rdev *rdev;
+ sector_t old_dev_sectors = mddev->dev_sectors;
+ bool is_reshaped = false;
/* resync has finished, collect result */
md_unregister_thread(&mddev->sync_thread);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ mddev->degraded != mddev->raid_disks) {
/* success...*/
/* activate any spares */
if (mddev->pers->spare_active(mddev)) {
@@ -8982,8 +9117,11 @@
}
}
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- mddev->pers->finish_reshape)
+ mddev->pers->finish_reshape) {
mddev->pers->finish_reshape(mddev);
+ if (mddev_is_clustered(mddev))
+ is_reshaped = true;
+ }
/* If array is no-longer degraded, then any saved_raid_disk
* information must be scrapped.
@@ -9004,6 +9142,14 @@
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ /*
+ * We call md_cluster_ops->update_size here because sync_size could
+ * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
+ * so it is time to update size across cluster.
+ */
+ if (mddev_is_clustered(mddev) && is_reshaped
+ && !test_bit(MD_CLOSING, &mddev->flags))
+ md_cluster_ops->update_size(mddev, old_dev_sectors);
wake_up(&resync_wait);
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -9203,8 +9349,12 @@
}
if (role != rdev2->raid_disk) {
- /* got activated */
- if (rdev2->raid_disk == -1 && role != 0xffff) {
+ /*
+ * got activated except reshape is happening.
+ */
+ if (rdev2->raid_disk == -1 && role != 0xffff &&
+ !(le32_to_cpu(sb->feature_map) &
+ MD_FEATURE_RESHAPE_ACTIVE)) {
rdev2->saved_raid_disk = role;
ret = remove_and_add_spares(mddev, rdev2);
pr_info("Activated spare: %s\n",
@@ -9213,7 +9363,6 @@
* perform resync with the new activated disk */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
-
}
/* device faulty
* We just want to do the minimum to mark the disk
@@ -9230,6 +9379,30 @@
if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
+ /*
+ * Since mddev->delta_disks has already updated in update_raid_disks,
+ * so it is time to check reshape.
+ */
+ if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+ (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+ /*
+ * reshape is happening in the remote node, we need to
+ * update reshape_position and call start_reshape.
+ */
+ mddev->reshape_position = le64_to_cpu(sb->reshape_position);
+ if (mddev->pers->update_reshape_pos)
+ mddev->pers->update_reshape_pos(mddev);
+ if (mddev->pers->start_reshape)
+ mddev->pers->start_reshape(mddev);
+ } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+ mddev->reshape_position != MaxSector &&
+ !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+ /* reshape is just done in another node. */
+ mddev->reshape_position = MaxSector;
+ if (mddev->pers->update_reshape_pos)
+ mddev->pers->update_reshape_pos(mddev);
+ }
+
/* Finally set the event to be up to date */
mddev->events = le64_to_cpu(sb->events);
}