Update Linux to v5.4.148
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.4.148.tar.gz
Change-Id: Ib3d26c5ba9b022e2e03533005c4fed4d7c30b61b
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index aa98953..7dd6e98 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -565,6 +565,7 @@
select BLK_DEV_INTEGRITY
select DM_BUFIO
select CRYPTO
+ select CRYPTO_SKCIPHER
select ASYNC_XOR
---help---
This device-mapper target emulates a block device that has
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 6f77682..a1df0d9 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -377,7 +377,10 @@
if (!fifo_full(&ca->free_inc))
goto retry_invalidate;
- bch_prio_write(ca);
+ if (bch_prio_write(ca, false) < 0) {
+ ca->invalidate_needs_gc = 1;
+ wake_up_gc(ca->set);
+ }
}
}
out:
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 013e35a..36de6f7 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -264,7 +264,7 @@
#define BCACHE_DEV_UNLINK_DONE 2
#define BCACHE_DEV_WB_RUNNING 3
#define BCACHE_DEV_RATE_DW_RUNNING 4
- unsigned int nr_stripes;
+ int nr_stripes;
unsigned int stripe_size;
atomic_t *stripe_sectors_dirty;
unsigned long *full_dirty_stripes;
@@ -329,6 +329,9 @@
*/
atomic_t has_dirty;
+#define BCH_CACHE_READA_ALL 0
+#define BCH_CACHE_READA_META_ONLY 1
+ unsigned int cache_readahead_policy;
struct bch_ratelimit writeback_rate;
struct delayed_work writeback_rate_update;
@@ -582,6 +585,7 @@
*/
wait_queue_head_t btree_cache_wait;
struct task_struct *btree_cache_alloc_lock;
+ spinlock_t btree_cannibalize_lock;
/*
* When we free a btree node, we increment the gen of the bucket the
@@ -977,11 +981,12 @@
__printf(2, 3)
bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...);
-void bch_prio_write(struct cache *ca);
+int bch_prio_write(struct cache *ca, bool wait);
void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent);
extern struct workqueue_struct *bcache_wq;
extern struct workqueue_struct *bch_journal_wq;
+extern struct workqueue_struct *bch_flush_wq;
extern struct mutex bch_register_lock;
extern struct list_head bch_cache_sets;
@@ -1023,5 +1028,7 @@
void bch_debug_init(void);
void bch_request_exit(void);
int bch_request_init(void);
+void bch_btree_exit(void);
+int bch_btree_init(void);
#endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 0876879..fda68c0 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -321,7 +321,7 @@
b->page_order = page_order;
- t->data = (void *) __get_free_pages(gfp, b->page_order);
+ t->data = (void *) __get_free_pages(__GFP_COMP|gfp, b->page_order);
if (!t->data)
goto err;
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index c71365e..a50dcfd 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -397,7 +397,8 @@
/* Bkey utility code */
-#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys)
+#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, \
+ (unsigned int)(i)->keys)
static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned int idx)
{
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index ba434d9..5a33910 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -99,6 +99,8 @@
#define PTR_HASH(c, k) \
(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
+static struct workqueue_struct *btree_io_wq;
+
#define insert_lock(s, b) ((b)->level <= (s)->lock)
/*
@@ -366,7 +368,7 @@
btree_complete_write(b, w);
if (btree_node_dirty(b))
- schedule_delayed_work(&b->work, 30 * HZ);
+ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
closure_return_with_destructor(cl, btree_node_write_unlock);
}
@@ -539,7 +541,7 @@
BUG_ON(!i->keys);
if (!btree_node_dirty(b))
- schedule_delayed_work(&b->work, 30 * HZ);
+ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
set_btree_node_dirty(b);
@@ -723,6 +725,8 @@
* IO can always make forward progress:
*/
nr /= c->btree_pages;
+ if (nr == 0)
+ nr = 1;
nr = min_t(unsigned long, nr, mca_can_free(c));
i = 0;
@@ -838,7 +842,7 @@
mutex_init(&c->verify_lock);
c->verify_ondisk = (void *)
- __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
+ __get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(bucket_pages(c)));
c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
@@ -884,15 +888,17 @@
static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
{
- struct task_struct *old;
-
- old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
- if (old && old != current) {
+ spin_lock(&c->btree_cannibalize_lock);
+ if (likely(c->btree_cache_alloc_lock == NULL)) {
+ c->btree_cache_alloc_lock = current;
+ } else if (c->btree_cache_alloc_lock != current) {
if (op)
prepare_to_wait(&c->btree_cache_wait, &op->wait,
TASK_UNINTERRUPTIBLE);
+ spin_unlock(&c->btree_cannibalize_lock);
return -EINTR;
}
+ spin_unlock(&c->btree_cannibalize_lock);
return 0;
}
@@ -927,10 +933,12 @@
*/
static void bch_cannibalize_unlock(struct cache_set *c)
{
+ spin_lock(&c->btree_cannibalize_lock);
if (c->btree_cache_alloc_lock == current) {
c->btree_cache_alloc_lock = NULL;
wake_up(&c->btree_cache_wait);
}
+ spin_unlock(&c->btree_cannibalize_lock);
}
static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
@@ -1440,7 +1448,7 @@
if (__set_blocks(n1, n1->keys + n2->keys,
block_bytes(b->c)) >
btree_blocks(new_nodes[i]))
- goto out_nocoalesce;
+ goto out_unlock_nocoalesce;
keys = n2->keys;
/* Take the key of the node we're getting rid of */
@@ -1469,7 +1477,7 @@
if (__bch_keylist_realloc(&keylist,
bkey_u64s(&new_nodes[i]->key)))
- goto out_nocoalesce;
+ goto out_unlock_nocoalesce;
bch_btree_node_write(new_nodes[i], &cl);
bch_keylist_add(&keylist, &new_nodes[i]->key);
@@ -1515,6 +1523,10 @@
/* Invalidated our iterator */
return -EINTR;
+out_unlock_nocoalesce:
+ for (i = 0; i < nodes; i++)
+ mutex_unlock(&new_nodes[i]->write_lock);
+
out_nocoalesce:
closure_sync(&cl);
@@ -2649,3 +2661,18 @@
spin_lock_init(&buf->lock);
array_allocator_init(&buf->freelist);
}
+
+void bch_btree_exit(void)
+{
+ if (btree_io_wq)
+ destroy_workqueue(btree_io_wq);
+}
+
+int __init bch_btree_init(void)
+{
+ btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0);
+ if (!btree_io_wq)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index be2a2a2..b4fd923 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -417,10 +417,15 @@
/* Journalling */
+#define nr_to_fifo_front(p, front_p, mask) (((p) - (front_p)) & (mask))
+
static void btree_flush_write(struct cache_set *c)
{
struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR];
- unsigned int i, n;
+ unsigned int i, nr;
+ int ref_nr;
+ atomic_t *fifo_front_p, *now_fifo_front_p;
+ size_t mask;
if (c->journal.btree_flushing)
return;
@@ -433,12 +438,50 @@
c->journal.btree_flushing = true;
spin_unlock(&c->journal.flush_write_lock);
+ /* get the oldest journal entry and check its refcount */
+ spin_lock(&c->journal.lock);
+ fifo_front_p = &fifo_front(&c->journal.pin);
+ ref_nr = atomic_read(fifo_front_p);
+ if (ref_nr <= 0) {
+ /*
+ * do nothing if no btree node references
+ * the oldest journal entry
+ */
+ spin_unlock(&c->journal.lock);
+ goto out;
+ }
+ spin_unlock(&c->journal.lock);
+
+ mask = c->journal.pin.mask;
+ nr = 0;
atomic_long_inc(&c->flush_write);
memset(btree_nodes, 0, sizeof(btree_nodes));
- n = 0;
mutex_lock(&c->bucket_lock);
list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
+ /*
+ * It is safe to get now_fifo_front_p without holding
+ * c->journal.lock here, because we don't need to know
+ * the exactly accurate value, just check whether the
+ * front pointer of c->journal.pin is changed.
+ */
+ now_fifo_front_p = &fifo_front(&c->journal.pin);
+ /*
+ * If the oldest journal entry is reclaimed and front
+ * pointer of c->journal.pin changes, it is unnecessary
+ * to scan c->btree_cache anymore, just quit the loop and
+ * flush out what we have already.
+ */
+ if (now_fifo_front_p != fifo_front_p)
+ break;
+ /*
+ * quit this loop if all matching btree nodes are
+ * scanned and record in btree_nodes[] already.
+ */
+ ref_nr = atomic_read(fifo_front_p);
+ if (nr >= ref_nr)
+ break;
+
if (btree_node_journal_flush(b))
pr_err("BUG: flush_write bit should not be set here!");
@@ -454,17 +497,44 @@
continue;
}
+ /*
+ * Only select the btree node which exactly references
+ * the oldest journal entry.
+ *
+ * If the journal entry pointed by fifo_front_p is
+ * reclaimed in parallel, don't worry:
+ * - the list_for_each_xxx loop will quit when checking
+ * next now_fifo_front_p.
+ * - If there are matched nodes recorded in btree_nodes[],
+ * they are clean now (this is why and how the oldest
+ * journal entry can be reclaimed). These selected nodes
+ * will be ignored and skipped in the folowing for-loop.
+ */
+ if (nr_to_fifo_front(btree_current_write(b)->journal,
+ fifo_front_p,
+ mask) != 0) {
+ mutex_unlock(&b->write_lock);
+ continue;
+ }
+
set_btree_node_journal_flush(b);
mutex_unlock(&b->write_lock);
- btree_nodes[n++] = b;
- if (n == BTREE_FLUSH_NR)
+ btree_nodes[nr++] = b;
+ /*
+ * To avoid holding c->bucket_lock too long time,
+ * only scan for BTREE_FLUSH_NR matched btree nodes
+ * at most. If there are more btree nodes reference
+ * the oldest journal entry, try to flush them next
+ * time when btree_flush_write() is called.
+ */
+ if (nr == BTREE_FLUSH_NR)
break;
}
mutex_unlock(&c->bucket_lock);
- for (i = 0; i < n; i++) {
+ for (i = 0; i < nr; i++) {
b = btree_nodes[i];
if (!b) {
pr_err("BUG: btree_nodes[%d] is NULL", i);
@@ -497,6 +567,7 @@
mutex_unlock(&b->write_lock);
}
+out:
spin_lock(&c->journal.flush_write_lock);
c->journal.btree_flushing = false;
spin_unlock(&c->journal.flush_write_lock);
@@ -887,8 +958,8 @@
journal_try_write(c);
} else if (!w->dirty) {
w->dirty = true;
- schedule_delayed_work(&c->journal.work,
- msecs_to_jiffies(c->journal_delay_ms));
+ queue_delayed_work(bch_flush_wq, &c->journal.work,
+ msecs_to_jiffies(c->journal_delay_ms));
spin_unlock(&c->journal.lock);
} else {
spin_unlock(&c->journal.lock);
@@ -931,8 +1002,8 @@
j->w[1].c = c;
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
- !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
- !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
+ !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)) ||
+ !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)))
return -ENOMEM;
return 0;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 41adcd1..4045ae7 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -391,13 +391,20 @@
goto skip;
/*
- * Flag for bypass if the IO is for read-ahead or background,
- * unless the read-ahead request is for metadata
+ * If the bio is for read-ahead or background IO, bypass it or
+ * not depends on the following situations,
+ * - If the IO is for meta data, always cache it and no bypass
+ * - If the IO is not meta data, check dc->cache_reada_policy,
+ * BCH_CACHE_READA_ALL: cache it and not bypass
+ * BCH_CACHE_READA_META_ONLY: not cache it and bypass
+ * That is, read-ahead request for metadata always get cached
* (eg, for gfs2 or xfs).
*/
- if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
- !(bio->bi_opf & (REQ_META|REQ_PRIO)))
- goto skip;
+ if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) {
+ if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
+ (dc->cache_readahead_policy != BCH_CACHE_READA_ALL))
+ goto skip;
+ }
if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
bio_sectors(bio) & (c->sb.block_size - 1)) {
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index ba1c937..503aafe 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -109,9 +109,13 @@
void bch_cache_accounting_clear(struct cache_accounting *acc)
{
- memset(&acc->total.cache_hits,
- 0,
- sizeof(struct cache_stats));
+ acc->total.cache_hits = 0;
+ acc->total.cache_misses = 0;
+ acc->total.cache_bypass_hits = 0;
+ acc->total.cache_bypass_misses = 0;
+ acc->total.cache_readaheads = 0;
+ acc->total.cache_miss_collisions = 0;
+ acc->total.sectors_bypassed = 0;
}
void bch_cache_accounting_destroy(struct cache_accounting *acc)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 20ed838..efdf6ce 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -48,6 +48,7 @@
static DEFINE_IDA(bcache_device_idx);
static wait_queue_head_t unregister_wait;
struct workqueue_struct *bcache_wq;
+struct workqueue_struct *bch_flush_wq;
struct workqueue_struct *bch_journal_wq;
@@ -529,12 +530,29 @@
closure_sync(cl);
}
-void bch_prio_write(struct cache *ca)
+int bch_prio_write(struct cache *ca, bool wait)
{
int i;
struct bucket *b;
struct closure cl;
+ pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu",
+ fifo_used(&ca->free[RESERVE_PRIO]),
+ fifo_used(&ca->free[RESERVE_NONE]),
+ fifo_used(&ca->free_inc));
+
+ /*
+ * Pre-check if there are enough free buckets. In the non-blocking
+ * scenario it's better to fail early rather than starting to allocate
+ * buckets and do a cleanup later in case of failure.
+ */
+ if (!wait) {
+ size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
+ fifo_used(&ca->free[RESERVE_NONE]);
+ if (prio_buckets(ca) > avail)
+ return -ENOMEM;
+ }
+
closure_init_stack(&cl);
lockdep_assert_held(&ca->set->bucket_lock);
@@ -544,9 +562,6 @@
atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
&ca->meta_sectors_written);
- //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
- // fifo_used(&ca->free_inc), fifo_used(&ca->unused));
-
for (i = prio_buckets(ca) - 1; i >= 0; --i) {
long bucket;
struct prio_set *p = ca->disk_buckets;
@@ -564,7 +579,7 @@
p->magic = pset_magic(&ca->sb);
p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
- bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
+ bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
BUG_ON(bucket == -1);
mutex_unlock(&ca->set->bucket_lock);
@@ -593,6 +608,7 @@
ca->prio_last_buckets[i] = ca->prio_buckets[i];
}
+ return 0;
}
static void prio_read(struct cache *ca, uint64_t bucket)
@@ -761,20 +777,31 @@
static void bcache_device_free(struct bcache_device *d)
{
+ struct gendisk *disk = d->disk;
+
lockdep_assert_held(&bch_register_lock);
- pr_info("%s stopped", d->disk->disk_name);
+ if (disk)
+ pr_info("%s stopped", disk->disk_name);
+ else
+ pr_err("bcache device (NULL gendisk) stopped");
if (d->c)
bcache_device_detach(d);
- if (d->disk && d->disk->flags & GENHD_FL_UP)
- del_gendisk(d->disk);
- if (d->disk && d->disk->queue)
- blk_cleanup_queue(d->disk->queue);
- if (d->disk) {
+
+ if (disk) {
+ bool disk_added = (disk->flags & GENHD_FL_UP) != 0;
+
+ if (disk_added)
+ del_gendisk(disk);
+
+ if (disk->queue)
+ blk_cleanup_queue(disk->queue);
+
ida_simple_remove(&bcache_device_idx,
- first_minor_to_idx(d->disk->first_minor));
- put_disk(d->disk);
+ first_minor_to_idx(disk->first_minor));
+ if (disk_added)
+ put_disk(disk);
}
bioset_exit(&d->bio_split);
@@ -790,19 +817,19 @@
struct request_queue *q;
const size_t max_stripes = min_t(size_t, INT_MAX,
SIZE_MAX / sizeof(atomic_t));
- size_t n;
+ uint64_t n;
int idx;
if (!d->stripe_size)
d->stripe_size = 1 << 31;
- d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
-
- if (!d->nr_stripes || d->nr_stripes > max_stripes) {
- pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
- (unsigned int)d->nr_stripes);
+ n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
+ if (!n || n > max_stripes) {
+ pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n",
+ n);
return -ENOMEM;
}
+ d->nr_stripes = n;
n = d->nr_stripes * sizeof(atomic_t);
d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
@@ -812,20 +839,20 @@
n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
if (!d->full_dirty_stripes)
- return -ENOMEM;
+ goto out_free_stripe_sectors_dirty;
idx = ida_simple_get(&bcache_device_idx, 0,
BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
if (idx < 0)
- return idx;
+ goto out_free_full_dirty_stripes;
if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
- goto err;
+ goto out_ida_remove;
d->disk = alloc_disk(BCACHE_MINORS);
if (!d->disk)
- goto err;
+ goto out_bioset_exit;
set_capacity(d->disk, sectors);
snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
@@ -860,8 +887,14 @@
return 0;
-err:
+out_bioset_exit:
+ bioset_exit(&d->bio_split);
+out_ida_remove:
ida_simple_remove(&bcache_device_idx, idx);
+out_free_full_dirty_stripes:
+ kvfree(d->full_dirty_stripes);
+out_free_stripe_sectors_dirty:
+ kvfree(d->stripe_sectors_dirty);
return -ENOMEM;
}
@@ -1251,6 +1284,9 @@
mutex_unlock(&bch_register_lock);
+ if (dc->sb_bio.bi_inline_vecs[0].bv_page)
+ put_page(bio_first_page_all(&dc->sb_bio));
+
if (!IS_ERR_OR_NULL(dc->bdev))
blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
@@ -1725,7 +1761,7 @@
}
#define alloc_bucket_pages(gfp, c) \
- ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
+ ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(bucket_pages(c))))
struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
{
@@ -1769,6 +1805,7 @@
sema_init(&c->sb_write_mutex, 1);
mutex_init(&c->bucket_lock);
init_waitqueue_head(&c->btree_cache_wait);
+ spin_lock_init(&c->btree_cannibalize_lock);
init_waitqueue_head(&c->bucket_wait);
init_waitqueue_head(&c->gc_wait);
sema_init(&c->uuid_write_mutex, 1);
@@ -1954,7 +1991,7 @@
mutex_lock(&c->bucket_lock);
for_each_cache(ca, c, i)
- bch_prio_write(ca);
+ bch_prio_write(ca, true);
mutex_unlock(&c->bucket_lock);
err = "cannot allocate new UUID bucket";
@@ -2062,7 +2099,14 @@
sysfs_create_link(&c->kobj, &ca->kobj, buf))
goto err;
- if (ca->sb.seq > c->sb.seq) {
+ /*
+ * A special case is both ca->sb.seq and c->sb.seq are 0,
+ * such condition happens on a new created cache device whose
+ * super block is never flushed yet. In this case c->sb.version
+ * and other members should be updated too, otherwise we will
+ * have a mistaken super block version in cache set.
+ */
+ if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) {
c->sb.version = ca->sb.version;
memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
c->sb.flags = ca->sb.flags;
@@ -2346,29 +2390,35 @@
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
const char *buffer, size_t size)
{
- ssize_t ret = -EINVAL;
- const char *err = "cannot allocate memory";
+ const char *err;
char *path = NULL;
- struct cache_sb *sb = NULL;
+ struct cache_sb *sb;
struct block_device *bdev = NULL;
- struct page *sb_page = NULL;
+ struct page *sb_page;
+ ssize_t ret;
+ ret = -EBUSY;
+ err = "failed to reference bcache module";
if (!try_module_get(THIS_MODULE))
- return -EBUSY;
+ goto out;
/* For latest state of bcache_is_reboot */
smp_mb();
+ err = "bcache is in reboot";
if (bcache_is_reboot)
- return -EBUSY;
+ goto out_module_put;
+ ret = -ENOMEM;
+ err = "cannot allocate memory";
path = kstrndup(buffer, size, GFP_KERNEL);
if (!path)
- goto err;
+ goto out_module_put;
sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
if (!sb)
- goto err;
+ goto out_free_path;
+ ret = -EINVAL;
err = "failed to open device";
bdev = blkdev_get_by_path(strim(path),
FMODE_READ|FMODE_WRITE|FMODE_EXCL,
@@ -2385,57 +2435,69 @@
if (!IS_ERR(bdev))
bdput(bdev);
if (attr == &ksysfs_register_quiet)
- goto quiet_out;
+ goto done;
}
- goto err;
+ goto out_free_sb;
}
err = "failed to set blocksize";
if (set_blocksize(bdev, 4096))
- goto err_close;
+ goto out_blkdev_put;
err = read_super(sb, bdev, &sb_page);
if (err)
- goto err_close;
+ goto out_blkdev_put;
err = "failed to register device";
if (SB_IS_BDEV(sb)) {
struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
if (!dc)
- goto err_close;
+ goto out_put_sb_page;
mutex_lock(&bch_register_lock);
ret = register_bdev(sb, sb_page, bdev, dc);
mutex_unlock(&bch_register_lock);
/* blkdev_put() will be called in cached_dev_free() */
- if (ret < 0)
- goto err;
+ if (ret < 0) {
+ bdev = NULL;
+ goto out_put_sb_page;
+ }
} else {
struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
- goto err_close;
+ goto out_put_sb_page;
/* blkdev_put() will be called in bch_cache_release() */
- if (register_cache(sb, sb_page, bdev, ca) != 0)
- goto err;
+ if (register_cache(sb, sb_page, bdev, ca) != 0) {
+ bdev = NULL;
+ goto out_put_sb_page;
+ }
}
-quiet_out:
- ret = size;
-out:
- if (sb_page)
- put_page(sb_page);
+
+ put_page(sb_page);
+done:
kfree(sb);
kfree(path);
module_put(THIS_MODULE);
- return ret;
+ return size;
-err_close:
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-err:
- pr_info("error %s: %s", path, err);
- goto out;
+out_put_sb_page:
+ put_page(sb_page);
+out_blkdev_put:
+ if (bdev)
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+out_free_sb:
+ kfree(sb);
+out_free_path:
+ kfree(path);
+ path = NULL;
+out_module_put:
+ module_put(THIS_MODULE);
+out:
+ pr_info("error %s: %s", path?path:"", err);
+ return ret;
}
@@ -2597,6 +2659,9 @@
destroy_workqueue(bcache_wq);
if (bch_journal_wq)
destroy_workqueue(bch_journal_wq);
+ if (bch_flush_wq)
+ destroy_workqueue(bch_flush_wq);
+ bch_btree_exit();
if (bcache_major)
unregister_blkdev(bcache_major, "bcache");
@@ -2652,10 +2717,26 @@
return bcache_major;
}
+ if (bch_btree_init())
+ goto err;
+
bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
if (!bcache_wq)
goto err;
+ /*
+ * Let's not make this `WQ_MEM_RECLAIM` for the following reasons:
+ *
+ * 1. It used `system_wq` before which also does no memory reclaim.
+ * 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and
+ * reduced throughput can be observed.
+ *
+ * We still want to user our own queue to not congest the `system_wq`.
+ */
+ bch_flush_wq = alloc_workqueue("bch_flush", 0, 0);
+ if (!bch_flush_wq)
+ goto err;
+
bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
if (!bch_journal_wq)
goto err;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 627dcea..7f0fb4b 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -27,6 +27,12 @@
NULL
};
+static const char * const bch_reada_cache_policies[] = {
+ "all",
+ "meta-only",
+ NULL
+};
+
/* Default is 0 ("auto") */
static const char * const bch_stop_on_failure_modes[] = {
"auto",
@@ -100,6 +106,7 @@
rw_attribute(sequential_cutoff);
rw_attribute(data_csum);
rw_attribute(cache_mode);
+rw_attribute(readahead_cache_policy);
rw_attribute(stop_when_cache_set_failed);
rw_attribute(writeback_metadata);
rw_attribute(writeback_running);
@@ -167,6 +174,11 @@
bch_cache_modes,
BDEV_CACHE_MODE(&dc->sb));
+ if (attr == &sysfs_readahead_cache_policy)
+ return bch_snprint_string_list(buf, PAGE_SIZE,
+ bch_reada_cache_policies,
+ dc->cache_readahead_policy);
+
if (attr == &sysfs_stop_when_cache_set_failed)
return bch_snprint_string_list(buf, PAGE_SIZE,
bch_stop_on_failure_modes,
@@ -352,6 +364,15 @@
}
}
+ if (attr == &sysfs_readahead_cache_policy) {
+ v = __sysfs_match_string(bch_reada_cache_policies, -1, buf);
+ if (v < 0)
+ return v;
+
+ if ((unsigned int) v != dc->cache_readahead_policy)
+ dc->cache_readahead_policy = v;
+ }
+
if (attr == &sysfs_stop_when_cache_set_failed) {
v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf);
if (v < 0)
@@ -466,6 +487,7 @@
&sysfs_data_csum,
#endif
&sysfs_cache_mode,
+ &sysfs_readahead_cache_policy,
&sysfs_stop_when_cache_set_failed,
&sysfs_writeback_metadata,
&sysfs_writeback_running,
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index d60268f..0b02210 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -519,15 +519,19 @@
uint64_t offset, int nr_sectors)
{
struct bcache_device *d = c->devices[inode];
- unsigned int stripe_offset, stripe, sectors_dirty;
+ unsigned int stripe_offset, sectors_dirty;
+ int stripe;
if (!d)
return;
+ stripe = offset_to_stripe(d, offset);
+ if (stripe < 0)
+ return;
+
if (UUID_FLASH_ONLY(&c->uuids[inode]))
atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors);
- stripe = offset_to_stripe(d, offset);
stripe_offset = offset & (d->stripe_size - 1);
while (nr_sectors) {
@@ -567,12 +571,12 @@
static void refill_full_stripes(struct cached_dev *dc)
{
struct keybuf *buf = &dc->writeback_keys;
- unsigned int start_stripe, stripe, next_stripe;
+ unsigned int start_stripe, next_stripe;
+ int stripe;
bool wrapped = false;
stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
-
- if (stripe >= dc->disk.nr_stripes)
+ if (stripe < 0)
stripe = 0;
start_stripe = stripe;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 4e4c681..c4ff760 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -33,10 +33,22 @@
return ret;
}
-static inline unsigned int offset_to_stripe(struct bcache_device *d,
+static inline int offset_to_stripe(struct bcache_device *d,
uint64_t offset)
{
do_div(offset, d->stripe_size);
+
+ /* d->nr_stripes is in range [1, INT_MAX] */
+ if (unlikely(offset >= d->nr_stripes)) {
+ pr_err("Invalid stripe %llu (>= nr_stripes %d).\n",
+ offset, d->nr_stripes);
+ return -EINVAL;
+ }
+
+ /*
+ * Here offset is definitly smaller than INT_MAX,
+ * return it as int will never overflow.
+ */
return offset;
}
@@ -44,7 +56,10 @@
uint64_t offset,
unsigned int nr_sectors)
{
- unsigned int stripe = offset_to_stripe(&dc->disk, offset);
+ int stripe = offset_to_stripe(&dc->disk, offset);
+
+ if (stripe < 0)
+ return false;
while (1) {
if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index c82578a..2ea0360 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -20,8 +20,13 @@
struct dm_bio_details {
struct gendisk *bi_disk;
u8 bi_partno;
+ int __bi_remaining;
unsigned long bi_flags;
struct bvec_iter bi_iter;
+ bio_end_io_t *bi_end_io;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+ struct bio_integrity_payload *bi_integrity;
+#endif
};
static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
@@ -30,6 +35,11 @@
bd->bi_partno = bio->bi_partno;
bd->bi_flags = bio->bi_flags;
bd->bi_iter = bio->bi_iter;
+ bd->__bi_remaining = atomic_read(&bio->__bi_remaining);
+ bd->bi_end_io = bio->bi_end_io;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+ bd->bi_integrity = bio_integrity(bio);
+#endif
}
static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
@@ -38,6 +48,11 @@
bio->bi_partno = bd->bi_partno;
bio->bi_flags = bd->bi_flags;
bio->bi_iter = bd->bi_iter;
+ atomic_set(&bio->__bi_remaining, bd->__bi_remaining);
+ bio->bi_end_io = bd->bi_end_io;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+ bio->bi_integrity = bd->bi_integrity;
+#endif
}
#endif
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 2d519c2..e8c37d9 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1438,6 +1438,10 @@
sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
{
sector_t s = i_size_read(c->bdev->bd_inode) >> SECTOR_SHIFT;
+ if (s >= c->start)
+ s -= c->start;
+ else
+ s = 0;
if (likely(c->sectors_per_block_bits >= 0))
s >>= c->sectors_per_block_bits;
else
@@ -1446,6 +1450,12 @@
}
EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
+struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
+{
+ return c->dm_io;
+}
+EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
+
sector_t dm_bufio_get_block_number(struct dm_buffer *b)
{
return b->block;
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 151aa95..af6d4f8 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -537,12 +537,16 @@
CACHE_MAX_CONCURRENT_LOCKS);
if (IS_ERR(cmd->bm)) {
DMERR("could not create block manager");
- return PTR_ERR(cmd->bm);
+ r = PTR_ERR(cmd->bm);
+ cmd->bm = NULL;
+ return r;
}
r = __open_or_format_metadata(cmd, may_format_device);
- if (r)
+ if (r) {
dm_block_manager_destroy(cmd->bm);
+ cmd->bm = NULL;
+ }
return r;
}
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 8346e6d..f595e98 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2867,8 +2867,8 @@
prevent_background_work(cache);
BUG_ON(atomic_read(&cache->nr_io_migrations));
- cancel_delayed_work(&cache->waker);
- flush_workqueue(cache->wq);
+ cancel_delayed_work_sync(&cache->waker);
+ drain_workqueue(cache->wq);
WARN_ON(cache->tracker.in_flight);
/*
diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c
index 6bc8c1d..1771245 100644
--- a/drivers/md/dm-clone-metadata.c
+++ b/drivers/md/dm-clone-metadata.c
@@ -67,23 +67,34 @@
* To save constantly doing look ups on disk we keep an in core copy of the
* on-disk bitmap, the region_map.
*
- * To further reduce metadata I/O overhead we use a second bitmap, the dmap
- * (dirty bitmap), which tracks the dirty words, i.e. longs, of the region_map.
+ * In order to track which regions are hydrated during a metadata transaction,
+ * we use a second set of bitmaps, the dmap (dirty bitmap), which includes two
+ * bitmaps, namely dirty_regions and dirty_words. The dirty_regions bitmap
+ * tracks the regions that got hydrated during the current metadata
+ * transaction. The dirty_words bitmap tracks the dirty words, i.e. longs, of
+ * the dirty_regions bitmap.
+ *
+ * This allows us to precisely track the regions that were hydrated during the
+ * current metadata transaction and update the metadata accordingly, when we
+ * commit the current transaction. This is important because dm-clone should
+ * only commit the metadata of regions that were properly flushed to the
+ * destination device beforehand. Otherwise, in case of a crash, we could end
+ * up with a corrupted dm-clone device.
*
* When a region finishes hydrating dm-clone calls
* dm_clone_set_region_hydrated(), or for discard requests
* dm_clone_cond_set_range(), which sets the corresponding bits in region_map
* and dmap.
*
- * During a metadata commit we scan the dmap for dirty region_map words (longs)
- * and update accordingly the on-disk metadata. Thus, we don't have to flush to
- * disk the whole region_map. We can just flush the dirty region_map words.
+ * During a metadata commit we scan dmap->dirty_words and dmap->dirty_regions
+ * and update the on-disk metadata accordingly. Thus, we don't have to flush to
+ * disk the whole region_map. We can just flush the dirty region_map bits.
*
- * We use a dirty bitmap, which is smaller than the original region_map, to
- * reduce the amount of memory accesses during a metadata commit. As dm-bitset
- * accesses the on-disk bitmap in 64-bit word granularity, there is no
- * significant benefit in tracking the dirty region_map bits with a smaller
- * granularity.
+ * We use the helper dmap->dirty_words bitmap, which is smaller than the
+ * original region_map, to reduce the amount of memory accesses during a
+ * metadata commit. Moreover, as dm-bitset also accesses the on-disk bitmap in
+ * 64-bit word granularity, the dirty_words bitmap helps us avoid useless disk
+ * accesses.
*
* We could update directly the on-disk bitmap, when dm-clone calls either
* dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), buts this
@@ -92,12 +103,13 @@
* e.g., in a hooked overwrite bio's completion routine, and further reduce the
* I/O completion latency.
*
- * We maintain two dirty bitmaps. During a metadata commit we atomically swap
- * the currently used dmap with the unused one. This allows the metadata update
- * functions to run concurrently with an ongoing commit.
+ * We maintain two dirty bitmap sets. During a metadata commit we atomically
+ * swap the currently used dmap with the unused one. This allows the metadata
+ * update functions to run concurrently with an ongoing commit.
*/
struct dirty_map {
unsigned long *dirty_words;
+ unsigned long *dirty_regions;
unsigned int changed;
};
@@ -115,6 +127,9 @@
struct dirty_map dmap[2];
struct dirty_map *current_dmap;
+ /* Protected by lock */
+ struct dirty_map *committing_dmap;
+
/*
* In core copy of the on-disk bitmap to save constantly doing look ups
* on disk.
@@ -461,34 +476,53 @@
return BITS_TO_LONGS(nr_bits) * sizeof(long);
}
+static int __dirty_map_init(struct dirty_map *dmap, unsigned long nr_words,
+ unsigned long nr_regions)
+{
+ dmap->changed = 0;
+
+ dmap->dirty_words = kvzalloc(bitmap_size(nr_words), GFP_KERNEL);
+ if (!dmap->dirty_words)
+ return -ENOMEM;
+
+ dmap->dirty_regions = kvzalloc(bitmap_size(nr_regions), GFP_KERNEL);
+ if (!dmap->dirty_regions) {
+ kvfree(dmap->dirty_words);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __dirty_map_exit(struct dirty_map *dmap)
+{
+ kvfree(dmap->dirty_words);
+ kvfree(dmap->dirty_regions);
+}
+
static int dirty_map_init(struct dm_clone_metadata *cmd)
{
- cmd->dmap[0].changed = 0;
- cmd->dmap[0].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL);
-
- if (!cmd->dmap[0].dirty_words) {
+ if (__dirty_map_init(&cmd->dmap[0], cmd->nr_words, cmd->nr_regions)) {
DMERR("Failed to allocate dirty bitmap");
return -ENOMEM;
}
- cmd->dmap[1].changed = 0;
- cmd->dmap[1].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL);
-
- if (!cmd->dmap[1].dirty_words) {
+ if (__dirty_map_init(&cmd->dmap[1], cmd->nr_words, cmd->nr_regions)) {
DMERR("Failed to allocate dirty bitmap");
- kvfree(cmd->dmap[0].dirty_words);
+ __dirty_map_exit(&cmd->dmap[0]);
return -ENOMEM;
}
cmd->current_dmap = &cmd->dmap[0];
+ cmd->committing_dmap = NULL;
return 0;
}
static void dirty_map_exit(struct dm_clone_metadata *cmd)
{
- kvfree(cmd->dmap[0].dirty_words);
- kvfree(cmd->dmap[1].dirty_words);
+ __dirty_map_exit(&cmd->dmap[0]);
+ __dirty_map_exit(&cmd->dmap[1]);
}
static int __load_bitset_in_core(struct dm_clone_metadata *cmd)
@@ -622,7 +656,7 @@
return (bit >= (start + nr_regions));
}
-unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd)
+unsigned int dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd)
{
return bitmap_weight(cmd->region_map, cmd->nr_regions);
}
@@ -633,21 +667,23 @@
return find_next_zero_bit(cmd->region_map, cmd->nr_regions, start);
}
-static int __update_metadata_word(struct dm_clone_metadata *cmd, unsigned long word)
+static int __update_metadata_word(struct dm_clone_metadata *cmd,
+ unsigned long *dirty_regions,
+ unsigned long word)
{
int r;
unsigned long index = word * BITS_PER_LONG;
unsigned long max_index = min(cmd->nr_regions, (word + 1) * BITS_PER_LONG);
while (index < max_index) {
- if (test_bit(index, cmd->region_map)) {
+ if (test_bit(index, dirty_regions)) {
r = dm_bitset_set_bit(&cmd->bitset_info, cmd->bitset_root,
index, &cmd->bitset_root);
-
if (r) {
DMERR("dm_bitset_set_bit failed");
return r;
}
+ __clear_bit(index, dirty_regions);
}
index++;
}
@@ -712,7 +748,7 @@
static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
{
int r;
- unsigned long word, flags;
+ unsigned long word;
word = 0;
do {
@@ -721,7 +757,7 @@
if (word == cmd->nr_words)
break;
- r = __update_metadata_word(cmd, word);
+ r = __update_metadata_word(cmd, dmap->dirty_regions, word);
if (r)
return r;
@@ -736,23 +772,24 @@
return r;
/* Update the changed flag */
- spin_lock_irqsave(&cmd->bitmap_lock, flags);
+ spin_lock_irq(&cmd->bitmap_lock);
dmap->changed = 0;
- spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
+ spin_unlock_irq(&cmd->bitmap_lock);
return 0;
}
-int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
+int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd)
{
- int r = -EPERM;
- unsigned long flags;
+ int r = 0;
struct dirty_map *dmap, *next_dmap;
down_write(&cmd->lock);
- if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) {
+ r = -EPERM;
goto out;
+ }
/* Get current dirty bitmap */
dmap = cmd->current_dmap;
@@ -764,21 +801,43 @@
* The last commit failed, so we don't have a clean dirty-bitmap to
* use.
*/
- if (WARN_ON(next_dmap->changed)) {
+ if (WARN_ON(next_dmap->changed || cmd->committing_dmap)) {
r = -EINVAL;
goto out;
}
/* Swap dirty bitmaps */
- spin_lock_irqsave(&cmd->bitmap_lock, flags);
+ spin_lock_irq(&cmd->bitmap_lock);
cmd->current_dmap = next_dmap;
- spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
+ spin_unlock_irq(&cmd->bitmap_lock);
- /*
- * No one is accessing the old dirty bitmap anymore, so we can flush
- * it.
- */
- r = __flush_dmap(cmd, dmap);
+ /* Set old dirty bitmap as currently committing */
+ cmd->committing_dmap = dmap;
+out:
+ up_write(&cmd->lock);
+
+ return r;
+}
+
+int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
+{
+ int r = -EPERM;
+
+ down_write(&cmd->lock);
+
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
+ goto out;
+
+ if (WARN_ON(!cmd->committing_dmap)) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ r = __flush_dmap(cmd, cmd->committing_dmap);
+ if (!r) {
+ /* Clear committing dmap */
+ cmd->committing_dmap = NULL;
+ }
out:
up_write(&cmd->lock);
@@ -791,6 +850,12 @@
struct dirty_map *dmap;
unsigned long word, flags;
+ if (unlikely(region_nr >= cmd->nr_regions)) {
+ DMERR("Region %lu out of range (total number of regions %lu)",
+ region_nr, cmd->nr_regions);
+ return -ERANGE;
+ }
+
word = region_nr / BITS_PER_LONG;
spin_lock_irqsave(&cmd->bitmap_lock, flags);
@@ -803,6 +868,7 @@
dmap = cmd->current_dmap;
__set_bit(word, dmap->dirty_words);
+ __set_bit(region_nr, dmap->dirty_regions);
__set_bit(region_nr, cmd->region_map);
dmap->changed = 1;
@@ -817,9 +883,16 @@
{
int r = 0;
struct dirty_map *dmap;
- unsigned long word, region_nr, flags;
+ unsigned long word, region_nr;
- spin_lock_irqsave(&cmd->bitmap_lock, flags);
+ if (unlikely(start >= cmd->nr_regions || (start + nr_regions) < start ||
+ (start + nr_regions) > cmd->nr_regions)) {
+ DMERR("Invalid region range: start %lu, nr_regions %lu (total number of regions %lu)",
+ start, nr_regions, cmd->nr_regions);
+ return -ERANGE;
+ }
+
+ spin_lock_irq(&cmd->bitmap_lock);
if (cmd->read_only) {
r = -EPERM;
@@ -831,12 +904,13 @@
if (!test_bit(region_nr, cmd->region_map)) {
word = region_nr / BITS_PER_LONG;
__set_bit(word, dmap->dirty_words);
+ __set_bit(region_nr, dmap->dirty_regions);
__set_bit(region_nr, cmd->region_map);
dmap->changed = 1;
}
}
out:
- spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
+ spin_unlock_irq(&cmd->bitmap_lock);
return r;
}
@@ -903,13 +977,11 @@
void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd)
{
- unsigned long flags;
-
down_write(&cmd->lock);
- spin_lock_irqsave(&cmd->bitmap_lock, flags);
+ spin_lock_irq(&cmd->bitmap_lock);
cmd->read_only = 1;
- spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
+ spin_unlock_irq(&cmd->bitmap_lock);
if (!cmd->fail_io)
dm_bm_set_read_only(cmd->bm);
@@ -919,13 +991,11 @@
void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd)
{
- unsigned long flags;
-
down_write(&cmd->lock);
- spin_lock_irqsave(&cmd->bitmap_lock, flags);
+ spin_lock_irq(&cmd->bitmap_lock);
cmd->read_only = 0;
- spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
+ spin_unlock_irq(&cmd->bitmap_lock);
if (!cmd->fail_io)
dm_bm_set_read_write(cmd->bm);
diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h
index 434bff0..d848b87 100644
--- a/drivers/md/dm-clone-metadata.h
+++ b/drivers/md/dm-clone-metadata.h
@@ -44,7 +44,9 @@
* @start: Starting region number
* @nr_regions: Number of regions in the range
*
- * This function doesn't block, so it's safe to call it from interrupt context.
+ * This function doesn't block, but since it uses spin_lock_irq()/spin_unlock_irq()
+ * it's NOT safe to call it from any context where interrupts are disabled, e.g.,
+ * from interrupt context.
*/
int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
unsigned long nr_regions);
@@ -73,7 +75,23 @@
/*
* Commit dm-clone metadata to disk.
+ *
+ * We use a two phase commit:
+ *
+ * 1. dm_clone_metadata_pre_commit(): Prepare the current transaction for
+ * committing. After this is called, all subsequent metadata updates, done
+ * through either dm_clone_set_region_hydrated() or
+ * dm_clone_cond_set_range(), will be part of the **next** transaction.
+ *
+ * 2. dm_clone_metadata_commit(): Actually commit the current transaction to
+ * disk and start a new transaction.
+ *
+ * This allows dm-clone to flush the destination device after step (1) to
+ * ensure that all freshly hydrated regions, for which we are updating the
+ * metadata, are properly written to non-volatile storage and won't be lost in
+ * case of a crash.
*/
+int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd);
int dm_clone_metadata_commit(struct dm_clone_metadata *cmd);
/*
@@ -110,6 +128,7 @@
* Switches metadata to a read only mode. Once read-only mode has been entered
* the following functions will return -EPERM:
*
+ * dm_clone_metadata_pre_commit()
* dm_clone_metadata_commit()
* dm_clone_set_region_hydrated()
* dm_clone_cond_set_range()
@@ -137,7 +156,7 @@
/*
* Returns the number of hydrated regions.
*/
-unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd);
+unsigned int dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd);
/*
* Returns the first unhydrated region with region_nr >= @start
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index 4ca8f19..eb7a5d3 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -86,6 +86,12 @@
struct dm_clone_metadata *cmd;
+ /*
+ * bio used to flush the destination device, before committing the
+ * metadata.
+ */
+ struct bio flush_bio;
+
/* Region hydration hash table */
struct hash_table_bucket *ht;
@@ -276,7 +282,7 @@
/* Get the address of the region in sectors */
static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr)
{
- return (region_nr << clone->region_shift);
+ return ((sector_t)region_nr << clone->region_shift);
}
/* Get the region number of the bio */
@@ -287,10 +293,17 @@
/* Get the region range covered by the bio */
static void bio_region_range(struct clone *clone, struct bio *bio,
- unsigned long *rs, unsigned long *re)
+ unsigned long *rs, unsigned long *nr_regions)
{
+ unsigned long end;
+
*rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size);
- *re = bio_end_sector(bio) >> clone->region_shift;
+ end = bio_end_sector(bio) >> clone->region_shift;
+
+ if (*rs >= end)
+ *nr_regions = 0;
+ else
+ *nr_regions = end - *rs;
}
/* Check whether a bio overwrites a region */
@@ -332,8 +345,6 @@
*/
static void issue_bio(struct clone *clone, struct bio *bio)
{
- unsigned long flags;
-
if (!bio_triggers_commit(clone, bio)) {
generic_make_request(bio);
return;
@@ -352,9 +363,9 @@
* Batch together any bios that trigger commits and then issue a single
* commit for them in process_deferred_flush_bios().
*/
- spin_lock_irqsave(&clone->lock, flags);
+ spin_lock_irq(&clone->lock);
bio_list_add(&clone->deferred_flush_bios, bio);
- spin_unlock_irqrestore(&clone->lock, flags);
+ spin_unlock_irq(&clone->lock);
wake_worker(clone);
}
@@ -450,7 +461,7 @@
static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success)
{
- unsigned long rs, re;
+ unsigned long rs, nr_regions;
/*
* If the destination device supports discards, remap and trim the
@@ -459,9 +470,9 @@
*/
if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) {
remap_to_dest(clone, bio);
- bio_region_range(clone, bio, &rs, &re);
- trim_bio(bio, rs << clone->region_shift,
- (re - rs) << clone->region_shift);
+ bio_region_range(clone, bio, &rs, &nr_regions);
+ trim_bio(bio, region_to_sector(clone, rs),
+ nr_regions << clone->region_shift);
generic_make_request(bio);
} else
bio_endio(bio);
@@ -469,12 +480,21 @@
static void process_discard_bio(struct clone *clone, struct bio *bio)
{
- unsigned long rs, re, flags;
+ unsigned long rs, nr_regions;
- bio_region_range(clone, bio, &rs, &re);
- BUG_ON(re > clone->nr_regions);
+ bio_region_range(clone, bio, &rs, &nr_regions);
+ if (!nr_regions) {
+ bio_endio(bio);
+ return;
+ }
- if (unlikely(rs == re)) {
+ if (WARN_ON(rs >= clone->nr_regions || (rs + nr_regions) < rs ||
+ (rs + nr_regions) > clone->nr_regions)) {
+ DMERR("%s: Invalid range (%lu + %lu, total regions %lu) for discard (%llu + %u)",
+ clone_device_name(clone), rs, nr_regions,
+ clone->nr_regions,
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bio_sectors(bio));
bio_endio(bio);
return;
}
@@ -483,7 +503,7 @@
* The covered regions are already hydrated so we just need to pass
* down the discard.
*/
- if (dm_clone_is_range_hydrated(clone->cmd, rs, re - rs)) {
+ if (dm_clone_is_range_hydrated(clone->cmd, rs, nr_regions)) {
complete_discard_bio(clone, bio, true);
return;
}
@@ -501,9 +521,9 @@
/*
* Defer discard processing.
*/
- spin_lock_irqsave(&clone->lock, flags);
+ spin_lock_irq(&clone->lock);
bio_list_add(&clone->deferred_discard_bios, bio);
- spin_unlock_irqrestore(&clone->lock, flags);
+ spin_unlock_irq(&clone->lock);
wake_worker(clone);
}
@@ -778,11 +798,14 @@
struct dm_io_region from, to;
struct clone *clone = hd->clone;
+ if (WARN_ON(!nr_regions))
+ return;
+
region_size = clone->region_size;
region_start = hd->region_nr;
region_end = region_start + nr_regions - 1;
- total_size = (nr_regions - 1) << clone->region_shift;
+ total_size = region_to_sector(clone, nr_regions - 1);
if (region_end == clone->nr_regions - 1) {
/*
@@ -1106,10 +1129,13 @@
/*
* A non-zero return indicates read-only or fail mode.
*/
-static int commit_metadata(struct clone *clone)
+static int commit_metadata(struct clone *clone, bool *dest_dev_flushed)
{
int r = 0;
+ if (dest_dev_flushed)
+ *dest_dev_flushed = false;
+
mutex_lock(&clone->commit_lock);
if (!dm_clone_changed_this_transaction(clone->cmd))
@@ -1120,8 +1146,26 @@
goto out;
}
- r = dm_clone_metadata_commit(clone->cmd);
+ r = dm_clone_metadata_pre_commit(clone->cmd);
+ if (unlikely(r)) {
+ __metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r);
+ goto out;
+ }
+ bio_reset(&clone->flush_bio);
+ bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
+ clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+
+ r = submit_bio_wait(&clone->flush_bio);
+ if (unlikely(r)) {
+ __metadata_operation_failed(clone, "flush destination device", r);
+ goto out;
+ }
+
+ if (dest_dev_flushed)
+ *dest_dev_flushed = true;
+
+ r = dm_clone_metadata_commit(clone->cmd);
if (unlikely(r)) {
__metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
goto out;
@@ -1140,13 +1184,13 @@
int r = -EPERM;
struct bio *bio;
struct blk_plug plug;
- unsigned long rs, re, flags;
+ unsigned long rs, nr_regions;
struct bio_list discards = BIO_EMPTY_LIST;
- spin_lock_irqsave(&clone->lock, flags);
+ spin_lock_irq(&clone->lock);
bio_list_merge(&discards, &clone->deferred_discard_bios);
bio_list_init(&clone->deferred_discard_bios);
- spin_unlock_irqrestore(&clone->lock, flags);
+ spin_unlock_irq(&clone->lock);
if (bio_list_empty(&discards))
return;
@@ -1156,14 +1200,13 @@
/* Update the metadata */
bio_list_for_each(bio, &discards) {
- bio_region_range(clone, bio, &rs, &re);
+ bio_region_range(clone, bio, &rs, &nr_regions);
/*
* A discard request might cover regions that have been already
* hydrated. There is no need to update the metadata for these
* regions.
*/
- r = dm_clone_cond_set_range(clone->cmd, rs, re - rs);
-
+ r = dm_clone_cond_set_range(clone->cmd, rs, nr_regions);
if (unlikely(r))
break;
}
@@ -1176,13 +1219,12 @@
static void process_deferred_bios(struct clone *clone)
{
- unsigned long flags;
struct bio_list bios = BIO_EMPTY_LIST;
- spin_lock_irqsave(&clone->lock, flags);
+ spin_lock_irq(&clone->lock);
bio_list_merge(&bios, &clone->deferred_bios);
bio_list_init(&clone->deferred_bios);
- spin_unlock_irqrestore(&clone->lock, flags);
+ spin_unlock_irq(&clone->lock);
if (bio_list_empty(&bios))
return;
@@ -1193,7 +1235,7 @@
static void process_deferred_flush_bios(struct clone *clone)
{
struct bio *bio;
- unsigned long flags;
+ bool dest_dev_flushed;
struct bio_list bios = BIO_EMPTY_LIST;
struct bio_list bio_completions = BIO_EMPTY_LIST;
@@ -1201,19 +1243,19 @@
* If there are any deferred flush bios, we must commit the metadata
* before issuing them or signaling their completion.
*/
- spin_lock_irqsave(&clone->lock, flags);
+ spin_lock_irq(&clone->lock);
bio_list_merge(&bios, &clone->deferred_flush_bios);
bio_list_init(&clone->deferred_flush_bios);
bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
bio_list_init(&clone->deferred_flush_completions);
- spin_unlock_irqrestore(&clone->lock, flags);
+ spin_unlock_irq(&clone->lock);
if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
!(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
return;
- if (commit_metadata(clone)) {
+ if (commit_metadata(clone, &dest_dev_flushed)) {
bio_list_merge(&bios, &bio_completions);
while ((bio = bio_list_pop(&bios)))
@@ -1227,8 +1269,17 @@
while ((bio = bio_list_pop(&bio_completions)))
bio_endio(bio);
- while ((bio = bio_list_pop(&bios)))
- generic_make_request(bio);
+ while ((bio = bio_list_pop(&bios))) {
+ if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) {
+ /* We just flushed the destination device as part of
+ * the metadata commit, so there is no reason to send
+ * another flush.
+ */
+ bio_endio(bio);
+ } else {
+ generic_make_request(bio);
+ }
+ }
}
static void do_worker(struct work_struct *work)
@@ -1400,7 +1451,7 @@
/* Commit to ensure statistics aren't out-of-date */
if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
- (void) commit_metadata(clone);
+ (void) commit_metadata(clone, NULL);
r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);
@@ -1418,7 +1469,7 @@
goto error;
}
- DMEMIT("%u %llu/%llu %llu %lu/%lu %u ",
+ DMEMIT("%u %llu/%llu %llu %u/%lu %u ",
DM_CLONE_METADATA_BLOCK_SIZE,
(unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks),
(unsigned long long)nr_metadata_blocks,
@@ -1738,6 +1789,7 @@
static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
int r;
+ sector_t nr_regions;
struct clone *clone;
struct dm_arg_set as;
@@ -1779,7 +1831,16 @@
goto out_with_source_dev;
clone->region_shift = __ffs(clone->region_size);
- clone->nr_regions = dm_sector_div_up(ti->len, clone->region_size);
+ nr_regions = dm_sector_div_up(ti->len, clone->region_size);
+
+ /* Check for overflow */
+ if (nr_regions != (unsigned long)nr_regions) {
+ ti->error = "Too many regions. Consider increasing the region size";
+ r = -EOVERFLOW;
+ goto out_with_source_dev;
+ }
+
+ clone->nr_regions = nr_regions;
r = validate_nr_regions(clone->nr_regions, &ti->error);
if (r)
@@ -1834,6 +1895,7 @@
bio_list_init(&clone->deferred_flush_completions);
clone->hydration_offset = 0;
atomic_set(&clone->hydrations_in_flight, 0);
+ bio_init(&clone->flush_bio, NULL, 0);
clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
if (!clone->wq) {
@@ -1907,6 +1969,7 @@
struct clone *clone = ti->private;
mutex_destroy(&clone->commit_lock);
+ bio_uninit(&clone->flush_bio);
for (i = 0; i < clone->nr_ctr_args; i++)
kfree(clone->ctr_args[i]);
@@ -1961,7 +2024,7 @@
wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
flush_workqueue(clone->wq);
- (void) commit_metadata(clone);
+ (void) commit_metadata(clone, NULL);
}
static void clone_resume(struct dm_target *ti)
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index c4ef1fc..3fea121 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -106,6 +106,10 @@
struct block_device *bdev;
+ int swap_bios;
+ struct semaphore swap_bios_semaphore;
+ struct mutex swap_bios_lock;
+
struct dm_stats stats;
/* for blk-mq request-based DM support */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index eb9782f..571c04e 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -331,8 +331,14 @@
static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts)
{
- unsigned bs = crypto_skcipher_blocksize(any_tfm(cc));
- int log = ilog2(bs);
+ unsigned bs;
+ int log;
+
+ if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags))
+ bs = crypto_aead_blocksize(any_tfm_aead(cc));
+ else
+ bs = crypto_skcipher_blocksize(any_tfm(cc));
+ log = ilog2(bs);
/* we need to calculate how far we must shift the sector count
* to get the cipher block count, we use this shift in _gen */
@@ -714,10 +720,10 @@
u8 buf[MAX_CIPHER_BLOCKSIZE] __aligned(__alignof__(__le64));
struct skcipher_request *req;
struct scatterlist src, dst;
- struct crypto_wait wait;
+ DECLARE_CRYPTO_WAIT(wait);
int err;
- req = skcipher_request_alloc(any_tfm(cc), GFP_KERNEL | GFP_NOFS);
+ req = skcipher_request_alloc(any_tfm(cc), GFP_NOIO);
if (!req)
return -ENOMEM;
@@ -2086,7 +2092,12 @@
struct crypt_config *cc = pool_data;
struct page *page;
- if (unlikely(percpu_counter_compare(&cc->n_allocated_pages, dm_crypt_pages_per_client) >= 0) &&
+ /*
+ * Note, percpu_counter_read_positive() may over (and under) estimate
+ * the current usage by at most (batch - 1) * num_online_cpus() pages,
+ * but avoids potential spinlock contention of an exact result.
+ */
+ if (unlikely(percpu_counter_read_positive(&cc->n_allocated_pages) >= dm_crypt_pages_per_client) &&
likely(gfp_mask & __GFP_NORETRY))
return NULL;
@@ -2731,6 +2742,7 @@
wake_up_process(cc->write_thread);
ti->num_flush_bios = 1;
+ ti->limit_swap_bios = true;
return 0;
@@ -2951,7 +2963,7 @@
limits->max_segment_size = PAGE_SIZE;
limits->logical_block_size =
- max_t(unsigned short, limits->logical_block_size, cc->sector_size);
+ max_t(unsigned, limits->logical_block_size, cc->sector_size);
limits->physical_block_size =
max_t(unsigned, limits->physical_block_size, cc->sector_size);
limits->io_min = max_t(unsigned, limits->io_min, cc->sector_size);
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index bdb84b8..6b0b3a1 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -47,6 +47,7 @@
static void writeset_free(struct writeset *ws)
{
vfree(ws->bits);
+ ws->bits = NULL;
}
static int setup_on_disk_bitset(struct dm_disk_bitset *info,
@@ -71,8 +72,6 @@
*/
static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks)
{
- ws->md.nr_bits = nr_blocks;
- ws->md.root = INVALID_WRITESET_ROOT;
ws->bits = vzalloc(bitset_size(nr_blocks));
if (!ws->bits) {
DMERR("%s: couldn't allocate in memory bitset", __func__);
@@ -85,12 +84,14 @@
/*
* Wipes the in-core bitset, and creates a new on disk bitset.
*/
-static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws)
+static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws,
+ dm_block_t nr_blocks)
{
int r;
- memset(ws->bits, 0, bitset_size(ws->md.nr_bits));
+ memset(ws->bits, 0, bitset_size(nr_blocks));
+ ws->md.nr_bits = nr_blocks;
r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root);
if (r) {
DMERR("%s: setup_on_disk_bitset failed", __func__);
@@ -134,7 +135,7 @@
{
int r;
- if (!test_and_set_bit(block, ws->bits)) {
+ if (!test_bit(block, ws->bits)) {
r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root);
if (r) {
/* FIXME: fail mode */
@@ -388,7 +389,7 @@
static int ws_eq(void *context, const void *value1, const void *value2)
{
- return !memcmp(value1, value2, sizeof(struct writeset_metadata));
+ return !memcmp(value1, value2, sizeof(struct writeset_disk));
}
/*----------------------------------------------------------------*/
@@ -564,6 +565,15 @@
}
disk = dm_block_data(sblock);
+
+ /* Verify the data block size hasn't changed */
+ if (le32_to_cpu(disk->data_block_size) != md->block_size) {
+ DMERR("changing the data block size (from %u to %llu) is not supported",
+ le32_to_cpu(disk->data_block_size), md->block_size);
+ r = -EINVAL;
+ goto bad;
+ }
+
r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION,
disk->metadata_space_map_root,
sizeof(disk->metadata_space_map_root),
@@ -575,10 +585,10 @@
setup_infos(md);
- md->block_size = le32_to_cpu(disk->data_block_size);
md->nr_blocks = le32_to_cpu(disk->nr_blocks);
md->current_era = le32_to_cpu(disk->current_era);
+ ws_unpack(&disk->current_writeset, &md->current_writeset->md);
md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root);
md->era_array_root = le64_to_cpu(disk->era_array_root);
md->metadata_snap = le64_to_cpu(disk->metadata_snap);
@@ -746,6 +756,12 @@
ws_unpack(&disk, &d->writeset);
d->value = cpu_to_le32(key);
+ /*
+ * We initialise another bitset info to avoid any caching side effects
+ * with the previous one.
+ */
+ dm_disk_bitset_init(md->tm, &d->info);
+
d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks);
d->current_bit = 0;
d->step = metadata_digest_transcribe_writeset;
@@ -759,12 +775,6 @@
return 0;
memset(d, 0, sizeof(*d));
-
- /*
- * We initialise another bitset info to avoid any caching side
- * effects with the previous one.
- */
- dm_disk_bitset_init(md->tm, &d->info);
d->step = metadata_digest_lookup_writeset;
return 0;
@@ -802,6 +812,8 @@
static void metadata_close(struct era_metadata *md)
{
+ writeset_free(&md->writesets[0]);
+ writeset_free(&md->writesets[1]);
destroy_persistent_data_objects(md);
kfree(md);
}
@@ -839,6 +851,7 @@
r = writeset_alloc(&md->writesets[1], *new_size);
if (r) {
DMERR("%s: writeset_alloc failed for writeset 1", __func__);
+ writeset_free(&md->writesets[0]);
return r;
}
@@ -849,6 +862,8 @@
&value, &md->era_array_root);
if (r) {
DMERR("%s: dm_array_resize failed", __func__);
+ writeset_free(&md->writesets[0]);
+ writeset_free(&md->writesets[1]);
return r;
}
@@ -870,7 +885,6 @@
}
ws_pack(&md->current_writeset->md, &value);
- md->current_writeset->md.root = INVALID_WRITESET_ROOT;
keys[0] = md->current_era;
__dm_bless_for_disk(&value);
@@ -882,6 +896,7 @@
return r;
}
+ md->current_writeset->md.root = INVALID_WRITESET_ROOT;
md->archived_writesets = true;
return 0;
@@ -898,7 +913,7 @@
int r;
struct writeset *new_writeset = next_writeset(md);
- r = writeset_init(&md->bitset_info, new_writeset);
+ r = writeset_init(&md->bitset_info, new_writeset, md->nr_blocks);
if (r) {
DMERR("%s: writeset_init failed", __func__);
return r;
@@ -951,7 +966,7 @@
int r;
struct dm_block *sblock;
- if (md->current_writeset->md.root != SUPERBLOCK_LOCATION) {
+ if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) {
r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root,
&md->current_writeset->md.root);
if (r) {
@@ -1226,8 +1241,10 @@
int r;
struct bio_list deferred_bios, marked_bios;
struct bio *bio;
+ struct blk_plug plug;
bool commit_needed = false;
bool failed = false;
+ struct writeset *ws = era->md->current_writeset;
bio_list_init(&deferred_bios);
bio_list_init(&marked_bios);
@@ -1237,9 +1254,11 @@
bio_list_init(&era->deferred_bios);
spin_unlock(&era->deferred_lock);
+ if (bio_list_empty(&deferred_bios))
+ return;
+
while ((bio = bio_list_pop(&deferred_bios))) {
- r = writeset_test_and_set(&era->md->bitset_info,
- era->md->current_writeset,
+ r = writeset_test_and_set(&era->md->bitset_info, ws,
get_block(era, bio));
if (r < 0) {
/*
@@ -1247,7 +1266,6 @@
* FIXME: finish.
*/
failed = true;
-
} else if (r == 0)
commit_needed = true;
@@ -1263,9 +1281,19 @@
if (failed)
while ((bio = bio_list_pop(&marked_bios)))
bio_io_error(bio);
- else
- while ((bio = bio_list_pop(&marked_bios)))
+ else {
+ blk_start_plug(&plug);
+ while ((bio = bio_list_pop(&marked_bios))) {
+ /*
+ * Only update the in-core writeset if the on-disk one
+ * was updated too.
+ */
+ if (commit_needed)
+ set_bit(get_block(era, bio), ws->bits);
generic_make_request(bio);
+ }
+ blk_finish_plug(&plug);
+ }
}
static void process_rpc_calls(struct era *era)
@@ -1486,15 +1514,6 @@
}
era->md = md;
- era->nr_blocks = calc_nr_blocks(era);
-
- r = metadata_resize(era->md, &era->nr_blocks);
- if (r) {
- ti->error = "couldn't resize metadata";
- era_destroy(era);
- return -ENOMEM;
- }
-
era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
if (!era->wq) {
ti->error = "could not create workqueue for metadata object";
@@ -1571,16 +1590,24 @@
dm_block_t new_size = calc_nr_blocks(era);
if (era->nr_blocks != new_size) {
- r = in_worker1(era, metadata_resize, &new_size);
- if (r)
+ r = metadata_resize(era->md, &new_size);
+ if (r) {
+ DMERR("%s: metadata_resize failed", __func__);
return r;
+ }
+
+ r = metadata_commit(era->md);
+ if (r) {
+ DMERR("%s: metadata_commit failed", __func__);
+ return r;
+ }
era->nr_blocks = new_size;
}
start_worker(era);
- r = in_worker0(era, metadata_new_era);
+ r = in_worker0(era, metadata_era_rollover);
if (r) {
DMERR("%s: metadata_era_rollover failed", __func__);
return r;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index dab4446..9f4d657 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -6,6 +6,8 @@
* This file is released under the GPL.
*/
+#include "dm-bio-record.h"
+
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/device-mapper.h>
@@ -199,17 +201,19 @@
__u8 log2_blocks_per_bitmap_bit;
unsigned char mode;
- int suspending;
int failed;
struct crypto_shash *internal_hash;
+ struct dm_target *ti;
+
/* these variables are locked with endio_wait.lock */
struct rb_root in_progress;
struct list_head wait_list;
wait_queue_head_t endio_wait;
struct workqueue_struct *wait_wq;
+ struct workqueue_struct *offload_wq;
unsigned char commit_seq;
commit_id_t commit_ids[N_COMMIT_IDS];
@@ -250,6 +254,7 @@
bool journal_uptodate;
bool just_formatted;
bool recalculate_flag;
+ bool legacy_recalculate;
struct alg_spec internal_hash_alg;
struct alg_spec journal_crypt_alg;
@@ -290,11 +295,7 @@
struct completion *completion;
- struct gendisk *orig_bi_disk;
- u8 orig_bi_partno;
- bio_end_io_t *orig_bi_end_io;
- struct bio_integrity_payload *orig_bi_integrity;
- struct bvec_iter orig_bi_iter;
+ struct dm_bio_details bio_details;
};
struct journal_completion {
@@ -381,6 +382,14 @@
return READ_ONCE(ic->failed);
}
+static bool dm_integrity_disable_recalculate(struct dm_integrity_c *ic)
+{
+ if ((ic->internal_hash_alg.key || ic->journal_mac_alg.key) &&
+ !ic->legacy_recalculate)
+ return true;
+ return false;
+}
+
static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i,
unsigned j, unsigned char seq)
{
@@ -1343,12 +1352,52 @@
return 0;
}
-static void dm_integrity_flush_buffers(struct dm_integrity_c *ic)
+struct flush_request {
+ struct dm_io_request io_req;
+ struct dm_io_region io_reg;
+ struct dm_integrity_c *ic;
+ struct completion comp;
+};
+
+static void flush_notify(unsigned long error, void *fr_)
+{
+ struct flush_request *fr = fr_;
+ if (unlikely(error != 0))
+ dm_integrity_io_error(fr->ic, "flusing disk cache", -EIO);
+ complete(&fr->comp);
+}
+
+static void dm_integrity_flush_buffers(struct dm_integrity_c *ic, bool flush_data)
{
int r;
+
+ struct flush_request fr;
+
+ if (!ic->meta_dev)
+ flush_data = false;
+ if (flush_data) {
+ fr.io_req.bi_op = REQ_OP_WRITE,
+ fr.io_req.bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
+ fr.io_req.mem.type = DM_IO_KMEM,
+ fr.io_req.mem.ptr.addr = NULL,
+ fr.io_req.notify.fn = flush_notify,
+ fr.io_req.notify.context = &fr;
+ fr.io_req.client = dm_bufio_get_dm_io_client(ic->bufio),
+ fr.io_reg.bdev = ic->dev->bdev,
+ fr.io_reg.sector = 0,
+ fr.io_reg.count = 0,
+ fr.ic = ic;
+ init_completion(&fr.comp);
+ r = dm_io(&fr.io_req, 1, &fr.io_reg, NULL);
+ BUG_ON(r);
+ }
+
r = dm_bufio_write_dirty_buffers(ic->bufio);
if (unlikely(r))
dm_integrity_io_error(ic, "writing tags", r);
+
+ if (flush_data)
+ wait_for_completion(&fr.comp);
}
static void sleep_on_endio_wait(struct dm_integrity_c *ic)
@@ -1434,7 +1483,7 @@
dio->range.logical_sector += dio->range.n_sectors;
bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT);
INIT_WORK(&dio->work, integrity_bio_wait);
- queue_work(ic->wait_wq, &dio->work);
+ queue_work(ic->offload_wq, &dio->work);
return;
}
do_endio_flush(ic, dio);
@@ -1445,14 +1494,9 @@
{
struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
- bio->bi_iter = dio->orig_bi_iter;
- bio->bi_disk = dio->orig_bi_disk;
- bio->bi_partno = dio->orig_bi_partno;
- if (dio->orig_bi_integrity) {
- bio->bi_integrity = dio->orig_bi_integrity;
+ dm_bio_restore(&dio->bio_details, bio);
+ if (bio->bi_integrity)
bio->bi_opf |= REQ_INTEGRITY;
- }
- bio->bi_end_io = dio->orig_bi_end_io;
if (dio->completion)
complete(dio->completion);
@@ -1519,7 +1563,7 @@
struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
char *checksums;
unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
- char checksums_onstack[HASH_MAX_DIGESTSIZE];
+ char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
unsigned sectors_to_process = dio->range.n_sectors;
sector_t sector = dio->range.logical_sector;
@@ -1537,7 +1581,7 @@
}
}
- __bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) {
+ __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) {
unsigned pos;
char *mem, *checksums_ptr;
@@ -1581,7 +1625,7 @@
if (likely(checksums != checksums_onstack))
kfree(checksums);
} else {
- struct bio_integrity_payload *bip = dio->orig_bi_integrity;
+ struct bio_integrity_payload *bip = dio->bio_details.bi_integrity;
if (bip) {
struct bio_vec biv;
@@ -1748,7 +1792,7 @@
} while (++s < ic->sectors_per_block);
#ifdef INTERNAL_VERIFY
if (ic->internal_hash) {
- char checksums_onstack[max(HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
+ char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
@@ -1860,7 +1904,7 @@
if (need_sync_io && from_map) {
INIT_WORK(&dio->work, integrity_bio_wait);
- queue_work(ic->metadata_wq, &dio->work);
+ queue_work(ic->offload_wq, &dio->work);
return;
}
@@ -2000,20 +2044,13 @@
} else
dio->completion = NULL;
- dio->orig_bi_iter = bio->bi_iter;
-
- dio->orig_bi_disk = bio->bi_disk;
- dio->orig_bi_partno = bio->bi_partno;
+ dm_bio_record(&dio->bio_details, bio);
bio_set_dev(bio, ic->dev->bdev);
-
- dio->orig_bi_integrity = bio_integrity(bio);
bio->bi_integrity = NULL;
bio->bi_opf &= ~REQ_INTEGRITY;
-
- dio->orig_bi_end_io = bio->bi_end_io;
bio->bi_end_io = integrity_end_io;
-
bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT;
+
generic_make_request(bio);
if (need_sync_io) {
@@ -2089,7 +2126,7 @@
flushes = bio_list_get(&ic->flush_bio_list);
if (unlikely(ic->mode != 'J')) {
spin_unlock_irq(&ic->endio_wait.lock);
- dm_integrity_flush_buffers(ic);
+ dm_integrity_flush_buffers(ic, true);
goto release_flush_bios;
}
@@ -2299,7 +2336,7 @@
complete_journal_op(&comp);
wait_for_completion_io(&comp.comp);
- dm_integrity_flush_buffers(ic);
+ dm_integrity_flush_buffers(ic, true);
}
static void integrity_writer(struct work_struct *w)
@@ -2310,7 +2347,7 @@
unsigned prev_free_sectors;
/* the following test is not needed, but it tests the replay code */
- if (READ_ONCE(ic->suspending) && !ic->meta_dev)
+ if (unlikely(dm_post_suspending(ic->ti)) && !ic->meta_dev)
return;
spin_lock_irq(&ic->endio_wait.lock);
@@ -2341,7 +2378,7 @@
{
int r;
- dm_integrity_flush_buffers(ic);
+ dm_integrity_flush_buffers(ic, false);
if (dm_integrity_failed(ic))
return;
@@ -2371,12 +2408,13 @@
next_chunk:
- if (unlikely(READ_ONCE(ic->suspending)))
+ if (unlikely(dm_post_suspending(ic->ti)))
goto unlock_ret;
range.logical_sector = le64_to_cpu(ic->sb->recalc_sector);
if (unlikely(range.logical_sector >= ic->provided_data_sectors)) {
if (ic->mode == 'B') {
+ block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
DEBUG_print("queue_delayed_work: bitmap_flush_work\n");
queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
}
@@ -2454,6 +2492,17 @@
goto err;
}
+ if (ic->mode == 'B') {
+ sector_t start, end;
+ start = (range.logical_sector >>
+ (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)) <<
+ (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+ end = ((range.logical_sector + range.n_sectors) >>
+ (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)) <<
+ (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+ block_bitmap_op(ic, ic->recalc_bitmap, start, end - start, BITMAP_OP_CLEAR);
+ }
+
advance_and_next:
cond_resched();
@@ -2496,7 +2545,7 @@
dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
remove_range(ic, &dio->range);
INIT_WORK(&dio->work, integrity_bio_wait);
- queue_work(ic->wait_wq, &dio->work);
+ queue_work(ic->offload_wq, &dio->work);
} else {
block_bitmap_op(ic, ic->journal, dio->range.logical_sector,
dio->range.n_sectors, BITMAP_OP_SET);
@@ -2519,7 +2568,7 @@
remove_range(ic, &dio->range);
INIT_WORK(&dio->work, integrity_bio_wait);
- queue_work(ic->wait_wq, &dio->work);
+ queue_work(ic->offload_wq, &dio->work);
}
queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
@@ -2532,7 +2581,7 @@
unsigned long limit;
struct bio *bio;
- dm_integrity_flush_buffers(ic);
+ dm_integrity_flush_buffers(ic, false);
range.logical_sector = 0;
range.n_sectors = ic->provided_data_sectors;
@@ -2541,7 +2590,7 @@
add_new_range_and_wait(ic, &range);
spin_unlock_irq(&ic->endio_wait.lock);
- dm_integrity_flush_buffers(ic);
+ dm_integrity_flush_buffers(ic, true);
if (ic->meta_dev)
blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL);
@@ -2799,8 +2848,6 @@
del_timer_sync(&ic->autocommit_timer);
- WRITE_ONCE(ic->suspending, 1);
-
if (ic->recalc_wq)
drain_workqueue(ic->recalc_wq);
@@ -2814,11 +2861,11 @@
if (ic->meta_dev)
queue_work(ic->writer_wq, &ic->writer_work);
drain_workqueue(ic->writer_wq);
- dm_integrity_flush_buffers(ic);
+ dm_integrity_flush_buffers(ic, true);
}
if (ic->mode == 'B') {
- dm_integrity_flush_buffers(ic);
+ dm_integrity_flush_buffers(ic, true);
#if 1
/* set to 0 to test bitmap replay code */
init_journal(ic, 0, ic->journal_sections, 0);
@@ -2829,8 +2876,6 @@
#endif
}
- WRITE_ONCE(ic->suspending, 0);
-
BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
ic->journal_uptodate = true;
@@ -2883,17 +2928,24 @@
} else {
replay_journal(ic);
if (ic->mode == 'B') {
- int mode;
ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
if (unlikely(r))
dm_integrity_io_error(ic, "writing superblock", r);
- mode = ic->recalculate_flag ? BITMAP_OP_SET : BITMAP_OP_CLEAR;
- block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, mode);
- block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, mode);
- block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, mode);
+ block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
+ block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
+ block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
+ le64_to_cpu(ic->sb->recalc_sector) < ic->provided_data_sectors) {
+ block_bitmap_op(ic, ic->journal, le64_to_cpu(ic->sb->recalc_sector),
+ ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET);
+ block_bitmap_op(ic, ic->recalc_bitmap, le64_to_cpu(ic->sb->recalc_sector),
+ ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET);
+ block_bitmap_op(ic, ic->may_write_bitmap, le64_to_cpu(ic->sb->recalc_sector),
+ ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET);
+ }
rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
}
@@ -2955,13 +3007,14 @@
arg_count += !!ic->internal_hash_alg.alg_string;
arg_count += !!ic->journal_crypt_alg.alg_string;
arg_count += !!ic->journal_mac_alg.alg_string;
+ arg_count += ic->legacy_recalculate;
DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start,
ic->tag_size, ic->mode, arg_count);
if (ic->meta_dev)
DMEMIT(" meta_device:%s", ic->meta_dev->name);
if (ic->sectors_per_block != 1)
DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
- if (ic->recalculate_flag)
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
DMEMIT(" recalculate");
DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
@@ -2974,6 +3027,8 @@
DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval));
}
+ if (ic->legacy_recalculate)
+ DMEMIT(" legacy_recalculate");
#define EMIT_ALG(a, n) \
do { \
@@ -3582,7 +3637,7 @@
unsigned extra_args;
struct dm_arg_set as;
static const struct dm_arg _args[] = {
- {0, 9, "Invalid number of feature args"},
+ {0, 14, "Invalid number of feature args"},
};
unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
bool should_write_sb;
@@ -3607,6 +3662,7 @@
}
ti->private = ic;
ti->per_io_data_size = sizeof(struct dm_integrity_io);
+ ic->ti = ti;
ic->in_progress = RB_ROOT;
INIT_LIST_HEAD(&ic->wait_list);
@@ -3706,6 +3762,7 @@
if (val >= (uint64_t)UINT_MAX * 1000 / HZ) {
r = -EINVAL;
ti->error = "Invalid bitmap_flush_interval argument";
+ goto bad;
}
ic->bitmap_flush_interval = msecs_to_jiffies(val);
} else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
@@ -3725,6 +3782,8 @@
goto bad;
} else if (!strcmp(opt_string, "recalculate")) {
ic->recalculate_flag = true;
+ } else if (!strcmp(opt_string, "legacy_recalculate")) {
+ ic->legacy_recalculate = true;
} else {
r = -EINVAL;
ti->error = "Invalid argument";
@@ -3818,6 +3877,14 @@
goto bad;
}
+ ic->offload_wq = alloc_workqueue("dm-integrity-offload", WQ_MEM_RECLAIM,
+ METADATA_WORKQUEUE_MAX_ACTIVE);
+ if (!ic->offload_wq) {
+ ti->error = "Cannot allocate workqueue";
+ r = -ENOMEM;
+ goto bad;
+ }
+
ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1);
if (!ic->commit_wq) {
ti->error = "Cannot allocate workqueue";
@@ -4007,6 +4074,20 @@
r = -ENOMEM;
goto bad;
}
+ } else {
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
+ ti->error = "Recalculate can only be specified with internal_hash";
+ r = -EINVAL;
+ goto bad;
+ }
+ }
+
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
+ le64_to_cpu(ic->sb->recalc_sector) < ic->provided_data_sectors &&
+ dm_integrity_disable_recalculate(ic)) {
+ ti->error = "Recalculating with HMAC is disabled for security reasons - if you really need it, use the argument \"legacy_recalculate\"";
+ r = -EOPNOTSUPP;
+ goto bad;
}
ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev,
@@ -4122,6 +4203,8 @@
destroy_workqueue(ic->metadata_wq);
if (ic->wait_wq)
destroy_workqueue(ic->wait_wq);
+ if (ic->offload_wq)
+ destroy_workqueue(ic->offload_wq);
if (ic->commit_wq)
destroy_workqueue(ic->commit_wq);
if (ic->writer_wq)
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index ac83f50..3f15d8d 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -529,7 +529,7 @@
* Grab our output buffer.
*/
nl = orig_nl = get_result_buffer(param, param_size, &len);
- if (len < needed) {
+ if (len < needed || len < sizeof(nl->dev)) {
param->flags |= DM_BUFFER_FULL_FLAG;
goto out;
}
@@ -1600,6 +1600,7 @@
if (!argc) {
DMWARN("Empty message received.");
+ r = -EINVAL;
goto out_argv;
}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index dbcc1e4..54ecfea 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -576,10 +576,12 @@
/* Do we need to select a new pgpath? */
pgpath = READ_ONCE(m->current_pgpath);
- queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
- if (!pgpath || !queue_io)
+ if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
+ /* MPATHF_QUEUE_IO might have been cleared by choose_pgpath. */
+ queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
+
if ((pgpath && queue_io) ||
(!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
/* Queue for the daemon to resubmit */
@@ -599,45 +601,10 @@
return pgpath;
}
-static struct pgpath *__map_bio_fast(struct multipath *m, struct bio *bio)
-{
- struct pgpath *pgpath;
- unsigned long flags;
-
- /* Do we need to select a new pgpath? */
- /*
- * FIXME: currently only switching path if no path (due to failure, etc)
- * - which negates the point of using a path selector
- */
- pgpath = READ_ONCE(m->current_pgpath);
- if (!pgpath)
- pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
-
- if (!pgpath) {
- if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
- /* Queue for the daemon to resubmit */
- spin_lock_irqsave(&m->lock, flags);
- bio_list_add(&m->queued_bios, bio);
- spin_unlock_irqrestore(&m->lock, flags);
- queue_work(kmultipathd, &m->process_queued_bios);
-
- return ERR_PTR(-EAGAIN);
- }
- return NULL;
- }
-
- return pgpath;
-}
-
static int __multipath_map_bio(struct multipath *m, struct bio *bio,
struct dm_mpath_io *mpio)
{
- struct pgpath *pgpath;
-
- if (!m->hw_handler_name)
- pgpath = __map_bio_fast(m, bio);
- else
- pgpath = __map_bio(m, bio);
+ struct pgpath *pgpath = __map_bio(m, bio);
if (IS_ERR(pgpath))
return DM_MAPIO_SUBMITTED;
@@ -1223,17 +1190,25 @@
static void flush_multipath_work(struct multipath *m)
{
if (m->hw_handler_name) {
- set_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
- smp_mb__after_atomic();
+ unsigned long flags;
- if (atomic_read(&m->pg_init_in_progress))
+ if (!atomic_read(&m->pg_init_in_progress))
+ goto skip;
+
+ spin_lock_irqsave(&m->lock, flags);
+ if (atomic_read(&m->pg_init_in_progress) &&
+ !test_and_set_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) {
+ spin_unlock_irqrestore(&m->lock, flags);
+
flush_workqueue(kmpath_handlerd);
- multipath_wait_for_pg_init_completion(m);
+ multipath_wait_for_pg_init_completion(m);
- clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
- smp_mb__after_atomic();
+ spin_lock_irqsave(&m->lock, flags);
+ clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
+ }
+ spin_unlock_irqrestore(&m->lock, flags);
}
-
+skip:
if (m->queue_mode == DM_TYPE_BIO_BASED)
flush_work(&m->process_queued_bios);
flush_work(&m->trigger_event);
@@ -1889,7 +1864,7 @@
int r;
current_pgpath = READ_ONCE(m->current_pgpath);
- if (!current_pgpath)
+ if (!current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
current_pgpath = choose_pgpath(m, 0);
if (current_pgpath) {
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index b0aa595..5e73cc6 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1892,6 +1892,14 @@
return rs->md.new_level != rs->md.level;
}
+/* True if layout is set to reshape. */
+static bool rs_is_layout_change(struct raid_set *rs, bool use_mddev)
+{
+ return (use_mddev ? rs->md.delta_disks : rs->delta_disks) ||
+ rs->md.new_layout != rs->md.layout ||
+ rs->md.new_chunk_sectors != rs->md.chunk_sectors;
+}
+
/* True if @rs is requested to reshape by ctr */
static bool rs_reshape_requested(struct raid_set *rs)
{
@@ -1904,9 +1912,7 @@
if (rs_is_raid0(rs))
return false;
- change = mddev->new_layout != mddev->layout ||
- mddev->new_chunk_sectors != mddev->chunk_sectors ||
- rs->delta_disks;
+ change = rs_is_layout_change(rs, false);
/* Historical case to support raid1 reshape without delta disks */
if (rs_is_raid1(rs)) {
@@ -2843,7 +2849,7 @@
}
/*
- *
+ * Reshape:
* - change raid layout
* - change chunk size
* - add disks
@@ -2953,6 +2959,20 @@
}
/*
+ * If the md resync thread has updated superblock with max reshape position
+ * at the end of a reshape but not (yet) reset the layout configuration
+ * changes -> reset the latter.
+ */
+static void rs_reset_inconclusive_reshape(struct raid_set *rs)
+{
+ if (!rs_is_reshaping(rs) && rs_is_layout_change(rs, true)) {
+ rs_set_cur(rs);
+ rs->md.delta_disks = 0;
+ rs->md.reshape_backwards = 0;
+ }
+}
+
+/*
* Enable/disable discard support on RAID set depending on
* RAID level and discard properties of underlying RAID members.
*/
@@ -3216,11 +3236,14 @@
if (r)
goto bad;
+ /* Catch any inconclusive reshape superblock content. */
+ rs_reset_inconclusive_reshape(rs);
+
/* Start raid set read-only and assumed clean to change in raid_resume() */
rs->md.ro = 1;
rs->md.in_sync = 1;
- /* Keep array frozen */
+ /* Keep array frozen until resume. */
set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
/* Has to be held on running the array */
@@ -3234,7 +3257,6 @@
}
r = md_start(&rs->md);
-
if (r) {
ti->error = "Failed to start raid array";
mddev_unlock(&rs->md);
@@ -3744,10 +3766,10 @@
blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
/*
- * RAID1 and RAID10 personalities require bio splitting,
- * RAID0/4/5/6 don't and process large discard bios properly.
+ * RAID0 and RAID10 personalities require bio splitting,
+ * RAID1/4/5/6 don't and process large discard bios properly.
*/
- if (rs_is_raid1(rs) || rs_is_raid10(rs)) {
+ if (rs_is_raid0(rs) || rs_is_raid10(rs)) {
limits->discard_granularity = chunk_size_bytes;
limits->max_discard_sectors = rs->md.chunk_sectors;
}
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 3f8577e..6bc6192 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -70,9 +70,6 @@
void dm_stop_queue(struct request_queue *q)
{
- if (blk_mq_queue_stopped(q))
- return;
-
blk_mq_quiesce_queue(q);
}
@@ -575,6 +572,7 @@
blk_mq_free_tag_set(md->tag_set);
out_kfree_tag_set:
kfree(md->tag_set);
+ md->tag_set = NULL;
return err;
}
@@ -584,6 +582,7 @@
if (md->tag_set) {
blk_mq_free_tag_set(md->tag_set);
kfree(md->tag_set);
+ md->tag_set = NULL;
}
}
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 3c50c4e..963d377 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -17,7 +17,7 @@
#include <linux/dm-bufio.h>
#define DM_MSG_PREFIX "persistent snapshot"
-#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
+#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32U /* 16KB */
#define DM_PREFETCH_CHUNKS 12
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 4fb1a40..e902aae 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -141,6 +141,11 @@
* for them to be committed.
*/
struct bio_list bios_queued_during_merge;
+
+ /*
+ * Flush data after merge.
+ */
+ struct bio flush_bio;
};
/*
@@ -849,7 +854,7 @@
static uint32_t __minimum_chunk_size(struct origin *o)
{
struct dm_snapshot *snap;
- unsigned chunk_size = 0;
+ unsigned chunk_size = rounddown_pow_of_two(UINT_MAX);
if (o)
list_for_each_entry(snap, &o->snapshots, list)
@@ -1121,6 +1126,17 @@
static void error_bios(struct bio *bio);
+static int flush_data(struct dm_snapshot *s)
+{
+ struct bio *flush_bio = &s->flush_bio;
+
+ bio_reset(flush_bio);
+ bio_set_dev(flush_bio, s->origin->bdev);
+ flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+
+ return submit_bio_wait(flush_bio);
+}
+
static void merge_callback(int read_err, unsigned long write_err, void *context)
{
struct dm_snapshot *s = context;
@@ -1134,6 +1150,11 @@
goto shut;
}
+ if (flush_data(s) < 0) {
+ DMERR("Flush after merge failed: shutting down merge");
+ goto shut;
+ }
+
if (s->store->type->commit_merge(s->store,
s->num_merging_chunks) < 0) {
DMERR("Write error in exception store: shutting down merge");
@@ -1318,6 +1339,7 @@
s->first_merging_chunk = 0;
s->num_merging_chunks = 0;
bio_list_init(&s->bios_queued_during_merge);
+ bio_init(&s->flush_bio, NULL, 0);
/* Allocate hash table for COW data */
if (init_hash_tables(s)) {
@@ -1386,6 +1408,7 @@
if (!s->store->chunk_size) {
ti->error = "Chunk size not set";
+ r = -EINVAL;
goto bad_read_metadata;
}
@@ -1504,6 +1527,8 @@
dm_exception_store_destroy(s->store);
+ bio_uninit(&s->flush_bio);
+
dm_put_device(ti, s->cow);
dm_put_device(ti, s->origin);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 52e0495..06b3823 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -428,14 +428,23 @@
{
int r;
dev_t dev;
+ unsigned int major, minor;
+ char dummy;
struct dm_dev_internal *dd;
struct dm_table *t = ti->table;
BUG_ON(!t);
- dev = dm_get_dev_t(path);
- if (!dev)
- return -ENODEV;
+ if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
+ /* Extract the major/minor numbers */
+ dev = MKDEV(major, minor);
+ if (MAJOR(dev) != major || MINOR(dev) != minor)
+ return -EOVERFLOW;
+ } else {
+ dev = dm_get_dev_t(path);
+ if (!dev)
+ return -ENODEV;
+ }
dd = find_device(&t->devices, dev);
if (!dd) {
@@ -879,20 +888,24 @@
EXPORT_SYMBOL_GPL(dm_table_set_type);
/* validate the dax capability of the target device span */
-int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
+int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
- int blocksize = *(int *) data;
+ int blocksize = *(int *) data, id;
+ bool rc;
- return generic_fsdax_supported(dev->dax_dev, dev->bdev, blocksize,
- start, len);
+ id = dax_read_lock();
+ rc = !dax_supported(dev->dax_dev, dev->bdev, blocksize, start, len);
+ dax_read_unlock(id);
+
+ return rc;
}
/* Check devices support synchronous DAX */
-static int device_dax_synchronous(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
+static int device_not_dax_synchronous_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
{
- return dev->dax_dev && dax_synchronous(dev->dax_dev);
+ return !dev->dax_dev || !dax_synchronous(dev->dax_dev);
}
bool dm_table_supports_dax(struct dm_table *t,
@@ -909,7 +922,7 @@
return false;
if (!ti->type->iterate_devices ||
- !ti->type->iterate_devices(ti, iterate_fn, blocksize))
+ ti->type->iterate_devices(ti, iterate_fn, blocksize))
return false;
}
@@ -918,21 +931,15 @@
static bool dm_table_does_not_support_partial_completion(struct dm_table *t);
-struct verify_rq_based_data {
- unsigned sq_count;
- unsigned mq_count;
-};
-
-static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
+static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
{
- struct request_queue *q = bdev_get_queue(dev->bdev);
- struct verify_rq_based_data *v = data;
+ struct block_device *bdev = dev->bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
- if (queue_is_mq(q))
- v->mq_count++;
- else
- v->sq_count++;
+ /* request-based cannot stack on partitions! */
+ if (bdev != bdev->bd_contains)
+ return false;
return queue_is_mq(q);
}
@@ -941,7 +948,6 @@
{
unsigned i;
unsigned bio_based = 0, request_based = 0, hybrid = 0;
- struct verify_rq_based_data v = {.sq_count = 0, .mq_count = 0};
struct dm_target *tgt;
struct list_head *devices = dm_table_get_devices(t);
enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
@@ -990,7 +996,7 @@
verify_bio_based:
/* We must use this table as bio-based */
t->type = DM_TYPE_BIO_BASED;
- if (dm_table_supports_dax(t, device_supports_dax, &page_size) ||
+ if (dm_table_supports_dax(t, device_not_dax_capable, &page_size) ||
(list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
t->type = DM_TYPE_DAX_BIO_BASED;
} else {
@@ -1045,14 +1051,10 @@
/* Non-request-stackable devices can't be used for request-based dm */
if (!tgt->type->iterate_devices ||
- !tgt->type->iterate_devices(tgt, device_is_rq_based, &v)) {
+ !tgt->type->iterate_devices(tgt, device_is_rq_stackable, NULL)) {
DMERR("table load rejected: including non-request-stackable devices");
return -EINVAL;
}
- if (v.sq_count > 0) {
- DMERR("table load rejected: not all devices are blk-mq request-stackable");
- return -EINVAL;
- }
return 0;
}
@@ -1327,12 +1329,6 @@
void dm_table_event(struct dm_table *t)
{
- /*
- * You can no longer call dm_table_event() from interrupt
- * context, use a bottom half instead.
- */
- BUG_ON(in_interrupt());
-
mutex_lock(&_event_lock);
if (t->event_fn)
t->event_fn(t->event_context);
@@ -1380,6 +1376,46 @@
return &t->targets[(KEYS_PER_NODE * n) + k];
}
+/*
+ * type->iterate_devices() should be called when the sanity check needs to
+ * iterate and check all underlying data devices. iterate_devices() will
+ * iterate all underlying data devices until it encounters a non-zero return
+ * code, returned by whether the input iterate_devices_callout_fn, or
+ * iterate_devices() itself internally.
+ *
+ * For some target type (e.g. dm-stripe), one call of iterate_devices() may
+ * iterate multiple underlying devices internally, in which case a non-zero
+ * return code returned by iterate_devices_callout_fn will stop the iteration
+ * in advance.
+ *
+ * Cases requiring _any_ underlying device supporting some kind of attribute,
+ * should use the iteration structure like dm_table_any_dev_attr(), or call
+ * it directly. @func should handle semantics of positive examples, e.g.
+ * capable of something.
+ *
+ * Cases requiring _all_ underlying devices supporting some kind of attribute,
+ * should use the iteration structure like dm_table_supports_nowait() or
+ * dm_table_supports_discards(). Or introduce dm_table_all_devs_attr() that
+ * uses an @anti_func that handle semantics of counter examples, e.g. not
+ * capable of something. So: return !dm_table_any_dev_attr(t, anti_func, data);
+ */
+static bool dm_table_any_dev_attr(struct dm_table *t,
+ iterate_devices_callout_fn func, void *data)
+{
+ struct dm_target *ti;
+ unsigned int i;
+
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
+
+ if (ti->type->iterate_devices &&
+ ti->type->iterate_devices(ti, func, data))
+ return true;
+ }
+
+ return false;
+}
+
static int count_device(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -1416,13 +1452,13 @@
return true;
}
-static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
+static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
{
struct request_queue *q = bdev_get_queue(dev->bdev);
enum blk_zoned_model *zoned_model = data;
- return q && blk_queue_zoned_model(q) == *zoned_model;
+ return !q || blk_queue_zoned_model(q) != *zoned_model;
}
static bool dm_table_supports_zoned_model(struct dm_table *t,
@@ -1439,37 +1475,20 @@
return false;
if (!ti->type->iterate_devices ||
- !ti->type->iterate_devices(ti, device_is_zoned_model, &zoned_model))
+ ti->type->iterate_devices(ti, device_not_zoned_model, &zoned_model))
return false;
}
return true;
}
-static int device_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
+static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
{
struct request_queue *q = bdev_get_queue(dev->bdev);
unsigned int *zone_sectors = data;
- return q && blk_queue_zone_sectors(q) == *zone_sectors;
-}
-
-static bool dm_table_matches_zone_sectors(struct dm_table *t,
- unsigned int zone_sectors)
-{
- struct dm_target *ti;
- unsigned i;
-
- for (i = 0; i < dm_table_get_num_targets(t); i++) {
- ti = dm_table_get_target(t, i);
-
- if (!ti->type->iterate_devices ||
- !ti->type->iterate_devices(ti, device_matches_zone_sectors, &zone_sectors))
- return false;
- }
-
- return true;
+ return !q || blk_queue_zone_sectors(q) != *zone_sectors;
}
static int validate_hardware_zoned_model(struct dm_table *table,
@@ -1489,7 +1508,7 @@
if (!zone_sectors || !is_power_of_2(zone_sectors))
return -EINVAL;
- if (!dm_table_matches_zone_sectors(table, zone_sectors)) {
+ if (dm_table_any_dev_attr(table, device_not_matches_zone_sectors, &zone_sectors)) {
DMERR("%s: zone sectors is not consistent across all devices",
dm_device_name(table->md));
return -EINVAL;
@@ -1679,29 +1698,12 @@
return false;
}
-static int dm_table_supports_dax_write_cache(struct dm_table *t)
-{
- struct dm_target *ti;
- unsigned i;
-
- for (i = 0; i < dm_table_get_num_targets(t); i++) {
- ti = dm_table_get_target(t, i);
-
- if (ti->type->iterate_devices &&
- ti->type->iterate_devices(ti,
- device_dax_write_cache_enabled, NULL))
- return true;
- }
-
- return false;
-}
-
-static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
+static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
{
struct request_queue *q = bdev_get_queue(dev->bdev);
- return q && blk_queue_nonrot(q);
+ return q && !blk_queue_nonrot(q);
}
static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
@@ -1712,35 +1714,18 @@
return q && !blk_queue_add_random(q);
}
-static bool dm_table_all_devices_attribute(struct dm_table *t,
- iterate_devices_callout_fn func)
-{
- struct dm_target *ti;
- unsigned i;
-
- for (i = 0; i < dm_table_get_num_targets(t); i++) {
- ti = dm_table_get_target(t, i);
-
- if (!ti->type->iterate_devices ||
- !ti->type->iterate_devices(ti, func, NULL))
- return false;
- }
-
- return true;
-}
-
-static int device_no_partial_completion(struct dm_target *ti, struct dm_dev *dev,
+static int device_is_partial_completion(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
char b[BDEVNAME_SIZE];
/* For now, NVMe devices are the only devices of this class */
- return (strncmp(bdevname(dev->bdev, b), "nvme", 4) == 0);
+ return (strncmp(bdevname(dev->bdev, b), "nvme", 4) != 0);
}
static bool dm_table_does_not_support_partial_completion(struct dm_table *t)
{
- return dm_table_all_devices_attribute(t, device_no_partial_completion);
+ return !dm_table_any_dev_attr(t, device_is_partial_completion, NULL);
}
static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
@@ -1867,27 +1852,6 @@
return q && bdi_cap_stable_pages_required(q->backing_dev_info);
}
-/*
- * If any underlying device requires stable pages, a table must require
- * them as well. Only targets that support iterate_devices are considered:
- * don't want error, zero, etc to require stable pages.
- */
-static bool dm_table_requires_stable_pages(struct dm_table *t)
-{
- struct dm_target *ti;
- unsigned i;
-
- for (i = 0; i < dm_table_get_num_targets(t); i++) {
- ti = dm_table_get_target(t, i);
-
- if (ti->type->iterate_devices &&
- ti->type->iterate_devices(ti, device_requires_stable_pages, NULL))
- return true;
- }
-
- return false;
-}
-
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
{
@@ -1920,22 +1884,22 @@
}
blk_queue_write_cache(q, wc, fua);
- if (dm_table_supports_dax(t, device_supports_dax, &page_size)) {
+ if (dm_table_supports_dax(t, device_not_dax_capable, &page_size)) {
blk_queue_flag_set(QUEUE_FLAG_DAX, q);
- if (dm_table_supports_dax(t, device_dax_synchronous, NULL))
+ if (dm_table_supports_dax(t, device_not_dax_synchronous_capable, NULL))
set_dax_synchronous(t->md->dax_dev);
}
else
blk_queue_flag_clear(QUEUE_FLAG_DAX, q);
- if (dm_table_supports_dax_write_cache(t))
+ if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL))
dax_write_cache(t->md->dax_dev, true);
/* Ensure that all underlying devices are non-rotational. */
- if (dm_table_all_devices_attribute(t, device_is_nonrot))
- blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
- else
+ if (dm_table_any_dev_attr(t, device_is_rotational, NULL))
blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
+ else
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
if (!dm_table_supports_write_same(t))
q->limits.max_write_same_sectors = 0;
@@ -1947,8 +1911,11 @@
/*
* Some devices don't use blk_integrity but still want stable pages
* because they do their own checksumming.
+ * If any underlying device requires stable pages, a table must require
+ * them as well. Only targets that support iterate_devices are considered:
+ * don't want error, zero, etc to require stable pages.
*/
- if (dm_table_requires_stable_pages(t))
+ if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL))
q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
else
q->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES;
@@ -1959,7 +1926,8 @@
* Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
* have it set.
*/
- if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
+ if (blk_queue_add_random(q) &&
+ dm_table_any_dev_attr(t, device_is_not_random, NULL))
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
/*
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 4c68a7b..a5ed59e 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -189,6 +189,15 @@
sector_t data_block_size;
/*
+ * Pre-commit callback.
+ *
+ * This allows the thin provisioning target to run a callback before
+ * the metadata are committed.
+ */
+ dm_pool_pre_commit_fn pre_commit_fn;
+ void *pre_commit_context;
+
+ /*
* We reserve a section of the metadata for commit overhead.
* All reported space does *not* include this.
*/
@@ -378,16 +387,15 @@
* Variant that is used for in-core only changes or code that
* shouldn't put the pool in service on its own (e.g. commit).
*/
-static inline void __pmd_write_lock(struct dm_pool_metadata *pmd)
+static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd)
__acquires(pmd->root_lock)
{
down_write(&pmd->root_lock);
}
-#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd))
static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
{
- __pmd_write_lock(pmd);
+ pmd_write_lock_in_core(pmd);
if (unlikely(!pmd->in_service))
pmd->in_service = true;
}
@@ -731,12 +739,16 @@
THIN_MAX_CONCURRENT_LOCKS);
if (IS_ERR(pmd->bm)) {
DMERR("could not create block manager");
- return PTR_ERR(pmd->bm);
+ r = PTR_ERR(pmd->bm);
+ pmd->bm = NULL;
+ return r;
}
r = __open_or_format_metadata(pmd, format_device);
- if (r)
+ if (r) {
dm_block_manager_destroy(pmd->bm);
+ pmd->bm = NULL;
+ }
return r;
}
@@ -822,10 +834,19 @@
* We need to know if the thin_disk_superblock exceeds a 512-byte sector.
*/
BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
+ BUG_ON(!rwsem_is_locked(&pmd->root_lock));
if (unlikely(!pmd->in_service))
return 0;
+ if (pmd->pre_commit_fn) {
+ r = pmd->pre_commit_fn(pmd->pre_commit_context);
+ if (r < 0) {
+ DMERR("pre-commit callback failed");
+ return r;
+ }
+ }
+
r = __write_changed_details(pmd);
if (r < 0)
return r;
@@ -892,6 +913,8 @@
pmd->in_service = false;
pmd->bdev = bdev;
pmd->data_block_size = data_block_size;
+ pmd->pre_commit_fn = NULL;
+ pmd->pre_commit_context = NULL;
r = __create_persistent_data_objects(pmd, format_device);
if (r) {
@@ -934,12 +957,14 @@
return -EBUSY;
}
- if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) {
+ pmd_write_lock_in_core(pmd);
+ if (!pmd->fail_io && !dm_bm_is_read_only(pmd->bm)) {
r = __commit_transaction(pmd);
if (r < 0)
DMWARN("%s: __commit_transaction() failed, error = %d",
__func__, r);
}
+ pmd_write_unlock(pmd);
if (!pmd->fail_io)
__destroy_persistent_data_objects(pmd);
@@ -1822,7 +1847,7 @@
* Care is taken to not have commit be what
* triggers putting the thin-pool in-service.
*/
- __pmd_write_lock(pmd);
+ pmd_write_lock_in_core(pmd);
if (pmd->fail_io)
goto out;
@@ -2044,6 +2069,16 @@
return r;
}
+void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
+ dm_pool_pre_commit_fn fn,
+ void *context)
+{
+ pmd_write_lock_in_core(pmd);
+ pmd->pre_commit_fn = fn;
+ pmd->pre_commit_context = context;
+ pmd_write_unlock(pmd);
+}
+
int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
{
int r = -EINVAL;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index f6be0d7..7ef56bd 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -230,6 +230,13 @@
*/
void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd);
+/* Pre-commit callback */
+typedef int (*dm_pool_pre_commit_fn)(void *context);
+
+void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
+ dm_pool_pre_commit_fn fn,
+ void *context);
+
/*----------------------------------------------------------------*/
#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index fcd8877..1b2c98b 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -231,6 +231,7 @@
struct dm_target *ti; /* Only set if a pool target is bound */
struct mapped_device *pool_md;
+ struct block_device *data_dev;
struct block_device *md_dev;
struct dm_pool_metadata *pmd;
@@ -328,6 +329,7 @@
dm_block_t low_water_blocks;
struct pool_features requested_pf; /* Features requested during table load */
struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
+ struct bio flush_bio;
};
/*
@@ -2392,8 +2394,16 @@
while ((bio = bio_list_pop(&bio_completions)))
bio_endio(bio);
- while ((bio = bio_list_pop(&bios)))
- generic_make_request(bio);
+ while ((bio = bio_list_pop(&bios))) {
+ /*
+ * The data device was flushed as part of metadata commit,
+ * so complete redundant flushes immediately.
+ */
+ if (bio->bi_opf & REQ_PREFLUSH)
+ bio_endio(bio);
+ else
+ generic_make_request(bio);
+ }
}
static void do_worker(struct work_struct *ws)
@@ -2936,6 +2946,7 @@
static struct pool *pool_create(struct mapped_device *pool_md,
struct block_device *metadata_dev,
+ struct block_device *data_dev,
unsigned long block_size,
int read_only, char **error)
{
@@ -3043,6 +3054,7 @@
pool->last_commit_jiffies = jiffies;
pool->pool_md = pool_md;
pool->md_dev = metadata_dev;
+ pool->data_dev = data_dev;
__pool_table_insert(pool);
return pool;
@@ -3084,6 +3096,7 @@
static struct pool *__pool_find(struct mapped_device *pool_md,
struct block_device *metadata_dev,
+ struct block_device *data_dev,
unsigned long block_size, int read_only,
char **error, int *created)
{
@@ -3094,19 +3107,23 @@
*error = "metadata device already in use by a pool";
return ERR_PTR(-EBUSY);
}
+ if (pool->data_dev != data_dev) {
+ *error = "data device already in use by a pool";
+ return ERR_PTR(-EBUSY);
+ }
__pool_inc(pool);
} else {
pool = __pool_table_lookup(pool_md);
if (pool) {
- if (pool->md_dev != metadata_dev) {
+ if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) {
*error = "different pool cannot replace a pool";
return ERR_PTR(-EINVAL);
}
__pool_inc(pool);
} else {
- pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
+ pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error);
*created = 1;
}
}
@@ -3127,6 +3144,7 @@
__pool_dec(pt->pool);
dm_put_device(ti, pt->metadata_dev);
dm_put_device(ti, pt->data_dev);
+ bio_uninit(&pt->flush_bio);
kfree(pt);
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -3192,6 +3210,29 @@
dm_table_event(pool->ti->table);
}
+/*
+ * We need to flush the data device **before** committing the metadata.
+ *
+ * This ensures that the data blocks of any newly inserted mappings are
+ * properly written to non-volatile storage and won't be lost in case of a
+ * crash.
+ *
+ * Failure to do so can result in data corruption in the case of internal or
+ * external snapshots and in the case of newly provisioned blocks, when block
+ * zeroing is enabled.
+ */
+static int metadata_pre_commit_callback(void *context)
+{
+ struct pool_c *pt = context;
+ struct bio *flush_bio = &pt->flush_bio;
+
+ bio_reset(flush_bio);
+ bio_set_dev(flush_bio, pt->data_dev->bdev);
+ flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+
+ return submit_bio_wait(flush_bio);
+}
+
static sector_t get_dev_size(struct block_device *bdev)
{
return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
@@ -3335,7 +3376,7 @@
goto out;
}
- pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
+ pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev,
block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
if (IS_ERR(pool)) {
r = PTR_ERR(pool);
@@ -3360,6 +3401,7 @@
pt->data_dev = data_dev;
pt->low_water_blocks = low_water_blocks;
pt->adjusted_pf = pt->requested_pf = pf;
+ bio_init(&pt->flush_bio, NULL, 0);
ti->num_flush_bios = 1;
/*
@@ -3549,6 +3591,9 @@
if (r)
return r;
+ dm_pool_register_pre_commit_callback(pool->pmd,
+ metadata_pre_commit_callback, pt);
+
r = maybe_resize_data_dev(ti, &need_commit1);
if (r)
return r;
@@ -4077,7 +4122,7 @@
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 21, 0},
+ .version = {1, 22, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -4456,7 +4501,7 @@
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 21, 0},
+ .version = {1, 22, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 3ceeb6b..cea2b37 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -61,19 +61,18 @@
static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index,
unsigned *offset, struct dm_buffer **buf)
{
- u64 position, block;
+ u64 position, block, rem;
u8 *res;
position = (index + rsb) * v->fec->roots;
- block = position >> v->data_dev_block_bits;
- *offset = (unsigned)(position - (block << v->data_dev_block_bits));
+ block = div64_u64_rem(position, v->fec->io_size, &rem);
+ *offset = (unsigned)rem;
- res = dm_bufio_read(v->fec->bufio, v->fec->start + block, buf);
+ res = dm_bufio_read(v->fec->bufio, block, buf);
if (IS_ERR(res)) {
DMERR("%s: FEC %llu: parity read failed (block %llu): %ld",
v->data_dev->name, (unsigned long long)rsb,
- (unsigned long long)(v->fec->start + block),
- PTR_ERR(res));
+ (unsigned long long)block, PTR_ERR(res));
*buf = NULL;
}
@@ -155,7 +154,7 @@
/* read the next block when we run out of parity bytes */
offset += v->fec->roots;
- if (offset >= 1 << v->data_dev_block_bits) {
+ if (offset >= v->fec->io_size) {
dm_bufio_release(buf);
par = fec_read_parity(v, rsb, block_offset, &offset, &buf);
@@ -435,7 +434,7 @@
fio->level++;
if (type == DM_VERITY_BLOCK_TYPE_METADATA)
- block += v->data_blocks;
+ block = block - v->hash_start + v->data_blocks;
/*
* For RS(M, N), the continuous FEC data is divided into blocks of N
@@ -551,6 +550,7 @@
mempool_exit(&f->rs_pool);
mempool_exit(&f->prealloc_pool);
mempool_exit(&f->extra_pool);
+ mempool_exit(&f->output_pool);
kmem_cache_destroy(f->cache);
if (f->data_bufio)
@@ -673,7 +673,7 @@
{
struct dm_verity_fec *f = v->fec;
struct dm_target *ti = v->ti;
- u64 hash_blocks;
+ u64 hash_blocks, fec_blocks;
int ret;
if (!verity_fec_is_enabled(v)) {
@@ -742,16 +742,23 @@
return -E2BIG;
}
+ if ((f->roots << SECTOR_SHIFT) & ((1 << v->data_dev_block_bits) - 1))
+ f->io_size = 1 << v->data_dev_block_bits;
+ else
+ f->io_size = v->fec->roots << SECTOR_SHIFT;
+
f->bufio = dm_bufio_client_create(f->dev->bdev,
- 1 << v->data_dev_block_bits,
+ f->io_size,
1, 0, NULL, NULL);
if (IS_ERR(f->bufio)) {
ti->error = "Cannot initialize FEC bufio client";
return PTR_ERR(f->bufio);
}
- if (dm_bufio_get_device_size(f->bufio) <
- ((f->start + f->rounds * f->roots) >> v->data_dev_block_bits)) {
+ dm_bufio_set_sector_offset(f->bufio, f->start << (v->data_dev_block_bits - SECTOR_SHIFT));
+
+ fec_blocks = div64_u64(f->rounds * f->roots, v->fec->roots << SECTOR_SHIFT);
+ if (dm_bufio_get_device_size(f->bufio) < fec_blocks) {
ti->error = "FEC device is too small";
return -E2BIG;
}
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index 42fbd3a..3c46c8d 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -36,6 +36,7 @@
struct dm_dev *dev; /* parity data device */
struct dm_bufio_client *data_bufio; /* for data dev access */
struct dm_bufio_client *bufio; /* for parity data access */
+ size_t io_size; /* IO size for roots */
sector_t start; /* parity data start in blocks */
sector_t blocks; /* number of blocks covered */
sector_t rounds; /* number of interleaving rounds */
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 4fb33e7..711f101 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -33,7 +33,7 @@
#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks"
#define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once"
-#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC + \
+#define DM_VERITY_OPTS_MAX (3 + DM_VERITY_OPTS_FEC + \
DM_VERITY_ROOT_HASH_VERIFICATION_OPTS)
static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
@@ -534,6 +534,15 @@
}
/*
+ * Skip verity work in response to I/O error when system is shutting down.
+ */
+static inline bool verity_is_system_shutting_down(void)
+{
+ return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
+ || system_state == SYSTEM_RESTART;
+}
+
+/*
* End one "io" structure with a given error.
*/
static void verity_finish_io(struct dm_verity_io *io, blk_status_t status)
@@ -560,7 +569,8 @@
{
struct dm_verity_io *io = bio->bi_private;
- if (bio->bi_status && !verity_fec_is_enabled(io->v)) {
+ if (bio->bi_status &&
+ (!verity_fec_is_enabled(io->v) || verity_is_system_shutting_down())) {
verity_finish_io(io, bio->bi_status);
return;
}
diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c
index 614e43d..919154a 100644
--- a/drivers/md/dm-verity-verify-sig.c
+++ b/drivers/md/dm-verity-verify-sig.c
@@ -15,7 +15,7 @@
#define DM_VERITY_VERIFY_ERR(s) DM_VERITY_ROOT_HASH_VERIFICATION " " s
static bool require_signatures;
-module_param(require_signatures, bool, false);
+module_param(require_signatures, bool, 0444);
MODULE_PARM_DESC(require_signatures,
"Verify the roothash of dm-verity hash tree");
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index d06b8aa..ec10fda 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -142,6 +142,7 @@
size_t metadata_sectors;
size_t n_blocks;
uint64_t seq_count;
+ sector_t data_device_sectors;
void *block_start;
struct wc_entry *entries;
unsigned block_size;
@@ -153,6 +154,7 @@
bool overwrote_committed:1;
bool memory_vmapped:1;
+ bool start_sector_set:1;
bool high_wm_percent_set:1;
bool low_wm_percent_set:1;
bool max_writeback_jobs_set:1;
@@ -161,6 +163,10 @@
bool writeback_fua_set:1;
bool flush_on_suspend:1;
+ unsigned high_wm_percent_value;
+ unsigned low_wm_percent_value;
+ unsigned autocommit_time_value;
+
unsigned writeback_all;
struct workqueue_struct *writeback_wq;
struct work_struct writeback_work;
@@ -224,6 +230,7 @@
pfn_t pfn;
int id;
struct page **pages;
+ sector_t offset;
wc->memory_vmapped = false;
@@ -242,9 +249,16 @@
goto err1;
}
+ offset = get_start_sect(wc->ssd_dev->bdev);
+ if (offset & (PAGE_SIZE / 512 - 1)) {
+ r = -EINVAL;
+ goto err1;
+ }
+ offset >>= PAGE_SHIFT - 9;
+
id = dax_read_lock();
- da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn);
+ da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, &wc->memory_map, &pfn);
if (da < 0) {
wc->memory_map = NULL;
r = da;
@@ -266,7 +280,7 @@
i = 0;
do {
long daa;
- daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i,
+ daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, p - i,
NULL, &pfn);
if (daa <= 0) {
r = daa ? daa : -EINVAL;
@@ -279,6 +293,8 @@
while (daa-- && i < p) {
pages[i++] = pfn_t_to_page(pfn);
pfn.val++;
+ if (!(i & 15))
+ cond_resched();
}
} while (i < p);
wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
@@ -306,7 +322,7 @@
#else
static int persistent_memory_claim(struct dm_writecache *wc)
{
- BUG();
+ return -EOPNOTSUPP;
}
#endif
@@ -442,7 +458,13 @@
complete(&endio->c);
}
-static void ssd_commit_flushed(struct dm_writecache *wc)
+static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
+{
+ wait_event(wc->bio_in_progress_wait[direction],
+ !atomic_read(&wc->bio_in_progress[direction]));
+}
+
+static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{
struct dm_io_region region;
struct dm_io_request req;
@@ -488,17 +510,20 @@
writecache_notify_io(0, &endio);
wait_for_completion_io(&endio.c);
+ if (wait_for_ios)
+ writecache_wait_for_ios(wc, WRITE);
+
writecache_disk_flush(wc, wc->ssd_dev);
memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
}
-static void writecache_commit_flushed(struct dm_writecache *wc)
+static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{
if (WC_MODE_PMEM(wc))
wmb();
else
- ssd_commit_flushed(wc);
+ ssd_commit_flushed(wc, wait_for_ios);
}
static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
@@ -522,12 +547,6 @@
writecache_error(wc, r, "error flushing metadata: %d", r);
}
-static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
-{
- wait_event(wc->bio_in_progress_wait[direction],
- !atomic_read(&wc->bio_in_progress[direction]));
-}
-
#define WFE_RETURN_FOLLOWING 1
#define WFE_LOWEST_SEQ 2
@@ -622,6 +641,12 @@
wc->freelist_size++;
}
+static inline void writecache_verify_watermark(struct dm_writecache *wc)
+{
+ if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
+ queue_work(wc->writeback_wq, &wc->writeback_work);
+}
+
static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
{
struct wc_entry *e;
@@ -643,8 +668,8 @@
list_del(&e->lru);
}
wc->freelist_size--;
- if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
- queue_work(wc->writeback_wq, &wc->writeback_work);
+
+ writecache_verify_watermark(wc);
return e;
}
@@ -724,15 +749,12 @@
e = e2;
cond_resched();
}
- writecache_commit_flushed(wc);
-
- if (!WC_MODE_PMEM(wc))
- writecache_wait_for_ios(wc, WRITE);
+ writecache_commit_flushed(wc, true);
wc->seq_count++;
pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
wc->overwrote_committed = false;
@@ -756,7 +778,7 @@
}
if (need_flush_after_free)
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
}
static void writecache_flush_work(struct work_struct *work)
@@ -799,6 +821,8 @@
writecache_wait_for_ios(wc, WRITE);
discarded_something = true;
}
+ if (!writecache_entry_is_committed(wc, e))
+ wc->uncommitted_blocks--;
writecache_free_entry(wc, e);
}
@@ -809,7 +833,7 @@
}
if (discarded_something)
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
}
static bool writecache_wait_for_writeback(struct dm_writecache *wc)
@@ -838,7 +862,7 @@
}
wc_unlock(wc);
- flush_workqueue(wc->writeback_wq);
+ drain_workqueue(wc->writeback_wq);
wc_lock(wc);
if (flush_on_suspend)
@@ -866,11 +890,30 @@
struct wc_entry *e = &wc->entries[b];
e->index = b;
e->write_in_progress = false;
+ cond_resched();
}
return 0;
}
+static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors)
+{
+ struct dm_io_region region;
+ struct dm_io_request req;
+
+ region.bdev = wc->ssd_dev->bdev;
+ region.sector = wc->start_sector;
+ region.count = n_sectors;
+ req.bi_op = REQ_OP_READ;
+ req.bi_op_flags = REQ_SYNC;
+ req.mem.type = DM_IO_VMA;
+ req.mem.ptr.vma = (char *)wc->memory_map;
+ req.client = wc->dm_io;
+ req.notify.fn = NULL;
+
+ return dm_io(&req, 1, ®ion, NULL);
+}
+
static void writecache_resume(struct dm_target *ti)
{
struct dm_writecache *wc = ti->private;
@@ -881,8 +924,20 @@
wc_lock(wc);
- if (WC_MODE_PMEM(wc))
+ wc->data_device_sectors = i_size_read(wc->dev->bdev->bd_inode) >> SECTOR_SHIFT;
+
+ if (WC_MODE_PMEM(wc)) {
persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
+ } else {
+ r = writecache_read_metadata(wc, wc->metadata_sectors);
+ if (r) {
+ size_t sb_entries_offset;
+ writecache_error(wc, r, "unable to read metadata: %d", r);
+ sb_entries_offset = offsetof(struct wc_memory_superblock, entries);
+ memset((char *)wc->memory_map + sb_entries_offset, -1,
+ (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset);
+ }
+ }
wc->tree = RB_ROOT;
INIT_LIST_HEAD(&wc->lru);
@@ -920,6 +975,7 @@
e->original_sector = le64_to_cpu(wme.original_sector);
e->seq_count = le64_to_cpu(wme.seq_count);
}
+ cond_resched();
}
#endif
for (b = 0; b < wc->n_blocks; b++) {
@@ -958,9 +1014,11 @@
if (need_flush) {
writecache_flush_all_metadata(wc);
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
}
+ writecache_verify_watermark(wc);
+
wc_unlock(wc);
}
@@ -1218,7 +1276,8 @@
}
} while (bio->bi_iter.bi_size);
- if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks))
+ if (unlikely(bio->bi_opf & REQ_FUA ||
+ wc->uncommitted_blocks >= wc->autocommit_blocks))
writecache_flush(wc);
else
writecache_schedule_autocommit(wc);
@@ -1341,7 +1400,7 @@
wc->writeback_size--;
n_walked++;
if (unlikely(n_walked >= ENDIO_LATENCY)) {
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
wc_unlock(wc);
wc_lock(wc);
n_walked = 0;
@@ -1422,7 +1481,7 @@
writecache_wait_for_ios(wc, READ);
}
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
wc_unlock(wc);
}
@@ -1437,6 +1496,10 @@
void *address = memory_data(wc, e);
persistent_memory_flush_cache(address, block_size);
+
+ if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
+ return true;
+
return bio_add_page(&wb->bio, persistent_memory_page(address),
block_size, persistent_memory_page_offset(address)) != 0;
}
@@ -1508,6 +1571,9 @@
if (writecache_has_error(wc)) {
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
+ } else if (unlikely(!bio_sectors(bio))) {
+ bio->bi_status = BLK_STS_OK;
+ bio_endio(bio);
} else {
submit_bio(bio);
}
@@ -1551,6 +1617,14 @@
e = f;
}
+ if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
+ if (to.sector >= wc->data_device_sectors) {
+ writecache_copy_endio(0, 0, c);
+ continue;
+ }
+ from.count = to.count = wc->data_device_sectors - to.sector;
+ }
+
dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
__writeback_throttle(wc, wbl);
@@ -1761,14 +1835,16 @@
pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
- for (b = 0; b < wc->n_blocks; b++)
+ for (b = 0; b < wc->n_blocks; b++) {
write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
+ cond_resched();
+ }
writecache_flush_all_metadata(wc);
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
return 0;
}
@@ -1836,7 +1912,7 @@
struct wc_memory_superblock s;
static struct dm_arg _args[] = {
- {0, 10, "Invalid number of feature args"},
+ {0, 16, "Invalid number of feature args"},
};
as.argc = argc;
@@ -1971,6 +2047,12 @@
ti->error = "Invalid block size";
goto bad;
}
+ if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) ||
+ wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) {
+ r = -EINVAL;
+ ti->error = "Block size is smaller than device logical block size";
+ goto bad;
+ }
wc->block_size_bits = __ffs(wc->block_size);
wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
@@ -1992,6 +2074,7 @@
if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
goto invalid_optional;
wc->start_sector = start_sector;
+ wc->start_sector_set = true;
if (wc->start_sector != start_sector ||
wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
goto invalid_optional;
@@ -2001,6 +2084,7 @@
goto invalid_optional;
if (high_wm_percent < 0 || high_wm_percent > 100)
goto invalid_optional;
+ wc->high_wm_percent_value = high_wm_percent;
wc->high_wm_percent_set = true;
} else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
string = dm_shift_arg(&as), opt_params--;
@@ -2008,6 +2092,7 @@
goto invalid_optional;
if (low_wm_percent < 0 || low_wm_percent > 100)
goto invalid_optional;
+ wc->low_wm_percent_value = low_wm_percent;
wc->low_wm_percent_set = true;
} else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
string = dm_shift_arg(&as), opt_params--;
@@ -2027,6 +2112,7 @@
if (autocommit_msecs > 3600000)
goto invalid_optional;
wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
+ wc->autocommit_time_value = autocommit_msecs;
wc->autocommit_time_set = true;
} else if (!strcasecmp(string, "fua")) {
if (WC_MODE_PMEM(wc)) {
@@ -2053,14 +2139,18 @@
}
if (WC_MODE_PMEM(wc)) {
+ if (!dax_synchronous(wc->ssd_dev->dax_dev)) {
+ r = -EOPNOTSUPP;
+ ti->error = "Asynchronous persistent memory not supported as pmem cache";
+ goto bad;
+ }
+
r = persistent_memory_claim(wc);
if (r) {
ti->error = "Unable to map persistent memory for cache";
goto bad;
}
} else {
- struct dm_io_region region;
- struct dm_io_request req;
size_t n_blocks, n_metadata_blocks;
uint64_t n_bitmap_bits;
@@ -2117,19 +2207,9 @@
goto bad;
}
- region.bdev = wc->ssd_dev->bdev;
- region.sector = wc->start_sector;
- region.count = wc->metadata_sectors;
- req.bi_op = REQ_OP_READ;
- req.bi_op_flags = REQ_SYNC;
- req.mem.type = DM_IO_VMA;
- req.mem.ptr.vma = (char *)wc->memory_map;
- req.client = wc->dm_io;
- req.notify.fn = NULL;
-
- r = dm_io(&req, 1, ®ion, NULL);
+ r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT);
if (r) {
- ti->error = "Unable to read metadata";
+ ti->error = "Unable to read first block of metadata";
goto bad;
}
}
@@ -2234,7 +2314,6 @@
struct dm_writecache *wc = ti->private;
unsigned extra_args;
unsigned sz = 0;
- uint64_t x;
switch (type) {
case STATUSTYPE_INFO:
@@ -2246,7 +2325,7 @@
DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
wc->dev->name, wc->ssd_dev->name, wc->block_size);
extra_args = 0;
- if (wc->start_sector)
+ if (wc->start_sector_set)
extra_args += 2;
if (wc->high_wm_percent_set)
extra_args += 2;
@@ -2262,26 +2341,18 @@
extra_args++;
DMEMIT("%u", extra_args);
- if (wc->start_sector)
+ if (wc->start_sector_set)
DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
- if (wc->high_wm_percent_set) {
- x = (uint64_t)wc->freelist_high_watermark * 100;
- x += wc->n_blocks / 2;
- do_div(x, (size_t)wc->n_blocks);
- DMEMIT(" high_watermark %u", 100 - (unsigned)x);
- }
- if (wc->low_wm_percent_set) {
- x = (uint64_t)wc->freelist_low_watermark * 100;
- x += wc->n_blocks / 2;
- do_div(x, (size_t)wc->n_blocks);
- DMEMIT(" low_watermark %u", 100 - (unsigned)x);
- }
+ if (wc->high_wm_percent_set)
+ DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
+ if (wc->low_wm_percent_set)
+ DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
if (wc->max_writeback_jobs_set)
DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
if (wc->autocommit_blocks_set)
DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
if (wc->autocommit_time_set)
- DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
+ DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
if (wc->writeback_fua_set)
DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
break;
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 595a731..e6b0039 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -134,6 +134,7 @@
sector_t zone_bitmap_size;
unsigned int zone_nr_bitmap_blocks;
+ unsigned int zone_bits_per_mblk;
unsigned int nr_bitmap_blocks;
unsigned int nr_map_blocks;
@@ -554,6 +555,7 @@
TASK_UNINTERRUPTIBLE);
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
dmz_release_mblock(zmd, mblk);
+ dmz_check_bdev(zmd->dev);
return ERR_PTR(-EIO);
}
@@ -625,6 +627,8 @@
ret = submit_bio_wait(bio);
bio_put(bio);
+ if (ret)
+ dmz_check_bdev(zmd->dev);
return ret;
}
@@ -691,6 +695,7 @@
TASK_UNINTERRUPTIBLE);
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
clear_bit(DMZ_META_ERROR, &mblk->state);
+ dmz_check_bdev(zmd->dev);
ret = -EIO;
}
nr_mblks_submitted--;
@@ -768,7 +773,7 @@
/* If there are no dirty metadata blocks, just flush the device cache */
if (list_empty(&write_list)) {
ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
- goto out;
+ goto err;
}
/*
@@ -778,7 +783,7 @@
*/
ret = dmz_log_dirty_mblocks(zmd, &write_list);
if (ret)
- goto out;
+ goto err;
/*
* The log is on disk. It is now safe to update in place
@@ -786,11 +791,11 @@
*/
ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary);
if (ret)
- goto out;
+ goto err;
ret = dmz_write_sb(zmd, zmd->mblk_primary);
if (ret)
- goto out;
+ goto err;
while (!list_empty(&write_list)) {
mblk = list_first_entry(&write_list, struct dmz_mblock, link);
@@ -805,16 +810,20 @@
zmd->sb_gen++;
out:
- if (ret && !list_empty(&write_list)) {
- spin_lock(&zmd->mblk_lock);
- list_splice(&write_list, &zmd->mblk_dirty_list);
- spin_unlock(&zmd->mblk_lock);
- }
-
dmz_unlock_flush(zmd);
up_write(&zmd->mblk_sem);
return ret;
+
+err:
+ if (!list_empty(&write_list)) {
+ spin_lock(&zmd->mblk_lock);
+ list_splice(&write_list, &zmd->mblk_dirty_list);
+ spin_unlock(&zmd->mblk_lock);
+ }
+ if (!dmz_check_bdev(zmd->dev))
+ ret = -EIO;
+ goto out;
}
/*
@@ -1098,7 +1107,6 @@
if (blkz->type == BLK_ZONE_TYPE_CONVENTIONAL) {
set_bit(DMZ_RND, &zone->flags);
- zmd->nr_rnd_zones++;
} else if (blkz->type == BLK_ZONE_TYPE_SEQWRITE_REQ ||
blkz->type == BLK_ZONE_TYPE_SEQWRITE_PREF) {
set_bit(DMZ_SEQ, &zone->flags);
@@ -1159,7 +1167,10 @@
/* Init */
zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
- zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT;
+ zmd->zone_nr_bitmap_blocks =
+ max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT);
+ zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks,
+ DMZ_BLOCK_SIZE_BITS);
/* Allocate zone array */
zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
@@ -1244,6 +1255,7 @@
if (ret) {
dmz_dev_err(zmd->dev, "Get zone %u report failed",
dmz_id(zmd, zone));
+ dmz_check_bdev(zmd->dev);
return ret;
}
@@ -1577,7 +1589,7 @@
return dzone;
}
- return ERR_PTR(-EBUSY);
+ return NULL;
}
/*
@@ -1597,7 +1609,7 @@
return zone;
}
- return ERR_PTR(-EBUSY);
+ return NULL;
}
/*
@@ -1982,7 +1994,7 @@
dmz_release_mblock(zmd, to_mblk);
dmz_release_mblock(zmd, from_mblk);
- chunk_block += DMZ_BLOCK_SIZE_BITS;
+ chunk_block += zmd->zone_bits_per_mblk;
}
to_zone->weight = from_zone->weight;
@@ -2043,7 +2055,7 @@
/* Set bits */
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+ nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits);
if (count) {
@@ -2122,7 +2134,7 @@
/* Clear bits */
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+ nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
count = dmz_clear_bits((unsigned long *)mblk->data,
bit, nr_bits);
@@ -2182,6 +2194,7 @@
{
struct dmz_mblock *mblk;
unsigned int bit, set_bit, nr_bits;
+ unsigned int zone_bits = zmd->zone_bits_per_mblk;
unsigned long *bitmap;
int n = 0;
@@ -2196,15 +2209,15 @@
/* Get offset */
bitmap = (unsigned long *) mblk->data;
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+ nr_bits = min(nr_blocks, zone_bits - bit);
if (set)
- set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
+ set_bit = find_next_bit(bitmap, zone_bits, bit);
else
- set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
+ set_bit = find_next_zero_bit(bitmap, zone_bits, bit);
dmz_release_mblock(zmd, mblk);
n += set_bit - bit;
- if (set_bit < DMZ_BLOCK_SIZE_BITS)
+ if (set_bit < zone_bits)
break;
nr_blocks -= nr_bits;
@@ -2307,7 +2320,7 @@
/* Count bits in this block */
bitmap = mblk->data;
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+ nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
n += dmz_count_bits(bitmap, bit, nr_bits);
dmz_release_mblock(zmd, mblk);
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
index d240d7c..d508173 100644
--- a/drivers/md/dm-zoned-reclaim.c
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -82,6 +82,7 @@
"Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
dmz_id(zmd, zone), (unsigned long long)wp_block,
(unsigned long long)block, nr_blocks, ret);
+ dmz_check_bdev(zrc->dev);
return ret;
}
@@ -348,8 +349,8 @@
/* Get a data zone */
dzone = dmz_get_zone_for_reclaim(zmd);
- if (IS_ERR(dzone))
- return PTR_ERR(dzone);
+ if (!dzone)
+ return -EBUSY;
start = jiffies;
@@ -489,12 +490,7 @@
ret = dmz_do_reclaim(zrc);
if (ret) {
dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret);
- if (ret == -EIO)
- /*
- * LLD might be performing some error handling sequence
- * at the underlying device. To not interfere, do not
- * attempt to schedule the next reclaim run immediately.
- */
+ if (!dmz_check_bdev(zrc->dev))
return;
}
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index d3bcc41..6e4f3ef 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -80,6 +80,8 @@
if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
bio->bi_status = status;
+ if (bio->bi_status != BLK_STS_OK)
+ bioctx->target->dev->flags |= DMZ_CHECK_BDEV;
if (refcount_dec_and_test(&bioctx->ref)) {
struct dm_zone *zone = bioctx->zone;
@@ -531,8 +533,9 @@
/* Get the BIO chunk work. If one is not active yet, create one */
cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
- if (!cw) {
-
+ if (cw) {
+ dmz_get_chunk_work(cw);
+ } else {
/* Create a new chunk work */
cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
if (unlikely(!cw)) {
@@ -541,7 +544,7 @@
}
INIT_WORK(&cw->work, dmz_chunk_work);
- refcount_set(&cw->refcount, 0);
+ refcount_set(&cw->refcount, 1);
cw->target = dmz;
cw->chunk = chunk;
bio_list_init(&cw->bio_list);
@@ -554,7 +557,6 @@
}
bio_list_add(&cw->bio_list, bio);
- dmz_get_chunk_work(cw);
dmz_reclaim_bio_acc(dmz->reclaim);
if (queue_work(dmz->chunk_wq, &cw->work))
@@ -565,32 +567,52 @@
}
/*
- * Check the backing device availability. If it's on the way out,
+ * Check if the backing device is being removed. If it's on the way out,
* start failing I/O. Reclaim and metadata components also call this
* function to cleanly abort operation in the event of such failure.
*/
bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
{
- struct gendisk *disk;
+ if (dmz_dev->flags & DMZ_BDEV_DYING)
+ return true;
- if (!(dmz_dev->flags & DMZ_BDEV_DYING)) {
- disk = dmz_dev->bdev->bd_disk;
- if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
- dmz_dev_warn(dmz_dev, "Backing device queue dying");
- dmz_dev->flags |= DMZ_BDEV_DYING;
- } else if (disk->fops->check_events) {
- if (disk->fops->check_events(disk, 0) &
- DISK_EVENT_MEDIA_CHANGE) {
- dmz_dev_warn(dmz_dev, "Backing device offline");
- dmz_dev->flags |= DMZ_BDEV_DYING;
- }
- }
+ if (dmz_dev->flags & DMZ_CHECK_BDEV)
+ return !dmz_check_bdev(dmz_dev);
+
+ if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
+ dmz_dev_warn(dmz_dev, "Backing device queue dying");
+ dmz_dev->flags |= DMZ_BDEV_DYING;
}
return dmz_dev->flags & DMZ_BDEV_DYING;
}
/*
+ * Check the backing device availability. This detects such events as
+ * backing device going offline due to errors, media removals, etc.
+ * This check is less efficient than dmz_bdev_is_dying() and should
+ * only be performed as a part of error handling.
+ */
+bool dmz_check_bdev(struct dmz_dev *dmz_dev)
+{
+ struct gendisk *disk;
+
+ dmz_dev->flags &= ~DMZ_CHECK_BDEV;
+
+ if (dmz_bdev_is_dying(dmz_dev))
+ return false;
+
+ disk = dmz_dev->bdev->bd_disk;
+ if (disk->fops->check_events &&
+ disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) {
+ dmz_dev_warn(dmz_dev, "Backing device offline");
+ dmz_dev->flags |= DMZ_BDEV_DYING;
+ }
+
+ return !(dmz_dev->flags & DMZ_BDEV_DYING);
+}
+
+/*
* Process a new BIO.
*/
static int dmz_map(struct dm_target *ti, struct bio *bio)
@@ -768,7 +790,7 @@
}
/* Set target (no write same support) */
- ti->max_io_len = dev->zone_nr_sectors << 9;
+ ti->max_io_len = dev->zone_nr_sectors;
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_zeroes_bios = 1;
@@ -902,8 +924,8 @@
{
struct dmz_target *dmz = ti->private;
- if (dmz_bdev_is_dying(dmz->dev))
- return -ENODEV;
+ if (!dmz_check_bdev(dmz->dev))
+ return -EIO;
*bdev = dmz->dev->bdev;
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
index d8e70b0..5b5e493 100644
--- a/drivers/md/dm-zoned.h
+++ b/drivers/md/dm-zoned.h
@@ -72,6 +72,7 @@
/* Device flags. */
#define DMZ_BDEV_DYING (1 << 0)
+#define DMZ_CHECK_BDEV (2 << 0)
/*
* Zone descriptor.
@@ -255,5 +256,6 @@
* Functions defined in dm-zoned-target.c
*/
bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
+bool dmz_check_bdev(struct dmz_dev *dmz_dev);
#endif /* DM_ZONED_H */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1a5e328..530c0fe 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -12,6 +12,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/blkpg.h>
#include <linux/bio.h>
@@ -140,10 +141,21 @@
#define DMF_NOFLUSH_SUSPENDING 5
#define DMF_DEFERRED_REMOVE 6
#define DMF_SUSPENDED_INTERNALLY 7
+#define DMF_POST_SUSPENDING 8
#define DM_NUMA_NODE NUMA_NO_NODE
static int dm_numa_node = DM_NUMA_NODE;
+#define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE)
+static int swap_bios = DEFAULT_SWAP_BIOS;
+static int get_swap_bios(void)
+{
+ int latch = READ_ONCE(swap_bios);
+ if (unlikely(latch <= 0))
+ latch = DEFAULT_SWAP_BIOS;
+ return latch;
+}
+
/*
* For mempools pre-allocation at the table loading time.
*/
@@ -453,8 +465,10 @@
return -EAGAIN;
map = dm_get_live_table(md, &srcu_idx);
- if (!map)
- return -EIO;
+ if (!map) {
+ ret = -EIO;
+ goto out;
+ }
tgt = dm_table_find_target(map, sector);
if (!tgt) {
@@ -491,7 +505,6 @@
static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
struct block_device **bdev)
- __acquires(md->io_barrier)
{
struct dm_target *tgt;
struct dm_table *map;
@@ -525,7 +538,6 @@
}
static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
- __releases(md->io_barrier)
{
dm_put_live_table(md, srcu_idx);
}
@@ -546,7 +558,7 @@
* subset of the parent bdev; require extra privileges.
*/
if (!capable(CAP_SYS_RAWIO)) {
- DMWARN_LIMIT(
+ DMDEBUG_LIMIT(
"%s: sending ioctl %x to DM device without required privilege.",
current->comm, cmd);
r = -ENOIOCTLCMD;
@@ -970,6 +982,11 @@
limits->max_write_zeroes_sectors = 0;
}
+static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
+{
+ return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
+}
+
static void clone_endio(struct bio *bio)
{
blk_status_t error = bio->bi_status;
@@ -1007,6 +1024,11 @@
}
}
+ if (unlikely(swap_bios_limit(tio->ti, bio))) {
+ struct mapped_device *md = io->md;
+ up(&md->swap_bios_semaphore);
+ }
+
free_tio(tio);
dec_pending(io, error);
}
@@ -1110,15 +1132,16 @@
{
struct mapped_device *md = dax_get_private(dax_dev);
struct dm_table *map;
+ bool ret = false;
int srcu_idx;
- bool ret;
map = dm_get_live_table(md, &srcu_idx);
if (!map)
- return false;
+ goto out;
- ret = dm_table_supports_dax(map, device_supports_dax, &blocksize);
+ ret = dm_table_supports_dax(map, device_not_dax_capable, &blocksize);
+out:
dm_put_live_table(md, srcu_idx);
return ret;
@@ -1260,6 +1283,22 @@
}
EXPORT_SYMBOL_GPL(dm_remap_zone_report);
+static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
+{
+ mutex_lock(&md->swap_bios_lock);
+ while (latch < md->swap_bios) {
+ cond_resched();
+ down(&md->swap_bios_semaphore);
+ md->swap_bios--;
+ }
+ while (latch > md->swap_bios) {
+ cond_resched();
+ up(&md->swap_bios_semaphore);
+ md->swap_bios++;
+ }
+ mutex_unlock(&md->swap_bios_lock);
+}
+
static blk_qc_t __map_bio(struct dm_target_io *tio)
{
int r;
@@ -1280,6 +1319,14 @@
atomic_inc(&io->io_count);
sector = clone->bi_iter.bi_sector;
+ if (unlikely(swap_bios_limit(ti, clone))) {
+ struct mapped_device *md = io->md;
+ int latch = get_swap_bios();
+ if (unlikely(latch != md->swap_bios))
+ __set_swap_bios_limit(md, latch);
+ down(&md->swap_bios_semaphore);
+ }
+
r = ti->type->map(ti, clone);
switch (r) {
case DM_MAPIO_SUBMITTED:
@@ -1294,10 +1341,18 @@
ret = generic_make_request(clone);
break;
case DM_MAPIO_KILL:
+ if (unlikely(swap_bios_limit(ti, clone))) {
+ struct mapped_device *md = io->md;
+ up(&md->swap_bios_semaphore);
+ }
free_tio(tio);
dec_pending(io, BLK_STS_IOERR);
break;
case DM_MAPIO_REQUEUE:
+ if (unlikely(swap_bios_limit(ti, clone))) {
+ struct mapped_device *md = io->md;
+ up(&md->swap_bios_semaphore);
+ }
free_tio(tio);
dec_pending(io, BLK_STS_DM_REQUEUE);
break;
@@ -1436,9 +1491,6 @@
BUG_ON(bio_has_data(ci->bio));
while ((ti = dm_table_get_target(ci->map, target_nr++)))
__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
-
- bio_disassociate_blkg(ci->bio);
-
return 0;
}
@@ -1626,6 +1678,7 @@
ci.bio = &flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
+ bio_uninit(ci.bio);
/* dec_pending submits any data associated with flush */
} else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
ci.bio = bio;
@@ -1700,6 +1753,7 @@
ci.bio = &flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
+ bio_uninit(ci.bio);
/* dec_pending submits any data associated with flush */
} else {
struct dm_target_io *tio;
@@ -1718,23 +1772,6 @@
return ret;
}
-static void dm_queue_split(struct mapped_device *md, struct dm_target *ti, struct bio **bio)
-{
- unsigned len, sector_count;
-
- sector_count = bio_sectors(*bio);
- len = min_t(sector_t, max_io_len((*bio)->bi_iter.bi_sector, ti), sector_count);
-
- if (sector_count > len) {
- struct bio *split = bio_split(*bio, len, GFP_NOIO, &md->queue->bio_split);
-
- bio_chain(split, *bio);
- trace_block_split(md->queue, split, (*bio)->bi_iter.bi_sector);
- generic_make_request(*bio);
- *bio = split;
- }
-}
-
static blk_qc_t dm_process_bio(struct mapped_device *md,
struct dm_table *map, struct bio *bio)
{
@@ -1760,15 +1797,14 @@
* won't be imposed.
*/
if (current->bio_list) {
- blk_queue_split(md->queue, &bio);
- if (!is_abnormal_io(bio))
- dm_queue_split(md, ti, &bio);
+ if (is_abnormal_io(bio))
+ blk_queue_split(md->queue, &bio);
+ /* regular IO is split by __split_and_process_bio */
}
if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED)
return __process_bio(md, map, bio, ti);
- else
- return __split_and_process_bio(md, map, bio);
+ return __split_and_process_bio(md, map, bio);
}
static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
@@ -1809,7 +1845,8 @@
* With request-based DM we only need to check the
* top-level queue for congestion.
*/
- r = md->queue->backing_dev_info->wb.state & bdi_bits;
+ struct backing_dev_info *bdi = md->queue->backing_dev_info;
+ r = bdi->wb.congested->state & bdi_bits;
} else {
map = dm_get_live_table_fast(md);
if (map)
@@ -1875,14 +1912,6 @@
static void dm_wq_work(struct work_struct *work);
-static void dm_init_normal_md_queue(struct mapped_device *md)
-{
- /*
- * Initialize aspects of queue that aren't relevant for blk-mq
- */
- md->queue->backing_dev_info->congested_fn = dm_any_congested;
-}
-
static void cleanup_mapped_device(struct mapped_device *md)
{
if (md->wq)
@@ -1917,6 +1946,7 @@
mutex_destroy(&md->suspend_lock);
mutex_destroy(&md->type_lock);
mutex_destroy(&md->table_devices_lock);
+ mutex_destroy(&md->swap_bios_lock);
dm_mq_cleanup_mapped_device(md);
}
@@ -1970,7 +2000,12 @@
if (!md->queue)
goto bad;
md->queue->queuedata = md;
- md->queue->backing_dev_info->congested_data = md;
+ /*
+ * default to bio-based required ->make_request_fn until DM
+ * table is loaded and md->type established. If request-based
+ * table is loaded: blk-mq will override accordingly.
+ */
+ blk_queue_make_request(md->queue, dm_make_request);
md->disk = alloc_disk_node(1, md->numa_node_id);
if (!md->disk)
@@ -1981,6 +2016,10 @@
init_waitqueue_head(&md->eventq);
init_completion(&md->kobj_holder.completion);
+ md->swap_bios = get_swap_bios();
+ sema_init(&md->swap_bios_semaphore, md->swap_bios);
+ mutex_init(&md->swap_bios_lock);
+
md->disk->major = _major;
md->disk->first_minor = minor;
md->disk->fops = &dm_blk_dops;
@@ -2264,6 +2303,12 @@
}
EXPORT_SYMBOL_GPL(dm_get_queue_limits);
+static void dm_init_congested_fn(struct mapped_device *md)
+{
+ md->queue->backing_dev_info->congested_data = md;
+ md->queue->backing_dev_info->congested_fn = dm_any_congested;
+}
+
/*
* Setup the DM device's queue based on md's type
*/
@@ -2280,12 +2325,12 @@
DMERR("Cannot initialize queue for request-based dm-mq mapped device");
return r;
}
+ dm_init_congested_fn(md);
break;
case DM_TYPE_BIO_BASED:
case DM_TYPE_DAX_BIO_BASED:
case DM_TYPE_NVME_BIO_BASED:
- dm_init_normal_md_queue(md);
- blk_queue_make_request(md->queue, dm_make_request);
+ dm_init_congested_fn(md);
break;
case DM_TYPE_NONE:
WARN_ON_ONCE(true);
@@ -2384,6 +2429,8 @@
map = dm_get_live_table(md, &srcu_idx);
if (!dm_suspended_md(md)) {
dm_table_presuspend_targets(map);
+ set_bit(DMF_SUSPENDED, &md->flags);
+ set_bit(DMF_POST_SUSPENDING, &md->flags);
dm_table_postsuspend_targets(map);
}
/* dm_put_live_table must be before msleep, otherwise deadlock is possible */
@@ -2706,7 +2753,9 @@
if (r)
goto out_unlock;
+ set_bit(DMF_POST_SUSPENDING, &md->flags);
dm_table_postsuspend_targets(map);
+ clear_bit(DMF_POST_SUSPENDING, &md->flags);
out_unlock:
mutex_unlock(&md->suspend_lock);
@@ -2803,7 +2852,9 @@
(void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
DMF_SUSPENDED_INTERNALLY);
+ set_bit(DMF_POST_SUSPENDING, &md->flags);
dm_table_postsuspend_targets(map);
+ clear_bit(DMF_POST_SUSPENDING, &md->flags);
}
static void __dm_internal_resume(struct mapped_device *md)
@@ -2880,17 +2931,25 @@
int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
unsigned cookie)
{
+ int r;
+ unsigned noio_flag;
char udev_cookie[DM_COOKIE_LENGTH];
char *envp[] = { udev_cookie, NULL };
+ noio_flag = memalloc_noio_save();
+
if (!cookie)
- return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
+ r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
else {
snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
DM_COOKIE_ENV_VAR_NAME, cookie);
- return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
- action, envp);
+ r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
+ action, envp);
}
+
+ memalloc_noio_restore(noio_flag);
+
+ return r;
}
uint32_t dm_next_uevent_seq(struct mapped_device *md)
@@ -2956,6 +3015,11 @@
return test_bit(DMF_SUSPENDED, &md->flags);
}
+static int dm_post_suspending_md(struct mapped_device *md)
+{
+ return test_bit(DMF_POST_SUSPENDING, &md->flags);
+}
+
int dm_suspended_internally_md(struct mapped_device *md)
{
return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
@@ -2972,6 +3036,12 @@
}
EXPORT_SYMBOL_GPL(dm_suspended);
+int dm_post_suspending(struct dm_target *ti)
+{
+ return dm_post_suspending_md(dm_table_get_md(ti->table));
+}
+EXPORT_SYMBOL_GPL(dm_post_suspending);
+
int dm_noflush_suspending(struct dm_target *ti)
{
return __noflush_suspending(dm_table_get_md(ti->table));
@@ -3232,6 +3302,9 @@
module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
+module_param(swap_bios, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
+
MODULE_DESCRIPTION(DM_NAME " driver");
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index d7c4f66..9fbf87e 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -74,7 +74,7 @@
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
bool dm_table_supports_dax(struct dm_table *t, iterate_devices_callout_fn fn,
int *blocksize);
-int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
+int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data);
void dm_lock_md_type(struct mapped_device *md);
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index b092c7b..d7eef52 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1372,7 +1372,7 @@
if (bitmap->bp[page].hijacked ||
bitmap->bp[page].map == NULL)
csize = ((sector_t)1) << (bitmap->chunkshift +
- PAGE_COUNTER_SHIFT - 1);
+ PAGE_COUNTER_SHIFT);
else
csize = ((sector_t)1) << bitmap->chunkshift;
*blocks = csize - (offset & (csize - 1));
@@ -1726,6 +1726,8 @@
md_bitmap_daemon_work(mddev);
bitmap->daemon_lastrun -= sleep;
md_bitmap_daemon_work(mddev);
+ if (mddev->bitmap_info.external)
+ md_super_wait(mddev);
md_bitmap_update_sb(bitmap);
}
@@ -1954,6 +1956,7 @@
}
EXPORT_SYMBOL_GPL(md_bitmap_load);
+/* caller need to free returned bitmap with md_bitmap_free() */
struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
{
int rv = 0;
@@ -2017,6 +2020,7 @@
md_bitmap_unplug(mddev->bitmap);
*low = lo;
*high = hi;
+ md_bitmap_free(bitmap);
return rv;
}
@@ -2139,6 +2143,7 @@
memcpy(page_address(store.sb_page),
page_address(bitmap->storage.sb_page),
sizeof(bitmap_super_t));
+ spin_lock_irq(&bitmap->counts.lock);
md_bitmap_file_unmap(&bitmap->storage);
bitmap->storage = store;
@@ -2154,7 +2159,6 @@
blocks = min(old_counts.chunks << old_counts.chunkshift,
chunks << chunkshift);
- spin_lock_irq(&bitmap->counts.lock);
/* For cluster raid, need to pre-allocate bitmap */
if (mddev_is_clustered(bitmap->mddev)) {
unsigned long page;
@@ -2620,4 +2624,3 @@
.name = "bitmap",
.attrs = md_bitmap_attrs,
};
-
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 813a99f..794e1d5 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -664,9 +664,27 @@
* Takes the lock on the TOKEN lock resource so no other
* node can communicate while the operation is underway.
*/
-static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
+static int lock_token(struct md_cluster_info *cinfo)
{
- int error, set_bit = 0;
+ int error;
+
+ error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
+ if (error) {
+ pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
+ __func__, __LINE__, error);
+ } else {
+ /* Lock the receive sequence */
+ mutex_lock(&cinfo->recv_mutex);
+ }
+ return error;
+}
+
+/* lock_comm()
+ * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
+ */
+static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
+{
+ int rv, set_bit = 0;
struct mddev *mddev = cinfo->mddev;
/*
@@ -677,34 +695,19 @@
*/
if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
&cinfo->state)) {
- error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+ rv = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
&cinfo->state);
- WARN_ON_ONCE(error);
+ WARN_ON_ONCE(rv);
md_wakeup_thread(mddev->thread);
set_bit = 1;
}
- error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
- if (set_bit)
- clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
- if (error)
- pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
- __func__, __LINE__, error);
-
- /* Lock the receive sequence */
- mutex_lock(&cinfo->recv_mutex);
- return error;
-}
-
-/* lock_comm()
- * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
- */
-static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
-{
wait_event(cinfo->wait,
!test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
-
- return lock_token(cinfo, mddev_locked);
+ rv = lock_token(cinfo);
+ if (set_bit)
+ clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
+ return rv;
}
static void unlock_comm(struct md_cluster_info *cinfo)
@@ -784,9 +787,11 @@
{
int ret;
- lock_comm(cinfo, mddev_locked);
- ret = __sendmsg(cinfo, cmsg);
- unlock_comm(cinfo);
+ ret = lock_comm(cinfo, mddev_locked);
+ if (!ret) {
+ ret = __sendmsg(cinfo, cmsg);
+ unlock_comm(cinfo);
+ }
return ret;
}
@@ -1061,7 +1066,7 @@
return 0;
}
- ret = lock_token(cinfo, 1);
+ ret = lock_token(cinfo);
clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
return ret;
}
@@ -1139,6 +1144,7 @@
bitmap = get_bitmap_from_slot(mddev, i);
if (IS_ERR(bitmap)) {
pr_err("can't get bitmap from slot %d\n", i);
+ bitmap = NULL;
goto out;
}
counts = &bitmap->counts;
@@ -1165,6 +1171,7 @@
* can't resize bitmap
*/
goto out;
+ md_bitmap_free(bitmap);
}
return 0;
@@ -1253,7 +1260,10 @@
int raid_slot = -1;
md_update_sb(mddev, 1);
- lock_comm(cinfo, 1);
+ if (lock_comm(cinfo, 1)) {
+ pr_err("%s: lock_comm failed\n", __func__);
+ return;
+ }
memset(&cmsg, 0, sizeof(cmsg));
cmsg.type = cpu_to_le32(METADATA_UPDATED);
@@ -1405,7 +1415,8 @@
cmsg.type = cpu_to_le32(NEWDISK);
memcpy(cmsg.uuid, uuid, 16);
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
- lock_comm(cinfo, 1);
+ if (lock_comm(cinfo, 1))
+ return -EAGAIN;
ret = __sendmsg(cinfo, &cmsg);
if (ret) {
unlock_comm(cinfo);
@@ -1518,6 +1529,7 @@
}
}
kfree(cinfo->other_bitmap_lockres);
+ cinfo->other_bitmap_lockres = NULL;
}
}
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index c766c55..26c75c0 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -244,10 +244,9 @@
sector_t start_sector, end_sector, data_offset;
sector_t bio_sector = bio->bi_iter.bi_sector;
- if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
- md_flush_request(mddev, bio);
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+ && md_flush_request(mddev, bio))
return true;
- }
tmp_dev = which_dev(mddev, bio_sector);
start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 6780938..152f9e6 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -104,10 +104,9 @@
struct multipath_bh * mp_bh;
struct multipath_info *multipath;
- if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
- md_flush_request(mddev, bio);
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+ && md_flush_request(mddev, bio))
return true;
- }
mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1be7abe..761d438 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -376,6 +376,11 @@
struct mddev *mddev = q->queuedata;
unsigned int sectors;
+ if (mddev == NULL || mddev->pers == NULL) {
+ bio_io_error(bio);
+ return BLK_QC_T_NONE;
+ }
+
if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
bio_io_error(bio);
return BLK_QC_T_NONE;
@@ -383,10 +388,6 @@
blk_queue_split(q, &bio);
- if (mddev == NULL || mddev->pers == NULL) {
- bio_io_error(bio);
- return BLK_QC_T_NONE;
- }
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
if (bio_sectors(bio) != 0)
bio->bi_status = BLK_STS_IOERR;
@@ -537,8 +538,10 @@
* could wait for this and below md_handle_request could wait for those
* bios because of suspend check
*/
+ spin_lock_irq(&mddev->lock);
mddev->last_flush = mddev->start_flush;
mddev->flush_bio = NULL;
+ spin_unlock_irq(&mddev->lock);
wake_up(&mddev->sb_wait);
if (bio->bi_iter.bi_size == 0) {
@@ -550,7 +553,13 @@
}
}
-void md_flush_request(struct mddev *mddev, struct bio *bio)
+/*
+ * Manages consolidation of flushes and submitting any flushes needed for
+ * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is
+ * being finished in another context. Returns false if the flushing is
+ * complete but still needs the I/O portion of the bio to be processed.
+ */
+bool md_flush_request(struct mddev *mddev, struct bio *bio)
{
ktime_t start = ktime_get_boottime();
spin_lock_irq(&mddev->lock);
@@ -575,9 +584,10 @@
bio_endio(bio);
else {
bio->bi_opf &= ~REQ_PREFLUSH;
- mddev->pers->make_request(mddev, bio);
+ return false;
}
}
+ return true;
}
EXPORT_SYMBOL(md_flush_request);
@@ -637,8 +647,35 @@
}
EXPORT_SYMBOL_GPL(mddev_init);
+static struct mddev *mddev_find_locked(dev_t unit)
+{
+ struct mddev *mddev;
+
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs)
+ if (mddev->unit == unit)
+ return mddev;
+
+ return NULL;
+}
+
static struct mddev *mddev_find(dev_t unit)
{
+ struct mddev *mddev;
+
+ if (MAJOR(unit) != MD_MAJOR)
+ unit &= ~((1 << MdpMinorShift) - 1);
+
+ spin_lock(&all_mddevs_lock);
+ mddev = mddev_find_locked(unit);
+ if (mddev)
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
+
+ return mddev;
+}
+
+static struct mddev *mddev_find_or_alloc(dev_t unit)
+{
struct mddev *mddev, *new = NULL;
if (unit && MAJOR(unit) != MD_MAJOR)
@@ -648,13 +685,13 @@
spin_lock(&all_mddevs_lock);
if (unit) {
- list_for_each_entry(mddev, &all_mddevs, all_mddevs)
- if (mddev->unit == unit) {
- mddev_get(mddev);
- spin_unlock(&all_mddevs_lock);
- kfree(new);
- return mddev;
- }
+ mddev = mddev_find_locked(unit);
+ if (mddev) {
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
+ kfree(new);
+ return mddev;
+ }
if (new) {
list_add(&new->all_mddevs, &all_mddevs);
@@ -680,12 +717,7 @@
return NULL;
}
- is_free = 1;
- list_for_each_entry(mddev, &all_mddevs, all_mddevs)
- if (mddev->unit == dev) {
- is_free = 0;
- break;
- }
+ is_free = !mddev_find_locked(dev);
}
new->unit = dev;
new->md_minor = MINOR(dev);
@@ -1098,6 +1130,7 @@
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
mdp_super_t *sb;
int ret;
+ bool spare_disk = true;
/*
* Calculate the position of the superblock (512byte sectors),
@@ -1148,8 +1181,19 @@
else
rdev->desc_nr = sb->this_disk.number;
+ /* not spare disk, or LEVEL_MULTIPATH */
+ if (sb->level == LEVEL_MULTIPATH ||
+ (rdev->desc_nr >= 0 &&
+ rdev->desc_nr < MD_SB_DISKS &&
+ sb->disks[rdev->desc_nr].state &
+ ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))))
+ spare_disk = false;
+
if (!refdev) {
- ret = 1;
+ if (!spare_disk)
+ ret = 1;
+ else
+ ret = 0;
} else {
__u64 ev1, ev2;
mdp_super_t *refsb = page_address(refdev->sb_page);
@@ -1165,7 +1209,8 @@
}
ev1 = md_event(sb);
ev2 = md_event(refsb);
- if (ev1 > ev2)
+
+ if (!spare_disk && ev1 > ev2)
ret = 1;
else
ret = 0;
@@ -1525,6 +1570,7 @@
sector_t sectors;
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
int bmask;
+ bool spare_disk = true;
/*
* Calculate the position of the superblock in 512byte sectors.
@@ -1658,8 +1704,19 @@
sb->level != 0)
return -EINVAL;
+ /* not spare disk, or LEVEL_MULTIPATH */
+ if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) ||
+ (rdev->desc_nr >= 0 &&
+ rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
+ (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
+ le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)))
+ spare_disk = false;
+
if (!refdev) {
- ret = 1;
+ if (!spare_disk)
+ ret = 1;
+ else
+ ret = 0;
} else {
__u64 ev1, ev2;
struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
@@ -1676,7 +1733,7 @@
ev1 = le64_to_cpu(sb->events);
ev2 = le64_to_cpu(refsb->events);
- if (ev1 > ev2)
+ if (!spare_disk && ev1 > ev2)
ret = 1;
else
ret = 0;
@@ -3597,7 +3654,7 @@
* Check a full RAID array for plausibility
*/
-static void analyze_sbs(struct mddev *mddev)
+static int analyze_sbs(struct mddev *mddev)
{
int i;
struct md_rdev *rdev, *freshest, *tmp;
@@ -3618,6 +3675,12 @@
md_kick_rdev_from_array(rdev);
}
+ /* Cannot find a valid fresh disk */
+ if (!freshest) {
+ pr_warn("md: cannot find a valid disk\n");
+ return -EINVAL;
+ }
+
super_types[mddev->major_version].
validate_super(mddev, freshest);
@@ -3652,6 +3715,8 @@
clear_bit(In_sync, &rdev->flags);
}
}
+
+ return 0;
}
/* Read a fixed-point number.
@@ -5393,7 +5458,7 @@
* writing to /sys/module/md_mod/parameters/new_array.
*/
static DEFINE_MUTEX(disks_mutex);
- struct mddev *mddev = mddev_find(dev);
+ struct mddev *mddev = mddev_find_or_alloc(dev);
struct gendisk *disk;
int partitioned;
int shift;
@@ -5570,7 +5635,9 @@
if (!mddev->raid_disks) {
if (!mddev->persistent)
return -EINVAL;
- analyze_sbs(mddev);
+ err = analyze_sbs(mddev);
+ if (err)
+ return -EINVAL;
}
if (mddev->level != LEVEL_NONE)
@@ -5998,7 +6065,7 @@
static void mddev_detach(struct mddev *mddev)
{
md_bitmap_wait_behind_writes(mddev);
- if (mddev->pers && mddev->pers->quiesce) {
+ if (mddev->pers && mddev->pers->quiesce && !mddev->suspended) {
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
}
@@ -6267,11 +6334,9 @@
md_probe(dev, NULL, NULL);
mddev = mddev_find(dev);
- if (!mddev || !mddev->gendisk) {
- if (mddev)
- mddev_put(mddev);
+ if (!mddev)
break;
- }
+
if (mddev_lock(mddev))
pr_warn("md: %s locked, cannot run\n", mdname(mddev));
else if (mddev->raid_disks || mddev->major_version
@@ -6678,8 +6743,10 @@
goto busy;
kick_rdev:
- if (mddev_is_clustered(mddev))
- md_cluster_ops->remove_disk(mddev, rdev);
+ if (mddev_is_clustered(mddev)) {
+ if (md_cluster_ops->remove_disk(mddev, rdev))
+ goto busy;
+ }
md_kick_rdev_from_array(rdev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
@@ -7009,6 +7076,7 @@
return -EINVAL;
if (mddev->sync_thread ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) ||
mddev->reshape_position != MaxSector)
return -EBUSY;
@@ -7328,8 +7396,11 @@
err = -EBUSY;
goto out;
}
- WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
- set_bit(MD_CLOSING, &mddev->flags);
+ if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
+ mutex_unlock(&mddev->open_mutex);
+ err = -EBUSY;
+ goto out;
+ }
did_set_md_closing = true;
mutex_unlock(&mddev->open_mutex);
sync_blockdev(bdev);
@@ -7565,9 +7636,9 @@
*/
mddev_put(mddev);
/* Wait until bdev->bd_disk is definitely gone */
- flush_workqueue(md_misc_wq);
- /* Then retry the open from the top */
- return -ERESTARTSYS;
+ if (work_pending(&mddev->del_work))
+ flush_workqueue(md_misc_wq);
+ return -EBUSY;
}
BUG_ON(mddev != bdev->bd_disk->private_data);
@@ -7900,7 +7971,11 @@
loff_t l = *pos;
struct mddev *mddev;
- if (l >= 0x10000)
+ if (l == 0x10000) {
+ ++*pos;
+ return (void *)2;
+ }
+ if (l > 0x10000)
return NULL;
if (!l--)
/* header */
@@ -8997,11 +9072,11 @@
}
if (mddev_is_clustered(mddev)) {
- struct md_rdev *rdev;
+ struct md_rdev *rdev, *tmp;
/* kick the device if another node issued a
* remove disk.
*/
- rdev_for_each(rdev, mddev) {
+ rdev_for_each_safe(rdev, tmp, mddev) {
if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
rdev->raid_disk < 0)
md_kick_rdev_from_array(rdev);
@@ -9314,7 +9389,7 @@
static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
{
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
- struct md_rdev *rdev2;
+ struct md_rdev *rdev2, *tmp;
int role, ret;
char b[BDEVNAME_SIZE];
@@ -9331,7 +9406,7 @@
}
/* Check for change of roles in the active devices */
- rdev_for_each(rdev2, mddev) {
+ rdev_for_each_safe(rdev2, tmp, mddev) {
if (test_bit(Faulty, &rdev2->flags))
continue;
@@ -9376,8 +9451,11 @@
}
}
- if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
- update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
+ if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) {
+ ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
+ if (ret)
+ pr_warn("md: updating array disks failed. %d\n", ret);
+ }
/*
* Since mddev->delta_disks has already updated in update_raid_disks,
diff --git a/drivers/md/md.h b/drivers/md/md.h
index c5e3ff3..5f86f8a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -550,7 +550,7 @@
int level;
struct list_head list;
struct module *owner;
- bool (*make_request)(struct mddev *mddev, struct bio *bio);
+ bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio);
/*
* start up works that do NOT require md_thread. tasks that
* requires md_thread should go into start()
@@ -703,7 +703,7 @@
extern void md_finish_reshape(struct mddev *mddev);
extern int mddev_congested(struct mddev *mddev, int bits);
-extern void md_flush_request(struct mddev *mddev, struct bio *bio);
+extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
sector_t sector, int size, struct page *page);
extern int md_super_wait(struct mddev *mddev);
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 749ec26..54c089a 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -493,7 +493,7 @@
void *p;
int r;
- if (bm->read_only)
+ if (dm_bm_is_read_only(bm))
return -EPERM;
p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
@@ -562,7 +562,7 @@
struct buffer_aux *aux;
void *p;
- if (bm->read_only)
+ if (dm_bm_is_read_only(bm))
return -EPERM;
p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
@@ -602,7 +602,7 @@
int dm_bm_flush(struct dm_block_manager *bm)
{
- if (bm->read_only)
+ if (dm_bm_is_read_only(bm))
return -EPERM;
return dm_bufio_write_dirty_buffers(bm->bufio);
@@ -616,19 +616,21 @@
bool dm_bm_is_read_only(struct dm_block_manager *bm)
{
- return bm->read_only;
+ return (bm ? bm->read_only : true);
}
EXPORT_SYMBOL_GPL(dm_bm_is_read_only);
void dm_bm_set_read_only(struct dm_block_manager *bm)
{
- bm->read_only = true;
+ if (bm)
+ bm->read_only = true;
}
EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
void dm_bm_set_read_write(struct dm_block_manager *bm)
{
- bm->read_only = false;
+ if (bm)
+ bm->read_only = false;
}
EXPORT_SYMBOL_GPL(dm_bm_set_read_write);
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index a240990..5673f8e 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -34,12 +34,12 @@
__le32 max_entries;
__le32 value_size;
__le32 padding;
-} __packed;
+} __attribute__((packed, aligned(8)));
struct btree_node {
struct node_header header;
__le64 keys[0];
-} __packed;
+} __attribute__((packed, aligned(8)));
/*
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 21ea537..9e4d121 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -203,7 +203,13 @@
struct btree_node *right = r->n;
uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
- unsigned threshold = 2 * merge_threshold(left) + 1;
+ /*
+ * Ensure the number of entries in each child will be greater
+ * than or equal to (max_entries / 3 + 1), so no matter which
+ * child is used for removal, the number will still be not
+ * less than (max_entries / 3).
+ */
+ unsigned int threshold = 2 * (merge_threshold(left) + 1);
if (nr_left + nr_right < threshold) {
/*
@@ -543,7 +549,8 @@
delete_at(n, index);
}
- *new_root = shadow_root(&spine);
+ if (!r)
+ *new_root = shadow_root(&spine);
exit_shadow_spine(&spine);
return r;
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index bd68f6f..a213bf1 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -339,6 +339,8 @@
*/
begin = do_div(index_begin, ll->entries_per_block);
end = do_div(end, ll->entries_per_block);
+ if (end == 0)
+ end = ll->entries_per_block;
for (i = index_begin; i < index_end; i++, begin = 0) {
struct dm_block *blk;
@@ -380,6 +382,33 @@
return -ENOSPC;
}
+int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
+ dm_block_t begin, dm_block_t end, dm_block_t *b)
+{
+ int r;
+ uint32_t count;
+
+ do {
+ r = sm_ll_find_free_block(new_ll, begin, new_ll->nr_blocks, b);
+ if (r)
+ break;
+
+ /* double check this block wasn't used in the old transaction */
+ if (*b >= old_ll->nr_blocks)
+ count = 0;
+ else {
+ r = sm_ll_lookup(old_ll, *b, &count);
+ if (r)
+ break;
+
+ if (count)
+ begin = *b + 1;
+ }
+ } while (count);
+
+ return r;
+}
+
static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
int (*mutator)(void *context, uint32_t old, uint32_t *new),
void *context, enum allocation_event *ev)
diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h
index b3078d5..87e1790 100644
--- a/drivers/md/persistent-data/dm-space-map-common.h
+++ b/drivers/md/persistent-data/dm-space-map-common.h
@@ -33,7 +33,7 @@
__le64 blocknr;
__le32 nr_free;
__le32 none_free_before;
-} __packed;
+} __attribute__ ((packed, aligned(8)));
#define MAX_METADATA_BITMAPS 255
@@ -43,7 +43,7 @@
__le64 blocknr;
struct disk_index_entry index[MAX_METADATA_BITMAPS];
-} __packed;
+} __attribute__ ((packed, aligned(8)));
struct ll_disk;
@@ -86,7 +86,7 @@
__le64 nr_allocated;
__le64 bitmap_root;
__le64 ref_count_root;
-} __packed;
+} __attribute__ ((packed, aligned(8)));
#define ENTRIES_PER_BYTE 4
@@ -94,7 +94,7 @@
__le32 csum;
__le32 not_used;
__le64 blocknr;
-} __packed;
+} __attribute__ ((packed, aligned(8)));
enum allocation_event {
SM_NONE,
@@ -109,6 +109,8 @@
int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result);
int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
dm_block_t end, dm_block_t *result);
+int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
+ dm_block_t begin, dm_block_t end, dm_block_t *result);
int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index 32adf6b..e0acae7 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -167,8 +167,18 @@
enum allocation_event ev;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
- /* FIXME: we should loop round a couple of times */
- r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b);
+ /*
+ * Any block we allocate has to be free in both the old and current ll.
+ */
+ r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b);
+ if (r == -ENOSPC) {
+ /*
+ * There's no free block between smd->begin and the end of the metadata device.
+ * We search before smd->begin in case something has been freed.
+ */
+ r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, 0, smd->begin, b);
+ }
+
if (r)
return r;
@@ -197,7 +207,6 @@
return r;
memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll));
- smd->begin = 0;
smd->nr_allocated_this_transaction = 0;
r = sm_disk_get_nr_free(sm, &nr_free);
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 2532858..da439ac 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -448,7 +448,18 @@
enum allocation_event ev;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
- r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b);
+ /*
+ * Any block we allocate has to be free in both the old and current ll.
+ */
+ r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b);
+ if (r == -ENOSPC) {
+ /*
+ * There's no free block between smm->begin and the end of the metadata device.
+ * We search before smm->begin in case something has been freed.
+ */
+ r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, 0, smm->begin, b);
+ }
+
if (r)
return r;
@@ -500,7 +511,6 @@
return r;
memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
- smm->begin = 0;
smm->allocated_this_transaction = 0;
return 0;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 1e77228..322386f 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -87,7 +87,7 @@
char b[BDEVNAME_SIZE];
char b2[BDEVNAME_SIZE];
struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
- unsigned short blksize = 512;
+ unsigned blksize = 512;
*private_conf = ERR_PTR(-ENOMEM);
if (!conf)
@@ -575,10 +575,9 @@
unsigned chunk_sects;
unsigned sectors;
- if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
- md_flush_request(mddev, bio);
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+ && md_flush_request(mddev, bio))
return true;
- }
if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
raid0_handle_discard(mddev, bio);
@@ -615,7 +614,7 @@
tmp_dev = map_sector(mddev, zone, sector, §or);
break;
default:
- WARN("md/raid0:%s: Invalid layout\n", mdname(mddev));
+ WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev));
bio_io_error(bio);
return true;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0466ee2..e871846 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -452,12 +452,12 @@
/*
* When the device is faulty, it is not necessary to
* handle write error.
- * For failfast, this is the only remaining device,
- * We need to retry the write without FailFast.
*/
if (!test_bit(Faulty, &rdev->flags))
set_bit(R1BIO_WriteError, &r1_bio->state);
else {
+ /* Fail the request */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
/* Finished with this branch */
r1_bio->bios[mirror] = NULL;
to_put = bio;
@@ -1567,10 +1567,9 @@
{
sector_t sectors;
- if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
- md_flush_request(mddev, bio);
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+ && md_flush_request(mddev, bio))
return true;
- }
/*
* There is a limit to the maximum size, but
@@ -2782,7 +2781,7 @@
write_targets++;
}
}
- if (bio->bi_end_io) {
+ if (rdev && bio->bi_end_io) {
atomic_inc(&rdev->nr_pending);
bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
bio_set_dev(bio, rdev->bdev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8a62c92..deddabf 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -470,12 +470,12 @@
/*
* When the device is faulty, it is not necessary to
* handle write error.
- * For failfast, this is the only remaining device,
- * We need to retry the write without FailFast.
*/
if (!test_bit(Faulty, &rdev->flags))
set_bit(R10BIO_WriteError, &r10_bio->state);
else {
+ /* Fail the request */
+ set_bit(R10BIO_Degraded, &r10_bio->state);
r10_bio->devs[slot].bio = NULL;
to_put = bio;
dec_rdev = 1;
@@ -1145,7 +1145,7 @@
struct md_rdev *err_rdev = NULL;
gfp_t gfp = GFP_NOIO;
- if (r10_bio->devs[slot].rdev) {
+ if (slot >= 0 && r10_bio->devs[slot].rdev) {
/*
* This is an error retry, but we cannot
* safely dereference the rdev in the r10_bio,
@@ -1510,6 +1510,7 @@
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
+ r10_bio->read_slot = -1;
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
if (bio_data_dir(bio) == READ)
@@ -1525,10 +1526,9 @@
int chunk_sects = chunk_mask + 1;
int sectors = bio_sectors(bio);
- if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
- md_flush_request(mddev, bio);
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+ && md_flush_request(mddev, bio))
return true;
- }
if (!md_write_start(mddev, bio))
return false;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 223e97a..08a7f97 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2228,14 +2228,19 @@
* of the P and Q blocks.
*/
static int scribble_alloc(struct raid5_percpu *percpu,
- int num, int cnt, gfp_t flags)
+ int num, int cnt)
{
size_t obj_size =
sizeof(struct page *) * (num+2) +
sizeof(addr_conv_t) * (num+2);
void *scribble;
- scribble = kvmalloc_array(cnt, obj_size, flags);
+ /*
+ * If here is in raid array suspend context, it is in memalloc noio
+ * context as well, there is no potential recursive memory reclaim
+ * I/Os with the GFP_KERNEL flag.
+ */
+ scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL);
if (!scribble)
return -ENOMEM;
@@ -2267,8 +2272,7 @@
percpu = per_cpu_ptr(conf->percpu, cpu);
err = scribble_alloc(percpu, new_disks,
- new_sectors / STRIPE_SECTORS,
- GFP_NOIO);
+ new_sectors / STRIPE_SECTORS);
if (err)
break;
}
@@ -2403,8 +2407,6 @@
} else
err = -ENOMEM;
- mutex_unlock(&conf->cache_size_mutex);
-
conf->slab_cache = sc;
conf->active_name = 1-conf->active_name;
@@ -2427,6 +2429,8 @@
if (!err)
conf->pool_size = newsize;
+ mutex_unlock(&conf->cache_size_mutex);
+
return err;
}
@@ -3600,6 +3604,7 @@
* is missing/faulty, then we need to read everything we can.
*/
if (sh->raid_conf->level != 6 &&
+ sh->raid_conf->rmw_level != PARITY_DISABLE_RMW &&
sh->sector < sh->raid_conf->mddev->recovery_cp)
/* reconstruct-write isn't being forced */
return 0;
@@ -4835,7 +4840,7 @@
* or to load a block that is being partially written.
*/
if (s.to_read || s.non_overwrite
- || (conf->level == 6 && s.to_write && s.failed)
+ || (s.to_write && s.failed)
|| (s.syncing && (s.uptodate + s.compute < disks))
|| s.replacing
|| s.expanding)
@@ -5592,8 +5597,8 @@
if (ret == 0)
return true;
if (ret == -ENODEV) {
- md_flush_request(mddev, bi);
- return true;
+ if (md_flush_request(mddev, bi))
+ return true;
}
/* ret == -EAGAIN, fallback */
/*
@@ -5726,7 +5731,7 @@
do_flush = false;
}
- if (!sh->batch_head)
+ if (!sh->batch_head || sh == sh->batch_head)
set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
@@ -6765,8 +6770,7 @@
conf->previous_raid_disks),
max(conf->chunk_sectors,
conf->prev_chunk_sectors)
- / STRIPE_SECTORS,
- GFP_KERNEL)) {
+ / STRIPE_SECTORS)) {
free_scratch_buffer(conf, percpu);
return -ENOMEM;
}