Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 8b8c123..aa98953 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Block device driver configuration
#
@@ -215,17 +216,6 @@
If unsure, say N.
-config DM_MQ_DEFAULT
- bool "request-based DM: use blk-mq I/O path by default"
- depends on BLK_DEV_DM
- ---help---
- This option enables the blk-mq based I/O path for request-based
- DM devices by default. With the option the dm_mod.use_blk_mq
- module/boot option defaults to Y, without it to N, but it can
- still be overriden either way.
-
- If unsure say N.
-
config DM_DEBUG
bool "Device mapper debugging support"
depends on BLK_DEV_DM
@@ -281,6 +271,7 @@
depends on BLK_DEV_DM
select CRYPTO
select CRYPTO_CBC
+ select CRYPTO_ESSIV
---help---
This device-mapper target allows you to create a device that
transparently encrypts the data on it. You'll need to activate
@@ -356,6 +347,20 @@
over time. Useful for maintaining cache coherency when using
vendor snapshots.
+config DM_CLONE
+ tristate "Clone target (EXPERIMENTAL)"
+ depends on BLK_DEV_DM
+ default n
+ select DM_PERSISTENT_DATA
+ ---help---
+ dm-clone produces a one-to-one copy of an existing, read-only source
+ device into a writable destination device. The cloned device is
+ visible/mountable immediately and the copy of the source device to the
+ destination device happens in the background, in parallel with user
+ I/O.
+
+ If unsure, say N.
+
config DM_MIRROR
tristate "Mirror target"
depends on BLK_DEV_DM
@@ -447,6 +452,27 @@
If unsure, say N.
+config DM_DUST
+ tristate "Bad sector simulation target"
+ depends on BLK_DEV_DM
+ ---help---
+ A target that simulates bad sector behavior.
+ Useful for testing.
+
+ If unsure, say N.
+
+config DM_INIT
+ bool "DM \"dm-mod.create=\" parameter support"
+ depends on BLK_DEV_DM=y
+ ---help---
+ Enable "dm-mod.create=" parameter to create mapped devices at init time.
+ This option is useful to allow mounting rootfs without requiring an
+ initramfs.
+ See Documentation/admin-guide/device-mapper/dm-init.rst for dm-mod.create="..."
+ format.
+
+ If unsure, say N.
+
config DM_UEVENT
bool "DM uevents"
depends on BLK_DEV_DM
@@ -479,6 +505,18 @@
If unsure, say N.
+config DM_VERITY_VERIFY_ROOTHASH_SIG
+ def_bool n
+ bool "Verity data device root hash signature verification support"
+ depends on DM_VERITY
+ select SYSTEM_DATA_VERIFICATION
+ help
+ Add ability for dm-verity device to be validated if the
+ pre-generated tree of cryptographic checksums passed has a pkcs#7
+ signature file that can validate the roothash of the tree.
+
+ If unsure, say N.
+
config DM_VERITY_FEC
bool "Verity forward error correction support"
depends on DM_VERITY
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 822f4e8..d91a7ed 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,6 +18,7 @@
dm-cache-background-tracker.o
dm-cache-smq-y += dm-cache-policy-smq.o
dm-era-y += dm-era-target.o
+dm-clone-y += dm-clone-target.o dm-clone-metadata.o
dm-verity-y += dm-verity-target.o
md-mod-y += md.o md-bitmap.o
raid456-y += raid5.o raid5-cache.o raid5-ppl.o
@@ -48,6 +49,7 @@
obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
obj-$(CONFIG_DM_DELAY) += dm-delay.o
+obj-$(CONFIG_DM_DUST) += dm-dust.o
obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
@@ -64,11 +66,16 @@
obj-$(CONFIG_DM_CACHE) += dm-cache.o
obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
obj-$(CONFIG_DM_ERA) += dm-era.o
+obj-$(CONFIG_DM_CLONE) += dm-clone.o
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o
obj-$(CONFIG_DM_ZONED) += dm-zoned.o
obj-$(CONFIG_DM_WRITECACHE) += dm-writecache.o
+ifeq ($(CONFIG_DM_INIT),y)
+dm-mod-objs += dm-init.o
+endif
+
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
endif
@@ -76,3 +83,7 @@
ifeq ($(CONFIG_DM_VERITY_FEC),y)
dm-verity-objs += dm-verity-fec.o
endif
+
+ifeq ($(CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG),y)
+dm-verity-objs += dm-verity-verify-sig.o
+endif
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index f6e0a8b..6dfa653 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config BCACHE
tristate "Block device as cache"
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 7a28232..6f77682 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -327,10 +327,11 @@
* possibly issue discards to them, then we add the bucket to
* the free list:
*/
- while (!fifo_empty(&ca->free_inc)) {
+ while (1) {
long bucket;
- fifo_pop(&ca->free_inc, bucket);
+ if (!fifo_pop(&ca->free_inc, bucket))
+ break;
if (ca->discard) {
mutex_unlock(&ca->set->bucket_lock);
@@ -392,6 +393,11 @@
struct bucket *b;
long r;
+
+ /* No allocation if CACHE_SET_IO_DISABLE bit is set */
+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)))
+ return -1;
+
/* fastpath */
if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
fifo_pop(&ca->free[reserve], r))
@@ -483,8 +489,12 @@
{
int i;
+ /* No allocation if CACHE_SET_IO_DISABLE bit is set */
+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
+ return -1;
+
lockdep_assert_held(&c->bucket_lock);
- BUG_ON(!n || n > c->caches_loaded || n > 8);
+ BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET);
bkey_init(k);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 954dad2..013e35a 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -627,6 +627,20 @@
struct bkey gc_done;
/*
+ * For automatical garbage collection after writeback completed, this
+ * varialbe is used as bit fields,
+ * - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback
+ * - 0000 0010b (BCH_DO_AUTO_GC): do gc after writeback
+ * This is an optimization for following write request after writeback
+ * finished, but read hit rate dropped due to clean data on cache is
+ * discarded. Unless user explicitly sets it via sysfs, it won't be
+ * enabled.
+ */
+#define BCH_ENABLE_AUTO_GC 1
+#define BCH_DO_AUTO_GC 2
+ uint8_t gc_after_writeback;
+
+ /*
* The allocation code needs gc_mark in struct bucket to be correct, but
* it's not while a gc is in progress. Protected by bucket_lock.
*/
@@ -658,7 +672,11 @@
/*
* A btree node on disk could have too many bsets for an iterator to fit
- * on the stack - have to dynamically allocate them
+ * on the stack - have to dynamically allocate them.
+ * bch_cache_set_alloc() will make sure the pool can allocate iterators
+ * equipped with enough room that can host
+ * (sb.bucket_size / sb.block_size)
+ * btree_iter_sets, which is more than static MAX_BSETS.
*/
mempool_t fill_iter;
@@ -687,8 +705,8 @@
atomic_long_t writeback_keys_failed;
atomic_long_t reclaim;
+ atomic_long_t reclaimed_journal_buckets;
atomic_long_t flush_write;
- atomic_long_t retry_flush_write;
enum {
ON_ERROR_UNREGISTER,
@@ -708,8 +726,6 @@
#define BUCKET_HASH_BITS 12
struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
-
- DECLARE_HEAP(struct btree *, flush_btree);
};
struct bbio {
@@ -988,7 +1004,7 @@
int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
uint8_t *set_uuid);
void bch_cached_dev_detach(struct cached_dev *dc);
-void bch_cached_dev_run(struct cached_dev *dc);
+int bch_cached_dev_run(struct cached_dev *dc);
void bcache_device_stop(struct bcache_device *d);
void bch_cache_set_unregister(struct cache_set *c);
@@ -1004,7 +1020,7 @@
int bch_cache_allocator_start(struct cache *ca);
void bch_debug_exit(void);
-void bch_debug_init(struct kobject *kobj);
+void bch_debug_init(void);
void bch_request_exit(void);
int bch_request_init(void);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 8f07fa6..0876879 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -347,22 +347,19 @@
void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
bool *expensive_debug_checks)
{
- unsigned int i;
-
b->ops = ops;
b->expensive_debug_checks = expensive_debug_checks;
b->nsets = 0;
b->last_set_unwritten = 0;
- /* XXX: shouldn't be needed */
- for (i = 0; i < MAX_BSETS; i++)
- b->set[i].size = 0;
/*
- * Second loop starts at 1 because b->keys[0]->data is the memory we
- * allocated
+ * struct btree_keys in embedded in struct btree, and struct
+ * bset_tree is embedded into struct btree_keys. They are all
+ * initialized as 0 by kzalloc() in mca_bucket_alloc(), and
+ * b->set[0].data is allocated in bch_btree_keys_alloc(), so we
+ * don't have to initiate b->set[].size and b->set[].data here
+ * any more.
*/
- for (i = 1; i < MAX_BSETS; i++)
- b->set[i].data = NULL;
}
EXPORT_SYMBOL(bch_btree_keys_init);
@@ -887,12 +884,22 @@
struct bset *i = bset_tree_last(b)->data;
struct bkey *m, *prev = NULL;
struct btree_iter iter;
+ struct bkey preceding_key_on_stack = ZERO_KEY;
+ struct bkey *preceding_key_p = &preceding_key_on_stack;
BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
- m = bch_btree_iter_init(b, &iter, b->ops->is_extents
- ? PRECEDING_KEY(&START_KEY(k))
- : PRECEDING_KEY(k));
+ /*
+ * If k has preceding key, preceding_key_p will be set to address
+ * of k's preceding key; otherwise preceding_key_p will be set
+ * to NULL inside preceding_key().
+ */
+ if (b->ops->is_extents)
+ preceding_key(&START_KEY(k), &preceding_key_p);
+ else
+ preceding_key(k, &preceding_key_p);
+
+ m = bch_btree_iter_init(b, &iter, preceding_key_p);
if (b->ops->insert_fixup(b, k, &iter, replace_key))
return status;
@@ -960,45 +967,25 @@
unsigned int inorder, j, n = 1;
do {
- /*
- * A bit trick here.
- * If p < t->size, (int)(p - t->size) is a minus value and
- * the most significant bit is set, right shifting 31 bits
- * gets 1. If p >= t->size, the most significant bit is
- * not set, right shifting 31 bits gets 0.
- * So the following 2 lines equals to
- * if (p >= t->size)
- * p = 0;
- * but a branch instruction is avoided.
- */
unsigned int p = n << 4;
- p &= ((int) (p - t->size)) >> 31;
-
- prefetch(&t->tree[p]);
+ if (p < t->size)
+ prefetch(&t->tree[p]);
j = n;
f = &t->tree[j];
- /*
- * Similar bit trick, use subtract operation to avoid a branch
- * instruction.
- *
- * n = (f->mantissa > bfloat_mantissa())
- * ? j * 2
- * : j * 2 + 1;
- *
- * We need to subtract 1 from f->mantissa for the sign bit trick
- * to work - that's done in make_bfloat()
- */
- if (likely(f->exponent != 127))
- n = j * 2 + (((unsigned int)
- (f->mantissa -
- bfloat_mantissa(search, f))) >> 31);
- else
- n = (bkey_cmp(tree_to_bkey(t, j), search) > 0)
- ? j * 2
- : j * 2 + 1;
+ if (likely(f->exponent != 127)) {
+ if (f->mantissa >= bfloat_mantissa(search, f))
+ n = j * 2;
+ else
+ n = j * 2 + 1;
+ } else {
+ if (bkey_cmp(tree_to_bkey(t, j), search) > 0)
+ n = j * 2;
+ else
+ n = j * 2 + 1;
+ }
} while (n < t->size);
inorder = to_inorder(j, t);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index bac76aa..c71365e 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -434,20 +434,26 @@
return __bch_cut_back(where, k);
}
-#define PRECEDING_KEY(_k) \
-({ \
- struct bkey *_ret = NULL; \
- \
- if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
- _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
- \
- if (!_ret->low) \
- _ret->high--; \
- _ret->low--; \
- } \
- \
- _ret; \
-})
+/*
+ * Pointer '*preceding_key_p' points to a memory object to store preceding
+ * key of k. If the preceding key does not exist, set '*preceding_key_p' to
+ * NULL. So the caller of preceding_key() needs to take care of memory
+ * which '*preceding_key_p' pointed to before calling preceding_key().
+ * Currently the only caller of preceding_key() is bch_btree_insert_key(),
+ * and it points to an on-stack variable, so the memory release is handled
+ * by stackframe itself.
+ */
+static inline void preceding_key(struct bkey *k, struct bkey **preceding_key_p)
+{
+ if (KEY_INODE(k) || KEY_OFFSET(k)) {
+ (**preceding_key_p) = KEY(KEY_INODE(k), KEY_OFFSET(k), 0);
+ if (!(*preceding_key_p)->low)
+ (*preceding_key_p)->high--;
+ (*preceding_key_p)->low--;
+ } else {
+ (*preceding_key_p) = NULL;
+ }
+}
static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
{
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3f4211b..ba434d9 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -35,7 +35,7 @@
#include <linux/rcupdate.h>
#include <linux/sched/clock.h>
#include <linux/rculist.h>
-
+#include <linux/delay.h>
#include <trace/events/bcache.h>
/*
@@ -207,6 +207,11 @@
struct bset *i = btree_bset_first(b);
struct btree_iter *iter;
+ /*
+ * c->fill_iter can allocate an iterator with more memory space
+ * than static MAX_BSETS.
+ * See the comment arount cache_set->fill_iter.
+ */
iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
iter->used = 0;
@@ -424,13 +429,14 @@
bset_sector_offset(&b->keys, i));
if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
- int j;
struct bio_vec *bv;
- void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
+ void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, b->bio, j)
- memcpy(page_address(bv->bv_page),
- base + j * PAGE_SIZE, PAGE_SIZE);
+ bio_for_each_segment_all(bv, b->bio, iter_all) {
+ memcpy(page_address(bv->bv_page), addr, PAGE_SIZE);
+ addr += PAGE_SIZE;
+ }
bch_submit_bbio(b->bio, b->c, &k.key, 0);
@@ -607,6 +613,10 @@
static struct btree *mca_bucket_alloc(struct cache_set *c,
struct bkey *k, gfp_t gfp)
{
+ /*
+ * kzalloc() is necessary here for initialization,
+ * see code comments in bch_btree_keys_init().
+ */
struct btree *b = kzalloc(sizeof(struct btree), gfp);
if (!b)
@@ -649,7 +659,25 @@
up(&b->io_mutex);
}
+retry:
+ /*
+ * BTREE_NODE_dirty might be cleared in btree_flush_btree() by
+ * __bch_btree_node_write(). To avoid an extra flush, acquire
+ * b->write_lock before checking BTREE_NODE_dirty bit.
+ */
mutex_lock(&b->write_lock);
+ /*
+ * If this btree node is selected in btree_flush_write() by journal
+ * code, delay and retry until the node is flushed by journal code
+ * and BTREE_NODE_journal_flush bit cleared by btree_flush_write().
+ */
+ if (btree_node_journal_flush(b)) {
+ pr_debug("bnode %p is flushing by journal, retry", b);
+ mutex_unlock(&b->write_lock);
+ udelay(1);
+ goto retry;
+ }
+
if (btree_node_dirty(b))
__bch_btree_node_write(b, &cl);
mutex_unlock(&b->write_lock);
@@ -772,10 +800,15 @@
while (!list_empty(&c->btree_cache)) {
b = list_first_entry(&c->btree_cache, struct btree, list);
- if (btree_node_dirty(b))
+ /*
+ * This function is called by cache_set_free(), no I/O
+ * request on cache now, it is unnecessary to acquire
+ * b->write_lock before clearing BTREE_NODE_dirty anymore.
+ */
+ if (btree_node_dirty(b)) {
btree_complete_write(b, btree_current_write(b));
- clear_bit(BTREE_NODE_dirty, &b->flags);
-
+ clear_bit(BTREE_NODE_dirty, &b->flags);
+ }
mca_data_free(b);
}
@@ -1061,11 +1094,25 @@
BUG_ON(b == b->c->root);
+retry:
mutex_lock(&b->write_lock);
+ /*
+ * If the btree node is selected and flushing in btree_flush_write(),
+ * delay and retry until the BTREE_NODE_journal_flush bit cleared,
+ * then it is safe to free the btree node here. Otherwise this btree
+ * node will be in race condition.
+ */
+ if (btree_node_journal_flush(b)) {
+ mutex_unlock(&b->write_lock);
+ pr_debug("bnode %p journal_flush set, retry", b);
+ udelay(1);
+ goto retry;
+ }
- if (btree_node_dirty(b))
+ if (btree_node_dirty(b)) {
btree_complete_write(b, btree_current_write(b));
- clear_bit(BTREE_NODE_dirty, &b->flags);
+ clear_bit(BTREE_NODE_dirty, &b->flags);
+ }
mutex_unlock(&b->write_lock);
@@ -1470,11 +1517,11 @@
out_nocoalesce:
closure_sync(&cl);
- bch_keylist_free(&keylist);
while ((k = bch_keylist_pop(&keylist)))
if (!bkey_cmp(k, &ZERO_KEY))
atomic_dec(&b->c->prio_blocked);
+ bch_keylist_free(&keylist);
for (i = 0; i < nodes; i++)
if (!IS_ERR_OR_NULL(new_nodes[i])) {
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index a68d6c5..76cfd12 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -158,11 +158,13 @@
BTREE_NODE_io_error,
BTREE_NODE_dirty,
BTREE_NODE_write_idx,
+ BTREE_NODE_journal_flush,
};
BTREE_FLAG(io_error);
BTREE_FLAG(dirty);
BTREE_FLAG(write_idx);
+BTREE_FLAG(journal_flush);
static inline struct btree_write *btree_current_write(struct btree *b)
{
@@ -266,6 +268,24 @@
wake_up(&c->gc_wait);
}
+static inline void force_wake_up_gc(struct cache_set *c)
+{
+ /*
+ * Garbage collection thread only works when sectors_to_gc < 0,
+ * calling wake_up_gc() won't start gc thread if sectors_to_gc is
+ * not a nagetive value.
+ * Therefore sectors_to_gc is set to -1 here, before waking up
+ * gc thread by calling wake_up_gc(). Then gc_should_run() will
+ * give a chance to permit gc thread to run. "Give a chance" means
+ * before going into gc_should_run(), there is still possibility
+ * that c->sectors_to_gc being set to other positive value. So
+ * this routine won't 100% make sure gc thread will be woken up
+ * to run.
+ */
+ atomic_set(&c->sectors_to_gc, -1);
+ wake_up_gc(c);
+}
+
#define MAP_DONE 0
#define MAP_CONTINUE 1
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 73f5319..c12cd80 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -105,8 +105,14 @@
static void closure_sync_fn(struct closure *cl)
{
- cl->s->done = 1;
- wake_up_process(cl->s->task);
+ struct closure_syncer *s = cl->s;
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = READ_ONCE(s->task);
+ s->done = 1;
+ wake_up_process(p);
+ rcu_read_unlock();
}
void __sched __closure_sync(struct closure *cl)
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index eca0d49..c88cdc4 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -345,7 +345,8 @@
} while (0)
/**
- * closure_return - finish execution of a closure, with destructor
+ * closure_return_with_destructor - finish execution of a closure,
+ * with destructor
*
* Works like closure_return(), except @destructor will be called when all
* outstanding refs on @cl have been dropped; @destructor may be used to safely
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 06da66b..336f439 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -178,10 +178,9 @@
while (size) {
struct keybuf_key *w;
unsigned int bytes = min(i->bytes, size);
- int err = copy_to_user(buf, i->buf, bytes);
- if (err)
- return err;
+ if (copy_to_user(buf, i->buf, bytes))
+ return -EFAULT;
ret += bytes;
buf += bytes;
@@ -249,11 +248,10 @@
void bch_debug_exit(void)
{
- if (!IS_ERR_OR_NULL(bcache_debug))
- debugfs_remove_recursive(bcache_debug);
+ debugfs_remove_recursive(bcache_debug);
}
-void __init bch_debug_init(struct kobject *kobj)
+void __init bch_debug_init(void)
{
/*
* it is unnecessary to check return value of
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index c809724..8867100 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -538,6 +538,7 @@
{
struct btree *b = container_of(bk, struct btree, keys);
unsigned int i, stale;
+ char buf[80];
if (!KEY_PTRS(k) ||
bch_extent_invalid(bk, k))
@@ -547,19 +548,19 @@
if (!ptr_available(b->c, k, i))
return true;
- if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
- return false;
-
for (i = 0; i < KEY_PTRS(k); i++) {
stale = ptr_stale(b->c, k, i);
- btree_bug_on(stale > 96, b,
+ if (stale && KEY_DIRTY(k)) {
+ bch_extent_to_text(buf, sizeof(buf), k);
+ pr_info("stale dirty pointer, stale %u, key: %s",
+ stale, buf);
+ }
+
+ btree_bug_on(stale > BUCKET_GC_GEN_MAX, b,
"key too stale: %i, need_gc %u",
stale, b->c->need_gc);
- btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
- b, "stale dirty pointer");
-
if (stale)
return true;
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index c250979..4d93f07 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -58,6 +58,18 @@
WARN_ONCE(!dc, "NULL pointer of struct cached_dev");
+ /*
+ * Read-ahead requests on a degrading and recovering md raid
+ * (e.g. raid6) device might be failured immediately by md
+ * raid code, which is not a real hardware media failure. So
+ * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors.
+ */
+ if (bio->bi_opf & REQ_RAHEAD) {
+ pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore",
+ dc->backing_dev_name);
+ return;
+ }
+
errors = atomic_add_return(1, &dc->io_errors);
if (errors < dc->error_limit)
pr_err("%s: IO error on backing device, unrecoverable",
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 522c742..be2a2a2 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -100,6 +100,20 @@
blocks = set_blocks(j, block_bytes(ca->set));
+ /*
+ * Nodes in 'list' are in linear increasing order of
+ * i->j.seq, the node on head has the smallest (oldest)
+ * journal seq, the node on tail has the biggest
+ * (latest) journal seq.
+ */
+
+ /*
+ * Check from the oldest jset for last_seq. If
+ * i->j.seq < j->last_seq, it means the oldest jset
+ * in list is expired and useless, remove it from
+ * this list. Otherwise, j is a condidate jset for
+ * further following checks.
+ */
while (!list_empty(list)) {
i = list_first_entry(list,
struct journal_replay, list);
@@ -109,13 +123,22 @@
kfree(i);
}
+ /* iterate list in reverse order (from latest jset) */
list_for_each_entry_reverse(i, list, list) {
if (j->seq == i->j.seq)
goto next_set;
+ /*
+ * if j->seq is less than any i->j.last_seq
+ * in list, j is an expired and useless jset.
+ */
if (j->seq < i->j.last_seq)
goto next_set;
+ /*
+ * 'where' points to first jset in list which
+ * is elder then j.
+ */
if (j->seq > i->j.seq) {
where = &i->list;
goto add;
@@ -129,10 +152,12 @@
if (!i)
return -ENOMEM;
memcpy(&i->j, j, bytes);
+ /* Add to the location after 'where' points to */
list_add(&i->list, where);
ret = 1;
- ja->seq[bucket_index] = j->seq;
+ if (j->seq > ja->seq[bucket_index])
+ ja->seq[bucket_index] = j->seq;
next_set:
offset += blocks * ca->sb.block_size;
len -= blocks * ca->sb.block_size;
@@ -147,7 +172,7 @@
{
#define read_bucket(b) \
({ \
- int ret = journal_read_bucket(ca, list, b); \
+ ret = journal_read_bucket(ca, list, b); \
__set_bit(b, bitmap); \
if (ret < 0) \
return ret; \
@@ -156,6 +181,7 @@
struct cache *ca;
unsigned int iter;
+ int ret = 0;
for_each_cache(ca, c, iter) {
struct journal_device *ja = &ca->journal;
@@ -317,6 +343,18 @@
}
}
+static bool is_discard_enabled(struct cache_set *s)
+{
+ struct cache *ca;
+ unsigned int i;
+
+ for_each_cache(ca, s, i)
+ if (ca->discard)
+ return true;
+
+ return false;
+}
+
int bch_journal_replay(struct cache_set *s, struct list_head *list)
{
int ret = 0, keys = 0, entries = 0;
@@ -330,9 +368,17 @@
list_for_each_entry(i, list, list) {
BUG_ON(i->pin && atomic_read(i->pin) != 1);
- cache_set_err_on(n != i->j.seq, s,
-"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
- n, i->j.seq - 1, start, end);
+ if (n != i->j.seq) {
+ if (n == start && is_discard_enabled(s))
+ pr_info("bcache: journal entries %llu-%llu may be discarded! (replaying %llu-%llu)",
+ n, i->j.seq - 1, start, end);
+ else {
+ pr_err("bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
+ n, i->j.seq - 1, start, end);
+ ret = -EIO;
+ goto err;
+ }
+ }
for (k = i->j.start;
k < bset_bkey_last(&i->j);
@@ -370,60 +416,90 @@
}
/* Journalling */
-#define journal_max_cmp(l, r) \
- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \
- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
-#define journal_min_cmp(l, r) \
- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \
- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
static void btree_flush_write(struct cache_set *c)
{
- /*
- * Try to find the btree node with that references the oldest journal
- * entry, best is our current candidate and is locked if non NULL:
- */
- struct btree *b;
- int i;
+ struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR];
+ unsigned int i, n;
+
+ if (c->journal.btree_flushing)
+ return;
+
+ spin_lock(&c->journal.flush_write_lock);
+ if (c->journal.btree_flushing) {
+ spin_unlock(&c->journal.flush_write_lock);
+ return;
+ }
+ c->journal.btree_flushing = true;
+ spin_unlock(&c->journal.flush_write_lock);
atomic_long_inc(&c->flush_write);
+ memset(btree_nodes, 0, sizeof(btree_nodes));
+ n = 0;
-retry:
- spin_lock(&c->journal.lock);
- if (heap_empty(&c->flush_btree)) {
- for_each_cached_btree(b, c, i)
- if (btree_current_write(b)->journal) {
- if (!heap_full(&c->flush_btree))
- heap_add(&c->flush_btree, b,
- journal_max_cmp);
- else if (journal_max_cmp(b,
- heap_peek(&c->flush_btree))) {
- c->flush_btree.data[0] = b;
- heap_sift(&c->flush_btree, 0,
- journal_max_cmp);
- }
- }
+ mutex_lock(&c->bucket_lock);
+ list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
+ if (btree_node_journal_flush(b))
+ pr_err("BUG: flush_write bit should not be set here!");
- for (i = c->flush_btree.used / 2 - 1; i >= 0; --i)
- heap_sift(&c->flush_btree, i, journal_min_cmp);
- }
-
- b = NULL;
- heap_pop(&c->flush_btree, b, journal_min_cmp);
- spin_unlock(&c->journal.lock);
-
- if (b) {
mutex_lock(&b->write_lock);
+
+ if (!btree_node_dirty(b)) {
+ mutex_unlock(&b->write_lock);
+ continue;
+ }
+
if (!btree_current_write(b)->journal) {
mutex_unlock(&b->write_lock);
- /* We raced */
- atomic_long_inc(&c->retry_flush_write);
- goto retry;
+ continue;
+ }
+
+ set_btree_node_journal_flush(b);
+
+ mutex_unlock(&b->write_lock);
+
+ btree_nodes[n++] = b;
+ if (n == BTREE_FLUSH_NR)
+ break;
+ }
+ mutex_unlock(&c->bucket_lock);
+
+ for (i = 0; i < n; i++) {
+ b = btree_nodes[i];
+ if (!b) {
+ pr_err("BUG: btree_nodes[%d] is NULL", i);
+ continue;
+ }
+
+ /* safe to check without holding b->write_lock */
+ if (!btree_node_journal_flush(b)) {
+ pr_err("BUG: bnode %p: journal_flush bit cleaned", b);
+ continue;
+ }
+
+ mutex_lock(&b->write_lock);
+ if (!btree_current_write(b)->journal) {
+ clear_bit(BTREE_NODE_journal_flush, &b->flags);
+ mutex_unlock(&b->write_lock);
+ pr_debug("bnode %p: written by others", b);
+ continue;
+ }
+
+ if (!btree_node_dirty(b)) {
+ clear_bit(BTREE_NODE_journal_flush, &b->flags);
+ mutex_unlock(&b->write_lock);
+ pr_debug("bnode %p: dirty bit cleaned by others", b);
+ continue;
}
__bch_btree_node_write(b, NULL);
+ clear_bit(BTREE_NODE_journal_flush, &b->flags);
mutex_unlock(&b->write_lock);
}
+
+ spin_lock(&c->journal.flush_write_lock);
+ c->journal.btree_flushing = false;
+ spin_unlock(&c->journal.flush_write_lock);
}
#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
@@ -538,13 +614,14 @@
k->ptr[n++] = MAKE_PTR(0,
bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
ca->sb.nr_this_dev);
+ atomic_long_inc(&c->reclaimed_journal_buckets);
}
- bkey_init(k);
- SET_KEY_PTRS(k, n);
-
- if (n)
+ if (n) {
+ bkey_init(k);
+ SET_KEY_PTRS(k, n);
c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
+ }
out:
if (!journal_full(&c->journal))
__closure_wake_up(&c->journal.wait);
@@ -663,7 +740,7 @@
REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
bch_bio_map(bio, w->data);
- trace_bcache_journal_write(bio);
+ trace_bcache_journal_write(bio, w->data->keys);
bio_list_add(&list, bio);
SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
@@ -671,6 +748,9 @@
ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
}
+ /* If KEY_PTRS(k) == 0, this jset gets lost in air */
+ BUG_ON(i == 0);
+
atomic_dec_bug(&fifo_back(&c->journal.pin));
bch_journal_next(&c->journal);
journal_reclaim(c);
@@ -787,6 +867,10 @@
struct journal_write *w;
atomic_t *ret;
+ /* No journaling if CACHE_SET_IO_DISABLE set already */
+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
+ return NULL;
+
if (!CACHE_SYNC(&c->sb))
return NULL;
@@ -831,7 +915,6 @@
free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
free_fifo(&c->journal.pin);
- free_heap(&c->flush_btree);
}
int bch_journal_alloc(struct cache_set *c)
@@ -839,6 +922,7 @@
struct journal *j = &c->journal;
spin_lock_init(&j->lock);
+ spin_lock_init(&j->flush_write_lock);
INIT_DELAYED_WORK(&j->work, journal_write_work);
c->journal_delay_ms = 100;
@@ -846,8 +930,7 @@
j->w[0].c = c;
j->w[1].c = c;
- if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) ||
- !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
!(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
!(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
return -ENOMEM;
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 66f0fac..f2ea34d 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -103,6 +103,8 @@
/* Embedded in struct cache_set */
struct journal {
spinlock_t lock;
+ spinlock_t flush_write_lock;
+ bool btree_flushing;
/* used when waiting because the journal was full */
struct closure_waitlist wait;
struct closure io;
@@ -154,6 +156,8 @@
struct bio_vec bv[8];
};
+#define BTREE_FLUSH_NR 8
+
#define journal_pin_cmp(c, l, r) \
(fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 22944aa..41adcd1 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -311,11 +311,11 @@
* data is written it calls bch_journal, and after the keys have been added to
* the next journal write they're inserted into the btree.
*
- * It inserts the data in s->cache_bio; bi_sector is used for the key offset,
+ * It inserts the data in op->bio; bi_sector is used for the key offset,
* and op->inode is used for the key inode.
*
- * If s->bypass is true, instead of inserting the data it invalidates the
- * region of the cache represented by s->cache_bio and op->inode.
+ * If op->bypass is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
*/
void bch_data_insert(struct closure *cl)
{
@@ -329,12 +329,13 @@
bch_data_insert_start(cl);
}
-/* Congested? */
-
-unsigned int bch_get_congested(struct cache_set *c)
+/*
+ * Congested? Return 0 (not congested) or the limit (in sectors)
+ * beyond which we should bypass the cache due to congestion.
+ */
+unsigned int bch_get_congested(const struct cache_set *c)
{
int i;
- long rand;
if (!c->congested_read_threshold_us &&
!c->congested_write_threshold_us)
@@ -353,8 +354,7 @@
if (i > 0)
i = fract_exp_two(i, 6);
- rand = get_random_int();
- i -= bitmap_weight(&rand, BITS_PER_LONG);
+ i -= hweight32(get_random_u32());
return i > 0 ? i : 1;
}
@@ -376,7 +376,7 @@
{
struct cache_set *c = dc->disk.c;
unsigned int mode = cache_mode(dc);
- unsigned int sectors, congested = bch_get_congested(c);
+ unsigned int sectors, congested;
struct task_struct *task = current;
struct io *i;
@@ -392,10 +392,11 @@
/*
* Flag for bypass if the IO is for read-ahead or background,
- * unless the read-ahead request is for metadata (eg, for gfs2).
+ * unless the read-ahead request is for metadata
+ * (eg, for gfs2 or xfs).
*/
if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
- !(bio->bi_opf & REQ_META))
+ !(bio->bi_opf & (REQ_META|REQ_PRIO)))
goto skip;
if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
@@ -411,6 +412,7 @@
goto rescale;
}
+ congested = bch_get_congested(c);
if (!congested && !dc->sequential_cutoff)
goto rescale;
@@ -705,14 +707,14 @@
{
struct search *s = container_of(cl, struct search, cl);
- atomic_dec(&s->d->c->search_inflight);
+ atomic_dec(&s->iop.c->search_inflight);
if (s->iop.bio)
bio_put(s->iop.bio);
bio_complete(s);
closure_debug_destroy(cl);
- mempool_free(s, &s->d->c->search);
+ mempool_free(s, &s->iop.c->search);
}
static inline struct search *search_alloc(struct bio *bio,
@@ -755,13 +757,13 @@
struct search *s = container_of(cl, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
- search_free(cl);
cached_dev_put(dc);
+ search_free(cl);
}
/* Process reads */
-static void cached_dev_cache_miss_done(struct closure *cl)
+static void cached_dev_read_error_done(struct closure *cl)
{
struct search *s = container_of(cl, struct search, cl);
@@ -799,7 +801,22 @@
closure_bio_submit(s->iop.c, bio, cl);
}
- continue_at(cl, cached_dev_cache_miss_done, NULL);
+ continue_at(cl, cached_dev_read_error_done, NULL);
+}
+
+static void cached_dev_cache_miss_done(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct bcache_device *d = s->d;
+
+ if (s->iop.replace_collision)
+ bch_mark_cache_miss_collision(s->iop.c, s->d);
+
+ if (s->iop.bio)
+ bio_free_pages(s->iop.bio);
+
+ cached_dev_bio_complete(cl);
+ closure_put(&d->cl);
}
static void cached_dev_read_done(struct closure *cl)
@@ -832,6 +849,7 @@
if (verify(dc) && s->recoverable && !s->read_dirty_data)
bch_data_verify(dc, s->orig_bio);
+ closure_get(&dc->disk.cl);
bio_complete(s);
if (s->iop.bio &&
@@ -877,7 +895,7 @@
}
if (!(bio->bi_opf & REQ_RAHEAD) &&
- !(bio->bi_opf & REQ_META) &&
+ !(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
reada = min_t(sector_t, dc->readahead >> 9,
get_capacity(bio->bi_disk) - bio_end_sector(bio));
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index aa055cf..c64dbd7 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -33,12 +33,12 @@
BKEY_PADDED(replace_key);
};
-unsigned int bch_get_congested(struct cache_set *c);
+unsigned int bch_get_congested(const struct cache_set *c);
void bch_data_insert(struct closure *cl);
void bch_cached_dev_request_init(struct cached_dev *dc);
void bch_flash_dev_request_init(struct bcache_device *d);
-extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
+extern struct kmem_cache *bch_search_cache;
#endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index 894410f..ba1c937 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -111,7 +111,7 @@
{
memset(&acc->total.cache_hits,
0,
- sizeof(unsigned long) * 7);
+ sizeof(struct cache_stats));
}
void bch_cache_accounting_destroy(struct cache_accounting *acc)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 03bb5ce..20ed838 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -25,8 +25,8 @@
#include <linux/reboot.h>
#include <linux/sysfs.h>
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+unsigned int bch_cutoff_writeback;
+unsigned int bch_cutoff_writeback_sync;
static const char bcache_magic[] = {
0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
@@ -40,6 +40,7 @@
static struct kobject *bcache_kobj;
struct mutex bch_register_lock;
+bool bcache_is_reboot;
LIST_HEAD(bch_cache_sets);
static LIST_HEAD(uncached_devices);
@@ -49,6 +50,7 @@
struct workqueue_struct *bcache_wq;
struct workqueue_struct *bch_journal_wq;
+
#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
/* limitation of partitions number on single bcache device */
#define BCACHE_MINORS 128
@@ -197,7 +199,9 @@
static void write_bdev_super_endio(struct bio *bio)
{
struct cached_dev *dc = bio->bi_private;
- /* XXX: error checking */
+
+ if (bio->bi_status)
+ bch_count_backing_io_errors(dc, bio);
closure_put(&dc->sb_write);
}
@@ -418,6 +422,7 @@
{
BKEY_PADDED(key) k;
struct closure cl;
+ struct cache *ca;
closure_init_stack(&cl);
lockdep_assert_held(&bch_register_lock);
@@ -429,6 +434,10 @@
uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
closure_sync(&cl);
+ /* Only one bucket used for uuid write */
+ ca = PTR_CACHE(c, &k.key, 0);
+ atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
+
bkey_copy(&c->uuid_bucket, &k.key);
bkey_put(c, &k.key);
return 0;
@@ -657,6 +666,11 @@
void bcache_device_stop(struct bcache_device *d)
{
if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
+ /*
+ * closure_fn set to
+ * - cached device: cached_dev_flush()
+ * - flash dev: flash_dev_flush()
+ */
closure_queue(&d->cl);
}
@@ -681,6 +695,7 @@
{
unsigned int i;
struct cache *ca;
+ int ret;
for_each_cache(ca, d->c, i)
bd_link_disk_holder(ca->bdev, d->disk);
@@ -688,9 +703,13 @@
snprintf(d->name, BCACHEDEVNAME_SIZE,
"%s%u", name, d->id);
- WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
- sysfs_create_link(&c->kobj, &d->kobj, d->name),
- "Couldn't create device <-> cache set symlinks");
+ ret = sysfs_create_link(&d->kobj, &c->kobj, "cache");
+ if (ret < 0)
+ pr_err("Couldn't create device -> cache set symlink");
+
+ ret = sysfs_create_link(&c->kobj, &d->kobj, d->name);
+ if (ret < 0)
+ pr_err("Couldn't create cache set -> device symlink");
clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
}
@@ -898,25 +917,33 @@
}
-void bch_cached_dev_run(struct cached_dev *dc)
+int bch_cached_dev_run(struct cached_dev *dc)
{
struct bcache_device *d = &dc->disk;
- char buf[SB_LABEL_SIZE + 1];
+ char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
char *env[] = {
"DRIVER=bcache",
kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
- NULL,
+ kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""),
NULL,
};
- memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
- buf[SB_LABEL_SIZE] = '\0';
- env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
+ if (dc->io_disable) {
+ pr_err("I/O disabled on cached dev %s",
+ dc->backing_dev_name);
+ kfree(env[1]);
+ kfree(env[2]);
+ kfree(buf);
+ return -EIO;
+ }
if (atomic_xchg(&dc->running, 1)) {
kfree(env[1]);
kfree(env[2]);
- return;
+ kfree(buf);
+ pr_info("cached dev %s is running already",
+ dc->backing_dev_name);
+ return -EBUSY;
}
if (!d->c &&
@@ -939,10 +966,14 @@
kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
kfree(env[1]);
kfree(env[2]);
+ kfree(buf);
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
- sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
- pr_debug("error creating sysfs link");
+ sysfs_create_link(&disk_to_dev(d->disk)->kobj,
+ &d->kobj, "bcache")) {
+ pr_err("Couldn't create bcache dev <-> disk sysfs symlinks");
+ return -ENOMEM;
+ }
dc->status_update_thread = kthread_run(cached_dev_status_update,
dc, "bcache_status_update");
@@ -951,6 +982,8 @@
"continue to run without monitoring backing "
"device status");
}
+
+ return 0;
}
/*
@@ -988,7 +1021,6 @@
BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
BUG_ON(refcount_read(&dc->count));
- mutex_lock(&bch_register_lock);
if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
cancel_writeback_rate_update_dwork(dc);
@@ -1004,6 +1036,9 @@
bch_write_bdev_super(dc, &cl);
closure_sync(&cl);
+ mutex_lock(&bch_register_lock);
+
+ calc_cached_dev_sectors(dc->disk.c);
bcache_device_detach(&dc->disk);
list_move(&dc->list, &uncached_devices);
@@ -1045,6 +1080,7 @@
uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
struct uuid_entry *u;
struct cached_dev *exist_dc, *t;
+ int ret = 0;
if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
(!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
@@ -1144,6 +1180,8 @@
down_write(&dc->writeback_lock);
if (bch_cached_dev_writeback_start(dc)) {
up_write(&dc->writeback_lock);
+ pr_err("Couldn't start writeback facilities for %s",
+ dc->disk.disk->disk_name);
return -ENOMEM;
}
@@ -1154,7 +1192,22 @@
bch_sectors_dirty_init(&dc->disk);
- bch_cached_dev_run(dc);
+ ret = bch_cached_dev_run(dc);
+ if (ret && (ret != -EBUSY)) {
+ up_write(&dc->writeback_lock);
+ /*
+ * bch_register_lock is held, bcache_device_stop() is not
+ * able to be directly called. The kthread and kworker
+ * created previously in bch_cached_dev_writeback_start()
+ * have to be stopped manually here.
+ */
+ kthread_stop(dc->writeback_thread);
+ cancel_writeback_rate_update_dwork(dc);
+ pr_err("Couldn't run cached device %s",
+ dc->backing_dev_name);
+ return ret;
+ }
+
bcache_device_link(&dc->disk, c, "bdev");
atomic_inc(&c->attached_dev_nr);
@@ -1168,6 +1221,7 @@
return 0;
}
+/* when dc->disk.kobj released */
void bch_cached_dev_release(struct kobject *kobj)
{
struct cached_dev *dc = container_of(kobj, struct cached_dev,
@@ -1180,18 +1234,16 @@
{
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
- mutex_lock(&bch_register_lock);
-
if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
cancel_writeback_rate_update_dwork(dc);
if (!IS_ERR_OR_NULL(dc->writeback_thread))
kthread_stop(dc->writeback_thread);
- if (dc->writeback_write_wq)
- destroy_workqueue(dc->writeback_write_wq);
if (!IS_ERR_OR_NULL(dc->status_update_thread))
kthread_stop(dc->status_update_thread);
+ mutex_lock(&bch_register_lock);
+
if (atomic_read(&dc->running))
bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
bcache_device_free(&dc->disk);
@@ -1274,12 +1326,13 @@
/* Cached device - bcache superblock */
-static void register_bdev(struct cache_sb *sb, struct page *sb_page,
+static int register_bdev(struct cache_sb *sb, struct page *sb_page,
struct block_device *bdev,
struct cached_dev *dc)
{
const char *err = "cannot allocate memory";
struct cache_set *c;
+ int ret = -ENOMEM;
bdevname(bdev, dc->backing_dev_name);
memcpy(&dc->sb, sb, sizeof(struct cache_sb));
@@ -1309,17 +1362,23 @@
bch_cached_dev_attach(dc, c, NULL);
if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
- BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
- bch_cached_dev_run(dc);
+ BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) {
+ err = "failed to run cached device";
+ ret = bch_cached_dev_run(dc);
+ if (ret)
+ goto err;
+ }
- return;
+ return 0;
err:
pr_notice("error %s: %s", dc->backing_dev_name, err);
bcache_device_stop(&dc->disk);
+ return ret;
}
/* Flash only volumes */
+/* When d->kobj released */
void bch_flash_dev_release(struct kobject *kobj)
{
struct bcache_device *d = container_of(kobj, struct bcache_device,
@@ -1425,8 +1484,6 @@
bool bch_cached_dev_error(struct cached_dev *dc)
{
- struct cache_set *c;
-
if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
return false;
@@ -1437,21 +1494,6 @@
pr_err("stop %s: too many IO errors on backing device %s\n",
dc->disk.disk->disk_name, dc->backing_dev_name);
- /*
- * If the cached device is still attached to a cache set,
- * even dc->io_disable is true and no more I/O requests
- * accepted, cache device internal I/O (writeback scan or
- * garbage collection) may still prevent bcache device from
- * being stopped. So here CACHE_SET_IO_DISABLE should be
- * set to c->flags too, to make the internal I/O to cache
- * device rejected and stopped immediately.
- * If c is NULL, that means the bcache device is not attached
- * to any cache set, then no CACHE_SET_IO_DISABLE bit to set.
- */
- c = dc->disk.c;
- if (c && test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
- pr_info("CACHE_SET_IO_DISABLE already set");
-
bcache_device_stop(&dc->disk);
return true;
}
@@ -1490,6 +1532,7 @@
return true;
}
+/* When c->kobj released */
void bch_cache_set_release(struct kobject *kobj)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
@@ -1504,13 +1547,13 @@
struct cache *ca;
unsigned int i;
- if (!IS_ERR_OR_NULL(c->debug))
- debugfs_remove(c->debug);
+ debugfs_remove(c->debug);
bch_open_buckets_free(c);
bch_btree_cache_free(c);
bch_journal_free(c);
+ mutex_lock(&bch_register_lock);
for_each_cache(ca, c, i)
if (ca) {
ca->set = NULL;
@@ -1529,7 +1572,6 @@
mempool_exit(&c->search);
kfree(c->devices);
- mutex_lock(&bch_register_lock);
list_del(&c->list);
mutex_unlock(&bch_register_lock);
@@ -1552,19 +1594,23 @@
kobject_put(&c->internal);
kobject_del(&c->kobj);
- if (c->gc_thread)
+ if (!IS_ERR_OR_NULL(c->gc_thread))
kthread_stop(c->gc_thread);
if (!IS_ERR_OR_NULL(c->root))
list_add(&c->root->list, &c->btree_cache);
- /* Should skip this if we're unregistering because of an error */
- list_for_each_entry(b, &c->btree_cache, list) {
- mutex_lock(&b->write_lock);
- if (btree_node_dirty(b))
- __bch_btree_node_write(b, NULL);
- mutex_unlock(&b->write_lock);
- }
+ /*
+ * Avoid flushing cached nodes if cache set is retiring
+ * due to too many I/O errors detected.
+ */
+ if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
+ list_for_each_entry(b, &c->btree_cache, list) {
+ mutex_lock(&b->write_lock);
+ if (btree_node_dirty(b))
+ __bch_btree_node_write(b, NULL);
+ mutex_unlock(&b->write_lock);
+ }
for_each_cache(ca, c, i)
if (ca->alloc_thread)
@@ -1610,21 +1656,21 @@
*/
pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
d->disk->disk_name);
- /*
- * There might be a small time gap that cache set is
- * released but bcache device is not. Inside this time
- * gap, regular I/O requests will directly go into
- * backing device as no cache set attached to. This
- * behavior may also introduce potential inconsistence
- * data in writeback mode while cache is dirty.
- * Therefore before calling bcache_device_stop() due
- * to a broken cache device, dc->io_disable should be
- * explicitly set to true.
- */
- dc->io_disable = true;
- /* make others know io_disable is true earlier */
- smp_mb();
- bcache_device_stop(d);
+ /*
+ * There might be a small time gap that cache set is
+ * released but bcache device is not. Inside this time
+ * gap, regular I/O requests will directly go into
+ * backing device as no cache set attached to. This
+ * behavior may also introduce potential inconsistence
+ * data in writeback mode while cache is dirty.
+ * Therefore before calling bcache_device_stop() due
+ * to a broken cache device, dc->io_disable should be
+ * explicitly set to true.
+ */
+ dc->io_disable = true;
+ /* make others know io_disable is true earlier */
+ smp_mb();
+ bcache_device_stop(d);
} else {
/*
* dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
@@ -1668,6 +1714,7 @@
void bch_cache_set_stop(struct cache_set *c)
{
if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
+ /* closure_fn set to __cache_set_unregister() */
closure_queue(&c->caching);
}
@@ -1770,13 +1817,15 @@
return NULL;
}
-static void run_cache_set(struct cache_set *c)
+static int run_cache_set(struct cache_set *c)
{
const char *err = "cannot allocate memory";
struct cached_dev *dc, *t;
struct cache *ca;
struct closure cl;
unsigned int i;
+ LIST_HEAD(journal);
+ struct journal_replay *l;
closure_init_stack(&cl);
@@ -1785,7 +1834,6 @@
set_gc_sectors(c);
if (CACHE_SYNC(&c->sb)) {
- LIST_HEAD(journal);
struct bkey *k;
struct jset *j;
@@ -1835,6 +1883,23 @@
if (bch_btree_check(c))
goto err;
+ /*
+ * bch_btree_check() may occupy too much system memory which
+ * has negative effects to user space application (e.g. data
+ * base) performance. Shrink the mca cache memory proactively
+ * here to avoid competing memory with user space workloads..
+ */
+ if (!c->shrinker_disabled) {
+ struct shrink_control sc;
+
+ sc.gfp_mask = GFP_KERNEL;
+ sc.nr_to_scan = c->btree_cache_used * c->btree_pages;
+ /* first run to clear b->accessed tag */
+ c->shrink.scan_objects(&c->shrink, &sc);
+ /* second run to reap non-accessed nodes */
+ c->shrink.scan_objects(&c->shrink, &sc);
+ }
+
bch_journal_mark(c, &journal);
bch_initial_gc_finish(c);
pr_debug("btree_check() done");
@@ -1864,7 +1929,9 @@
if (j->version < BCACHE_JSET_VERSION_UUID)
__uuid_write(c);
- bch_journal_replay(c, &journal);
+ err = "bcache: replay journal failed";
+ if (bch_journal_replay(c, &journal))
+ goto err;
} else {
pr_notice("invalidating existing data");
@@ -1932,11 +1999,19 @@
flash_devs_run(c);
set_bit(CACHE_SET_RUNNING, &c->flags);
- return;
+ return 0;
err:
+ while (!list_empty(&journal)) {
+ l = list_first_entry(&journal, struct journal_replay, list);
+ list_del(&l->list);
+ kfree(l);
+ }
+
closure_sync(&cl);
- /* XXX: test this, it's broken */
+
bch_cache_set_error(c, "%s", err);
+
+ return -EIO;
}
static bool can_attach_cache(struct cache *ca, struct cache_set *c)
@@ -2000,8 +2075,11 @@
ca->set->cache[ca->sb.nr_this_dev] = ca;
c->cache_by_alloc[c->caches_loaded++] = ca;
- if (c->caches_loaded == c->sb.nr_in_set)
- run_cache_set(c);
+ if (c->caches_loaded == c->sb.nr_in_set) {
+ err = "failed to run cache set";
+ if (run_cache_set(c) < 0)
+ goto err;
+ }
return NULL;
err:
@@ -2011,6 +2089,7 @@
/* Cache device */
+/* When ca->kobj released */
void bch_cache_release(struct kobject *kobj)
{
struct cache *ca = container_of(kobj, struct cache, kobj);
@@ -2046,6 +2125,8 @@
size_t free;
size_t btree_buckets;
struct bucket *b;
+ int ret = -ENOMEM;
+ const char *err = NULL;
__module_get(THIS_MODULE);
kobject_init(&ca->kobj, &bch_cache_ktype);
@@ -2063,27 +2144,93 @@
*/
btree_buckets = ca->sb.njournal_buckets ?: 8;
free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
+ if (!free) {
+ ret = -EPERM;
+ err = "ca->sb.nbuckets is too small";
+ goto err_free;
+ }
- if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) ||
- !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
- !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
- !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
- !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
- !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
- !(ca->buckets = vzalloc(array_size(sizeof(struct bucket),
- ca->sb.nbuckets))) ||
- !(ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
- prio_buckets(ca), 2),
- GFP_KERNEL)) ||
- !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)))
- return -ENOMEM;
+ if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets,
+ GFP_KERNEL)) {
+ err = "ca->free[RESERVE_BTREE] alloc failed";
+ goto err_btree_alloc;
+ }
+
+ if (!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca),
+ GFP_KERNEL)) {
+ err = "ca->free[RESERVE_PRIO] alloc failed";
+ goto err_prio_alloc;
+ }
+
+ if (!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL)) {
+ err = "ca->free[RESERVE_MOVINGGC] alloc failed";
+ goto err_movinggc_alloc;
+ }
+
+ if (!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL)) {
+ err = "ca->free[RESERVE_NONE] alloc failed";
+ goto err_none_alloc;
+ }
+
+ if (!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL)) {
+ err = "ca->free_inc alloc failed";
+ goto err_free_inc_alloc;
+ }
+
+ if (!init_heap(&ca->heap, free << 3, GFP_KERNEL)) {
+ err = "ca->heap alloc failed";
+ goto err_heap_alloc;
+ }
+
+ ca->buckets = vzalloc(array_size(sizeof(struct bucket),
+ ca->sb.nbuckets));
+ if (!ca->buckets) {
+ err = "ca->buckets alloc failed";
+ goto err_buckets_alloc;
+ }
+
+ ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
+ prio_buckets(ca), 2),
+ GFP_KERNEL);
+ if (!ca->prio_buckets) {
+ err = "ca->prio_buckets alloc failed";
+ goto err_prio_buckets_alloc;
+ }
+
+ ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca);
+ if (!ca->disk_buckets) {
+ err = "ca->disk_buckets alloc failed";
+ goto err_disk_buckets_alloc;
+ }
ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
for_each_bucket(b, ca)
atomic_set(&b->pin, 0);
-
return 0;
+
+err_disk_buckets_alloc:
+ kfree(ca->prio_buckets);
+err_prio_buckets_alloc:
+ vfree(ca->buckets);
+err_buckets_alloc:
+ free_heap(&ca->heap);
+err_heap_alloc:
+ free_fifo(&ca->free_inc);
+err_free_inc_alloc:
+ free_fifo(&ca->free[RESERVE_NONE]);
+err_none_alloc:
+ free_fifo(&ca->free[RESERVE_MOVINGGC]);
+err_movinggc_alloc:
+ free_fifo(&ca->free[RESERVE_PRIO]);
+err_prio_alloc:
+ free_fifo(&ca->free[RESERVE_BTREE]);
+err_btree_alloc:
+err_free:
+ module_put(THIS_MODULE);
+ if (err)
+ pr_notice("error %s: %s", ca->cache_dev_name, err);
+ return ret;
}
static int register_cache(struct cache_sb *sb, struct page *sb_page,
@@ -2106,9 +2253,17 @@
ret = cache_alloc(ca);
if (ret != 0) {
+ /*
+ * If we failed here, it means ca->kobj is not initialized yet,
+ * kobject_put() won't be called and there is no chance to
+ * call blkdev_put() to bdev in bch_cache_release(). So we
+ * explicitly call blkdev_put() here.
+ */
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
if (ret == -ENOMEM)
err = "cache_alloc(): -ENOMEM";
+ else if (ret == -EPERM)
+ err = "cache_alloc(): cache device is too small";
else
err = "cache_alloc(): unknown error";
goto err;
@@ -2147,9 +2302,13 @@
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
const char *buffer, size_t size);
+static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
+ struct kobj_attribute *attr,
+ const char *buffer, size_t size);
kobj_attribute_write(register, register_bcache);
kobj_attribute_write(register_quiet, register_bcache);
+kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup);
static bool bch_is_open_backing(struct block_device *bdev)
{
@@ -2187,7 +2346,7 @@
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
const char *buffer, size_t size)
{
- ssize_t ret = size;
+ ssize_t ret = -EINVAL;
const char *err = "cannot allocate memory";
char *path = NULL;
struct cache_sb *sb = NULL;
@@ -2197,6 +2356,11 @@
if (!try_module_get(THIS_MODULE))
return -EBUSY;
+ /* For latest state of bcache_is_reboot */
+ smp_mb();
+ if (bcache_is_reboot)
+ return -EBUSY;
+
path = kstrndup(buffer, size, GFP_KERNEL);
if (!path)
goto err;
@@ -2221,7 +2385,7 @@
if (!IS_ERR(bdev))
bdput(bdev);
if (attr == &ksysfs_register_quiet)
- goto out;
+ goto quiet_out;
}
goto err;
}
@@ -2242,17 +2406,23 @@
goto err_close;
mutex_lock(&bch_register_lock);
- register_bdev(sb, sb_page, bdev, dc);
+ ret = register_bdev(sb, sb_page, bdev, dc);
mutex_unlock(&bch_register_lock);
+ /* blkdev_put() will be called in cached_dev_free() */
+ if (ret < 0)
+ goto err;
} else {
struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
goto err_close;
+ /* blkdev_put() will be called in bch_cache_release() */
if (register_cache(sb, sb_page, bdev, ca) != 0)
goto err;
}
+quiet_out:
+ ret = size;
out:
if (sb_page)
put_page(sb_page);
@@ -2265,12 +2435,64 @@
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
err:
pr_info("error %s: %s", path, err);
- ret = -EINVAL;
goto out;
}
+
+struct pdev {
+ struct list_head list;
+ struct cached_dev *dc;
+};
+
+static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
+ struct kobj_attribute *attr,
+ const char *buffer,
+ size_t size)
+{
+ LIST_HEAD(pending_devs);
+ ssize_t ret = size;
+ struct cached_dev *dc, *tdc;
+ struct pdev *pdev, *tpdev;
+ struct cache_set *c, *tc;
+
+ mutex_lock(&bch_register_lock);
+ list_for_each_entry_safe(dc, tdc, &uncached_devices, list) {
+ pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL);
+ if (!pdev)
+ break;
+ pdev->dc = dc;
+ list_add(&pdev->list, &pending_devs);
+ }
+
+ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
+ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
+ char *pdev_set_uuid = pdev->dc->sb.set_uuid;
+ char *set_uuid = c->sb.uuid;
+
+ if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
+ list_del(&pdev->list);
+ kfree(pdev);
+ break;
+ }
+ }
+ }
+ mutex_unlock(&bch_register_lock);
+
+ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
+ pr_info("delete pdev %p", pdev);
+ list_del(&pdev->list);
+ bcache_device_stop(&pdev->dc->disk);
+ kfree(pdev);
+ }
+
+ return ret;
+}
+
static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
{
+ if (bcache_is_reboot)
+ return NOTIFY_DONE;
+
if (code == SYS_DOWN ||
code == SYS_HALT ||
code == SYS_POWER_OFF) {
@@ -2283,22 +2505,57 @@
mutex_lock(&bch_register_lock);
+ if (bcache_is_reboot)
+ goto out;
+
+ /* New registration is rejected since now */
+ bcache_is_reboot = true;
+ /*
+ * Make registering caller (if there is) on other CPU
+ * core know bcache_is_reboot set to true earlier
+ */
+ smp_mb();
+
if (list_empty(&bch_cache_sets) &&
list_empty(&uncached_devices))
goto out;
+ mutex_unlock(&bch_register_lock);
+
pr_info("Stopping all devices:");
+ /*
+ * The reason bch_register_lock is not held to call
+ * bch_cache_set_stop() and bcache_device_stop() is to
+ * avoid potential deadlock during reboot, because cache
+ * set or bcache device stopping process will acqurie
+ * bch_register_lock too.
+ *
+ * We are safe here because bcache_is_reboot sets to
+ * true already, register_bcache() will reject new
+ * registration now. bcache_is_reboot also makes sure
+ * bcache_reboot() won't be re-entered on by other thread,
+ * so there is no race in following list iteration by
+ * list_for_each_entry_safe().
+ */
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
+
+ /*
+ * Give an early chance for other kthreads and
+ * kworkers to stop themselves
+ */
+ schedule();
+
/* What's a condition variable? */
while (1) {
- long timeout = start + 2 * HZ - jiffies;
+ long timeout = start + 10 * HZ - jiffies;
+ mutex_lock(&bch_register_lock);
stopped = list_empty(&bch_cache_sets) &&
list_empty(&uncached_devices);
@@ -2310,7 +2567,6 @@
mutex_unlock(&bch_register_lock);
schedule_timeout(timeout);
- mutex_lock(&bch_register_lock);
}
finish_wait(&unregister_wait, &wait);
@@ -2348,14 +2604,43 @@
mutex_destroy(&bch_register_lock);
}
+/* Check and fixup module parameters */
+static void check_module_parameters(void)
+{
+ if (bch_cutoff_writeback_sync == 0)
+ bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
+ else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
+ pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u",
+ bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
+ bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
+ }
+
+ if (bch_cutoff_writeback == 0)
+ bch_cutoff_writeback = CUTOFF_WRITEBACK;
+ else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
+ pr_warn("set bch_cutoff_writeback (%u) to max value %u",
+ bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
+ bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
+ }
+
+ if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
+ pr_warn("set bch_cutoff_writeback (%u) to %u",
+ bch_cutoff_writeback, bch_cutoff_writeback_sync);
+ bch_cutoff_writeback = bch_cutoff_writeback_sync;
+ }
+}
+
static int __init bcache_init(void)
{
static const struct attribute *files[] = {
&ksysfs_register.attr,
&ksysfs_register_quiet.attr,
+ &ksysfs_pendings_cleanup.attr,
NULL
};
+ check_module_parameters();
+
mutex_init(&bch_register_lock);
init_waitqueue_head(&unregister_wait);
register_reboot_notifier(&reboot);
@@ -2383,14 +2668,29 @@
sysfs_create_files(bcache_kobj, files))
goto err;
- bch_debug_init(bcache_kobj);
+ bch_debug_init();
closure_debug_init();
+ bcache_is_reboot = false;
+
return 0;
err:
bcache_exit();
return -ENOMEM;
}
+/*
+ * Module hooks
+ */
module_exit(bcache_exit);
module_init(bcache_init);
+
+module_param(bch_cutoff_writeback, uint, 0);
+MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
+
+module_param(bch_cutoff_writeback_sync, uint, 0);
+MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
+
+MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 26f035a..627dcea 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -16,7 +16,9 @@
#include <linux/sort.h>
#include <linux/sched/clock.h>
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
+extern bool bcache_is_reboot;
+
+/* Default is 0 ("writethrough") */
static const char * const bch_cache_modes[] = {
"writethrough",
"writeback",
@@ -25,7 +27,7 @@
NULL
};
-/* Default is -1; we skip past it for stop_when_cache_set_failed */
+/* Default is 0 ("auto") */
static const char * const bch_stop_on_failure_modes[] = {
"auto",
"always",
@@ -67,6 +69,8 @@
read_attribute(btree_written);
read_attribute(metadata_written);
read_attribute(active_journal_entries);
+read_attribute(backing_dev_name);
+read_attribute(backing_dev_uuid);
sysfs_time_stats_attribute(btree_gc, sec, ms);
sysfs_time_stats_attribute(btree_split, sec, us);
@@ -82,12 +86,14 @@
read_attribute(state);
read_attribute(cache_read_races);
read_attribute(reclaim);
+read_attribute(reclaimed_journal_buckets);
read_attribute(flush_write);
-read_attribute(retry_flush_write);
read_attribute(writeback_keys_done);
read_attribute(writeback_keys_failed);
read_attribute(io_errors);
read_attribute(congested);
+read_attribute(cutoff_writeback);
+read_attribute(cutoff_writeback_sync);
rw_attribute(congested_read_threshold_us);
rw_attribute(congested_write_threshold_us);
@@ -128,6 +134,7 @@
rw_attribute(cache_replacement_policy);
rw_attribute(btree_shrinker_disabled);
rw_attribute(copy_gc_enabled);
+rw_attribute(gc_after_writeback);
rw_attribute(size);
static ssize_t bch_snprint_string_list(char *buf,
@@ -175,7 +182,7 @@
var_print(writeback_percent);
sysfs_hprint(writeback_rate,
wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0);
- sysfs_hprint(io_errors, atomic_read(&dc->io_errors));
+ sysfs_printf(io_errors, "%i", atomic_read(&dc->io_errors));
sysfs_printf(io_error_limit, "%i", dc->error_limit);
sysfs_printf(io_disable, "%i", dc->io_disable);
var_print(writeback_rate_update_seconds);
@@ -240,6 +247,19 @@
return strlen(buf);
}
+ if (attr == &sysfs_backing_dev_name) {
+ snprintf(buf, BDEVNAME_SIZE + 1, "%s", dc->backing_dev_name);
+ strcat(buf, "\n");
+ return strlen(buf);
+ }
+
+ if (attr == &sysfs_backing_dev_uuid) {
+ /* convert binary uuid into 36-byte string plus '\0' */
+ snprintf(buf, 36+1, "%pU", dc->sb.uuid);
+ strcat(buf, "\n");
+ return strlen(buf);
+ }
+
#undef var
return 0;
}
@@ -253,18 +273,23 @@
struct cache_set *c;
struct kobj_uevent_env *env;
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
#define d_strtoul(var) sysfs_strtoul(var, dc->var)
#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
sysfs_strtoul(data_csum, dc->disk.data_csum);
d_strtoul(verify);
- d_strtoul(bypass_torture_test);
- d_strtoul(writeback_metadata);
- d_strtoul(writeback_running);
- d_strtoul(writeback_delay);
+ sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test);
+ sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata);
+ sysfs_strtoul_bool(writeback_running, dc->writeback_running);
+ sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX);
- sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
+ sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
+ 0, bch_cutoff_writeback);
if (attr == &sysfs_writeback_rate) {
ssize_t ret;
@@ -283,9 +308,15 @@
sysfs_strtoul_clamp(writeback_rate_update_seconds,
dc->writeback_rate_update_seconds,
1, WRITEBACK_RATE_UPDATE_SECS_MAX);
- d_strtoul(writeback_rate_i_term_inverse);
- d_strtoul_nonzero(writeback_rate_p_term_inverse);
- d_strtoul_nonzero(writeback_rate_minimum);
+ sysfs_strtoul_clamp(writeback_rate_i_term_inverse,
+ dc->writeback_rate_i_term_inverse,
+ 1, UINT_MAX);
+ sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
+ dc->writeback_rate_p_term_inverse,
+ 1, UINT_MAX);
+ sysfs_strtoul_clamp(writeback_rate_minimum,
+ dc->writeback_rate_minimum,
+ 1, UINT_MAX);
sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX);
@@ -295,15 +326,20 @@
dc->io_disable = v ? 1 : 0;
}
- d_strtoi_h(sequential_cutoff);
+ sysfs_strtoul_clamp(sequential_cutoff,
+ dc->sequential_cutoff,
+ 0, UINT_MAX);
d_strtoi_h(readahead);
if (attr == &sysfs_clear_stats)
bch_cache_accounting_clear(&dc->accounting);
if (attr == &sysfs_running &&
- strtoul_or_return(buf))
- bch_cached_dev_run(dc);
+ strtoul_or_return(buf)) {
+ v = bch_cached_dev_run(dc);
+ if (v)
+ return v;
+ }
if (attr == &sysfs_cache_mode) {
v = __sysfs_match_string(bch_cache_modes, -1, buf);
@@ -381,14 +417,40 @@
struct cached_dev *dc = container_of(kobj, struct cached_dev,
disk.kobj);
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
mutex_lock(&bch_register_lock);
size = __cached_dev_store(kobj, attr, buf, size);
- if (attr == &sysfs_writeback_running)
- bch_writeback_queue(dc);
+ if (attr == &sysfs_writeback_running) {
+ /* dc->writeback_running changed in __cached_dev_store() */
+ if (IS_ERR_OR_NULL(dc->writeback_thread)) {
+ /*
+ * reject setting it to 1 via sysfs if writeback
+ * kthread is not created yet.
+ */
+ if (dc->writeback_running) {
+ dc->writeback_running = false;
+ pr_err("%s: failed to run non-existent writeback thread",
+ dc->disk.disk->disk_name);
+ }
+ } else
+ /*
+ * writeback kthread will check if dc->writeback_running
+ * is true or false.
+ */
+ bch_writeback_queue(dc);
+ }
+ /*
+ * Only set BCACHE_DEV_WB_RUNNING when cached device attached to
+ * a cache set, otherwise it doesn't make sense.
+ */
if (attr == &sysfs_writeback_percent)
- if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
+ if ((dc->disk.c != NULL) &&
+ (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)))
schedule_delayed_work(&dc->writeback_rate_update,
dc->writeback_rate_update_seconds * HZ);
@@ -415,7 +477,7 @@
&sysfs_writeback_rate_p_term_inverse,
&sysfs_writeback_rate_minimum,
&sysfs_writeback_rate_debug,
- &sysfs_errors,
+ &sysfs_io_errors,
&sysfs_io_error_limit,
&sysfs_io_disable,
&sysfs_dirty_data,
@@ -431,6 +493,8 @@
&sysfs_verify,
&sysfs_bypass_torture_test,
#endif
+ &sysfs_backing_dev_name,
+ &sysfs_backing_dev_uuid,
NULL
};
KTYPE(bch_cached_dev);
@@ -460,6 +524,10 @@
kobj);
struct uuid_entry *u = &d->c->uuids[d->id];
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
sysfs_strtoul(data_csum, d->data_csum);
if (attr == &sysfs_size) {
@@ -642,12 +710,12 @@
sysfs_print(reclaim,
atomic_long_read(&c->reclaim));
+ sysfs_print(reclaimed_journal_buckets,
+ atomic_long_read(&c->reclaimed_journal_buckets));
+
sysfs_print(flush_write,
atomic_long_read(&c->flush_write));
- sysfs_print(retry_flush_write,
- atomic_long_read(&c->retry_flush_write));
-
sysfs_print(writeback_keys_done,
atomic_long_read(&c->writeback_keys_done));
sysfs_print(writeback_keys_failed,
@@ -668,6 +736,9 @@
sysfs_print(congested_write_threshold_us,
c->congested_write_threshold_us);
+ sysfs_print(cutoff_writeback, bch_cutoff_writeback);
+ sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync);
+
sysfs_print(active_journal_entries, fifo_used(&c->journal.pin));
sysfs_printf(verify, "%i", c->verify);
sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled);
@@ -676,6 +747,7 @@
sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+ sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback);
sysfs_printf(io_disable, "%i",
test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -691,6 +763,10 @@
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
ssize_t v;
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
if (attr == &sysfs_unregister)
bch_cache_set_unregister(c);
@@ -725,21 +801,8 @@
bch_cache_accounting_clear(&c->accounting);
}
- if (attr == &sysfs_trigger_gc) {
- /*
- * Garbage collection thread only works when sectors_to_gc < 0,
- * when users write to sysfs entry trigger_gc, most of time
- * they want to forcibly triger gargage collection. Here -1 is
- * set to c->sectors_to_gc, to make gc_should_run() give a
- * chance to permit gc thread to run. "give a chance" means
- * before going into gc_should_run(), there is still chance
- * that c->sectors_to_gc being set to other positive value. So
- * writing sysfs entry trigger_gc won't always make sure gc
- * thread takes effect.
- */
- atomic_set(&c->sectors_to_gc, -1);
- wake_up_gc(c);
- }
+ if (attr == &sysfs_trigger_gc)
+ force_wake_up_gc(c);
if (attr == &sysfs_prune_cache) {
struct shrink_control sc;
@@ -749,10 +812,12 @@
c->shrink.scan_objects(&c->shrink, &sc);
}
- sysfs_strtoul(congested_read_threshold_us,
- c->congested_read_threshold_us);
- sysfs_strtoul(congested_write_threshold_us,
- c->congested_write_threshold_us);
+ sysfs_strtoul_clamp(congested_read_threshold_us,
+ c->congested_read_threshold_us,
+ 0, UINT_MAX);
+ sysfs_strtoul_clamp(congested_write_threshold_us,
+ c->congested_write_threshold_us,
+ 0, UINT_MAX);
if (attr == &sysfs_errors) {
v = __sysfs_match_string(error_actions, -1, buf);
@@ -762,12 +827,20 @@
c->on_error = v;
}
- if (attr == &sysfs_io_error_limit)
- c->error_limit = strtoul_or_return(buf);
+ sysfs_strtoul_clamp(io_error_limit, c->error_limit, 0, UINT_MAX);
/* See count_io_errors() for why 88 */
- if (attr == &sysfs_io_error_halflife)
- c->error_decay = strtoul_or_return(buf) / 88;
+ if (attr == &sysfs_io_error_halflife) {
+ unsigned long v = 0;
+ ssize_t ret;
+
+ ret = strtoul_safe_clamp(buf, v, 0, UINT_MAX);
+ if (!ret) {
+ c->error_decay = v / 88;
+ return size;
+ }
+ return ret;
+ }
if (attr == &sysfs_io_disable) {
v = strtoul_or_return(buf);
@@ -782,13 +855,21 @@
}
}
- sysfs_strtoul(journal_delay_ms, c->journal_delay_ms);
- sysfs_strtoul(verify, c->verify);
- sysfs_strtoul(key_merging_disabled, c->key_merging_disabled);
+ sysfs_strtoul_clamp(journal_delay_ms,
+ c->journal_delay_ms,
+ 0, USHRT_MAX);
+ sysfs_strtoul_bool(verify, c->verify);
+ sysfs_strtoul_bool(key_merging_disabled, c->key_merging_disabled);
sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks);
- sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
- sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
- sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled);
+ sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite);
+ sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled);
+ sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled);
+ /*
+ * write gc_after_writeback here may overwrite an already set
+ * BCH_DO_AUTO_GC, it doesn't matter because this flag will be
+ * set in next chance.
+ */
+ sysfs_strtoul_clamp(gc_after_writeback, c->gc_after_writeback, 0, 1);
return size;
}
@@ -805,6 +886,10 @@
{
struct cache_set *c = container_of(kobj, struct cache_set, internal);
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
return bch_cache_set_store(&c->kobj, attr, buf, size);
}
@@ -854,8 +939,8 @@
&sysfs_bset_tree_stats,
&sysfs_cache_read_races,
&sysfs_reclaim,
+ &sysfs_reclaimed_journal_buckets,
&sysfs_flush_write,
- &sysfs_retry_flush_write,
&sysfs_writeback_keys_done,
&sysfs_writeback_keys_failed,
@@ -869,13 +954,17 @@
&sysfs_gc_always_rewrite,
&sysfs_btree_shrinker_disabled,
&sysfs_copy_gc_enabled,
+ &sysfs_gc_after_writeback,
&sysfs_io_disable,
+ &sysfs_cutoff_writeback,
+ &sysfs_cutoff_writeback_sync,
NULL
};
KTYPE(bch_cache_set_internal);
static int __bch_cache_cmp(const void *l, const void *r)
{
+ cond_resched();
return *((uint16_t *)r) - *((uint16_t *)l);
}
@@ -938,8 +1027,6 @@
!cached[n - 1])
--n;
- unused = ca->sb.nbuckets - n;
-
while (cached < p + n &&
*cached == BTREE_PRIO)
cached++, n--;
@@ -989,6 +1076,10 @@
struct cache *ca = container_of(kobj, struct cache, kobj);
ssize_t v;
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
if (attr == &sysfs_discard) {
bool v = strtoul_or_return(buf);
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h
index 3fe8242..215df32 100644
--- a/drivers/md/bcache/sysfs.h
+++ b/drivers/md/bcache/sysfs.h
@@ -79,11 +79,28 @@
return strtoul_safe(buf, var) ?: (ssize_t) size; \
} while (0)
+#define sysfs_strtoul_bool(file, var) \
+do { \
+ if (attr == &sysfs_ ## file) { \
+ unsigned long v = strtoul_or_return(buf); \
+ \
+ var = v ? 1 : 0; \
+ return size; \
+ } \
+} while (0)
+
#define sysfs_strtoul_clamp(file, var, min, max) \
do { \
- if (attr == &sysfs_ ## file) \
- return strtoul_safe_clamp(buf, var, min, max) \
- ?: (ssize_t) size; \
+ if (attr == &sysfs_ ## file) { \
+ unsigned long v = 0; \
+ ssize_t ret; \
+ ret = strtoul_safe_clamp(buf, v, min, max); \
+ if (!ret) { \
+ var = v; \
+ return size; \
+ } \
+ return ret; \
+ } \
} while (0)
#define strtoul_or_return(cp) \
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 20eddea..62fb917 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -270,7 +270,11 @@
int i;
struct bio_vec *bv;
- bio_for_each_segment_all(bv, bio, i) {
+ /*
+ * This is called on freshly new bio, so it is safe to access the
+ * bvec table directly.
+ */
+ for (i = 0, bv = bio->bi_io_vec; i < bio->bi_vcnt; bv++, i++) {
bv->bv_page = alloc_page(gfp_mask);
if (!bv->bv_page) {
while (--bv >= bio->bi_io_vec)
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 00aab6a..c029f74 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -113,8 +113,6 @@
#define heap_full(h) ((h)->used == (h)->size)
-#define heap_empty(h) ((h)->used == 0)
-
#define DECLARE_FIFO(type, name) \
struct { \
size_t front, back, size, mask; \
@@ -560,17 +558,29 @@
return crc;
}
-/* Does linear interpolation between powers of two */
+/*
+ * A stepwise-linear pseudo-exponential. This returns 1 << (x >>
+ * frac_bits), with the less-significant bits filled in by linear
+ * interpolation.
+ *
+ * This can also be interpreted as a floating-point number format,
+ * where the low frac_bits are the mantissa (with implicit leading
+ * 1 bit), and the more significant bits are the exponent.
+ * The return value is 1.mantissa * 2^exponent.
+ *
+ * The way this is used, fract_bits is 6 and the largest possible
+ * input is CONGESTED_MAX-1 = 1023 (exponent 16, mantissa 0x1.fc),
+ * so the maximum output is 0x1fc00.
+ */
static inline unsigned int fract_exp_two(unsigned int x,
unsigned int fract_bits)
{
- unsigned int fract = x & ~(~0 << fract_bits);
+ unsigned int mantissa = 1 << fract_bits; /* Implicit bit */
- x >>= fract_bits;
- x = 1 << x;
- x += (x * fract) >> fract_bits;
-
- return x;
+ mantissa += x & (mantissa - 1);
+ x >>= fract_bits; /* The exponent */
+ /* Largest intermediate value 0x7f0000 */
+ return mantissa << x >> fract_bits;
}
void bch_bio_map(struct bio *bio, void *base);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 08c3a9f..d60268f 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -17,6 +17,15 @@
#include <linux/sched/clock.h>
#include <trace/events/bcache.h>
+static void update_gc_after_writeback(struct cache_set *c)
+{
+ if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) ||
+ c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
+ return;
+
+ c->gc_after_writeback |= BCH_DO_AUTO_GC;
+}
+
/* Rate limiting */
static uint64_t __calc_target_rate(struct cached_dev *dc)
{
@@ -113,6 +122,9 @@
static bool set_at_max_writeback_rate(struct cache_set *c,
struct cached_dev *dc)
{
+ /* Don't set max writeback rate if gc is running */
+ if (!c->gc_mark_valid)
+ return false;
/*
* Idle_counter is increased everytime when update_writeback_rate() is
* called. If all backing devices attached to the same cache set have
@@ -191,6 +203,7 @@
if (!set_at_max_writeback_rate(c, dc)) {
down_read(&dc->writeback_lock);
__update_writeback_rate(dc);
+ update_gc_after_writeback(c);
up_read(&dc->writeback_lock);
}
}
@@ -689,6 +702,23 @@
up_write(&dc->writeback_lock);
break;
}
+
+ /*
+ * When dirty data rate is high (e.g. 50%+), there might
+ * be heavy buckets fragmentation after writeback
+ * finished, which hurts following write performance.
+ * If users really care about write performance they
+ * may set BCH_ENABLE_AUTO_GC via sysfs, then when
+ * BCH_DO_AUTO_GC is set, garbage collection thread
+ * will be wake up here. After moving gc, the shrunk
+ * btree and discarded free buckets SSD space may be
+ * helpful for following write requests.
+ */
+ if (c->gc_after_writeback ==
+ (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
+ c->gc_after_writeback &= ~BCH_DO_AUTO_GC;
+ force_wake_up_gc(c);
+ }
}
up_write(&dc->writeback_lock);
@@ -708,6 +738,10 @@
}
}
+ if (dc->writeback_write_wq) {
+ flush_workqueue(dc->writeback_write_wq);
+ destroy_workqueue(dc->writeback_write_wq);
+ }
cached_dev_put(dc);
wait_for_kthread_stop();
@@ -777,7 +811,7 @@
bch_keybuf_init(&dc->writeback_keys);
dc->writeback_metadata = true;
- dc->writeback_running = true;
+ dc->writeback_running = false;
dc->writeback_percent = 10;
dc->writeback_delay = 30;
atomic_long_set(&dc->writeback_rate.rate, 1024);
@@ -803,8 +837,10 @@
"bcache_writeback");
if (IS_ERR(dc->writeback_thread)) {
cached_dev_put(dc);
+ destroy_workqueue(dc->writeback_write_wq);
return PTR_ERR(dc->writeback_thread);
}
+ dc->writeback_running = true;
WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
schedule_delayed_work(&dc->writeback_rate_update,
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index d2b9fdb..4e4c681 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -5,12 +5,17 @@
#define CUTOFF_WRITEBACK 40
#define CUTOFF_WRITEBACK_SYNC 70
+#define CUTOFF_WRITEBACK_MAX 70
+#define CUTOFF_WRITEBACK_SYNC_MAX 90
+
#define MAX_WRITEBACKS_IN_PASS 5
#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
#define WRITEBACK_RATE_UPDATE_SECS_MAX 60
#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5
+#define BCH_AUTO_GC_DIRTY_THRESHOLD 50
+
/*
* 14 (16384ths) is chosen here as something that each backing device
* should be a reasonable fraction of the share, and not to blow up
@@ -53,6 +58,9 @@
}
}
+extern unsigned int bch_cutoff_writeback;
+extern unsigned int bch_cutoff_writeback_sync;
+
static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
unsigned int cache_mode, bool would_skip)
{
@@ -60,7 +68,10 @@
if (cache_mode != CACHE_MODE_WRITEBACK ||
test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
- in_use > CUTOFF_WRITEBACK_SYNC)
+ in_use > bch_cutoff_writeback_sync)
+ return false;
+
+ if (bio_op(bio) == REQ_OP_DISCARD)
return false;
if (dc->partial_stripes_expensive &&
@@ -73,7 +84,7 @@
return (op_is_sync(bio->bi_opf) ||
bio->bi_opf & (REQ_META|REQ_PRIO) ||
- in_use <= CUTOFF_WRITEBACK);
+ in_use <= bch_cutoff_writeback);
}
static inline void bch_writeback_queue(struct cached_dev *dc)
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index dc385b7..2d519c2 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -33,7 +33,8 @@
#define DM_BUFIO_MEMORY_PERCENT 2
#define DM_BUFIO_VMALLOC_PERCENT 25
-#define DM_BUFIO_WRITEBACK_PERCENT 75
+#define DM_BUFIO_WRITEBACK_RATIO 3
+#define DM_BUFIO_LOW_WATERMARK_RATIO 16
/*
* Check buffer ages in this interval (seconds)
@@ -65,7 +66,7 @@
/*
* Linking of buffers:
- * All buffers are linked to cache_hash with their hash_list field.
+ * All buffers are linked to buffer_tree with their node field.
*
* Clean buffers that are not being written (B_WRITING not set)
* are linked to lru[LIST_CLEAN] with their lru_list field.
@@ -132,12 +133,14 @@
struct dm_buffer {
struct rb_node node;
struct list_head lru_list;
+ struct list_head global_list;
sector_t block;
void *data;
unsigned char data_mode; /* DATA_MODE_* */
unsigned char list_mode; /* LIST_* */
blk_status_t read_error;
blk_status_t write_error;
+ unsigned accessed;
unsigned hold_count;
unsigned long state;
unsigned long last_accessed;
@@ -150,7 +153,7 @@
void (*end_io)(struct dm_buffer *, blk_status_t);
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
#define MAX_STACK 10
- struct stack_trace stack_trace;
+ unsigned int stack_len;
unsigned long stack_entries[MAX_STACK];
#endif
};
@@ -192,7 +195,11 @@
*/
static unsigned long dm_bufio_cache_size_latch;
-static DEFINE_SPINLOCK(param_spinlock);
+static DEFINE_SPINLOCK(global_spinlock);
+
+static LIST_HEAD(global_queue);
+
+static unsigned long global_num = 0;
/*
* Buffers are freed after this timeout
@@ -209,11 +216,6 @@
/*----------------------------------------------------------------*/
/*
- * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count
- */
-static unsigned long dm_bufio_cache_size_per_client;
-
-/*
* The current number of clients.
*/
static int dm_bufio_client_count;
@@ -224,19 +226,19 @@
static LIST_HEAD(dm_bufio_all_clients);
/*
- * This mutex protects dm_bufio_cache_size_latch,
- * dm_bufio_cache_size_per_client and dm_bufio_client_count
+ * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
*/
static DEFINE_MUTEX(dm_bufio_clients_lock);
+static struct workqueue_struct *dm_bufio_wq;
+static struct delayed_work dm_bufio_cleanup_old_work;
+static struct work_struct dm_bufio_replacement_work;
+
+
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
static void buffer_record_stack(struct dm_buffer *b)
{
- b->stack_trace.nr_entries = 0;
- b->stack_trace.max_entries = MAX_STACK;
- b->stack_trace.entries = b->stack_entries;
- b->stack_trace.skip = 2;
- save_stack_trace(&b->stack_trace);
+ b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
}
#endif
@@ -289,15 +291,23 @@
/*----------------------------------------------------------------*/
-static void adjust_total_allocated(unsigned char data_mode, long diff)
+static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
{
+ unsigned char data_mode;
+ long diff;
+
static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
&dm_bufio_allocated_kmem_cache,
&dm_bufio_allocated_get_free_pages,
&dm_bufio_allocated_vmalloc,
};
- spin_lock(¶m_spinlock);
+ data_mode = b->data_mode;
+ diff = (long)b->c->block_size;
+ if (unlink)
+ diff = -diff;
+
+ spin_lock(&global_spinlock);
*class_ptr[data_mode] += diff;
@@ -306,7 +316,19 @@
if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
dm_bufio_peak_allocated = dm_bufio_current_allocated;
- spin_unlock(¶m_spinlock);
+ b->accessed = 1;
+
+ if (!unlink) {
+ list_add(&b->global_list, &global_queue);
+ global_num++;
+ if (dm_bufio_current_allocated > dm_bufio_cache_size)
+ queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
+ } else {
+ list_del(&b->global_list);
+ global_num--;
+ }
+
+ spin_unlock(&global_spinlock);
}
/*
@@ -327,9 +349,6 @@
dm_bufio_default_cache_size);
dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
}
-
- dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch /
- (dm_bufio_client_count ? : 1);
}
/*
@@ -435,10 +454,8 @@
return NULL;
}
- adjust_total_allocated(b->data_mode, (long)c->block_size);
-
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
- memset(&b->stack_trace, 0, sizeof(b->stack_trace));
+ b->stack_len = 0;
#endif
return b;
}
@@ -450,14 +467,12 @@
{
struct dm_bufio_client *c = b->c;
- adjust_total_allocated(b->data_mode, -(long)c->block_size);
-
free_buffer_data(c, b->data, b->data_mode);
kmem_cache_free(c->slab_buffer, b);
}
/*
- * Link buffer to the hash list and clean or dirty queue.
+ * Link buffer to the buffer tree and clean or dirty queue.
*/
static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
{
@@ -469,10 +484,12 @@
list_add(&b->lru_list, &c->lru[dirty]);
__insert(b->c, b);
b->last_accessed = jiffies;
+
+ adjust_total_allocated(b, false);
}
/*
- * Unlink buffer from the hash list and dirty or clean queue.
+ * Unlink buffer from the buffer tree and dirty or clean queue.
*/
static void __unlink_buffer(struct dm_buffer *b)
{
@@ -483,6 +500,8 @@
c->n_buffers[b->list_mode]--;
__remove(b->c, b);
list_del(&b->lru_list);
+
+ adjust_total_allocated(b, true);
}
/*
@@ -492,6 +511,8 @@
{
struct dm_bufio_client *c = b->c;
+ b->accessed = 1;
+
BUG_ON(!c->n_buffers[b->list_mode]);
c->n_buffers[b->list_mode]--;
@@ -911,36 +932,6 @@
}
/*
- * Get writeback threshold and buffer limit for a given client.
- */
-static void __get_memory_limit(struct dm_bufio_client *c,
- unsigned long *threshold_buffers,
- unsigned long *limit_buffers)
-{
- unsigned long buffers;
-
- if (unlikely(READ_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
- if (mutex_trylock(&dm_bufio_clients_lock)) {
- __cache_size_refresh();
- mutex_unlock(&dm_bufio_clients_lock);
- }
- }
-
- buffers = dm_bufio_cache_size_per_client;
- if (likely(c->sectors_per_block_bits >= 0))
- buffers >>= c->sectors_per_block_bits + SECTOR_SHIFT;
- else
- buffers /= c->block_size;
-
- if (buffers < c->minimum_buffers)
- buffers = c->minimum_buffers;
-
- *limit_buffers = buffers;
- *threshold_buffers = mult_frac(buffers,
- DM_BUFIO_WRITEBACK_PERCENT, 100);
-}
-
-/*
* Check if we're over watermark.
* If we are over threshold_buffers, start freeing buffers.
* If we're over "limit_buffers", block until we get under the limit.
@@ -948,23 +939,7 @@
static void __check_watermark(struct dm_bufio_client *c,
struct list_head *write_list)
{
- unsigned long threshold_buffers, limit_buffers;
-
- __get_memory_limit(c, &threshold_buffers, &limit_buffers);
-
- while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] >
- limit_buffers) {
-
- struct dm_buffer *b = __get_unclaimed_buffer(c);
-
- if (!b)
- return;
-
- __free_buffer_wake(b);
- cond_resched();
- }
-
- if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
+ if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO)
__write_dirty_buffers_async(c, 1, write_list);
}
@@ -993,7 +968,7 @@
/*
* We've had a period where the mutex was unlocked, so need to
- * recheck the hash table.
+ * recheck the buffer tree.
*/
b = __find(c, block);
if (b) {
@@ -1327,7 +1302,7 @@
EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
/*
- * Use dm-io to send and empty barrier flush the device.
+ * Use dm-io to send an empty barrier to flush the device.
*/
int dm_bufio_issue_flush(struct dm_bufio_client *c)
{
@@ -1356,7 +1331,7 @@
* Then, we write the buffer to the original location if it was dirty.
*
* Then, if we are the only one who is holding the buffer, relink the buffer
- * in the hash queue for the new location.
+ * in the buffer tree for the new location.
*
* If there was someone else holding the buffer, we write it to the new
* location but not relink it, because that other user needs to have the buffer
@@ -1520,8 +1495,9 @@
DMERR("leaked buffer %llx, hold count %u, list %d",
(unsigned long long)b->block, b->hold_count, i);
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
- print_stack_trace(&b->stack_trace, 1);
- b->hold_count = 0; /* mark unclaimed to avoid BUG_ON below */
+ stack_trace_print(b->stack_entries, b->stack_len, 1);
+ /* mark unclaimed to avoid BUG_ON below */
+ b->hold_count = 0;
#endif
}
@@ -1844,6 +1820,74 @@
dm_bufio_unlock(c);
}
+static void do_global_cleanup(struct work_struct *w)
+{
+ struct dm_bufio_client *locked_client = NULL;
+ struct dm_bufio_client *current_client;
+ struct dm_buffer *b;
+ unsigned spinlock_hold_count;
+ unsigned long threshold = dm_bufio_cache_size -
+ dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
+ unsigned long loops = global_num * 2;
+
+ mutex_lock(&dm_bufio_clients_lock);
+
+ while (1) {
+ cond_resched();
+
+ spin_lock(&global_spinlock);
+ if (unlikely(dm_bufio_current_allocated <= threshold))
+ break;
+
+ spinlock_hold_count = 0;
+get_next:
+ if (!loops--)
+ break;
+ if (unlikely(list_empty(&global_queue)))
+ break;
+ b = list_entry(global_queue.prev, struct dm_buffer, global_list);
+
+ if (b->accessed) {
+ b->accessed = 0;
+ list_move(&b->global_list, &global_queue);
+ if (likely(++spinlock_hold_count < 16))
+ goto get_next;
+ spin_unlock(&global_spinlock);
+ continue;
+ }
+
+ current_client = b->c;
+ if (unlikely(current_client != locked_client)) {
+ if (locked_client)
+ dm_bufio_unlock(locked_client);
+
+ if (!dm_bufio_trylock(current_client)) {
+ spin_unlock(&global_spinlock);
+ dm_bufio_lock(current_client);
+ locked_client = current_client;
+ continue;
+ }
+
+ locked_client = current_client;
+ }
+
+ spin_unlock(&global_spinlock);
+
+ if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) {
+ spin_lock(&global_spinlock);
+ list_move(&b->global_list, &global_queue);
+ spin_unlock(&global_spinlock);
+ }
+ }
+
+ spin_unlock(&global_spinlock);
+
+ if (locked_client)
+ dm_bufio_unlock(locked_client);
+
+ mutex_unlock(&dm_bufio_clients_lock);
+}
+
static void cleanup_old_buffers(void)
{
unsigned long max_age_hz = get_max_age_hz();
@@ -1859,14 +1903,11 @@
mutex_unlock(&dm_bufio_clients_lock);
}
-static struct workqueue_struct *dm_bufio_wq;
-static struct delayed_work dm_bufio_work;
-
static void work_fn(struct work_struct *w)
{
cleanup_old_buffers();
- queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
+ queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
DM_BUFIO_WORK_TIMER_SECS * HZ);
}
@@ -1887,7 +1928,7 @@
dm_bufio_allocated_vmalloc = 0;
dm_bufio_current_allocated = 0;
- mem = (__u64)mult_frac(totalram_pages - totalhigh_pages,
+ mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
if (mem > ULONG_MAX)
@@ -1908,8 +1949,9 @@
if (!dm_bufio_wq)
return -ENOMEM;
- INIT_DELAYED_WORK(&dm_bufio_work, work_fn);
- queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
+ INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
+ INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
+ queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
DM_BUFIO_WORK_TIMER_SECS * HZ);
return 0;
@@ -1922,7 +1964,8 @@
{
int bug = 0;
- cancel_delayed_work_sync(&dm_bufio_work);
+ cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
+ flush_workqueue(dm_bufio_wq);
destroy_workqueue(dm_bufio_wq);
if (dm_bufio_client_count) {
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 6fc9383..151aa95 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -1167,11 +1167,18 @@
if (r)
return r;
- for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+ for (b = 0; ; b++) {
r = fn(context, cmd->discard_block_size, to_dblock(b),
dm_bitset_cursor_get_value(&c));
if (r)
break;
+
+ if (b >= (from_dblock(cmd->discard_nr_blocks) - 1))
+ break;
+
+ r = dm_bitset_cursor_next(&c);
+ if (r)
+ break;
}
dm_bitset_cursor_end(&c);
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 1b5b9ad..b61aac0 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1200,7 +1200,7 @@
struct policy_work work;
struct entry *e;
- if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
+ if (WARN_ON_ONCE(!mq->migrations_allowed))
return;
e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index b29a832..8346e6d 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -353,6 +353,7 @@
enum cache_metadata_mode mode;
enum cache_io_mode io_mode;
unsigned metadata_version;
+ bool discard_passdown:1;
};
struct cache_stats {
@@ -541,7 +542,7 @@
static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
{
- return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
+ return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO);
}
static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
@@ -553,9 +554,7 @@
{
struct dm_cache_migration *mg;
- mg = mempool_alloc(&cache->migration_pool, GFP_NOWAIT);
- if (!mg)
- return NULL;
+ mg = mempool_alloc(&cache->migration_pool, GFP_NOIO);
memset(mg, 0, sizeof(*mg));
@@ -663,10 +662,6 @@
struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
- if (!cell_prealloc) {
- defer_bio(cache, bio);
- return false;
- }
build_key(oblock, end, &key);
r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
@@ -1492,11 +1487,6 @@
struct dm_bio_prison_cell_v2 *prealloc;
prealloc = alloc_prison_cell(cache);
- if (!prealloc) {
- DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
- mg_complete(mg, false);
- return -ENOMEM;
- }
/*
* Prevent writes to the block, but allow reads to continue.
@@ -1534,11 +1524,6 @@
}
mg = alloc_migration(cache);
- if (!mg) {
- policy_complete_background_work(cache->policy, op, false);
- background_work_end(cache);
- return -ENOMEM;
- }
mg->op = op;
mg->overwrite_bio = bio;
@@ -1627,10 +1612,6 @@
struct dm_bio_prison_cell_v2 *prealloc;
prealloc = alloc_prison_cell(cache);
- if (!prealloc) {
- invalidate_complete(mg, false);
- return -ENOMEM;
- }
build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
r = dm_cell_lock_v2(cache->prison, &key,
@@ -1668,10 +1649,6 @@
return -EPERM;
mg = alloc_migration(cache);
- if (!mg) {
- background_work_end(cache);
- return -ENOMEM;
- }
mg->overwrite_bio = bio;
mg->invalidate_cblock = cblock;
@@ -1899,7 +1876,11 @@
b = to_dblock(from_dblock(b) + 1);
}
- bio_endio(bio);
+ if (cache->features.discard_passdown) {
+ remap_to_origin(cache, bio);
+ generic_make_request(bio);
+ } else
+ bio_endio(bio);
return false;
}
@@ -2233,13 +2214,14 @@
cf->mode = CM_WRITE;
cf->io_mode = CM_IO_WRITEBACK;
cf->metadata_version = 1;
+ cf->discard_passdown = true;
}
static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
char **error)
{
static const struct dm_arg _args[] = {
- {0, 2, "Invalid number of cache feature arguments"},
+ {0, 3, "Invalid number of cache feature arguments"},
};
int r, mode_ctr = 0;
@@ -2274,6 +2256,9 @@
else if (!strcasecmp(arg, "metadata2"))
cf->metadata_version = 2;
+ else if (!strcasecmp(arg, "no_discard_passdown"))
+ cf->discard_passdown = false;
+
else {
*error = "Unrecognised cache feature requested";
return -EINVAL;
@@ -2496,7 +2481,6 @@
ti->num_discard_bios = 1;
ti->discards_supported = true;
- ti->split_discard_bios = false;
ti->per_io_data_size = sizeof(struct per_bio_data);
@@ -3120,6 +3104,39 @@
do_waker(&cache->waker.work);
}
+static void emit_flags(struct cache *cache, char *result,
+ unsigned maxlen, ssize_t *sz_ptr)
+{
+ ssize_t sz = *sz_ptr;
+ struct cache_features *cf = &cache->features;
+ unsigned count = (cf->metadata_version == 2) + !cf->discard_passdown + 1;
+
+ DMEMIT("%u ", count);
+
+ if (cf->metadata_version == 2)
+ DMEMIT("metadata2 ");
+
+ if (writethrough_mode(cache))
+ DMEMIT("writethrough ");
+
+ else if (passthrough_mode(cache))
+ DMEMIT("passthrough ");
+
+ else if (writeback_mode(cache))
+ DMEMIT("writeback ");
+
+ else {
+ DMEMIT("unknown ");
+ DMERR("%s: internal error: unknown io mode: %d",
+ cache_device_name(cache), (int) cf->io_mode);
+ }
+
+ if (!cf->discard_passdown)
+ DMEMIT("no_discard_passdown ");
+
+ *sz_ptr = sz;
+}
+
/*
* Status format:
*
@@ -3186,25 +3203,7 @@
(unsigned) atomic_read(&cache->stats.promotion),
(unsigned long) atomic_read(&cache->nr_dirty));
- if (cache->features.metadata_version == 2)
- DMEMIT("2 metadata2 ");
- else
- DMEMIT("1 ");
-
- if (writethrough_mode(cache))
- DMEMIT("writethrough ");
-
- else if (passthrough_mode(cache))
- DMEMIT("passthrough ");
-
- else if (writeback_mode(cache))
- DMEMIT("writeback ");
-
- else {
- DMERR("%s: internal error: unknown io mode: %d",
- cache_device_name(cache), (int) cache->features.io_mode);
- goto err;
- }
+ emit_flags(cache, result, maxlen, &sz);
DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
@@ -3433,14 +3432,62 @@
return r;
}
+static bool origin_dev_supports_discard(struct block_device *origin_bdev)
+{
+ struct request_queue *q = bdev_get_queue(origin_bdev);
+
+ return q && blk_queue_discard(q);
+}
+
+/*
+ * If discard_passdown was enabled verify that the origin device
+ * supports discards. Disable discard_passdown if not.
+ */
+static void disable_passdown_if_not_supported(struct cache *cache)
+{
+ struct block_device *origin_bdev = cache->origin_dev->bdev;
+ struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
+ const char *reason = NULL;
+ char buf[BDEVNAME_SIZE];
+
+ if (!cache->features.discard_passdown)
+ return;
+
+ if (!origin_dev_supports_discard(origin_bdev))
+ reason = "discard unsupported";
+
+ else if (origin_limits->max_discard_sectors < cache->sectors_per_block)
+ reason = "max discard sectors smaller than a block";
+
+ if (reason) {
+ DMWARN("Origin device (%s) %s: Disabling discard passdown.",
+ bdevname(origin_bdev, buf), reason);
+ cache->features.discard_passdown = false;
+ }
+}
+
static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
{
+ struct block_device *origin_bdev = cache->origin_dev->bdev;
+ struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
+
+ if (!cache->features.discard_passdown) {
+ /* No passdown is done so setting own virtual limits */
+ limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
+ cache->origin_sectors);
+ limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
+ return;
+ }
+
/*
- * FIXME: these limits may be incompatible with the cache device
+ * cache_iterate_devices() is stacking both origin and fast device limits
+ * but discards aren't passed to fast device, so inherit origin's limits.
*/
- limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
- cache->origin_sectors);
- limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
+ limits->max_discard_sectors = origin_limits->max_discard_sectors;
+ limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors;
+ limits->discard_granularity = origin_limits->discard_granularity;
+ limits->discard_alignment = origin_limits->discard_alignment;
+ limits->discard_misaligned = origin_limits->discard_misaligned;
}
static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -3457,6 +3504,8 @@
blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
}
+
+ disable_passdown_if_not_supported(cache);
set_discard_limits(cache, limits);
}
@@ -3464,7 +3513,7 @@
static struct target_type cache_target = {
.name = "cache",
- .version = {2, 0, 0},
+ .version = {2, 1, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c
new file mode 100644
index 0000000..6bc8c1d
--- /dev/null
+++ b/drivers/md/dm-clone-metadata.c
@@ -0,0 +1,964 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
+ */
+
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/bitops.h>
+#include <linux/bitmap.h>
+#include <linux/device-mapper.h>
+
+#include "persistent-data/dm-bitset.h"
+#include "persistent-data/dm-space-map.h"
+#include "persistent-data/dm-block-manager.h"
+#include "persistent-data/dm-transaction-manager.h"
+
+#include "dm-clone-metadata.h"
+
+#define DM_MSG_PREFIX "clone metadata"
+
+#define SUPERBLOCK_LOCATION 0
+#define SUPERBLOCK_MAGIC 0x8af27f64
+#define SUPERBLOCK_CSUM_XOR 257649492
+
+#define DM_CLONE_MAX_CONCURRENT_LOCKS 5
+
+#define UUID_LEN 16
+
+/* Min and max dm-clone metadata versions supported */
+#define DM_CLONE_MIN_METADATA_VERSION 1
+#define DM_CLONE_MAX_METADATA_VERSION 1
+
+/*
+ * On-disk metadata layout
+ */
+struct superblock_disk {
+ __le32 csum;
+ __le32 flags;
+ __le64 blocknr;
+
+ __u8 uuid[UUID_LEN];
+ __le64 magic;
+ __le32 version;
+
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+
+ __le64 region_size;
+ __le64 target_size;
+
+ __le64 bitset_root;
+} __packed;
+
+/*
+ * Region and Dirty bitmaps.
+ *
+ * dm-clone logically splits the source and destination devices in regions of
+ * fixed size. The destination device's regions are gradually hydrated, i.e.,
+ * we copy (clone) the source's regions to the destination device. Eventually,
+ * all regions will get hydrated and all I/O will be served from the
+ * destination device.
+ *
+ * We maintain an on-disk bitmap which tracks the state of each of the
+ * destination device's regions, i.e., whether they are hydrated or not.
+ *
+ * To save constantly doing look ups on disk we keep an in core copy of the
+ * on-disk bitmap, the region_map.
+ *
+ * To further reduce metadata I/O overhead we use a second bitmap, the dmap
+ * (dirty bitmap), which tracks the dirty words, i.e. longs, of the region_map.
+ *
+ * When a region finishes hydrating dm-clone calls
+ * dm_clone_set_region_hydrated(), or for discard requests
+ * dm_clone_cond_set_range(), which sets the corresponding bits in region_map
+ * and dmap.
+ *
+ * During a metadata commit we scan the dmap for dirty region_map words (longs)
+ * and update accordingly the on-disk metadata. Thus, we don't have to flush to
+ * disk the whole region_map. We can just flush the dirty region_map words.
+ *
+ * We use a dirty bitmap, which is smaller than the original region_map, to
+ * reduce the amount of memory accesses during a metadata commit. As dm-bitset
+ * accesses the on-disk bitmap in 64-bit word granularity, there is no
+ * significant benefit in tracking the dirty region_map bits with a smaller
+ * granularity.
+ *
+ * We could update directly the on-disk bitmap, when dm-clone calls either
+ * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), buts this
+ * inserts significant metadata I/O overhead in dm-clone's I/O path. Also, as
+ * these two functions don't block, we can call them in interrupt context,
+ * e.g., in a hooked overwrite bio's completion routine, and further reduce the
+ * I/O completion latency.
+ *
+ * We maintain two dirty bitmaps. During a metadata commit we atomically swap
+ * the currently used dmap with the unused one. This allows the metadata update
+ * functions to run concurrently with an ongoing commit.
+ */
+struct dirty_map {
+ unsigned long *dirty_words;
+ unsigned int changed;
+};
+
+struct dm_clone_metadata {
+ /* The metadata block device */
+ struct block_device *bdev;
+
+ sector_t target_size;
+ sector_t region_size;
+ unsigned long nr_regions;
+ unsigned long nr_words;
+
+ /* Spinlock protecting the region and dirty bitmaps. */
+ spinlock_t bitmap_lock;
+ struct dirty_map dmap[2];
+ struct dirty_map *current_dmap;
+
+ /*
+ * In core copy of the on-disk bitmap to save constantly doing look ups
+ * on disk.
+ */
+ unsigned long *region_map;
+
+ /* Protected by bitmap_lock */
+ unsigned int read_only;
+
+ struct dm_block_manager *bm;
+ struct dm_space_map *sm;
+ struct dm_transaction_manager *tm;
+
+ struct rw_semaphore lock;
+
+ struct dm_disk_bitset bitset_info;
+ dm_block_t bitset_root;
+
+ /*
+ * Reading the space map root can fail, so we read it into this
+ * buffer before the superblock is locked and updated.
+ */
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+
+ bool hydration_done:1;
+ bool fail_io:1;
+};
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Superblock validation.
+ */
+static void sb_prepare_for_write(struct dm_block_validator *v,
+ struct dm_block *b, size_t sb_block_size)
+{
+ struct superblock_disk *sb;
+ u32 csum;
+
+ sb = dm_block_data(b);
+ sb->blocknr = cpu_to_le64(dm_block_location(b));
+
+ csum = dm_bm_checksum(&sb->flags, sb_block_size - sizeof(__le32),
+ SUPERBLOCK_CSUM_XOR);
+ sb->csum = cpu_to_le32(csum);
+}
+
+static int sb_check(struct dm_block_validator *v, struct dm_block *b,
+ size_t sb_block_size)
+{
+ struct superblock_disk *sb;
+ u32 csum, metadata_version;
+
+ sb = dm_block_data(b);
+
+ if (dm_block_location(b) != le64_to_cpu(sb->blocknr)) {
+ DMERR("Superblock check failed: blocknr %llu, expected %llu",
+ le64_to_cpu(sb->blocknr),
+ (unsigned long long)dm_block_location(b));
+ return -ENOTBLK;
+ }
+
+ if (le64_to_cpu(sb->magic) != SUPERBLOCK_MAGIC) {
+ DMERR("Superblock check failed: magic %llu, expected %llu",
+ le64_to_cpu(sb->magic),
+ (unsigned long long)SUPERBLOCK_MAGIC);
+ return -EILSEQ;
+ }
+
+ csum = dm_bm_checksum(&sb->flags, sb_block_size - sizeof(__le32),
+ SUPERBLOCK_CSUM_XOR);
+ if (sb->csum != cpu_to_le32(csum)) {
+ DMERR("Superblock check failed: checksum %u, expected %u",
+ csum, le32_to_cpu(sb->csum));
+ return -EILSEQ;
+ }
+
+ /* Check metadata version */
+ metadata_version = le32_to_cpu(sb->version);
+ if (metadata_version < DM_CLONE_MIN_METADATA_VERSION ||
+ metadata_version > DM_CLONE_MAX_METADATA_VERSION) {
+ DMERR("Clone metadata version %u found, but only versions between %u and %u supported.",
+ metadata_version, DM_CLONE_MIN_METADATA_VERSION,
+ DM_CLONE_MAX_METADATA_VERSION);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct dm_block_validator sb_validator = {
+ .name = "superblock",
+ .prepare_for_write = sb_prepare_for_write,
+ .check = sb_check
+};
+
+/*
+ * Check if the superblock is formatted or not. We consider the superblock to
+ * be formatted in case we find non-zero bytes in it.
+ */
+static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *formatted)
+{
+ int r;
+ unsigned int i, nr_words;
+ struct dm_block *sblock;
+ __le64 *data_le, zero = cpu_to_le64(0);
+
+ /*
+ * We don't use a validator here because the superblock could be all
+ * zeroes.
+ */
+ r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &sblock);
+ if (r) {
+ DMERR("Failed to read_lock superblock");
+ return r;
+ }
+
+ data_le = dm_block_data(sblock);
+ *formatted = false;
+
+ /* This assumes that the block size is a multiple of 8 bytes */
+ BUG_ON(dm_bm_block_size(bm) % sizeof(__le64));
+ nr_words = dm_bm_block_size(bm) / sizeof(__le64);
+ for (i = 0; i < nr_words; i++) {
+ if (data_le[i] != zero) {
+ *formatted = true;
+ break;
+ }
+ }
+
+ dm_bm_unlock(sblock);
+
+ return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Low-level metadata handling.
+ */
+static inline int superblock_read_lock(struct dm_clone_metadata *cmd,
+ struct dm_block **sblock)
+{
+ return dm_bm_read_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock);
+}
+
+static inline int superblock_write_lock(struct dm_clone_metadata *cmd,
+ struct dm_block **sblock)
+{
+ return dm_bm_write_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock);
+}
+
+static inline int superblock_write_lock_zero(struct dm_clone_metadata *cmd,
+ struct dm_block **sblock)
+{
+ return dm_bm_write_lock_zero(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock);
+}
+
+static int __copy_sm_root(struct dm_clone_metadata *cmd)
+{
+ int r;
+ size_t root_size;
+
+ r = dm_sm_root_size(cmd->sm, &root_size);
+ if (r)
+ return r;
+
+ return dm_sm_copy_root(cmd->sm, &cmd->metadata_space_map_root, root_size);
+}
+
+/* Save dm-clone metadata in superblock */
+static void __prepare_superblock(struct dm_clone_metadata *cmd,
+ struct superblock_disk *sb)
+{
+ sb->flags = cpu_to_le32(0UL);
+
+ /* FIXME: UUID is currently unused */
+ memset(sb->uuid, 0, sizeof(sb->uuid));
+
+ sb->magic = cpu_to_le64(SUPERBLOCK_MAGIC);
+ sb->version = cpu_to_le32(DM_CLONE_MAX_METADATA_VERSION);
+
+ /* Save the metadata space_map root */
+ memcpy(&sb->metadata_space_map_root, &cmd->metadata_space_map_root,
+ sizeof(cmd->metadata_space_map_root));
+
+ sb->region_size = cpu_to_le64(cmd->region_size);
+ sb->target_size = cpu_to_le64(cmd->target_size);
+ sb->bitset_root = cpu_to_le64(cmd->bitset_root);
+}
+
+static int __open_metadata(struct dm_clone_metadata *cmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct superblock_disk *sb;
+
+ r = superblock_read_lock(cmd, &sblock);
+
+ if (r) {
+ DMERR("Failed to read_lock superblock");
+ return r;
+ }
+
+ sb = dm_block_data(sblock);
+
+ /* Verify that target_size and region_size haven't changed. */
+ if (cmd->region_size != le64_to_cpu(sb->region_size) ||
+ cmd->target_size != le64_to_cpu(sb->target_size)) {
+ DMERR("Region and/or target size don't match the ones in metadata");
+ r = -EINVAL;
+ goto out_with_lock;
+ }
+
+ r = dm_tm_open_with_sm(cmd->bm, SUPERBLOCK_LOCATION,
+ sb->metadata_space_map_root,
+ sizeof(sb->metadata_space_map_root),
+ &cmd->tm, &cmd->sm);
+
+ if (r) {
+ DMERR("dm_tm_open_with_sm failed");
+ goto out_with_lock;
+ }
+
+ dm_disk_bitset_init(cmd->tm, &cmd->bitset_info);
+ cmd->bitset_root = le64_to_cpu(sb->bitset_root);
+
+out_with_lock:
+ dm_bm_unlock(sblock);
+
+ return r;
+}
+
+static int __format_metadata(struct dm_clone_metadata *cmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct superblock_disk *sb;
+
+ r = dm_tm_create_with_sm(cmd->bm, SUPERBLOCK_LOCATION, &cmd->tm, &cmd->sm);
+ if (r) {
+ DMERR("Failed to create transaction manager");
+ return r;
+ }
+
+ dm_disk_bitset_init(cmd->tm, &cmd->bitset_info);
+
+ r = dm_bitset_empty(&cmd->bitset_info, &cmd->bitset_root);
+ if (r) {
+ DMERR("Failed to create empty on-disk bitset");
+ goto err_with_tm;
+ }
+
+ r = dm_bitset_resize(&cmd->bitset_info, cmd->bitset_root, 0,
+ cmd->nr_regions, false, &cmd->bitset_root);
+ if (r) {
+ DMERR("Failed to resize on-disk bitset to %lu entries", cmd->nr_regions);
+ goto err_with_tm;
+ }
+
+ /* Flush to disk all blocks, except the superblock */
+ r = dm_tm_pre_commit(cmd->tm);
+ if (r) {
+ DMERR("dm_tm_pre_commit failed");
+ goto err_with_tm;
+ }
+
+ r = __copy_sm_root(cmd);
+ if (r) {
+ DMERR("__copy_sm_root failed");
+ goto err_with_tm;
+ }
+
+ r = superblock_write_lock_zero(cmd, &sblock);
+ if (r) {
+ DMERR("Failed to write_lock superblock");
+ goto err_with_tm;
+ }
+
+ sb = dm_block_data(sblock);
+ __prepare_superblock(cmd, sb);
+ r = dm_tm_commit(cmd->tm, sblock);
+ if (r) {
+ DMERR("Failed to commit superblock");
+ goto err_with_tm;
+ }
+
+ return 0;
+
+err_with_tm:
+ dm_sm_destroy(cmd->sm);
+ dm_tm_destroy(cmd->tm);
+
+ return r;
+}
+
+static int __open_or_format_metadata(struct dm_clone_metadata *cmd, bool may_format_device)
+{
+ int r;
+ bool formatted = false;
+
+ r = __superblock_all_zeroes(cmd->bm, &formatted);
+ if (r)
+ return r;
+
+ if (!formatted)
+ return may_format_device ? __format_metadata(cmd) : -EPERM;
+
+ return __open_metadata(cmd);
+}
+
+static int __create_persistent_data_structures(struct dm_clone_metadata *cmd,
+ bool may_format_device)
+{
+ int r;
+
+ /* Create block manager */
+ cmd->bm = dm_block_manager_create(cmd->bdev,
+ DM_CLONE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
+ DM_CLONE_MAX_CONCURRENT_LOCKS);
+ if (IS_ERR(cmd->bm)) {
+ DMERR("Failed to create block manager");
+ return PTR_ERR(cmd->bm);
+ }
+
+ r = __open_or_format_metadata(cmd, may_format_device);
+ if (r)
+ dm_block_manager_destroy(cmd->bm);
+
+ return r;
+}
+
+static void __destroy_persistent_data_structures(struct dm_clone_metadata *cmd)
+{
+ dm_sm_destroy(cmd->sm);
+ dm_tm_destroy(cmd->tm);
+ dm_block_manager_destroy(cmd->bm);
+}
+
+/*---------------------------------------------------------------------------*/
+
+static size_t bitmap_size(unsigned long nr_bits)
+{
+ return BITS_TO_LONGS(nr_bits) * sizeof(long);
+}
+
+static int dirty_map_init(struct dm_clone_metadata *cmd)
+{
+ cmd->dmap[0].changed = 0;
+ cmd->dmap[0].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL);
+
+ if (!cmd->dmap[0].dirty_words) {
+ DMERR("Failed to allocate dirty bitmap");
+ return -ENOMEM;
+ }
+
+ cmd->dmap[1].changed = 0;
+ cmd->dmap[1].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL);
+
+ if (!cmd->dmap[1].dirty_words) {
+ DMERR("Failed to allocate dirty bitmap");
+ kvfree(cmd->dmap[0].dirty_words);
+ return -ENOMEM;
+ }
+
+ cmd->current_dmap = &cmd->dmap[0];
+
+ return 0;
+}
+
+static void dirty_map_exit(struct dm_clone_metadata *cmd)
+{
+ kvfree(cmd->dmap[0].dirty_words);
+ kvfree(cmd->dmap[1].dirty_words);
+}
+
+static int __load_bitset_in_core(struct dm_clone_metadata *cmd)
+{
+ int r;
+ unsigned long i;
+ struct dm_bitset_cursor c;
+
+ /* Flush bitset cache */
+ r = dm_bitset_flush(&cmd->bitset_info, cmd->bitset_root, &cmd->bitset_root);
+ if (r)
+ return r;
+
+ r = dm_bitset_cursor_begin(&cmd->bitset_info, cmd->bitset_root, cmd->nr_regions, &c);
+ if (r)
+ return r;
+
+ for (i = 0; ; i++) {
+ if (dm_bitset_cursor_get_value(&c))
+ __set_bit(i, cmd->region_map);
+ else
+ __clear_bit(i, cmd->region_map);
+
+ if (i >= (cmd->nr_regions - 1))
+ break;
+
+ r = dm_bitset_cursor_next(&c);
+
+ if (r)
+ break;
+ }
+
+ dm_bitset_cursor_end(&c);
+
+ return r;
+}
+
+struct dm_clone_metadata *dm_clone_metadata_open(struct block_device *bdev,
+ sector_t target_size,
+ sector_t region_size)
+{
+ int r;
+ struct dm_clone_metadata *cmd;
+
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd) {
+ DMERR("Failed to allocate memory for dm-clone metadata");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ cmd->bdev = bdev;
+ cmd->target_size = target_size;
+ cmd->region_size = region_size;
+ cmd->nr_regions = dm_sector_div_up(cmd->target_size, cmd->region_size);
+ cmd->nr_words = BITS_TO_LONGS(cmd->nr_regions);
+
+ init_rwsem(&cmd->lock);
+ spin_lock_init(&cmd->bitmap_lock);
+ cmd->read_only = 0;
+ cmd->fail_io = false;
+ cmd->hydration_done = false;
+
+ cmd->region_map = kvmalloc(bitmap_size(cmd->nr_regions), GFP_KERNEL);
+ if (!cmd->region_map) {
+ DMERR("Failed to allocate memory for region bitmap");
+ r = -ENOMEM;
+ goto out_with_md;
+ }
+
+ r = __create_persistent_data_structures(cmd, true);
+ if (r)
+ goto out_with_region_map;
+
+ r = __load_bitset_in_core(cmd);
+ if (r) {
+ DMERR("Failed to load on-disk region map");
+ goto out_with_pds;
+ }
+
+ r = dirty_map_init(cmd);
+ if (r)
+ goto out_with_pds;
+
+ if (bitmap_full(cmd->region_map, cmd->nr_regions))
+ cmd->hydration_done = true;
+
+ return cmd;
+
+out_with_pds:
+ __destroy_persistent_data_structures(cmd);
+
+out_with_region_map:
+ kvfree(cmd->region_map);
+
+out_with_md:
+ kfree(cmd);
+
+ return ERR_PTR(r);
+}
+
+void dm_clone_metadata_close(struct dm_clone_metadata *cmd)
+{
+ if (!cmd->fail_io)
+ __destroy_persistent_data_structures(cmd);
+
+ dirty_map_exit(cmd);
+ kvfree(cmd->region_map);
+ kfree(cmd);
+}
+
+bool dm_clone_is_hydration_done(struct dm_clone_metadata *cmd)
+{
+ return cmd->hydration_done;
+}
+
+bool dm_clone_is_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr)
+{
+ return dm_clone_is_hydration_done(cmd) || test_bit(region_nr, cmd->region_map);
+}
+
+bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd,
+ unsigned long start, unsigned long nr_regions)
+{
+ unsigned long bit;
+
+ if (dm_clone_is_hydration_done(cmd))
+ return true;
+
+ bit = find_next_zero_bit(cmd->region_map, cmd->nr_regions, start);
+
+ return (bit >= (start + nr_regions));
+}
+
+unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd)
+{
+ return bitmap_weight(cmd->region_map, cmd->nr_regions);
+}
+
+unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd,
+ unsigned long start)
+{
+ return find_next_zero_bit(cmd->region_map, cmd->nr_regions, start);
+}
+
+static int __update_metadata_word(struct dm_clone_metadata *cmd, unsigned long word)
+{
+ int r;
+ unsigned long index = word * BITS_PER_LONG;
+ unsigned long max_index = min(cmd->nr_regions, (word + 1) * BITS_PER_LONG);
+
+ while (index < max_index) {
+ if (test_bit(index, cmd->region_map)) {
+ r = dm_bitset_set_bit(&cmd->bitset_info, cmd->bitset_root,
+ index, &cmd->bitset_root);
+
+ if (r) {
+ DMERR("dm_bitset_set_bit failed");
+ return r;
+ }
+ }
+ index++;
+ }
+
+ return 0;
+}
+
+static int __metadata_commit(struct dm_clone_metadata *cmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct superblock_disk *sb;
+
+ /* Flush bitset cache */
+ r = dm_bitset_flush(&cmd->bitset_info, cmd->bitset_root, &cmd->bitset_root);
+ if (r) {
+ DMERR("dm_bitset_flush failed");
+ return r;
+ }
+
+ /* Flush to disk all blocks, except the superblock */
+ r = dm_tm_pre_commit(cmd->tm);
+ if (r) {
+ DMERR("dm_tm_pre_commit failed");
+ return r;
+ }
+
+ /* Save the space map root in cmd->metadata_space_map_root */
+ r = __copy_sm_root(cmd);
+ if (r) {
+ DMERR("__copy_sm_root failed");
+ return r;
+ }
+
+ /* Lock the superblock */
+ r = superblock_write_lock_zero(cmd, &sblock);
+ if (r) {
+ DMERR("Failed to write_lock superblock");
+ return r;
+ }
+
+ /* Save the metadata in superblock */
+ sb = dm_block_data(sblock);
+ __prepare_superblock(cmd, sb);
+
+ /* Unlock superblock and commit it to disk */
+ r = dm_tm_commit(cmd->tm, sblock);
+ if (r) {
+ DMERR("Failed to commit superblock");
+ return r;
+ }
+
+ /*
+ * FIXME: Find a more efficient way to check if the hydration is done.
+ */
+ if (bitmap_full(cmd->region_map, cmd->nr_regions))
+ cmd->hydration_done = true;
+
+ return 0;
+}
+
+static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
+{
+ int r;
+ unsigned long word, flags;
+
+ word = 0;
+ do {
+ word = find_next_bit(dmap->dirty_words, cmd->nr_words, word);
+
+ if (word == cmd->nr_words)
+ break;
+
+ r = __update_metadata_word(cmd, word);
+
+ if (r)
+ return r;
+
+ __clear_bit(word, dmap->dirty_words);
+ word++;
+ } while (word < cmd->nr_words);
+
+ r = __metadata_commit(cmd);
+
+ if (r)
+ return r;
+
+ /* Update the changed flag */
+ spin_lock_irqsave(&cmd->bitmap_lock, flags);
+ dmap->changed = 0;
+ spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
+
+ return 0;
+}
+
+int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
+{
+ int r = -EPERM;
+ unsigned long flags;
+ struct dirty_map *dmap, *next_dmap;
+
+ down_write(&cmd->lock);
+
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
+ goto out;
+
+ /* Get current dirty bitmap */
+ dmap = cmd->current_dmap;
+
+ /* Get next dirty bitmap */
+ next_dmap = (dmap == &cmd->dmap[0]) ? &cmd->dmap[1] : &cmd->dmap[0];
+
+ /*
+ * The last commit failed, so we don't have a clean dirty-bitmap to
+ * use.
+ */
+ if (WARN_ON(next_dmap->changed)) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ /* Swap dirty bitmaps */
+ spin_lock_irqsave(&cmd->bitmap_lock, flags);
+ cmd->current_dmap = next_dmap;
+ spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
+
+ /*
+ * No one is accessing the old dirty bitmap anymore, so we can flush
+ * it.
+ */
+ r = __flush_dmap(cmd, dmap);
+out:
+ up_write(&cmd->lock);
+
+ return r;
+}
+
+int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr)
+{
+ int r = 0;
+ struct dirty_map *dmap;
+ unsigned long word, flags;
+
+ word = region_nr / BITS_PER_LONG;
+
+ spin_lock_irqsave(&cmd->bitmap_lock, flags);
+
+ if (cmd->read_only) {
+ r = -EPERM;
+ goto out;
+ }
+
+ dmap = cmd->current_dmap;
+
+ __set_bit(word, dmap->dirty_words);
+ __set_bit(region_nr, cmd->region_map);
+ dmap->changed = 1;
+
+out:
+ spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
+
+ return r;
+}
+
+int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
+ unsigned long nr_regions)
+{
+ int r = 0;
+ struct dirty_map *dmap;
+ unsigned long word, region_nr, flags;
+
+ spin_lock_irqsave(&cmd->bitmap_lock, flags);
+
+ if (cmd->read_only) {
+ r = -EPERM;
+ goto out;
+ }
+
+ dmap = cmd->current_dmap;
+ for (region_nr = start; region_nr < (start + nr_regions); region_nr++) {
+ if (!test_bit(region_nr, cmd->region_map)) {
+ word = region_nr / BITS_PER_LONG;
+ __set_bit(word, dmap->dirty_words);
+ __set_bit(region_nr, cmd->region_map);
+ dmap->changed = 1;
+ }
+ }
+out:
+ spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
+
+ return r;
+}
+
+/*
+ * WARNING: This must not be called concurrently with either
+ * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), as it changes
+ * cmd->region_map without taking the cmd->bitmap_lock spinlock. The only
+ * exception is after setting the metadata to read-only mode, using
+ * dm_clone_metadata_set_read_only().
+ *
+ * We don't take the spinlock because __load_bitset_in_core() does I/O, so it
+ * may block.
+ */
+int dm_clone_reload_in_core_bitset(struct dm_clone_metadata *cmd)
+{
+ int r = -EINVAL;
+
+ down_write(&cmd->lock);
+
+ if (cmd->fail_io)
+ goto out;
+
+ r = __load_bitset_in_core(cmd);
+out:
+ up_write(&cmd->lock);
+
+ return r;
+}
+
+bool dm_clone_changed_this_transaction(struct dm_clone_metadata *cmd)
+{
+ bool r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cmd->bitmap_lock, flags);
+ r = cmd->dmap[0].changed || cmd->dmap[1].changed;
+ spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
+
+ return r;
+}
+
+int dm_clone_metadata_abort(struct dm_clone_metadata *cmd)
+{
+ int r = -EPERM;
+
+ down_write(&cmd->lock);
+
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
+ goto out;
+
+ __destroy_persistent_data_structures(cmd);
+
+ r = __create_persistent_data_structures(cmd, false);
+ if (r) {
+ /* If something went wrong we can neither write nor read the metadata */
+ cmd->fail_io = true;
+ }
+out:
+ up_write(&cmd->lock);
+
+ return r;
+}
+
+void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd)
+{
+ unsigned long flags;
+
+ down_write(&cmd->lock);
+
+ spin_lock_irqsave(&cmd->bitmap_lock, flags);
+ cmd->read_only = 1;
+ spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
+
+ if (!cmd->fail_io)
+ dm_bm_set_read_only(cmd->bm);
+
+ up_write(&cmd->lock);
+}
+
+void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd)
+{
+ unsigned long flags;
+
+ down_write(&cmd->lock);
+
+ spin_lock_irqsave(&cmd->bitmap_lock, flags);
+ cmd->read_only = 0;
+ spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
+
+ if (!cmd->fail_io)
+ dm_bm_set_read_write(cmd->bm);
+
+ up_write(&cmd->lock);
+}
+
+int dm_clone_get_free_metadata_block_count(struct dm_clone_metadata *cmd,
+ dm_block_t *result)
+{
+ int r = -EINVAL;
+
+ down_read(&cmd->lock);
+
+ if (!cmd->fail_io)
+ r = dm_sm_get_nr_free(cmd->sm, result);
+
+ up_read(&cmd->lock);
+
+ return r;
+}
+
+int dm_clone_get_metadata_dev_size(struct dm_clone_metadata *cmd,
+ dm_block_t *result)
+{
+ int r = -EINVAL;
+
+ down_read(&cmd->lock);
+
+ if (!cmd->fail_io)
+ r = dm_sm_get_nr_blocks(cmd->sm, result);
+
+ up_read(&cmd->lock);
+
+ return r;
+}
diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h
new file mode 100644
index 0000000..434bff0
--- /dev/null
+++ b/drivers/md/dm-clone-metadata.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
+ */
+
+#ifndef DM_CLONE_METADATA_H
+#define DM_CLONE_METADATA_H
+
+#include "persistent-data/dm-block-manager.h"
+#include "persistent-data/dm-space-map-metadata.h"
+
+#define DM_CLONE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
+
+/*
+ * The metadata device is currently limited in size.
+ */
+#define DM_CLONE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
+
+/*
+ * A metadata device larger than 16GB triggers a warning.
+ */
+#define DM_CLONE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
+
+#define SPACE_MAP_ROOT_SIZE 128
+
+/* dm-clone metadata */
+struct dm_clone_metadata;
+
+/*
+ * Set region status to hydrated.
+ *
+ * @cmd: The dm-clone metadata
+ * @region_nr: The region number
+ *
+ * This function doesn't block, so it's safe to call it from interrupt context.
+ */
+int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr);
+
+/*
+ * Set status of all regions in the provided range to hydrated, if not already
+ * hydrated.
+ *
+ * @cmd: The dm-clone metadata
+ * @start: Starting region number
+ * @nr_regions: Number of regions in the range
+ *
+ * This function doesn't block, so it's safe to call it from interrupt context.
+ */
+int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
+ unsigned long nr_regions);
+
+/*
+ * Read existing or create fresh metadata.
+ *
+ * @bdev: The device storing the metadata
+ * @target_size: The target size
+ * @region_size: The region size
+ *
+ * @returns: The dm-clone metadata
+ *
+ * This function reads the superblock of @bdev and checks if it's all zeroes.
+ * If it is, it formats @bdev and creates fresh metadata. If it isn't, it
+ * validates the metadata stored in @bdev.
+ */
+struct dm_clone_metadata *dm_clone_metadata_open(struct block_device *bdev,
+ sector_t target_size,
+ sector_t region_size);
+
+/*
+ * Free the resources related to metadata management.
+ */
+void dm_clone_metadata_close(struct dm_clone_metadata *cmd);
+
+/*
+ * Commit dm-clone metadata to disk.
+ */
+int dm_clone_metadata_commit(struct dm_clone_metadata *cmd);
+
+/*
+ * Reload the in core copy of the on-disk bitmap.
+ *
+ * This should be used after aborting a metadata transaction and setting the
+ * metadata to read-only, to invalidate the in-core cache and make it match the
+ * on-disk metadata.
+ *
+ * WARNING: It must not be called concurrently with either
+ * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), as it updates
+ * the region bitmap without taking the relevant spinlock. We don't take the
+ * spinlock because dm_clone_reload_in_core_bitset() does I/O, so it may block.
+ *
+ * But, it's safe to use it after calling dm_clone_metadata_set_read_only(),
+ * because the latter sets the metadata to read-only mode. Both
+ * dm_clone_set_region_hydrated() and dm_clone_cond_set_range() refuse to touch
+ * the region bitmap, after calling dm_clone_metadata_set_read_only().
+ */
+int dm_clone_reload_in_core_bitset(struct dm_clone_metadata *cmd);
+
+/*
+ * Check whether dm-clone's metadata changed this transaction.
+ */
+bool dm_clone_changed_this_transaction(struct dm_clone_metadata *cmd);
+
+/*
+ * Abort current metadata transaction and rollback metadata to the last
+ * committed transaction.
+ */
+int dm_clone_metadata_abort(struct dm_clone_metadata *cmd);
+
+/*
+ * Switches metadata to a read only mode. Once read-only mode has been entered
+ * the following functions will return -EPERM:
+ *
+ * dm_clone_metadata_commit()
+ * dm_clone_set_region_hydrated()
+ * dm_clone_cond_set_range()
+ * dm_clone_metadata_abort()
+ */
+void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd);
+void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd);
+
+/*
+ * Returns true if the hydration of the destination device is finished.
+ */
+bool dm_clone_is_hydration_done(struct dm_clone_metadata *cmd);
+
+/*
+ * Returns true if region @region_nr is hydrated.
+ */
+bool dm_clone_is_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr);
+
+/*
+ * Returns true if all the regions in the range are hydrated.
+ */
+bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd,
+ unsigned long start, unsigned long nr_regions);
+
+/*
+ * Returns the number of hydrated regions.
+ */
+unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd);
+
+/*
+ * Returns the first unhydrated region with region_nr >= @start
+ */
+unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd,
+ unsigned long start);
+
+/*
+ * Get the number of free metadata blocks.
+ */
+int dm_clone_get_free_metadata_block_count(struct dm_clone_metadata *cmd, dm_block_t *result);
+
+/*
+ * Get the total number of metadata blocks.
+ */
+int dm_clone_get_metadata_dev_size(struct dm_clone_metadata *cmd, dm_block_t *result);
+
+#endif /* DM_CLONE_METADATA_H */
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
new file mode 100644
index 0000000..4ca8f19
--- /dev/null
+++ b/drivers/md/dm-clone-target.c
@@ -0,0 +1,2191 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
+ */
+
+#include <linux/mm.h>
+#include <linux/bio.h>
+#include <linux/err.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/log2.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/dm-io.h>
+#include <linux/mutex.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/kdev_t.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+#include <linux/blk_types.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/workqueue.h>
+#include <linux/backing-dev.h>
+#include <linux/device-mapper.h>
+
+#include "dm.h"
+#include "dm-clone-metadata.h"
+
+#define DM_MSG_PREFIX "clone"
+
+/*
+ * Minimum and maximum allowed region sizes
+ */
+#define MIN_REGION_SIZE (1 << 3) /* 4KB */
+#define MAX_REGION_SIZE (1 << 21) /* 1GB */
+
+#define MIN_HYDRATIONS 256 /* Size of hydration mempool */
+#define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */
+#define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */
+
+#define COMMIT_PERIOD HZ /* 1 sec */
+
+/*
+ * Hydration hash table size: 1 << HASH_TABLE_BITS
+ */
+#define HASH_TABLE_BITS 15
+
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle,
+ "A percentage of time allocated for hydrating regions");
+
+/* Slab cache for struct dm_clone_region_hydration */
+static struct kmem_cache *_hydration_cache;
+
+/* dm-clone metadata modes */
+enum clone_metadata_mode {
+ CM_WRITE, /* metadata may be changed */
+ CM_READ_ONLY, /* metadata may not be changed */
+ CM_FAIL, /* all metadata I/O fails */
+};
+
+struct hash_table_bucket;
+
+struct clone {
+ struct dm_target *ti;
+ struct dm_target_callbacks callbacks;
+
+ struct dm_dev *metadata_dev;
+ struct dm_dev *dest_dev;
+ struct dm_dev *source_dev;
+
+ unsigned long nr_regions;
+ sector_t region_size;
+ unsigned int region_shift;
+
+ /*
+ * A metadata commit and the actions taken in case it fails should run
+ * as a single atomic step.
+ */
+ struct mutex commit_lock;
+
+ struct dm_clone_metadata *cmd;
+
+ /* Region hydration hash table */
+ struct hash_table_bucket *ht;
+
+ atomic_t ios_in_flight;
+
+ wait_queue_head_t hydration_stopped;
+
+ mempool_t hydration_pool;
+
+ unsigned long last_commit_jiffies;
+
+ /*
+ * We defer incoming WRITE bios for regions that are not hydrated,
+ * until after these regions have been hydrated.
+ *
+ * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the
+ * metadata have been committed.
+ */
+ spinlock_t lock;
+ struct bio_list deferred_bios;
+ struct bio_list deferred_discard_bios;
+ struct bio_list deferred_flush_bios;
+ struct bio_list deferred_flush_completions;
+
+ /* Maximum number of regions being copied during background hydration. */
+ unsigned int hydration_threshold;
+
+ /* Number of regions to batch together during background hydration. */
+ unsigned int hydration_batch_size;
+
+ /* Which region to hydrate next */
+ unsigned long hydration_offset;
+
+ atomic_t hydrations_in_flight;
+
+ /*
+ * Save a copy of the table line rather than reconstructing it for the
+ * status.
+ */
+ unsigned int nr_ctr_args;
+ const char **ctr_args;
+
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+ struct delayed_work waker;
+
+ struct dm_kcopyd_client *kcopyd_client;
+
+ enum clone_metadata_mode mode;
+ unsigned long flags;
+};
+
+/*
+ * dm-clone flags
+ */
+#define DM_CLONE_DISCARD_PASSDOWN 0
+#define DM_CLONE_HYDRATION_ENABLED 1
+#define DM_CLONE_HYDRATION_SUSPENDED 2
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Metadata failure handling.
+ */
+static enum clone_metadata_mode get_clone_mode(struct clone *clone)
+{
+ return READ_ONCE(clone->mode);
+}
+
+static const char *clone_device_name(struct clone *clone)
+{
+ return dm_table_device_name(clone->ti->table);
+}
+
+static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode)
+{
+ const char *descs[] = {
+ "read-write",
+ "read-only",
+ "fail"
+ };
+
+ enum clone_metadata_mode old_mode = get_clone_mode(clone);
+
+ /* Never move out of fail mode */
+ if (old_mode == CM_FAIL)
+ new_mode = CM_FAIL;
+
+ switch (new_mode) {
+ case CM_FAIL:
+ case CM_READ_ONLY:
+ dm_clone_metadata_set_read_only(clone->cmd);
+ break;
+
+ case CM_WRITE:
+ dm_clone_metadata_set_read_write(clone->cmd);
+ break;
+ }
+
+ WRITE_ONCE(clone->mode, new_mode);
+
+ if (new_mode != old_mode) {
+ dm_table_event(clone->ti->table);
+ DMINFO("%s: Switching to %s mode", clone_device_name(clone),
+ descs[(int)new_mode]);
+ }
+}
+
+static void __abort_transaction(struct clone *clone)
+{
+ const char *dev_name = clone_device_name(clone);
+
+ if (get_clone_mode(clone) >= CM_READ_ONLY)
+ return;
+
+ DMERR("%s: Aborting current metadata transaction", dev_name);
+ if (dm_clone_metadata_abort(clone->cmd)) {
+ DMERR("%s: Failed to abort metadata transaction", dev_name);
+ __set_clone_mode(clone, CM_FAIL);
+ }
+}
+
+static void __reload_in_core_bitset(struct clone *clone)
+{
+ const char *dev_name = clone_device_name(clone);
+
+ if (get_clone_mode(clone) == CM_FAIL)
+ return;
+
+ /* Reload the on-disk bitset */
+ DMINFO("%s: Reloading on-disk bitmap", dev_name);
+ if (dm_clone_reload_in_core_bitset(clone->cmd)) {
+ DMERR("%s: Failed to reload on-disk bitmap", dev_name);
+ __set_clone_mode(clone, CM_FAIL);
+ }
+}
+
+static void __metadata_operation_failed(struct clone *clone, const char *op, int r)
+{
+ DMERR("%s: Metadata operation `%s' failed: error = %d",
+ clone_device_name(clone), op, r);
+
+ __abort_transaction(clone);
+ __set_clone_mode(clone, CM_READ_ONLY);
+
+ /*
+ * dm_clone_reload_in_core_bitset() may run concurrently with either
+ * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but
+ * it's safe as we have already set the metadata to read-only mode.
+ */
+ __reload_in_core_bitset(clone);
+}
+
+/*---------------------------------------------------------------------------*/
+
+/* Wake up anyone waiting for region hydrations to stop */
+static inline void wakeup_hydration_waiters(struct clone *clone)
+{
+ wake_up_all(&clone->hydration_stopped);
+}
+
+static inline void wake_worker(struct clone *clone)
+{
+ queue_work(clone->wq, &clone->worker);
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * bio helper functions.
+ */
+static inline void remap_to_source(struct clone *clone, struct bio *bio)
+{
+ bio_set_dev(bio, clone->source_dev->bdev);
+}
+
+static inline void remap_to_dest(struct clone *clone, struct bio *bio)
+{
+ bio_set_dev(bio, clone->dest_dev->bdev);
+}
+
+static bool bio_triggers_commit(struct clone *clone, struct bio *bio)
+{
+ return op_is_flush(bio->bi_opf) &&
+ dm_clone_changed_this_transaction(clone->cmd);
+}
+
+/* Get the address of the region in sectors */
+static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr)
+{
+ return (region_nr << clone->region_shift);
+}
+
+/* Get the region number of the bio */
+static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio)
+{
+ return (bio->bi_iter.bi_sector >> clone->region_shift);
+}
+
+/* Get the region range covered by the bio */
+static void bio_region_range(struct clone *clone, struct bio *bio,
+ unsigned long *rs, unsigned long *re)
+{
+ *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size);
+ *re = bio_end_sector(bio) >> clone->region_shift;
+}
+
+/* Check whether a bio overwrites a region */
+static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio)
+{
+ return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size);
+}
+
+static void fail_bios(struct bio_list *bios, blk_status_t status)
+{
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(bios))) {
+ bio->bi_status = status;
+ bio_endio(bio);
+ }
+}
+
+static void submit_bios(struct bio_list *bios)
+{
+ struct bio *bio;
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+
+ while ((bio = bio_list_pop(bios)))
+ generic_make_request(bio);
+
+ blk_finish_plug(&plug);
+}
+
+/*
+ * Submit bio to the underlying device.
+ *
+ * If the bio triggers a commit, delay it, until after the metadata have been
+ * committed.
+ *
+ * NOTE: The bio remapping must be performed by the caller.
+ */
+static void issue_bio(struct clone *clone, struct bio *bio)
+{
+ unsigned long flags;
+
+ if (!bio_triggers_commit(clone, bio)) {
+ generic_make_request(bio);
+ return;
+ }
+
+ /*
+ * If the metadata mode is RO or FAIL we won't be able to commit the
+ * metadata, so we complete the bio with an error.
+ */
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
+ bio_io_error(bio);
+ return;
+ }
+
+ /*
+ * Batch together any bios that trigger commits and then issue a single
+ * commit for them in process_deferred_flush_bios().
+ */
+ spin_lock_irqsave(&clone->lock, flags);
+ bio_list_add(&clone->deferred_flush_bios, bio);
+ spin_unlock_irqrestore(&clone->lock, flags);
+
+ wake_worker(clone);
+}
+
+/*
+ * Remap bio to the destination device and submit it.
+ *
+ * If the bio triggers a commit, delay it, until after the metadata have been
+ * committed.
+ */
+static void remap_and_issue(struct clone *clone, struct bio *bio)
+{
+ remap_to_dest(clone, bio);
+ issue_bio(clone, bio);
+}
+
+/*
+ * Issue bios that have been deferred until after their region has finished
+ * hydrating.
+ *
+ * We delegate the bio submission to the worker thread, so this is safe to call
+ * from interrupt context.
+ */
+static void issue_deferred_bios(struct clone *clone, struct bio_list *bios)
+{
+ struct bio *bio;
+ unsigned long flags;
+ struct bio_list flush_bios = BIO_EMPTY_LIST;
+ struct bio_list normal_bios = BIO_EMPTY_LIST;
+
+ if (bio_list_empty(bios))
+ return;
+
+ while ((bio = bio_list_pop(bios))) {
+ if (bio_triggers_commit(clone, bio))
+ bio_list_add(&flush_bios, bio);
+ else
+ bio_list_add(&normal_bios, bio);
+ }
+
+ spin_lock_irqsave(&clone->lock, flags);
+ bio_list_merge(&clone->deferred_bios, &normal_bios);
+ bio_list_merge(&clone->deferred_flush_bios, &flush_bios);
+ spin_unlock_irqrestore(&clone->lock, flags);
+
+ wake_worker(clone);
+}
+
+static void complete_overwrite_bio(struct clone *clone, struct bio *bio)
+{
+ unsigned long flags;
+
+ /*
+ * If the bio has the REQ_FUA flag set we must commit the metadata
+ * before signaling its completion.
+ *
+ * complete_overwrite_bio() is only called by hydration_complete(),
+ * after having successfully updated the metadata. This means we don't
+ * need to call dm_clone_changed_this_transaction() to check if the
+ * metadata has changed and thus we can avoid taking the metadata spin
+ * lock.
+ */
+ if (!(bio->bi_opf & REQ_FUA)) {
+ bio_endio(bio);
+ return;
+ }
+
+ /*
+ * If the metadata mode is RO or FAIL we won't be able to commit the
+ * metadata, so we complete the bio with an error.
+ */
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
+ bio_io_error(bio);
+ return;
+ }
+
+ /*
+ * Batch together any bios that trigger commits and then issue a single
+ * commit for them in process_deferred_flush_bios().
+ */
+ spin_lock_irqsave(&clone->lock, flags);
+ bio_list_add(&clone->deferred_flush_completions, bio);
+ spin_unlock_irqrestore(&clone->lock, flags);
+
+ wake_worker(clone);
+}
+
+static void trim_bio(struct bio *bio, sector_t sector, unsigned int len)
+{
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_iter.bi_size = to_bytes(len);
+}
+
+static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success)
+{
+ unsigned long rs, re;
+
+ /*
+ * If the destination device supports discards, remap and trim the
+ * discard bio and pass it down. Otherwise complete the bio
+ * immediately.
+ */
+ if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) {
+ remap_to_dest(clone, bio);
+ bio_region_range(clone, bio, &rs, &re);
+ trim_bio(bio, rs << clone->region_shift,
+ (re - rs) << clone->region_shift);
+ generic_make_request(bio);
+ } else
+ bio_endio(bio);
+}
+
+static void process_discard_bio(struct clone *clone, struct bio *bio)
+{
+ unsigned long rs, re, flags;
+
+ bio_region_range(clone, bio, &rs, &re);
+ BUG_ON(re > clone->nr_regions);
+
+ if (unlikely(rs == re)) {
+ bio_endio(bio);
+ return;
+ }
+
+ /*
+ * The covered regions are already hydrated so we just need to pass
+ * down the discard.
+ */
+ if (dm_clone_is_range_hydrated(clone->cmd, rs, re - rs)) {
+ complete_discard_bio(clone, bio, true);
+ return;
+ }
+
+ /*
+ * If the metadata mode is RO or FAIL we won't be able to update the
+ * metadata for the regions covered by the discard so we just ignore
+ * it.
+ */
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
+ bio_endio(bio);
+ return;
+ }
+
+ /*
+ * Defer discard processing.
+ */
+ spin_lock_irqsave(&clone->lock, flags);
+ bio_list_add(&clone->deferred_discard_bios, bio);
+ spin_unlock_irqrestore(&clone->lock, flags);
+
+ wake_worker(clone);
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * dm-clone region hydrations.
+ */
+struct dm_clone_region_hydration {
+ struct clone *clone;
+ unsigned long region_nr;
+
+ struct bio *overwrite_bio;
+ bio_end_io_t *overwrite_bio_end_io;
+
+ struct bio_list deferred_bios;
+
+ blk_status_t status;
+
+ /* Used by hydration batching */
+ struct list_head list;
+
+ /* Used by hydration hash table */
+ struct hlist_node h;
+};
+
+/*
+ * Hydration hash table implementation.
+ *
+ * Ideally we would like to use list_bl, which uses bit spin locks and employs
+ * the least significant bit of the list head to lock the corresponding bucket,
+ * reducing the memory overhead for the locks. But, currently, list_bl and bit
+ * spin locks don't support IRQ safe versions. Since we have to take the lock
+ * in both process and interrupt context, we must fall back to using regular
+ * spin locks; one per hash table bucket.
+ */
+struct hash_table_bucket {
+ struct hlist_head head;
+
+ /* Spinlock protecting the bucket */
+ spinlock_t lock;
+};
+
+#define bucket_lock_irqsave(bucket, flags) \
+ spin_lock_irqsave(&(bucket)->lock, flags)
+
+#define bucket_unlock_irqrestore(bucket, flags) \
+ spin_unlock_irqrestore(&(bucket)->lock, flags)
+
+static int hash_table_init(struct clone *clone)
+{
+ unsigned int i, sz;
+ struct hash_table_bucket *bucket;
+
+ sz = 1 << HASH_TABLE_BITS;
+
+ clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL);
+ if (!clone->ht)
+ return -ENOMEM;
+
+ for (i = 0; i < sz; i++) {
+ bucket = clone->ht + i;
+
+ INIT_HLIST_HEAD(&bucket->head);
+ spin_lock_init(&bucket->lock);
+ }
+
+ return 0;
+}
+
+static void hash_table_exit(struct clone *clone)
+{
+ kvfree(clone->ht);
+}
+
+static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone,
+ unsigned long region_nr)
+{
+ return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)];
+}
+
+/*
+ * Search hash table for a hydration with hd->region_nr == region_nr
+ *
+ * NOTE: Must be called with the bucket lock held
+ */
+static struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket,
+ unsigned long region_nr)
+{
+ struct dm_clone_region_hydration *hd;
+
+ hlist_for_each_entry(hd, &bucket->head, h) {
+ if (hd->region_nr == region_nr)
+ return hd;
+ }
+
+ return NULL;
+}
+
+/*
+ * Insert a hydration into the hash table.
+ *
+ * NOTE: Must be called with the bucket lock held.
+ */
+static inline void __insert_region_hydration(struct hash_table_bucket *bucket,
+ struct dm_clone_region_hydration *hd)
+{
+ hlist_add_head(&hd->h, &bucket->head);
+}
+
+/*
+ * This function inserts a hydration into the hash table, unless someone else
+ * managed to insert a hydration for the same region first. In the latter case
+ * it returns the existing hydration descriptor for this region.
+ *
+ * NOTE: Must be called with the hydration hash table lock held.
+ */
+static struct dm_clone_region_hydration *
+__find_or_insert_region_hydration(struct hash_table_bucket *bucket,
+ struct dm_clone_region_hydration *hd)
+{
+ struct dm_clone_region_hydration *hd2;
+
+ hd2 = __hash_find(bucket, hd->region_nr);
+ if (hd2)
+ return hd2;
+
+ __insert_region_hydration(bucket, hd);
+
+ return hd;
+}
+
+/*---------------------------------------------------------------------------*/
+
+/* Allocate a hydration */
+static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone)
+{
+ struct dm_clone_region_hydration *hd;
+
+ /*
+ * Allocate a hydration from the hydration mempool.
+ * This might block but it can't fail.
+ */
+ hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO);
+ hd->clone = clone;
+
+ return hd;
+}
+
+static inline void free_hydration(struct dm_clone_region_hydration *hd)
+{
+ mempool_free(hd, &hd->clone->hydration_pool);
+}
+
+/* Initialize a hydration */
+static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr)
+{
+ hd->region_nr = region_nr;
+ hd->overwrite_bio = NULL;
+ bio_list_init(&hd->deferred_bios);
+ hd->status = 0;
+
+ INIT_LIST_HEAD(&hd->list);
+ INIT_HLIST_NODE(&hd->h);
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Update dm-clone's metadata after a region has finished hydrating and remove
+ * hydration from the hash table.
+ */
+static int hydration_update_metadata(struct dm_clone_region_hydration *hd)
+{
+ int r = 0;
+ unsigned long flags;
+ struct hash_table_bucket *bucket;
+ struct clone *clone = hd->clone;
+
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
+ r = -EPERM;
+
+ /* Update the metadata */
+ if (likely(!r) && hd->status == BLK_STS_OK)
+ r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr);
+
+ bucket = get_hash_table_bucket(clone, hd->region_nr);
+
+ /* Remove hydration from hash table */
+ bucket_lock_irqsave(bucket, flags);
+ hlist_del(&hd->h);
+ bucket_unlock_irqrestore(bucket, flags);
+
+ return r;
+}
+
+/*
+ * Complete a region's hydration:
+ *
+ * 1. Update dm-clone's metadata.
+ * 2. Remove hydration from hash table.
+ * 3. Complete overwrite bio.
+ * 4. Issue deferred bios.
+ * 5. If this was the last hydration, wake up anyone waiting for
+ * hydrations to finish.
+ */
+static void hydration_complete(struct dm_clone_region_hydration *hd)
+{
+ int r;
+ blk_status_t status;
+ struct clone *clone = hd->clone;
+
+ r = hydration_update_metadata(hd);
+
+ if (hd->status == BLK_STS_OK && likely(!r)) {
+ if (hd->overwrite_bio)
+ complete_overwrite_bio(clone, hd->overwrite_bio);
+
+ issue_deferred_bios(clone, &hd->deferred_bios);
+ } else {
+ status = r ? BLK_STS_IOERR : hd->status;
+
+ if (hd->overwrite_bio)
+ bio_list_add(&hd->deferred_bios, hd->overwrite_bio);
+
+ fail_bios(&hd->deferred_bios, status);
+ }
+
+ free_hydration(hd);
+
+ if (atomic_dec_and_test(&clone->hydrations_in_flight))
+ wakeup_hydration_waiters(clone);
+}
+
+static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context)
+{
+ blk_status_t status;
+
+ struct dm_clone_region_hydration *tmp, *hd = context;
+ struct clone *clone = hd->clone;
+
+ LIST_HEAD(batched_hydrations);
+
+ if (read_err || write_err) {
+ DMERR_LIMIT("%s: hydration failed", clone_device_name(clone));
+ status = BLK_STS_IOERR;
+ } else {
+ status = BLK_STS_OK;
+ }
+ list_splice_tail(&hd->list, &batched_hydrations);
+
+ hd->status = status;
+ hydration_complete(hd);
+
+ /* Complete batched hydrations */
+ list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) {
+ hd->status = status;
+ hydration_complete(hd);
+ }
+
+ /* Continue background hydration, if there is no I/O in-flight */
+ if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
+ !atomic_read(&clone->ios_in_flight))
+ wake_worker(clone);
+}
+
+static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions)
+{
+ unsigned long region_start, region_end;
+ sector_t tail_size, region_size, total_size;
+ struct dm_io_region from, to;
+ struct clone *clone = hd->clone;
+
+ region_size = clone->region_size;
+ region_start = hd->region_nr;
+ region_end = region_start + nr_regions - 1;
+
+ total_size = (nr_regions - 1) << clone->region_shift;
+
+ if (region_end == clone->nr_regions - 1) {
+ /*
+ * The last region of the target might be smaller than
+ * region_size.
+ */
+ tail_size = clone->ti->len & (region_size - 1);
+ if (!tail_size)
+ tail_size = region_size;
+ } else {
+ tail_size = region_size;
+ }
+
+ total_size += tail_size;
+
+ from.bdev = clone->source_dev->bdev;
+ from.sector = region_to_sector(clone, region_start);
+ from.count = total_size;
+
+ to.bdev = clone->dest_dev->bdev;
+ to.sector = from.sector;
+ to.count = from.count;
+
+ /* Issue copy */
+ atomic_add(nr_regions, &clone->hydrations_in_flight);
+ dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0,
+ hydration_kcopyd_callback, hd);
+}
+
+static void overwrite_endio(struct bio *bio)
+{
+ struct dm_clone_region_hydration *hd = bio->bi_private;
+
+ bio->bi_end_io = hd->overwrite_bio_end_io;
+ hd->status = bio->bi_status;
+
+ hydration_complete(hd);
+}
+
+static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio)
+{
+ /*
+ * We don't need to save and restore bio->bi_private because device
+ * mapper core generates a new bio for us to use, with clean
+ * bi_private.
+ */
+ hd->overwrite_bio = bio;
+ hd->overwrite_bio_end_io = bio->bi_end_io;
+
+ bio->bi_end_io = overwrite_endio;
+ bio->bi_private = hd;
+
+ atomic_inc(&hd->clone->hydrations_in_flight);
+ generic_make_request(bio);
+}
+
+/*
+ * Hydrate bio's region.
+ *
+ * This function starts the hydration of the bio's region and puts the bio in
+ * the list of deferred bios for this region. In case, by the time this
+ * function is called, the region has finished hydrating it's submitted to the
+ * destination device.
+ *
+ * NOTE: The bio remapping must be performed by the caller.
+ */
+static void hydrate_bio_region(struct clone *clone, struct bio *bio)
+{
+ unsigned long flags;
+ unsigned long region_nr;
+ struct hash_table_bucket *bucket;
+ struct dm_clone_region_hydration *hd, *hd2;
+
+ region_nr = bio_to_region(clone, bio);
+ bucket = get_hash_table_bucket(clone, region_nr);
+
+ bucket_lock_irqsave(bucket, flags);
+
+ hd = __hash_find(bucket, region_nr);
+ if (hd) {
+ /* Someone else is hydrating the region */
+ bio_list_add(&hd->deferred_bios, bio);
+ bucket_unlock_irqrestore(bucket, flags);
+ return;
+ }
+
+ if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
+ /* The region has been hydrated */
+ bucket_unlock_irqrestore(bucket, flags);
+ issue_bio(clone, bio);
+ return;
+ }
+
+ /*
+ * We must allocate a hydration descriptor and start the hydration of
+ * the corresponding region.
+ */
+ bucket_unlock_irqrestore(bucket, flags);
+
+ hd = alloc_hydration(clone);
+ hydration_init(hd, region_nr);
+
+ bucket_lock_irqsave(bucket, flags);
+
+ /* Check if the region has been hydrated in the meantime. */
+ if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
+ bucket_unlock_irqrestore(bucket, flags);
+ free_hydration(hd);
+ issue_bio(clone, bio);
+ return;
+ }
+
+ hd2 = __find_or_insert_region_hydration(bucket, hd);
+ if (hd2 != hd) {
+ /* Someone else started the region's hydration. */
+ bio_list_add(&hd2->deferred_bios, bio);
+ bucket_unlock_irqrestore(bucket, flags);
+ free_hydration(hd);
+ return;
+ }
+
+ /*
+ * If the metadata mode is RO or FAIL then there is no point starting a
+ * hydration, since we will not be able to update the metadata when the
+ * hydration finishes.
+ */
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
+ hlist_del(&hd->h);
+ bucket_unlock_irqrestore(bucket, flags);
+ free_hydration(hd);
+ bio_io_error(bio);
+ return;
+ }
+
+ /*
+ * Start region hydration.
+ *
+ * If a bio overwrites a region, i.e., its size is equal to the
+ * region's size, then we don't need to copy the region from the source
+ * to the destination device.
+ */
+ if (is_overwrite_bio(clone, bio)) {
+ bucket_unlock_irqrestore(bucket, flags);
+ hydration_overwrite(hd, bio);
+ } else {
+ bio_list_add(&hd->deferred_bios, bio);
+ bucket_unlock_irqrestore(bucket, flags);
+ hydration_copy(hd, 1);
+ }
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Background hydrations.
+ */
+
+/*
+ * Batch region hydrations.
+ *
+ * To better utilize device bandwidth we batch together the hydration of
+ * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which
+ * is good for small, random write performance (because of the overwriting of
+ * un-hydrated regions) and at the same time issue big copy requests to kcopyd
+ * to achieve high hydration bandwidth.
+ */
+struct batch_info {
+ struct dm_clone_region_hydration *head;
+ unsigned int nr_batched_regions;
+};
+
+static void __batch_hydration(struct batch_info *batch,
+ struct dm_clone_region_hydration *hd)
+{
+ struct clone *clone = hd->clone;
+ unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size);
+
+ if (batch->head) {
+ /* Try to extend the current batch */
+ if (batch->nr_batched_regions < max_batch_size &&
+ (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) {
+ list_add_tail(&hd->list, &batch->head->list);
+ batch->nr_batched_regions++;
+ hd = NULL;
+ }
+
+ /* Check if we should issue the current batch */
+ if (batch->nr_batched_regions >= max_batch_size || hd) {
+ hydration_copy(batch->head, batch->nr_batched_regions);
+ batch->head = NULL;
+ batch->nr_batched_regions = 0;
+ }
+ }
+
+ if (!hd)
+ return;
+
+ /* We treat max batch sizes of zero and one equivalently */
+ if (max_batch_size <= 1) {
+ hydration_copy(hd, 1);
+ return;
+ }
+
+ /* Start a new batch */
+ BUG_ON(!list_empty(&hd->list));
+ batch->head = hd;
+ batch->nr_batched_regions = 1;
+}
+
+static unsigned long __start_next_hydration(struct clone *clone,
+ unsigned long offset,
+ struct batch_info *batch)
+{
+ unsigned long flags;
+ struct hash_table_bucket *bucket;
+ struct dm_clone_region_hydration *hd;
+ unsigned long nr_regions = clone->nr_regions;
+
+ hd = alloc_hydration(clone);
+
+ /* Try to find a region to hydrate. */
+ do {
+ offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset);
+ if (offset == nr_regions)
+ break;
+
+ bucket = get_hash_table_bucket(clone, offset);
+ bucket_lock_irqsave(bucket, flags);
+
+ if (!dm_clone_is_region_hydrated(clone->cmd, offset) &&
+ !__hash_find(bucket, offset)) {
+ hydration_init(hd, offset);
+ __insert_region_hydration(bucket, hd);
+ bucket_unlock_irqrestore(bucket, flags);
+
+ /* Batch hydration */
+ __batch_hydration(batch, hd);
+
+ return (offset + 1);
+ }
+
+ bucket_unlock_irqrestore(bucket, flags);
+
+ } while (++offset < nr_regions);
+
+ if (hd)
+ free_hydration(hd);
+
+ return offset;
+}
+
+/*
+ * This function searches for regions that still reside in the source device
+ * and starts their hydration.
+ */
+static void do_hydration(struct clone *clone)
+{
+ unsigned int current_volume;
+ unsigned long offset, nr_regions = clone->nr_regions;
+
+ struct batch_info batch = {
+ .head = NULL,
+ .nr_batched_regions = 0,
+ };
+
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
+ return;
+
+ if (dm_clone_is_hydration_done(clone->cmd))
+ return;
+
+ /*
+ * Avoid race with device suspension.
+ */
+ atomic_inc(&clone->hydrations_in_flight);
+
+ /*
+ * Make sure atomic_inc() is ordered before test_bit(), otherwise we
+ * might race with clone_postsuspend() and start a region hydration
+ * after the target has been suspended.
+ *
+ * This is paired with the smp_mb__after_atomic() in
+ * clone_postsuspend().
+ */
+ smp_mb__after_atomic();
+
+ offset = clone->hydration_offset;
+ while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) &&
+ !atomic_read(&clone->ios_in_flight) &&
+ test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
+ offset < nr_regions) {
+ current_volume = atomic_read(&clone->hydrations_in_flight);
+ current_volume += batch.nr_batched_regions;
+
+ if (current_volume > READ_ONCE(clone->hydration_threshold))
+ break;
+
+ offset = __start_next_hydration(clone, offset, &batch);
+ }
+
+ if (batch.head)
+ hydration_copy(batch.head, batch.nr_batched_regions);
+
+ if (offset >= nr_regions)
+ offset = 0;
+
+ clone->hydration_offset = offset;
+
+ if (atomic_dec_and_test(&clone->hydrations_in_flight))
+ wakeup_hydration_waiters(clone);
+}
+
+/*---------------------------------------------------------------------------*/
+
+static bool need_commit_due_to_time(struct clone *clone)
+{
+ return !time_in_range(jiffies, clone->last_commit_jiffies,
+ clone->last_commit_jiffies + COMMIT_PERIOD);
+}
+
+/*
+ * A non-zero return indicates read-only or fail mode.
+ */
+static int commit_metadata(struct clone *clone)
+{
+ int r = 0;
+
+ mutex_lock(&clone->commit_lock);
+
+ if (!dm_clone_changed_this_transaction(clone->cmd))
+ goto out;
+
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
+ r = -EPERM;
+ goto out;
+ }
+
+ r = dm_clone_metadata_commit(clone->cmd);
+
+ if (unlikely(r)) {
+ __metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
+ goto out;
+ }
+
+ if (dm_clone_is_hydration_done(clone->cmd))
+ dm_table_event(clone->ti->table);
+out:
+ mutex_unlock(&clone->commit_lock);
+
+ return r;
+}
+
+static void process_deferred_discards(struct clone *clone)
+{
+ int r = -EPERM;
+ struct bio *bio;
+ struct blk_plug plug;
+ unsigned long rs, re, flags;
+ struct bio_list discards = BIO_EMPTY_LIST;
+
+ spin_lock_irqsave(&clone->lock, flags);
+ bio_list_merge(&discards, &clone->deferred_discard_bios);
+ bio_list_init(&clone->deferred_discard_bios);
+ spin_unlock_irqrestore(&clone->lock, flags);
+
+ if (bio_list_empty(&discards))
+ return;
+
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
+ goto out;
+
+ /* Update the metadata */
+ bio_list_for_each(bio, &discards) {
+ bio_region_range(clone, bio, &rs, &re);
+ /*
+ * A discard request might cover regions that have been already
+ * hydrated. There is no need to update the metadata for these
+ * regions.
+ */
+ r = dm_clone_cond_set_range(clone->cmd, rs, re - rs);
+
+ if (unlikely(r))
+ break;
+ }
+out:
+ blk_start_plug(&plug);
+ while ((bio = bio_list_pop(&discards)))
+ complete_discard_bio(clone, bio, r == 0);
+ blk_finish_plug(&plug);
+}
+
+static void process_deferred_bios(struct clone *clone)
+{
+ unsigned long flags;
+ struct bio_list bios = BIO_EMPTY_LIST;
+
+ spin_lock_irqsave(&clone->lock, flags);
+ bio_list_merge(&bios, &clone->deferred_bios);
+ bio_list_init(&clone->deferred_bios);
+ spin_unlock_irqrestore(&clone->lock, flags);
+
+ if (bio_list_empty(&bios))
+ return;
+
+ submit_bios(&bios);
+}
+
+static void process_deferred_flush_bios(struct clone *clone)
+{
+ struct bio *bio;
+ unsigned long flags;
+ struct bio_list bios = BIO_EMPTY_LIST;
+ struct bio_list bio_completions = BIO_EMPTY_LIST;
+
+ /*
+ * If there are any deferred flush bios, we must commit the metadata
+ * before issuing them or signaling their completion.
+ */
+ spin_lock_irqsave(&clone->lock, flags);
+ bio_list_merge(&bios, &clone->deferred_flush_bios);
+ bio_list_init(&clone->deferred_flush_bios);
+
+ bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
+ bio_list_init(&clone->deferred_flush_completions);
+ spin_unlock_irqrestore(&clone->lock, flags);
+
+ if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
+ !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
+ return;
+
+ if (commit_metadata(clone)) {
+ bio_list_merge(&bios, &bio_completions);
+
+ while ((bio = bio_list_pop(&bios)))
+ bio_io_error(bio);
+
+ return;
+ }
+
+ clone->last_commit_jiffies = jiffies;
+
+ while ((bio = bio_list_pop(&bio_completions)))
+ bio_endio(bio);
+
+ while ((bio = bio_list_pop(&bios)))
+ generic_make_request(bio);
+}
+
+static void do_worker(struct work_struct *work)
+{
+ struct clone *clone = container_of(work, typeof(*clone), worker);
+
+ process_deferred_bios(clone);
+ process_deferred_discards(clone);
+
+ /*
+ * process_deferred_flush_bios():
+ *
+ * - Commit metadata
+ *
+ * - Process deferred REQ_FUA completions
+ *
+ * - Process deferred REQ_PREFLUSH bios
+ */
+ process_deferred_flush_bios(clone);
+
+ /* Background hydration */
+ do_hydration(clone);
+}
+
+/*
+ * Commit periodically so that not too much unwritten data builds up.
+ *
+ * Also, restart background hydration, if it has been stopped by in-flight I/O.
+ */
+static void do_waker(struct work_struct *work)
+{
+ struct clone *clone = container_of(to_delayed_work(work), struct clone, waker);
+
+ wake_worker(clone);
+ queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD);
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Target methods
+ */
+static int clone_map(struct dm_target *ti, struct bio *bio)
+{
+ struct clone *clone = ti->private;
+ unsigned long region_nr;
+
+ atomic_inc(&clone->ios_in_flight);
+
+ if (unlikely(get_clone_mode(clone) == CM_FAIL))
+ return DM_MAPIO_KILL;
+
+ /*
+ * REQ_PREFLUSH bios carry no data:
+ *
+ * - Commit metadata, if changed
+ *
+ * - Pass down to destination device
+ */
+ if (bio->bi_opf & REQ_PREFLUSH) {
+ remap_and_issue(clone, bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
+
+ /*
+ * dm-clone interprets discards and performs a fast hydration of the
+ * discarded regions, i.e., we skip the copy from the source device and
+ * just mark the regions as hydrated.
+ */
+ if (bio_op(bio) == REQ_OP_DISCARD) {
+ process_discard_bio(clone, bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ /*
+ * If the bio's region is hydrated, redirect it to the destination
+ * device.
+ *
+ * If the region is not hydrated and the bio is a READ, redirect it to
+ * the source device.
+ *
+ * Else, defer WRITE bio until after its region has been hydrated and
+ * start the region's hydration immediately.
+ */
+ region_nr = bio_to_region(clone, bio);
+ if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
+ remap_and_issue(clone, bio);
+ return DM_MAPIO_SUBMITTED;
+ } else if (bio_data_dir(bio) == READ) {
+ remap_to_source(clone, bio);
+ return DM_MAPIO_REMAPPED;
+ }
+
+ remap_to_dest(clone, bio);
+ hydrate_bio_region(clone, bio);
+
+ return DM_MAPIO_SUBMITTED;
+}
+
+static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error)
+{
+ struct clone *clone = ti->private;
+
+ atomic_dec(&clone->ios_in_flight);
+
+ return DM_ENDIO_DONE;
+}
+
+static void emit_flags(struct clone *clone, char *result, unsigned int maxlen,
+ ssize_t *sz_ptr)
+{
+ ssize_t sz = *sz_ptr;
+ unsigned int count;
+
+ count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
+ count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
+
+ DMEMIT("%u ", count);
+
+ if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
+ DMEMIT("no_hydration ");
+
+ if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
+ DMEMIT("no_discard_passdown ");
+
+ *sz_ptr = sz;
+}
+
+static void emit_core_args(struct clone *clone, char *result,
+ unsigned int maxlen, ssize_t *sz_ptr)
+{
+ ssize_t sz = *sz_ptr;
+ unsigned int count = 4;
+
+ DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count,
+ READ_ONCE(clone->hydration_threshold),
+ READ_ONCE(clone->hydration_batch_size));
+
+ *sz_ptr = sz;
+}
+
+/*
+ * Status format:
+ *
+ * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
+ * <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions>
+ * <#features> <features>* <#core args> <core args>* <clone metadata mode>
+ */
+static void clone_status(struct dm_target *ti, status_type_t type,
+ unsigned int status_flags, char *result,
+ unsigned int maxlen)
+{
+ int r;
+ unsigned int i;
+ ssize_t sz = 0;
+ dm_block_t nr_free_metadata_blocks = 0;
+ dm_block_t nr_metadata_blocks = 0;
+ char buf[BDEVNAME_SIZE];
+ struct clone *clone = ti->private;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ if (get_clone_mode(clone) == CM_FAIL) {
+ DMEMIT("Fail");
+ break;
+ }
+
+ /* Commit to ensure statistics aren't out-of-date */
+ if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
+ (void) commit_metadata(clone);
+
+ r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);
+
+ if (r) {
+ DMERR("%s: dm_clone_get_free_metadata_block_count returned %d",
+ clone_device_name(clone), r);
+ goto error;
+ }
+
+ r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks);
+
+ if (r) {
+ DMERR("%s: dm_clone_get_metadata_dev_size returned %d",
+ clone_device_name(clone), r);
+ goto error;
+ }
+
+ DMEMIT("%u %llu/%llu %llu %lu/%lu %u ",
+ DM_CLONE_METADATA_BLOCK_SIZE,
+ (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks),
+ (unsigned long long)nr_metadata_blocks,
+ (unsigned long long)clone->region_size,
+ dm_clone_nr_of_hydrated_regions(clone->cmd),
+ clone->nr_regions,
+ atomic_read(&clone->hydrations_in_flight));
+
+ emit_flags(clone, result, maxlen, &sz);
+ emit_core_args(clone, result, maxlen, &sz);
+
+ switch (get_clone_mode(clone)) {
+ case CM_WRITE:
+ DMEMIT("rw");
+ break;
+ case CM_READ_ONLY:
+ DMEMIT("ro");
+ break;
+ case CM_FAIL:
+ DMEMIT("Fail");
+ }
+
+ break;
+
+ case STATUSTYPE_TABLE:
+ format_dev_t(buf, clone->metadata_dev->bdev->bd_dev);
+ DMEMIT("%s ", buf);
+
+ format_dev_t(buf, clone->dest_dev->bdev->bd_dev);
+ DMEMIT("%s ", buf);
+
+ format_dev_t(buf, clone->source_dev->bdev->bd_dev);
+ DMEMIT("%s", buf);
+
+ for (i = 0; i < clone->nr_ctr_args; i++)
+ DMEMIT(" %s", clone->ctr_args[i]);
+ }
+
+ return;
+
+error:
+ DMEMIT("Error");
+}
+
+static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+{
+ struct request_queue *dest_q, *source_q;
+ struct clone *clone = container_of(cb, struct clone, callbacks);
+
+ source_q = bdev_get_queue(clone->source_dev->bdev);
+ dest_q = bdev_get_queue(clone->dest_dev->bdev);
+
+ return (bdi_congested(dest_q->backing_dev_info, bdi_bits) |
+ bdi_congested(source_q->backing_dev_info, bdi_bits));
+}
+
+static sector_t get_dev_size(struct dm_dev *dev)
+{
+ return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Construct a clone device mapping:
+ *
+ * clone <metadata dev> <destination dev> <source dev> <region size>
+ * [<#feature args> [<feature arg>]* [<#core args> [key value]*]]
+ *
+ * metadata dev: Fast device holding the persistent metadata
+ * destination dev: The destination device, which will become a clone of the
+ * source device
+ * source dev: The read-only source device that gets cloned
+ * region size: dm-clone unit size in sectors
+ *
+ * #feature args: Number of feature arguments passed
+ * feature args: E.g. no_hydration, no_discard_passdown
+ *
+ * #core arguments: An even number of core arguments
+ * core arguments: Key/value pairs for tuning the core
+ * E.g. 'hydration_threshold 256'
+ */
+static int parse_feature_args(struct dm_arg_set *as, struct clone *clone)
+{
+ int r;
+ unsigned int argc;
+ const char *arg_name;
+ struct dm_target *ti = clone->ti;
+
+ const struct dm_arg args = {
+ .min = 0,
+ .max = 2,
+ .error = "Invalid number of feature arguments"
+ };
+
+ /* No feature arguments supplied */
+ if (!as->argc)
+ return 0;
+
+ r = dm_read_arg_group(&args, as, &argc, &ti->error);
+ if (r)
+ return r;
+
+ while (argc) {
+ arg_name = dm_shift_arg(as);
+ argc--;
+
+ if (!strcasecmp(arg_name, "no_hydration")) {
+ __clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
+ } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
+ __clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
+ } else {
+ ti->error = "Invalid feature argument";
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int parse_core_args(struct dm_arg_set *as, struct clone *clone)
+{
+ int r;
+ unsigned int argc;
+ unsigned int value;
+ const char *arg_name;
+ struct dm_target *ti = clone->ti;
+
+ const struct dm_arg args = {
+ .min = 0,
+ .max = 4,
+ .error = "Invalid number of core arguments"
+ };
+
+ /* Initialize core arguments */
+ clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE;
+ clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD;
+
+ /* No core arguments supplied */
+ if (!as->argc)
+ return 0;
+
+ r = dm_read_arg_group(&args, as, &argc, &ti->error);
+ if (r)
+ return r;
+
+ if (argc & 1) {
+ ti->error = "Number of core arguments must be even";
+ return -EINVAL;
+ }
+
+ while (argc) {
+ arg_name = dm_shift_arg(as);
+ argc -= 2;
+
+ if (!strcasecmp(arg_name, "hydration_threshold")) {
+ if (kstrtouint(dm_shift_arg(as), 10, &value)) {
+ ti->error = "Invalid value for argument `hydration_threshold'";
+ return -EINVAL;
+ }
+ clone->hydration_threshold = value;
+ } else if (!strcasecmp(arg_name, "hydration_batch_size")) {
+ if (kstrtouint(dm_shift_arg(as), 10, &value)) {
+ ti->error = "Invalid value for argument `hydration_batch_size'";
+ return -EINVAL;
+ }
+ clone->hydration_batch_size = value;
+ } else {
+ ti->error = "Invalid core argument";
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error)
+{
+ int r;
+ unsigned int region_size;
+ struct dm_arg arg;
+
+ arg.min = MIN_REGION_SIZE;
+ arg.max = MAX_REGION_SIZE;
+ arg.error = "Invalid region size";
+
+ r = dm_read_arg(&arg, as, ®ion_size, error);
+ if (r)
+ return r;
+
+ /* Check region size is a power of 2 */
+ if (!is_power_of_2(region_size)) {
+ *error = "Region size is not a power of 2";
+ return -EINVAL;
+ }
+
+ /* Validate the region size against the device logical block size */
+ if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) ||
+ region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) {
+ *error = "Region size is not a multiple of device logical block size";
+ return -EINVAL;
+ }
+
+ clone->region_size = region_size;
+
+ return 0;
+}
+
+static int validate_nr_regions(unsigned long n, char **error)
+{
+ /*
+ * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us
+ * further to 2^31 regions.
+ */
+ if (n > (1UL << 31)) {
+ *error = "Too many regions. Consider increasing the region size";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error)
+{
+ int r;
+ sector_t metadata_dev_size;
+ char b[BDEVNAME_SIZE];
+
+ r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+ &clone->metadata_dev);
+ if (r) {
+ *error = "Error opening metadata device";
+ return r;
+ }
+
+ metadata_dev_size = get_dev_size(clone->metadata_dev);
+ if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING)
+ DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
+ bdevname(clone->metadata_dev->bdev, b), DM_CLONE_METADATA_MAX_SECTORS);
+
+ return 0;
+}
+
+static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error)
+{
+ int r;
+ sector_t dest_dev_size;
+
+ r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+ &clone->dest_dev);
+ if (r) {
+ *error = "Error opening destination device";
+ return r;
+ }
+
+ dest_dev_size = get_dev_size(clone->dest_dev);
+ if (dest_dev_size < clone->ti->len) {
+ dm_put_device(clone->ti, clone->dest_dev);
+ *error = "Device size larger than destination device";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error)
+{
+ int r;
+ sector_t source_dev_size;
+
+ r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ,
+ &clone->source_dev);
+ if (r) {
+ *error = "Error opening source device";
+ return r;
+ }
+
+ source_dev_size = get_dev_size(clone->source_dev);
+ if (source_dev_size < clone->ti->len) {
+ dm_put_device(clone->ti, clone->source_dev);
+ *error = "Device size larger than source device";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error)
+{
+ unsigned int i;
+ const char **copy;
+
+ copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
+ if (!copy)
+ goto error;
+
+ for (i = 0; i < argc; i++) {
+ copy[i] = kstrdup(argv[i], GFP_KERNEL);
+
+ if (!copy[i]) {
+ while (i--)
+ kfree(copy[i]);
+ kfree(copy);
+ goto error;
+ }
+ }
+
+ clone->nr_ctr_args = argc;
+ clone->ctr_args = copy;
+ return 0;
+
+error:
+ *error = "Failed to allocate memory for table line";
+ return -ENOMEM;
+}
+
+static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ int r;
+ struct clone *clone;
+ struct dm_arg_set as;
+
+ if (argc < 4) {
+ ti->error = "Invalid number of arguments";
+ return -EINVAL;
+ }
+
+ as.argc = argc;
+ as.argv = argv;
+
+ clone = kzalloc(sizeof(*clone), GFP_KERNEL);
+ if (!clone) {
+ ti->error = "Failed to allocate clone structure";
+ return -ENOMEM;
+ }
+
+ clone->ti = ti;
+
+ /* Initialize dm-clone flags */
+ __set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
+ __set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
+ __set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
+
+ r = parse_metadata_dev(clone, &as, &ti->error);
+ if (r)
+ goto out_with_clone;
+
+ r = parse_dest_dev(clone, &as, &ti->error);
+ if (r)
+ goto out_with_meta_dev;
+
+ r = parse_source_dev(clone, &as, &ti->error);
+ if (r)
+ goto out_with_dest_dev;
+
+ r = parse_region_size(clone, &as, &ti->error);
+ if (r)
+ goto out_with_source_dev;
+
+ clone->region_shift = __ffs(clone->region_size);
+ clone->nr_regions = dm_sector_div_up(ti->len, clone->region_size);
+
+ r = validate_nr_regions(clone->nr_regions, &ti->error);
+ if (r)
+ goto out_with_source_dev;
+
+ r = dm_set_target_max_io_len(ti, clone->region_size);
+ if (r) {
+ ti->error = "Failed to set max io len";
+ goto out_with_source_dev;
+ }
+
+ r = parse_feature_args(&as, clone);
+ if (r)
+ goto out_with_source_dev;
+
+ r = parse_core_args(&as, clone);
+ if (r)
+ goto out_with_source_dev;
+
+ /* Load metadata */
+ clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len,
+ clone->region_size);
+ if (IS_ERR(clone->cmd)) {
+ ti->error = "Failed to load metadata";
+ r = PTR_ERR(clone->cmd);
+ goto out_with_source_dev;
+ }
+
+ __set_clone_mode(clone, CM_WRITE);
+
+ if (get_clone_mode(clone) != CM_WRITE) {
+ ti->error = "Unable to get write access to metadata, please check/repair metadata";
+ r = -EPERM;
+ goto out_with_metadata;
+ }
+
+ clone->last_commit_jiffies = jiffies;
+
+ /* Allocate hydration hash table */
+ r = hash_table_init(clone);
+ if (r) {
+ ti->error = "Failed to allocate hydration hash table";
+ goto out_with_metadata;
+ }
+
+ atomic_set(&clone->ios_in_flight, 0);
+ init_waitqueue_head(&clone->hydration_stopped);
+ spin_lock_init(&clone->lock);
+ bio_list_init(&clone->deferred_bios);
+ bio_list_init(&clone->deferred_discard_bios);
+ bio_list_init(&clone->deferred_flush_bios);
+ bio_list_init(&clone->deferred_flush_completions);
+ clone->hydration_offset = 0;
+ atomic_set(&clone->hydrations_in_flight, 0);
+
+ clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
+ if (!clone->wq) {
+ ti->error = "Failed to allocate workqueue";
+ r = -ENOMEM;
+ goto out_with_ht;
+ }
+
+ INIT_WORK(&clone->worker, do_worker);
+ INIT_DELAYED_WORK(&clone->waker, do_waker);
+
+ clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
+ if (IS_ERR(clone->kcopyd_client)) {
+ r = PTR_ERR(clone->kcopyd_client);
+ goto out_with_wq;
+ }
+
+ r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS,
+ _hydration_cache);
+ if (r) {
+ ti->error = "Failed to create dm_clone_region_hydration memory pool";
+ goto out_with_kcopyd;
+ }
+
+ /* Save a copy of the table line */
+ r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error);
+ if (r)
+ goto out_with_mempool;
+
+ mutex_init(&clone->commit_lock);
+ clone->callbacks.congested_fn = clone_is_congested;
+ dm_table_add_target_callbacks(ti->table, &clone->callbacks);
+
+ /* Enable flushes */
+ ti->num_flush_bios = 1;
+ ti->flush_supported = true;
+
+ /* Enable discards */
+ ti->discards_supported = true;
+ ti->num_discard_bios = 1;
+
+ ti->private = clone;
+
+ return 0;
+
+out_with_mempool:
+ mempool_exit(&clone->hydration_pool);
+out_with_kcopyd:
+ dm_kcopyd_client_destroy(clone->kcopyd_client);
+out_with_wq:
+ destroy_workqueue(clone->wq);
+out_with_ht:
+ hash_table_exit(clone);
+out_with_metadata:
+ dm_clone_metadata_close(clone->cmd);
+out_with_source_dev:
+ dm_put_device(ti, clone->source_dev);
+out_with_dest_dev:
+ dm_put_device(ti, clone->dest_dev);
+out_with_meta_dev:
+ dm_put_device(ti, clone->metadata_dev);
+out_with_clone:
+ kfree(clone);
+
+ return r;
+}
+
+static void clone_dtr(struct dm_target *ti)
+{
+ unsigned int i;
+ struct clone *clone = ti->private;
+
+ mutex_destroy(&clone->commit_lock);
+
+ for (i = 0; i < clone->nr_ctr_args; i++)
+ kfree(clone->ctr_args[i]);
+ kfree(clone->ctr_args);
+
+ mempool_exit(&clone->hydration_pool);
+ dm_kcopyd_client_destroy(clone->kcopyd_client);
+ destroy_workqueue(clone->wq);
+ hash_table_exit(clone);
+ dm_clone_metadata_close(clone->cmd);
+ dm_put_device(ti, clone->source_dev);
+ dm_put_device(ti, clone->dest_dev);
+ dm_put_device(ti, clone->metadata_dev);
+
+ kfree(clone);
+}
+
+/*---------------------------------------------------------------------------*/
+
+static void clone_postsuspend(struct dm_target *ti)
+{
+ struct clone *clone = ti->private;
+
+ /*
+ * To successfully suspend the device:
+ *
+ * - We cancel the delayed work for periodic commits and wait for
+ * it to finish.
+ *
+ * - We stop the background hydration, i.e. we prevent new region
+ * hydrations from starting.
+ *
+ * - We wait for any in-flight hydrations to finish.
+ *
+ * - We flush the workqueue.
+ *
+ * - We commit the metadata.
+ */
+ cancel_delayed_work_sync(&clone->waker);
+
+ set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
+
+ /*
+ * Make sure set_bit() is ordered before atomic_read(), otherwise we
+ * might race with do_hydration() and miss some started region
+ * hydrations.
+ *
+ * This is paired with smp_mb__after_atomic() in do_hydration().
+ */
+ smp_mb__after_atomic();
+
+ wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
+ flush_workqueue(clone->wq);
+
+ (void) commit_metadata(clone);
+}
+
+static void clone_resume(struct dm_target *ti)
+{
+ struct clone *clone = ti->private;
+
+ clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
+ do_waker(&clone->waker.work);
+}
+
+static bool bdev_supports_discards(struct block_device *bdev)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ return (q && blk_queue_discard(q));
+}
+
+/*
+ * If discard_passdown was enabled verify that the destination device supports
+ * discards. Disable discard_passdown if not.
+ */
+static void disable_passdown_if_not_supported(struct clone *clone)
+{
+ struct block_device *dest_dev = clone->dest_dev->bdev;
+ struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits;
+ const char *reason = NULL;
+ char buf[BDEVNAME_SIZE];
+
+ if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
+ return;
+
+ if (!bdev_supports_discards(dest_dev))
+ reason = "discard unsupported";
+ else if (dest_limits->max_discard_sectors < clone->region_size)
+ reason = "max discard sectors smaller than a region";
+
+ if (reason) {
+ DMWARN("Destination device (%s) %s: Disabling discard passdown.",
+ bdevname(dest_dev, buf), reason);
+ clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
+ }
+}
+
+static void set_discard_limits(struct clone *clone, struct queue_limits *limits)
+{
+ struct block_device *dest_bdev = clone->dest_dev->bdev;
+ struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits;
+
+ if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) {
+ /* No passdown is done so we set our own virtual limits */
+ limits->discard_granularity = clone->region_size << SECTOR_SHIFT;
+ limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size);
+ return;
+ }
+
+ /*
+ * clone_iterate_devices() is stacking both the source and destination
+ * device limits but discards aren't passed to the source device, so
+ * inherit destination's limits.
+ */
+ limits->max_discard_sectors = dest_limits->max_discard_sectors;
+ limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors;
+ limits->discard_granularity = dest_limits->discard_granularity;
+ limits->discard_alignment = dest_limits->discard_alignment;
+ limits->discard_misaligned = dest_limits->discard_misaligned;
+ limits->max_discard_segments = dest_limits->max_discard_segments;
+}
+
+static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct clone *clone = ti->private;
+ u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+
+ /*
+ * If the system-determined stacked limits are compatible with
+ * dm-clone's region size (io_opt is a factor) do not override them.
+ */
+ if (io_opt_sectors < clone->region_size ||
+ do_div(io_opt_sectors, clone->region_size)) {
+ blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT);
+ blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT);
+ }
+
+ disable_passdown_if_not_supported(clone);
+ set_discard_limits(clone, limits);
+}
+
+static int clone_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ int ret;
+ struct clone *clone = ti->private;
+ struct dm_dev *dest_dev = clone->dest_dev;
+ struct dm_dev *source_dev = clone->source_dev;
+
+ ret = fn(ti, source_dev, 0, ti->len, data);
+ if (!ret)
+ ret = fn(ti, dest_dev, 0, ti->len, data);
+ return ret;
+}
+
+/*
+ * dm-clone message functions.
+ */
+static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions)
+{
+ WRITE_ONCE(clone->hydration_threshold, nr_regions);
+
+ /*
+ * If user space sets hydration_threshold to zero then the hydration
+ * will stop. If at a later time the hydration_threshold is increased
+ * we must restart the hydration process by waking up the worker.
+ */
+ wake_worker(clone);
+}
+
+static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions)
+{
+ WRITE_ONCE(clone->hydration_batch_size, nr_regions);
+}
+
+static void enable_hydration(struct clone *clone)
+{
+ if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
+ wake_worker(clone);
+}
+
+static void disable_hydration(struct clone *clone)
+{
+ clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
+}
+
+static int clone_message(struct dm_target *ti, unsigned int argc, char **argv,
+ char *result, unsigned int maxlen)
+{
+ struct clone *clone = ti->private;
+ unsigned int value;
+
+ if (!argc)
+ return -EINVAL;
+
+ if (!strcasecmp(argv[0], "enable_hydration")) {
+ enable_hydration(clone);
+ return 0;
+ }
+
+ if (!strcasecmp(argv[0], "disable_hydration")) {
+ disable_hydration(clone);
+ return 0;
+ }
+
+ if (argc != 2)
+ return -EINVAL;
+
+ if (!strcasecmp(argv[0], "hydration_threshold")) {
+ if (kstrtouint(argv[1], 10, &value))
+ return -EINVAL;
+
+ set_hydration_threshold(clone, value);
+
+ return 0;
+ }
+
+ if (!strcasecmp(argv[0], "hydration_batch_size")) {
+ if (kstrtouint(argv[1], 10, &value))
+ return -EINVAL;
+
+ set_hydration_batch_size(clone, value);
+
+ return 0;
+ }
+
+ DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]);
+ return -EINVAL;
+}
+
+static struct target_type clone_target = {
+ .name = "clone",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = clone_ctr,
+ .dtr = clone_dtr,
+ .map = clone_map,
+ .end_io = clone_endio,
+ .postsuspend = clone_postsuspend,
+ .resume = clone_resume,
+ .status = clone_status,
+ .message = clone_message,
+ .io_hints = clone_io_hints,
+ .iterate_devices = clone_iterate_devices,
+};
+
+/*---------------------------------------------------------------------------*/
+
+/* Module functions */
+static int __init dm_clone_init(void)
+{
+ int r;
+
+ _hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0);
+ if (!_hydration_cache)
+ return -ENOMEM;
+
+ r = dm_register_target(&clone_target);
+ if (r < 0) {
+ DMERR("Failed to register clone target");
+ return r;
+ }
+
+ return 0;
+}
+
+static void __exit dm_clone_exit(void)
+{
+ dm_unregister_target(&clone_target);
+
+ kmem_cache_destroy(_hydration_cache);
+ _hydration_cache = NULL;
+}
+
+/* Module hooks */
+module_init(dm_clone_init);
+module_exit(dm_clone_exit);
+
+MODULE_DESCRIPTION(DM_NAME " clone target");
+MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 7d480c9..c4ef1fc 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -65,7 +65,6 @@
*/
struct work_struct work;
wait_queue_head_t wait;
- atomic_t pending[2];
spinlock_t deferred_lock;
struct bio_list deferred;
@@ -107,29 +106,16 @@
struct block_device *bdev;
- /* zero-length flush that will be cloned and submitted to targets */
- struct bio flush_bio;
-
struct dm_stats stats;
- struct kthread_worker kworker;
- struct task_struct *kworker_task;
-
- /* for request-based merge heuristic in dm_request_fn() */
- unsigned seq_rq_merge_deadline_usecs;
- int last_rq_rw;
- sector_t last_rq_pos;
- ktime_t last_rq_start_time;
-
/* for blk-mq request-based DM support */
struct blk_mq_tag_set *tag_set;
- bool use_blk_mq:1;
bool init_tio_pdu:1;
struct srcu_struct io_barrier;
};
-int md_in_flight(struct mapped_device *md);
+void disable_discard(struct mapped_device *md);
void disable_write_same(struct mapped_device *md);
void disable_write_zeroes(struct mapped_device *md);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0481223..eb9782f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -49,7 +49,7 @@
struct bio *bio_out;
struct bvec_iter iter_in;
struct bvec_iter iter_out;
- sector_t cc_sector;
+ u64 cc_sector;
atomic_t cc_pending;
union {
struct skcipher_request *req;
@@ -81,7 +81,7 @@
struct convert_context *ctx;
struct scatterlist sg_in[4];
struct scatterlist sg_out[4];
- sector_t iv_sector;
+ u64 iv_sector;
};
struct crypt_config;
@@ -98,11 +98,6 @@
struct dm_crypt_request *dmreq);
};
-struct iv_essiv_private {
- struct crypto_shash *hash_tfm;
- u8 *salt;
-};
-
struct iv_benbi_private {
int shift;
};
@@ -148,25 +143,21 @@
struct task_struct *write_thread;
struct rb_root write_tree;
- char *cipher;
char *cipher_string;
char *cipher_auth;
char *key_string;
const struct crypt_iv_operations *iv_gen_ops;
union {
- struct iv_essiv_private essiv;
struct iv_benbi_private benbi;
struct iv_lmk_private lmk;
struct iv_tcw_private tcw;
} iv_gen_private;
- sector_t iv_offset;
+ u64 iv_offset;
unsigned int iv_size;
unsigned short int sector_size;
unsigned char sector_shift;
- /* ESSIV: struct crypto_cipher *essiv_tfm */
- void *iv_private;
union {
struct crypto_skcipher **tfms;
struct crypto_aead **tfms_aead;
@@ -291,8 +282,9 @@
* Note that this encryption scheme is vulnerable to watermarking attacks
* and should be used for old compatible containers access only.
*
- * plumb: unimplemented, see:
- * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
+ * eboiv: Encrypted byte-offset IV (used in Bitlocker in CBC mode)
+ * The IV is encrypted little-endian byte-offset (with the same key
+ * and cipher as the volume).
*/
static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
@@ -323,158 +315,15 @@
return 0;
}
-/* Initialise ESSIV - compute salt but no local memory allocations */
-static int crypt_iv_essiv_init(struct crypt_config *cc)
-{
- struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
- SHASH_DESC_ON_STACK(desc, essiv->hash_tfm);
- struct crypto_cipher *essiv_tfm;
- int err;
-
- desc->tfm = essiv->hash_tfm;
- desc->flags = 0;
-
- err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt);
- shash_desc_zero(desc);
- if (err)
- return err;
-
- essiv_tfm = cc->iv_private;
-
- err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
- crypto_shash_digestsize(essiv->hash_tfm));
- if (err)
- return err;
-
- return 0;
-}
-
-/* Wipe salt and reset key derived from volume key */
-static int crypt_iv_essiv_wipe(struct crypt_config *cc)
-{
- struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
- unsigned salt_size = crypto_shash_digestsize(essiv->hash_tfm);
- struct crypto_cipher *essiv_tfm;
- int r, err = 0;
-
- memset(essiv->salt, 0, salt_size);
-
- essiv_tfm = cc->iv_private;
- r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
- if (r)
- err = r;
-
- return err;
-}
-
-/* Allocate the cipher for ESSIV */
-static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc,
- struct dm_target *ti,
- const u8 *salt,
- unsigned int saltsize)
-{
- struct crypto_cipher *essiv_tfm;
- int err;
-
- /* Setup the essiv_tfm with the given salt */
- essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(essiv_tfm)) {
- ti->error = "Error allocating crypto tfm for ESSIV";
- return essiv_tfm;
- }
-
- if (crypto_cipher_blocksize(essiv_tfm) != cc->iv_size) {
- ti->error = "Block size of ESSIV cipher does "
- "not match IV size of block cipher";
- crypto_free_cipher(essiv_tfm);
- return ERR_PTR(-EINVAL);
- }
-
- err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
- if (err) {
- ti->error = "Failed to set key for ESSIV cipher";
- crypto_free_cipher(essiv_tfm);
- return ERR_PTR(err);
- }
-
- return essiv_tfm;
-}
-
-static void crypt_iv_essiv_dtr(struct crypt_config *cc)
-{
- struct crypto_cipher *essiv_tfm;
- struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
-
- crypto_free_shash(essiv->hash_tfm);
- essiv->hash_tfm = NULL;
-
- kzfree(essiv->salt);
- essiv->salt = NULL;
-
- essiv_tfm = cc->iv_private;
-
- if (essiv_tfm)
- crypto_free_cipher(essiv_tfm);
-
- cc->iv_private = NULL;
-}
-
-static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
- const char *opts)
-{
- struct crypto_cipher *essiv_tfm = NULL;
- struct crypto_shash *hash_tfm = NULL;
- u8 *salt = NULL;
- int err;
-
- if (!opts) {
- ti->error = "Digest algorithm missing for ESSIV mode";
- return -EINVAL;
- }
-
- /* Allocate hash algorithm */
- hash_tfm = crypto_alloc_shash(opts, 0, 0);
- if (IS_ERR(hash_tfm)) {
- ti->error = "Error initializing ESSIV hash";
- err = PTR_ERR(hash_tfm);
- goto bad;
- }
-
- salt = kzalloc(crypto_shash_digestsize(hash_tfm), GFP_KERNEL);
- if (!salt) {
- ti->error = "Error kmallocing salt storage in ESSIV";
- err = -ENOMEM;
- goto bad;
- }
-
- cc->iv_gen_private.essiv.salt = salt;
- cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
-
- essiv_tfm = alloc_essiv_cipher(cc, ti, salt,
- crypto_shash_digestsize(hash_tfm));
- if (IS_ERR(essiv_tfm)) {
- crypt_iv_essiv_dtr(cc);
- return PTR_ERR(essiv_tfm);
- }
- cc->iv_private = essiv_tfm;
-
- return 0;
-
-bad:
- if (hash_tfm && !IS_ERR(hash_tfm))
- crypto_free_shash(hash_tfm);
- kfree(salt);
- return err;
-}
-
static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
- struct crypto_cipher *essiv_tfm = cc->iv_private;
-
+ /*
+ * ESSIV encryption of the IV is now handled by the crypto API,
+ * so just pass the plain sector number here.
+ */
memset(iv, 0, cc->iv_size);
*(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
- crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
return 0;
}
@@ -606,7 +455,6 @@
int i, r;
desc->tfm = lmk->hash_tfm;
- desc->flags = 0;
r = crypto_shash_init(desc);
if (r)
@@ -768,7 +616,6 @@
/* calculate crc32 for every 32bit part and xor it */
desc->tfm = tcw->crc32_tfm;
- desc->flags = 0;
for (i = 0; i < 4; i++) {
r = crypto_shash_init(desc);
if (r)
@@ -844,6 +691,49 @@
return 0;
}
+static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) {
+ ti->error = "AEAD transforms not supported for EBOIV";
+ return -EINVAL;
+ }
+
+ if (crypto_skcipher_blocksize(any_tfm(cc)) != cc->iv_size) {
+ ti->error = "Block size of EBOIV cipher does "
+ "not match IV size of block cipher";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ u8 buf[MAX_CIPHER_BLOCKSIZE] __aligned(__alignof__(__le64));
+ struct skcipher_request *req;
+ struct scatterlist src, dst;
+ struct crypto_wait wait;
+ int err;
+
+ req = skcipher_request_alloc(any_tfm(cc), GFP_KERNEL | GFP_NOFS);
+ if (!req)
+ return -ENOMEM;
+
+ memset(buf, 0, cc->iv_size);
+ *(__le64 *)buf = cpu_to_le64(dmreq->iv_sector * cc->sector_size);
+
+ sg_init_one(&src, page_address(ZERO_PAGE(0)), cc->iv_size);
+ sg_init_one(&dst, iv, cc->iv_size);
+ skcipher_request_set_crypt(req, &src, &dst, cc->iv_size, buf);
+ skcipher_request_set_callback(req, 0, crypto_req_done, &wait);
+ err = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+ skcipher_request_free(req);
+
+ return err;
+}
+
static const struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
@@ -857,10 +747,6 @@
};
static const struct crypt_iv_operations crypt_iv_essiv_ops = {
- .ctr = crypt_iv_essiv_ctr,
- .dtr = crypt_iv_essiv_dtr,
- .init = crypt_iv_essiv_init,
- .wipe = crypt_iv_essiv_wipe,
.generator = crypt_iv_essiv_gen
};
@@ -896,6 +782,11 @@
.generator = crypt_iv_random_gen
};
+static struct crypt_iv_operations crypt_iv_eboiv_ops = {
+ .ctr = crypt_iv_eboiv_ctr,
+ .generator = crypt_iv_eboiv_gen
+};
+
/*
* Integrity extensions
*/
@@ -932,7 +823,7 @@
if (IS_ERR(bip))
return PTR_ERR(bip);
- tag_len = io->cc->on_disk_tag_size * bio_sectors(bio);
+ tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift);
bip->bip_iter.bi_size = tag_len;
bip->bip_iter.bi_sector = io->cc->start + io->sector;
@@ -949,6 +840,7 @@
{
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk);
+ struct mapped_device *md = dm_table_get_md(ti->table);
/* From now we require underlying device with our integrity profile */
if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) {
@@ -968,7 +860,7 @@
if (crypt_integrity_aead(cc)) {
cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size;
- DMINFO("Integrity AEAD, tag size %u, IV size %u.",
+ DMDEBUG("%s: Integrity AEAD, tag size %u, IV size %u.", dm_device_name(md),
cc->integrity_tag_size, cc->integrity_iv_size);
if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) {
@@ -976,7 +868,7 @@
return -EINVAL;
}
} else if (cc->integrity_iv_size)
- DMINFO("Additional per-sector space %u bytes for IV.",
+ DMDEBUG("%s: Additional per-sector space %u bytes for IV.", dm_device_name(md),
cc->integrity_iv_size);
if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) {
@@ -1034,11 +926,11 @@
return iv_of_dmreq(cc, dmreq) + cc->iv_size;
}
-static uint64_t *org_sector_of_dmreq(struct crypt_config *cc,
+static __le64 *org_sector_of_dmreq(struct crypt_config *cc,
struct dm_crypt_request *dmreq)
{
u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size;
- return (uint64_t*) ptr;
+ return (__le64 *) ptr;
}
static unsigned int *org_tag_of_dmreq(struct crypt_config *cc,
@@ -1074,7 +966,7 @@
struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
struct dm_crypt_request *dmreq;
u8 *iv, *org_iv, *tag_iv, *tag;
- uint64_t *sector;
+ __le64 *sector;
int r = 0;
BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size);
@@ -1146,9 +1038,11 @@
r = crypto_aead_decrypt(req);
}
- if (r == -EBADMSG)
- DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+ if (r == -EBADMSG) {
+ char b[BDEVNAME_SIZE];
+ DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b),
(unsigned long long)le64_to_cpu(*sector));
+ }
if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
@@ -1169,7 +1063,7 @@
struct scatterlist *sg_in, *sg_out;
struct dm_crypt_request *dmreq;
u8 *iv, *org_iv, *tag_iv;
- uint64_t *sector;
+ __le64 *sector;
int r = 0;
/* Reject unexpected unaligned bio. */
@@ -1445,10 +1339,10 @@
static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
{
- unsigned int i;
struct bio_vec *bv;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, clone, i) {
+ bio_for_each_segment_all(bv, clone, iter_all) {
BUG_ON(!bv->bv_page);
mempool_free(bv->bv_page, &cc->page_pool);
}
@@ -1791,7 +1685,8 @@
error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
if (error == -EBADMSG) {
- DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+ char b[BDEVNAME_SIZE];
+ DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b),
(unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)));
io->error = BLK_STS_PROTECTION;
} else if (error < 0)
@@ -1885,6 +1780,13 @@
}
}
+ /*
+ * dm-crypt performance can vary greatly depending on which crypto
+ * algorithm implementation is used. Help people debug performance
+ * problems by logging the ->cra_driver_name.
+ */
+ DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode,
+ crypto_skcipher_alg(any_tfm(cc))->base.cra_driver_name);
return 0;
}
@@ -1903,6 +1805,8 @@
return err;
}
+ DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode,
+ crypto_aead_alg(any_tfm_aead(cc))->base.cra_driver_name);
return 0;
}
@@ -2148,6 +2052,14 @@
clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
get_random_bytes(&cc->key, cc->key_size);
+
+ /* Wipe IV private keys */
+ if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
+ r = cc->iv_gen_ops->wipe(cc);
+ if (r)
+ return r;
+ }
+
kzfree(cc->key_string);
cc->key_string = NULL;
r = crypt_setkey(cc);
@@ -2158,7 +2070,7 @@
static void crypt_calculate_pages_per_client(void)
{
- unsigned long pages = (totalram_pages - totalhigh_pages) * DM_CRYPT_MEMORY_PERCENT / 100;
+ unsigned long pages = (totalram_pages() - totalhigh_pages()) * DM_CRYPT_MEMORY_PERCENT / 100;
if (!dm_crypt_clients_n)
return;
@@ -2227,7 +2139,6 @@
if (cc->dev)
dm_put_device(ti, cc->dev);
- kzfree(cc->cipher);
kzfree(cc->cipher_string);
kzfree(cc->key_string);
kzfree(cc->cipher_auth);
@@ -2278,6 +2189,8 @@
cc->iv_gen_ops = &crypt_iv_benbi_ops;
else if (strcmp(ivmode, "null") == 0)
cc->iv_gen_ops = &crypt_iv_null_ops;
+ else if (strcmp(ivmode, "eboiv") == 0)
+ cc->iv_gen_ops = &crypt_iv_eboiv_ops;
else if (strcmp(ivmode, "lmk") == 0) {
cc->iv_gen_ops = &crypt_iv_lmk_ops;
/*
@@ -2307,52 +2220,6 @@
}
/*
- * Workaround to parse cipher algorithm from crypto API spec.
- * The cc->cipher is currently used only in ESSIV.
- * This should be probably done by crypto-api calls (once available...)
- */
-static int crypt_ctr_blkdev_cipher(struct crypt_config *cc)
-{
- const char *alg_name = NULL;
- char *start, *end;
-
- if (crypt_integrity_aead(cc)) {
- alg_name = crypto_tfm_alg_name(crypto_aead_tfm(any_tfm_aead(cc)));
- if (!alg_name)
- return -EINVAL;
- if (crypt_integrity_hmac(cc)) {
- alg_name = strchr(alg_name, ',');
- if (!alg_name)
- return -EINVAL;
- }
- alg_name++;
- } else {
- alg_name = crypto_tfm_alg_name(crypto_skcipher_tfm(any_tfm(cc)));
- if (!alg_name)
- return -EINVAL;
- }
-
- start = strchr(alg_name, '(');
- end = strchr(alg_name, ')');
-
- if (!start && !end) {
- cc->cipher = kstrdup(alg_name, GFP_KERNEL);
- return cc->cipher ? 0 : -ENOMEM;
- }
-
- if (!start || !end || ++start >= end)
- return -EINVAL;
-
- cc->cipher = kzalloc(end - start + 1, GFP_KERNEL);
- if (!cc->cipher)
- return -ENOMEM;
-
- strncpy(cc->cipher, start, end - start);
-
- return 0;
-}
-
-/*
* Workaround to parse HMAC algorithm from AEAD crypto API spec.
* The HMAC is needed to calculate tag size (HMAC digest size).
* This should be probably done by crypto-api calls (once available...)
@@ -2395,7 +2262,7 @@
char **ivmode, char **ivopts)
{
struct crypt_config *cc = ti->private;
- char *tmp, *cipher_api;
+ char *tmp, *cipher_api, buf[CRYPTO_MAX_ALG_NAME];
int ret = -EINVAL;
cc->tfms_count = 1;
@@ -2405,13 +2272,48 @@
* capi:cipher_api_spec-iv:ivopts
*/
tmp = &cipher_in[strlen("capi:")];
- cipher_api = strsep(&tmp, "-");
- *ivmode = strsep(&tmp, ":");
- *ivopts = tmp;
+
+ /* Separate IV options if present, it can contain another '-' in hash name */
+ *ivopts = strrchr(tmp, ':');
+ if (*ivopts) {
+ **ivopts = '\0';
+ (*ivopts)++;
+ }
+ /* Parse IV mode */
+ *ivmode = strrchr(tmp, '-');
+ if (*ivmode) {
+ **ivmode = '\0';
+ (*ivmode)++;
+ }
+ /* The rest is crypto API spec */
+ cipher_api = tmp;
+
+ /* Alloc AEAD, can be used only in new format. */
+ if (crypt_integrity_aead(cc)) {
+ ret = crypt_ctr_auth_cipher(cc, cipher_api);
+ if (ret < 0) {
+ ti->error = "Invalid AEAD cipher spec";
+ return -ENOMEM;
+ }
+ }
if (*ivmode && !strcmp(*ivmode, "lmk"))
cc->tfms_count = 64;
+ if (*ivmode && !strcmp(*ivmode, "essiv")) {
+ if (!*ivopts) {
+ ti->error = "Digest algorithm missing for ESSIV mode";
+ return -EINVAL;
+ }
+ ret = snprintf(buf, CRYPTO_MAX_ALG_NAME, "essiv(%s,%s)",
+ cipher_api, *ivopts);
+ if (ret < 0 || ret >= CRYPTO_MAX_ALG_NAME) {
+ ti->error = "Cannot allocate cipher string";
+ return -ENOMEM;
+ }
+ cipher_api = buf;
+ }
+
cc->key_parts = cc->tfms_count;
/* Allocate cipher */
@@ -2421,23 +2323,11 @@
return ret;
}
- /* Alloc AEAD, can be used only in new format. */
- if (crypt_integrity_aead(cc)) {
- ret = crypt_ctr_auth_cipher(cc, cipher_api);
- if (ret < 0) {
- ti->error = "Invalid AEAD cipher spec";
- return -ENOMEM;
- }
+ if (crypt_integrity_aead(cc))
cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
- } else
+ else
cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
- ret = crypt_ctr_blkdev_cipher(cc);
- if (ret < 0) {
- ti->error = "Cannot allocate cipher string";
- return -ENOMEM;
- }
-
return 0;
}
@@ -2472,16 +2362,9 @@
}
cc->key_parts = cc->tfms_count;
- cc->cipher = kstrdup(cipher, GFP_KERNEL);
- if (!cc->cipher)
- goto bad_mem;
-
chainmode = strsep(&tmp, "-");
- *ivopts = strsep(&tmp, "-");
- *ivmode = strsep(&*ivopts, ":");
-
- if (tmp)
- DMWARN("Ignoring unexpected additional cipher options");
+ *ivmode = strsep(&tmp, ":");
+ *ivopts = tmp;
/*
* For compatibility with the original dm-crypt mapping format, if
@@ -2501,9 +2384,19 @@
if (!cipher_api)
goto bad_mem;
- ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME,
- "%s(%s)", chainmode, cipher);
- if (ret < 0) {
+ if (*ivmode && !strcmp(*ivmode, "essiv")) {
+ if (!*ivopts) {
+ ti->error = "Digest algorithm missing for ESSIV mode";
+ kfree(cipher_api);
+ return -EINVAL;
+ }
+ ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME,
+ "essiv(%s(%s),%s)", chainmode, cipher, *ivopts);
+ } else {
+ ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME,
+ "%s(%s)", chainmode, cipher);
+ }
+ if (ret < 0 || ret >= CRYPTO_MAX_ALG_NAME) {
kfree(cipher_api);
goto bad_mem;
}
@@ -2661,6 +2554,7 @@
static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct crypt_config *cc;
+ const char *devname = dm_table_device_name(ti->table);
int key_size;
unsigned int align_mask;
unsigned long long tmpll;
@@ -2679,7 +2573,7 @@
return -EINVAL;
}
- cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
+ cc = kzalloc(struct_size(cc, key, key_size), GFP_KERNEL);
if (!cc) {
ti->error = "Cannot allocate encryption context";
return -ENOMEM;
@@ -2780,7 +2674,7 @@
}
ret = -EINVAL;
- if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
+ if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) {
ti->error = "Invalid device sector";
goto bad;
}
@@ -2806,18 +2700,19 @@
}
ret = -ENOMEM;
- cc->io_queue = alloc_workqueue("kcryptd_io", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+ cc->io_queue = alloc_workqueue("kcryptd_io/%s", WQ_MEM_RECLAIM, 1, devname);
if (!cc->io_queue) {
ti->error = "Couldn't create kcryptd io queue";
goto bad;
}
if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
- cc->crypt_queue = alloc_workqueue("kcryptd", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+ cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM,
+ 1, devname);
else
- cc->crypt_queue = alloc_workqueue("kcryptd",
- WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
- num_online_cpus());
+ cc->crypt_queue = alloc_workqueue("kcryptd/%s",
+ WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+ num_online_cpus(), devname);
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
goto bad;
@@ -2826,7 +2721,7 @@
spin_lock_init(&cc->write_thread_lock);
cc->write_tree = RB_ROOT;
- cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
+ cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write/%s", devname);
if (IS_ERR(cc->write_thread)) {
ret = PTR_ERR(cc->write_thread);
cc->write_thread = NULL;
@@ -3026,14 +2921,8 @@
memset(cc->key, 0, cc->key_size * sizeof(u8));
return ret;
}
- if (argc == 2 && !strcasecmp(argv[1], "wipe")) {
- if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
- ret = cc->iv_gen_ops->wipe(cc);
- if (ret)
- return ret;
- }
+ if (argc == 2 && !strcasecmp(argv[1], "wipe"))
return crypt_wipe_key(cc);
- }
}
error:
@@ -3070,7 +2959,7 @@
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 18, 1},
+ .version = {1, 19, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 2fb7bb4..f496213 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -121,7 +121,8 @@
{
struct delay_c *dc = ti->private;
- destroy_workqueue(dc->kdelayd_wq);
+ if (dc->kdelayd_wq)
+ destroy_workqueue(dc->kdelayd_wq);
if (dc->read.dev)
dm_put_device(ti, dc->read.dev);
@@ -141,7 +142,7 @@
unsigned long long tmpll;
char dummy;
- if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) {
+ if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) {
ti->error = "Invalid device sector";
return -EINVAL;
}
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
new file mode 100644
index 0000000..8288887
--- /dev/null
+++ b/drivers/md/dm-dust.c
@@ -0,0 +1,520 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * This is a test "dust" device, which fails reads on specified
+ * sectors, emulating the behavior of a hard disk drive sending
+ * a "Read Medium Error" sense.
+ *
+ */
+
+#include <linux/device-mapper.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+
+#define DM_MSG_PREFIX "dust"
+
+struct badblock {
+ struct rb_node node;
+ sector_t bb;
+};
+
+struct dust_device {
+ struct dm_dev *dev;
+ struct rb_root badblocklist;
+ unsigned long long badblock_count;
+ spinlock_t dust_lock;
+ unsigned int blksz;
+ int sect_per_block_shift;
+ unsigned int sect_per_block;
+ sector_t start;
+ bool fail_read_on_bb:1;
+ bool quiet_mode:1;
+};
+
+static struct badblock *dust_rb_search(struct rb_root *root, sector_t blk)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct badblock *bblk = rb_entry(node, struct badblock, node);
+
+ if (bblk->bb > blk)
+ node = node->rb_left;
+ else if (bblk->bb < blk)
+ node = node->rb_right;
+ else
+ return bblk;
+ }
+
+ return NULL;
+}
+
+static bool dust_rb_insert(struct rb_root *root, struct badblock *new)
+{
+ struct badblock *bblk;
+ struct rb_node **link = &root->rb_node, *parent = NULL;
+ sector_t value = new->bb;
+
+ while (*link) {
+ parent = *link;
+ bblk = rb_entry(parent, struct badblock, node);
+
+ if (bblk->bb > value)
+ link = &(*link)->rb_left;
+ else if (bblk->bb < value)
+ link = &(*link)->rb_right;
+ else
+ return false;
+ }
+
+ rb_link_node(&new->node, parent, link);
+ rb_insert_color(&new->node, root);
+
+ return true;
+}
+
+static int dust_remove_block(struct dust_device *dd, unsigned long long block)
+{
+ struct badblock *bblock;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ bblock = dust_rb_search(&dd->badblocklist, block);
+
+ if (bblock == NULL) {
+ if (!dd->quiet_mode) {
+ DMERR("%s: block %llu not found in badblocklist",
+ __func__, block);
+ }
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+ return -EINVAL;
+ }
+
+ rb_erase(&bblock->node, &dd->badblocklist);
+ dd->badblock_count--;
+ if (!dd->quiet_mode)
+ DMINFO("%s: badblock removed at block %llu", __func__, block);
+ kfree(bblock);
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+ return 0;
+}
+
+static int dust_add_block(struct dust_device *dd, unsigned long long block)
+{
+ struct badblock *bblock;
+ unsigned long flags;
+
+ bblock = kmalloc(sizeof(*bblock), GFP_KERNEL);
+ if (bblock == NULL) {
+ if (!dd->quiet_mode)
+ DMERR("%s: badblock allocation failed", __func__);
+ return -ENOMEM;
+ }
+
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ bblock->bb = block;
+ if (!dust_rb_insert(&dd->badblocklist, bblock)) {
+ if (!dd->quiet_mode) {
+ DMERR("%s: block %llu already in badblocklist",
+ __func__, block);
+ }
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+ kfree(bblock);
+ return -EINVAL;
+ }
+
+ dd->badblock_count++;
+ if (!dd->quiet_mode)
+ DMINFO("%s: badblock added at block %llu", __func__, block);
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+ return 0;
+}
+
+static int dust_query_block(struct dust_device *dd, unsigned long long block)
+{
+ struct badblock *bblock;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ bblock = dust_rb_search(&dd->badblocklist, block);
+ if (bblock != NULL)
+ DMINFO("%s: block %llu found in badblocklist", __func__, block);
+ else
+ DMINFO("%s: block %llu not found in badblocklist", __func__, block);
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+ return 0;
+}
+
+static int __dust_map_read(struct dust_device *dd, sector_t thisblock)
+{
+ struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);
+
+ if (bblk)
+ return DM_MAPIO_KILL;
+
+ return DM_MAPIO_REMAPPED;
+}
+
+static int dust_map_read(struct dust_device *dd, sector_t thisblock,
+ bool fail_read_on_bb)
+{
+ unsigned long flags;
+ int ret = DM_MAPIO_REMAPPED;
+
+ if (fail_read_on_bb) {
+ thisblock >>= dd->sect_per_block_shift;
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ ret = __dust_map_read(dd, thisblock);
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+ }
+
+ return ret;
+}
+
+static void __dust_map_write(struct dust_device *dd, sector_t thisblock)
+{
+ struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);
+
+ if (bblk) {
+ rb_erase(&bblk->node, &dd->badblocklist);
+ dd->badblock_count--;
+ kfree(bblk);
+ if (!dd->quiet_mode) {
+ sector_div(thisblock, dd->sect_per_block);
+ DMINFO("block %llu removed from badblocklist by write",
+ (unsigned long long)thisblock);
+ }
+ }
+}
+
+static int dust_map_write(struct dust_device *dd, sector_t thisblock,
+ bool fail_read_on_bb)
+{
+ unsigned long flags;
+
+ if (fail_read_on_bb) {
+ thisblock >>= dd->sect_per_block_shift;
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ __dust_map_write(dd, thisblock);
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+ }
+
+ return DM_MAPIO_REMAPPED;
+}
+
+static int dust_map(struct dm_target *ti, struct bio *bio)
+{
+ struct dust_device *dd = ti->private;
+ int ret;
+
+ bio_set_dev(bio, dd->dev->bdev);
+ bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
+
+ if (bio_data_dir(bio) == READ)
+ ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
+ else
+ ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
+
+ return ret;
+}
+
+static bool __dust_clear_badblocks(struct rb_root *tree,
+ unsigned long long count)
+{
+ struct rb_node *node = NULL, *nnode = NULL;
+
+ nnode = rb_first(tree);
+ if (nnode == NULL) {
+ BUG_ON(count != 0);
+ return false;
+ }
+
+ while (nnode) {
+ node = nnode;
+ nnode = rb_next(node);
+ rb_erase(node, tree);
+ count--;
+ kfree(node);
+ }
+ BUG_ON(count != 0);
+ BUG_ON(tree->rb_node != NULL);
+
+ return true;
+}
+
+static int dust_clear_badblocks(struct dust_device *dd)
+{
+ unsigned long flags;
+ struct rb_root badblocklist;
+ unsigned long long badblock_count;
+
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ badblocklist = dd->badblocklist;
+ badblock_count = dd->badblock_count;
+ dd->badblocklist = RB_ROOT;
+ dd->badblock_count = 0;
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+ if (!__dust_clear_badblocks(&badblocklist, badblock_count))
+ DMINFO("%s: no badblocks found", __func__);
+ else
+ DMINFO("%s: badblocks cleared", __func__);
+
+ return 0;
+}
+
+/*
+ * Target parameters:
+ *
+ * <device_path> <offset> <blksz>
+ *
+ * device_path: path to the block device
+ * offset: offset to data area from start of device_path
+ * blksz: block size (minimum 512, maximum 1073741824, must be a power of 2)
+ */
+static int dust_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct dust_device *dd;
+ unsigned long long tmp;
+ char dummy;
+ unsigned int blksz;
+ unsigned int sect_per_block;
+ sector_t DUST_MAX_BLKSZ_SECTORS = 2097152;
+ sector_t max_block_sectors = min(ti->len, DUST_MAX_BLKSZ_SECTORS);
+
+ if (argc != 3) {
+ ti->error = "Invalid argument count";
+ return -EINVAL;
+ }
+
+ if (kstrtouint(argv[2], 10, &blksz) || !blksz) {
+ ti->error = "Invalid block size parameter";
+ return -EINVAL;
+ }
+
+ if (blksz < 512) {
+ ti->error = "Block size must be at least 512";
+ return -EINVAL;
+ }
+
+ if (!is_power_of_2(blksz)) {
+ ti->error = "Block size must be a power of 2";
+ return -EINVAL;
+ }
+
+ if (to_sector(blksz) > max_block_sectors) {
+ ti->error = "Block size is too large";
+ return -EINVAL;
+ }
+
+ sect_per_block = (blksz >> SECTOR_SHIFT);
+
+ if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || tmp != (sector_t)tmp) {
+ ti->error = "Invalid device offset sector";
+ return -EINVAL;
+ }
+
+ dd = kzalloc(sizeof(struct dust_device), GFP_KERNEL);
+ if (dd == NULL) {
+ ti->error = "Cannot allocate context";
+ return -ENOMEM;
+ }
+
+ if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dd->dev)) {
+ ti->error = "Device lookup failed";
+ kfree(dd);
+ return -EINVAL;
+ }
+
+ dd->sect_per_block = sect_per_block;
+ dd->blksz = blksz;
+ dd->start = tmp;
+
+ dd->sect_per_block_shift = __ffs(sect_per_block);
+
+ /*
+ * Whether to fail a read on a "bad" block.
+ * Defaults to false; enabled later by message.
+ */
+ dd->fail_read_on_bb = false;
+
+ /*
+ * Initialize bad block list rbtree.
+ */
+ dd->badblocklist = RB_ROOT;
+ dd->badblock_count = 0;
+ spin_lock_init(&dd->dust_lock);
+
+ dd->quiet_mode = false;
+
+ BUG_ON(dm_set_target_max_io_len(ti, dd->sect_per_block) != 0);
+
+ ti->num_discard_bios = 1;
+ ti->num_flush_bios = 1;
+ ti->private = dd;
+
+ return 0;
+}
+
+static void dust_dtr(struct dm_target *ti)
+{
+ struct dust_device *dd = ti->private;
+
+ __dust_clear_badblocks(&dd->badblocklist, dd->badblock_count);
+ dm_put_device(ti, dd->dev);
+ kfree(dd);
+}
+
+static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
+ char *result_buf, unsigned int maxlen)
+{
+ struct dust_device *dd = ti->private;
+ sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT;
+ bool invalid_msg = false;
+ int result = -EINVAL;
+ unsigned long long tmp, block;
+ unsigned long flags;
+ char dummy;
+
+ if (argc == 1) {
+ if (!strcasecmp(argv[0], "addbadblock") ||
+ !strcasecmp(argv[0], "removebadblock") ||
+ !strcasecmp(argv[0], "queryblock")) {
+ DMERR("%s requires an additional argument", argv[0]);
+ } else if (!strcasecmp(argv[0], "disable")) {
+ DMINFO("disabling read failures on bad sectors");
+ dd->fail_read_on_bb = false;
+ result = 0;
+ } else if (!strcasecmp(argv[0], "enable")) {
+ DMINFO("enabling read failures on bad sectors");
+ dd->fail_read_on_bb = true;
+ result = 0;
+ } else if (!strcasecmp(argv[0], "countbadblocks")) {
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ DMINFO("countbadblocks: %llu badblock(s) found",
+ dd->badblock_count);
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+ result = 0;
+ } else if (!strcasecmp(argv[0], "clearbadblocks")) {
+ result = dust_clear_badblocks(dd);
+ } else if (!strcasecmp(argv[0], "quiet")) {
+ if (!dd->quiet_mode)
+ dd->quiet_mode = true;
+ else
+ dd->quiet_mode = false;
+ result = 0;
+ } else {
+ invalid_msg = true;
+ }
+ } else if (argc == 2) {
+ if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1)
+ return result;
+
+ block = tmp;
+ sector_div(size, dd->sect_per_block);
+ if (block > size) {
+ DMERR("selected block value out of range");
+ return result;
+ }
+
+ if (!strcasecmp(argv[0], "addbadblock"))
+ result = dust_add_block(dd, block);
+ else if (!strcasecmp(argv[0], "removebadblock"))
+ result = dust_remove_block(dd, block);
+ else if (!strcasecmp(argv[0], "queryblock"))
+ result = dust_query_block(dd, block);
+ else
+ invalid_msg = true;
+
+ } else
+ DMERR("invalid number of arguments '%d'", argc);
+
+ if (invalid_msg)
+ DMERR("unrecognized message '%s' received", argv[0]);
+
+ return result;
+}
+
+static void dust_status(struct dm_target *ti, status_type_t type,
+ unsigned int status_flags, char *result, unsigned int maxlen)
+{
+ struct dust_device *dd = ti->private;
+ unsigned int sz = 0;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%s %s %s", dd->dev->name,
+ dd->fail_read_on_bb ? "fail_read_on_bad_block" : "bypass",
+ dd->quiet_mode ? "quiet" : "verbose");
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%s %llu %u", dd->dev->name,
+ (unsigned long long)dd->start, dd->blksz);
+ break;
+ }
+}
+
+static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+{
+ struct dust_device *dd = ti->private;
+ struct dm_dev *dev = dd->dev;
+
+ *bdev = dev->bdev;
+
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (dd->start ||
+ ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+ return 1;
+
+ return 0;
+}
+
+static int dust_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn,
+ void *data)
+{
+ struct dust_device *dd = ti->private;
+
+ return fn(ti, dd->dev, dd->start, ti->len, data);
+}
+
+static struct target_type dust_target = {
+ .name = "dust",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = dust_ctr,
+ .dtr = dust_dtr,
+ .iterate_devices = dust_iterate_devices,
+ .map = dust_map,
+ .message = dust_message,
+ .status = dust_status,
+ .prepare_ioctl = dust_prepare_ioctl,
+};
+
+static int __init dm_dust_init(void)
+{
+ int result = dm_register_target(&dust_target);
+
+ if (result < 0)
+ DMERR("dm_register_target failed %d", result);
+
+ return result;
+}
+
+static void __exit dm_dust_exit(void)
+{
+ dm_unregister_target(&dust_target);
+}
+
+module_init(dm_dust_init);
+module_exit(dm_dust_exit);
+
+MODULE_DESCRIPTION(DM_NAME " dust test target");
+MODULE_AUTHOR("Bryan Gurney <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 8e48920..bdb84b8 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include "dm.h"
#include "persistent-data/dm-transaction-manager.h"
#include "persistent-data/dm-bitset.h"
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 12b5216..3f4139a 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -11,6 +11,7 @@
#define _LINUX_DM_EXCEPTION_STORE
#include <linux/blkdev.h>
+#include <linux/list_bl.h>
#include <linux/device-mapper.h>
/*
@@ -27,7 +28,7 @@
* chunk within the device.
*/
struct dm_exception {
- struct list_head hash_list;
+ struct hlist_bl_node hash_list;
chunk_t old_chunk;
chunk_t new_chunk;
@@ -135,9 +136,8 @@
/*
* Funtions to manipulate consecutive chunks
*/
-# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
-# define DM_CHUNK_CONSECUTIVE_BITS 8
-# define DM_CHUNK_NUMBER_BITS 56
+#define DM_CHUNK_CONSECUTIVE_BITS 8
+#define DM_CHUNK_NUMBER_BITS 56
static inline chunk_t dm_chunk_number(chunk_t chunk)
{
@@ -163,29 +163,6 @@
e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
}
-# else
-# define DM_CHUNK_CONSECUTIVE_BITS 0
-
-static inline chunk_t dm_chunk_number(chunk_t chunk)
-{
- return chunk;
-}
-
-static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
-{
- return 0;
-}
-
-static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
-{
-}
-
-static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
-{
-}
-
-# endif
-
/*
* Return the number of sectors in the device.
*/
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 32aabe2..2900fbd 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -213,7 +213,7 @@
devname = dm_shift_arg(&as);
r = -EINVAL;
- if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) {
+ if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) {
ti->error = "Invalid device sector";
goto bad;
}
@@ -287,20 +287,31 @@
static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
{
- unsigned bio_bytes = bio_cur_bytes(bio);
- char *data = bio_data(bio);
+ unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1;
+
+ struct bvec_iter iter;
+ struct bio_vec bvec;
+
+ if (!bio_has_data(bio))
+ return;
/*
- * Overwrite the Nth byte of the data returned.
+ * Overwrite the Nth byte of the bio's data, on whichever page
+ * it falls.
*/
- if (data && bio_bytes >= fc->corrupt_bio_byte) {
- data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value;
-
- DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
- "(rw=%c bi_opf=%u bi_sector=%llu cur_bytes=%u)\n",
- bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
- (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf,
- (unsigned long long)bio->bi_iter.bi_sector, bio_bytes);
+ bio_for_each_segment(bvec, bio, iter) {
+ if (bio_iter_len(bio, iter) > corrupt_bio_byte) {
+ char *segment = (page_address(bio_iter_page(bio, iter))
+ + bio_iter_offset(bio, iter));
+ segment[corrupt_bio_byte] = fc->corrupt_bio_value;
+ DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
+ "(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n",
+ bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
+ (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf,
+ (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size);
+ break;
+ }
+ corrupt_bio_byte -= bio_iter_len(bio, iter);
}
}
@@ -315,10 +326,6 @@
if (bio_op(bio) == REQ_OP_ZONE_RESET)
goto map_bio;
- /* We need to remap reported zones, so remember the BIO iter */
- if (bio_op(bio) == REQ_OP_ZONE_REPORT)
- goto map_bio;
-
/* Are we alive ? */
elapsed = (jiffies - fc->start_time) / HZ;
if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
@@ -380,11 +387,6 @@
if (bio_op(bio) == REQ_OP_ZONE_RESET)
return DM_ENDIO_DONE;
- if (bio_op(bio) == REQ_OP_ZONE_REPORT) {
- dm_remap_zone_report(ti, bio, fc->start);
- return DM_ENDIO_DONE;
- }
-
if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) {
if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == READ) &&
all_corrupt_bio_flags_match(bio, fc)) {
@@ -457,6 +459,25 @@
return 0;
}
+#ifdef CONFIG_BLK_DEV_ZONED
+static int flakey_report_zones(struct dm_target *ti, sector_t sector,
+ struct blk_zone *zones, unsigned int *nr_zones)
+{
+ struct flakey_c *fc = ti->private;
+ int ret;
+
+ /* Do report and remap it */
+ ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector),
+ zones, nr_zones);
+ if (ret != 0)
+ return ret;
+
+ if (*nr_zones)
+ dm_remap_zone_report(ti, fc->start, zones, nr_zones);
+ return 0;
+}
+#endif
+
static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
{
struct flakey_c *fc = ti->private;
@@ -469,6 +490,7 @@
.version = {1, 5, 0},
#ifdef CONFIG_BLK_DEV_ZONED
.features = DM_TARGET_ZONED_HM,
+ .report_zones = flakey_report_zones,
#endif
.module = THIS_MODULE,
.ctr = flakey_ctr,
diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c
new file mode 100644
index 0000000..b869316
--- /dev/null
+++ b/drivers/md/dm-init.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * dm-init.c
+ * Copyright (C) 2017 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/device-mapper.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/moduleparam.h>
+
+#define DM_MSG_PREFIX "init"
+#define DM_MAX_DEVICES 256
+#define DM_MAX_TARGETS 256
+#define DM_MAX_STR_SIZE 4096
+
+static char *create;
+
+/*
+ * Format: dm-mod.create=<name>,<uuid>,<minor>,<flags>,<table>[,<table>+][;<name>,<uuid>,<minor>,<flags>,<table>[,<table>+]+]
+ * Table format: <start_sector> <num_sectors> <target_type> <target_args>
+ *
+ * See Documentation/admin-guide/device-mapper/dm-init.rst for dm-mod.create="..." format
+ * details.
+ */
+
+struct dm_device {
+ struct dm_ioctl dmi;
+ struct dm_target_spec *table[DM_MAX_TARGETS];
+ char *target_args_array[DM_MAX_TARGETS];
+ struct list_head list;
+};
+
+const char * const dm_allowed_targets[] __initconst = {
+ "crypt",
+ "delay",
+ "linear",
+ "snapshot-origin",
+ "striped",
+ "verity",
+};
+
+static int __init dm_verify_target_type(const char *target)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(dm_allowed_targets); i++) {
+ if (!strcmp(dm_allowed_targets[i], target))
+ return 0;
+ }
+ return -EINVAL;
+}
+
+static void __init dm_setup_cleanup(struct list_head *devices)
+{
+ struct dm_device *dev, *tmp;
+ unsigned int i;
+
+ list_for_each_entry_safe(dev, tmp, devices, list) {
+ list_del(&dev->list);
+ for (i = 0; i < dev->dmi.target_count; i++) {
+ kfree(dev->table[i]);
+ kfree(dev->target_args_array[i]);
+ }
+ kfree(dev);
+ }
+}
+
+/**
+ * str_field_delimit - delimit a string based on a separator char.
+ * @str: the pointer to the string to delimit.
+ * @separator: char that delimits the field
+ *
+ * Find a @separator and replace it by '\0'.
+ * Remove leading and trailing spaces.
+ * Return the remainder string after the @separator.
+ */
+static char __init *str_field_delimit(char **str, char separator)
+{
+ char *s;
+
+ /* TODO: add support for escaped characters */
+ *str = skip_spaces(*str);
+ s = strchr(*str, separator);
+ /* Delimit the field and remove trailing spaces */
+ if (s)
+ *s = '\0';
+ *str = strim(*str);
+ return s ? ++s : NULL;
+}
+
+/**
+ * dm_parse_table_entry - parse a table entry
+ * @dev: device to store the parsed information.
+ * @str: the pointer to a string with the format:
+ * <start_sector> <num_sectors> <target_type> <target_args>[, ...]
+ *
+ * Return the remainder string after the table entry, i.e, after the comma which
+ * delimits the entry or NULL if reached the end of the string.
+ */
+static char __init *dm_parse_table_entry(struct dm_device *dev, char *str)
+{
+ const unsigned int n = dev->dmi.target_count - 1;
+ struct dm_target_spec *sp;
+ unsigned int i;
+ /* fields: */
+ char *field[4];
+ char *next;
+
+ field[0] = str;
+ /* Delimit first 3 fields that are separated by space */
+ for (i = 0; i < ARRAY_SIZE(field) - 1; i++) {
+ field[i + 1] = str_field_delimit(&field[i], ' ');
+ if (!field[i + 1])
+ return ERR_PTR(-EINVAL);
+ }
+ /* Delimit last field that can be terminated by comma */
+ next = str_field_delimit(&field[i], ',');
+
+ sp = kzalloc(sizeof(*sp), GFP_KERNEL);
+ if (!sp)
+ return ERR_PTR(-ENOMEM);
+ dev->table[n] = sp;
+
+ /* start_sector */
+ if (kstrtoull(field[0], 0, &sp->sector_start))
+ return ERR_PTR(-EINVAL);
+ /* num_sector */
+ if (kstrtoull(field[1], 0, &sp->length))
+ return ERR_PTR(-EINVAL);
+ /* target_type */
+ strscpy(sp->target_type, field[2], sizeof(sp->target_type));
+ if (dm_verify_target_type(sp->target_type)) {
+ DMERR("invalid type \"%s\"", sp->target_type);
+ return ERR_PTR(-EINVAL);
+ }
+ /* target_args */
+ dev->target_args_array[n] = kstrndup(field[3], DM_MAX_STR_SIZE,
+ GFP_KERNEL);
+ if (!dev->target_args_array[n])
+ return ERR_PTR(-ENOMEM);
+
+ return next;
+}
+
+/**
+ * dm_parse_table - parse "dm-mod.create=" table field
+ * @dev: device to store the parsed information.
+ * @str: the pointer to a string with the format:
+ * <table>[,<table>+]
+ */
+static int __init dm_parse_table(struct dm_device *dev, char *str)
+{
+ char *table_entry = str;
+
+ while (table_entry) {
+ DMDEBUG("parsing table \"%s\"", str);
+ if (++dev->dmi.target_count > DM_MAX_TARGETS) {
+ DMERR("too many targets %u > %d",
+ dev->dmi.target_count, DM_MAX_TARGETS);
+ return -EINVAL;
+ }
+ table_entry = dm_parse_table_entry(dev, table_entry);
+ if (IS_ERR(table_entry)) {
+ DMERR("couldn't parse table");
+ return PTR_ERR(table_entry);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * dm_parse_device_entry - parse a device entry
+ * @dev: device to store the parsed information.
+ * @str: the pointer to a string with the format:
+ * name,uuid,minor,flags,table[; ...]
+ *
+ * Return the remainder string after the table entry, i.e, after the semi-colon
+ * which delimits the entry or NULL if reached the end of the string.
+ */
+static char __init *dm_parse_device_entry(struct dm_device *dev, char *str)
+{
+ /* There are 5 fields: name,uuid,minor,flags,table; */
+ char *field[5];
+ unsigned int i;
+ char *next;
+
+ field[0] = str;
+ /* Delimit first 4 fields that are separated by comma */
+ for (i = 0; i < ARRAY_SIZE(field) - 1; i++) {
+ field[i+1] = str_field_delimit(&field[i], ',');
+ if (!field[i+1])
+ return ERR_PTR(-EINVAL);
+ }
+ /* Delimit last field that can be delimited by semi-colon */
+ next = str_field_delimit(&field[i], ';');
+
+ /* name */
+ strscpy(dev->dmi.name, field[0], sizeof(dev->dmi.name));
+ /* uuid */
+ strscpy(dev->dmi.uuid, field[1], sizeof(dev->dmi.uuid));
+ /* minor */
+ if (strlen(field[2])) {
+ if (kstrtoull(field[2], 0, &dev->dmi.dev))
+ return ERR_PTR(-EINVAL);
+ dev->dmi.flags |= DM_PERSISTENT_DEV_FLAG;
+ }
+ /* flags */
+ if (!strcmp(field[3], "ro"))
+ dev->dmi.flags |= DM_READONLY_FLAG;
+ else if (strcmp(field[3], "rw"))
+ return ERR_PTR(-EINVAL);
+ /* table */
+ if (dm_parse_table(dev, field[4]))
+ return ERR_PTR(-EINVAL);
+
+ return next;
+}
+
+/**
+ * dm_parse_devices - parse "dm-mod.create=" argument
+ * @devices: list of struct dm_device to store the parsed information.
+ * @str: the pointer to a string with the format:
+ * <device>[;<device>+]
+ */
+static int __init dm_parse_devices(struct list_head *devices, char *str)
+{
+ unsigned long ndev = 0;
+ struct dm_device *dev;
+ char *device = str;
+
+ DMDEBUG("parsing \"%s\"", str);
+ while (device) {
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ list_add_tail(&dev->list, devices);
+
+ if (++ndev > DM_MAX_DEVICES) {
+ DMERR("too many devices %lu > %d",
+ ndev, DM_MAX_DEVICES);
+ return -EINVAL;
+ }
+
+ device = dm_parse_device_entry(dev, device);
+ if (IS_ERR(device)) {
+ DMERR("couldn't parse device");
+ return PTR_ERR(device);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * dm_init_init - parse "dm-mod.create=" argument and configure drivers
+ */
+static int __init dm_init_init(void)
+{
+ struct dm_device *dev;
+ LIST_HEAD(devices);
+ char *str;
+ int r;
+
+ if (!create)
+ return 0;
+
+ if (strlen(create) >= DM_MAX_STR_SIZE) {
+ DMERR("Argument is too big. Limit is %d", DM_MAX_STR_SIZE);
+ return -EINVAL;
+ }
+ str = kstrndup(create, DM_MAX_STR_SIZE, GFP_KERNEL);
+ if (!str)
+ return -ENOMEM;
+
+ r = dm_parse_devices(&devices, str);
+ if (r)
+ goto out;
+
+ DMINFO("waiting for all devices to be available before creating mapped devices");
+ wait_for_device_probe();
+
+ list_for_each_entry(dev, &devices, list) {
+ if (dm_early_create(&dev->dmi, dev->table,
+ dev->target_args_array))
+ break;
+ }
+out:
+ kfree(str);
+ dm_setup_cleanup(&devices);
+ return r;
+}
+
+late_initcall(dm_init_init);
+
+module_param(create, charp, 0);
+MODULE_PARM_DESC(create, "Create a mapped device in early boot");
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index e1fa6ba..dab4446 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -15,6 +15,7 @@
#include <linux/rbtree.h>
#include <linux/delay.h>
#include <linux/random.h>
+#include <linux/reboot.h>
#include <crypto/hash.h>
#include <crypto/skcipher.h>
#include <linux/async_tx.h>
@@ -24,6 +25,7 @@
#define DEFAULT_INTERLEAVE_SECTORS 32768
#define DEFAULT_JOURNAL_SIZE_FACTOR 7
+#define DEFAULT_SECTORS_PER_BITMAP_BIT 32768
#define DEFAULT_BUFFER_SECTORS 128
#define DEFAULT_JOURNAL_WATERMARK 50
#define DEFAULT_SYNC_MSEC 10000
@@ -33,6 +35,8 @@
#define METADATA_WORKQUEUE_MAX_ACTIVE 16
#define RECALC_SECTORS 8192
#define RECALC_WRITE_SUPER 16
+#define BITMAP_BLOCK_SIZE 4096 /* don't change it */
+#define BITMAP_FLUSH_INTERVAL (10 * HZ)
/*
* Warning - DEBUG_PRINT prints security-sensitive data to the log,
@@ -48,6 +52,7 @@
#define SB_MAGIC "integrt"
#define SB_VERSION_1 1
#define SB_VERSION_2 2
+#define SB_VERSION_3 3
#define SB_SECTORS 8
#define MAX_SECTORS_PER_BLOCK 8
@@ -60,12 +65,14 @@
__u64 provided_data_sectors; /* userspace uses this value */
__u32 flags;
__u8 log2_sectors_per_block;
- __u8 pad[3];
+ __u8 log2_blocks_per_bitmap_bit;
+ __u8 pad[2];
__u64 recalc_sector;
};
#define SB_FLAG_HAVE_JOURNAL_MAC 0x1
#define SB_FLAG_RECALCULATING 0x2
+#define SB_FLAG_DIRTY_BITMAP 0x4
#define JOURNAL_ENTRY_ROUNDUP 8
@@ -88,14 +95,10 @@
#if BITS_PER_LONG == 64
#define journal_entry_set_sector(je, x) do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0)
-#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
-#elif defined(CONFIG_LBDAF)
-#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0)
-#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
#else
-#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32(0)); } while (0)
-#define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo)
+#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0)
#endif
+#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
#define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1))
#define journal_entry_set_unused(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0)
#define journal_entry_is_inprogress(je) ((je)->u.s.sector_hi == cpu_to_le32(-2))
@@ -155,9 +158,18 @@
struct workqueue_struct *metadata_wq;
struct superblock *sb;
unsigned journal_pages;
+ unsigned n_bitmap_blocks;
+
struct page_list *journal;
struct page_list *journal_io;
struct page_list *journal_xor;
+ struct page_list *recalc_bitmap;
+ struct page_list *may_write_bitmap;
+ struct bitmap_block_status *bbs;
+ unsigned bitmap_flush_interval;
+ int synchronous_mode;
+ struct bio_list synchronous_bios;
+ struct delayed_work bitmap_flush_work;
struct crypto_skcipher *journal_crypt;
struct scatterlist **journal_scatterlist;
@@ -184,6 +196,7 @@
__s8 log2_metadata_run;
__u8 log2_buffer_sectors;
__u8 sectors_per_block;
+ __u8 log2_blocks_per_bitmap_bit;
unsigned char mode;
int suspending;
@@ -236,17 +249,20 @@
bool journal_uptodate;
bool just_formatted;
+ bool recalculate_flag;
struct alg_spec internal_hash_alg;
struct alg_spec journal_crypt_alg;
struct alg_spec journal_mac_alg;
atomic64_t number_of_mismatches;
+
+ struct notifier_block reboot_notifier;
};
struct dm_integrity_range {
sector_t logical_sector;
- unsigned n_sectors;
+ sector_t n_sectors;
bool waiting;
union {
struct rb_node node;
@@ -292,6 +308,16 @@
struct journal_completion *comp;
};
+struct bitmap_block_status {
+ struct work_struct work;
+ struct dm_integrity_c *ic;
+ unsigned idx;
+ unsigned long *bitmap;
+ struct bio_list bio_queue;
+ spinlock_t bio_queue_lock;
+
+};
+
static struct kmem_cache *journal_io_cache;
#define JOURNAL_IO_MEMPOOL 32
@@ -319,6 +345,14 @@
#define DEBUG_bytes(bytes, len, msg, ...) do { } while (0)
#endif
+static void dm_integrity_prepare(struct request *rq)
+{
+}
+
+static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes)
+{
+}
+
/*
* DM Integrity profile, protection is performed layer above (dm-crypt)
*/
@@ -326,6 +360,8 @@
.name = "DM-DIF-EXT-TAG",
.generate_fn = NULL,
.verify_fn = NULL,
+ .prepare_fn = dm_integrity_prepare,
+ .complete_fn = dm_integrity_complete,
};
static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
@@ -427,7 +463,9 @@
static void sb_set_version(struct dm_integrity_c *ic)
{
- if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
+ if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
+ ic->sb->version = SB_VERSION_3;
+ else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
ic->sb->version = SB_VERSION_2;
else
ic->sb->version = SB_VERSION_1;
@@ -448,9 +486,143 @@
io_loc.sector = ic->start;
io_loc.count = SB_SECTORS;
+ if (op == REQ_OP_WRITE)
+ sb_set_version(ic);
+
return dm_io(&io_req, 1, &io_loc, NULL);
}
+#define BITMAP_OP_TEST_ALL_SET 0
+#define BITMAP_OP_TEST_ALL_CLEAR 1
+#define BITMAP_OP_SET 2
+#define BITMAP_OP_CLEAR 3
+
+static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap,
+ sector_t sector, sector_t n_sectors, int mode)
+{
+ unsigned long bit, end_bit, this_end_bit, page, end_page;
+ unsigned long *data;
+
+ if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) {
+ DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)",
+ (unsigned long long)sector,
+ (unsigned long long)n_sectors,
+ ic->sb->log2_sectors_per_block,
+ ic->log2_blocks_per_bitmap_bit,
+ mode);
+ BUG();
+ }
+
+ if (unlikely(!n_sectors))
+ return true;
+
+ bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+ end_bit = (sector + n_sectors - 1) >>
+ (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+
+ page = bit / (PAGE_SIZE * 8);
+ bit %= PAGE_SIZE * 8;
+
+ end_page = end_bit / (PAGE_SIZE * 8);
+ end_bit %= PAGE_SIZE * 8;
+
+repeat:
+ if (page < end_page) {
+ this_end_bit = PAGE_SIZE * 8 - 1;
+ } else {
+ this_end_bit = end_bit;
+ }
+
+ data = lowmem_page_address(bitmap[page].page);
+
+ if (mode == BITMAP_OP_TEST_ALL_SET) {
+ while (bit <= this_end_bit) {
+ if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+ do {
+ if (data[bit / BITS_PER_LONG] != -1)
+ return false;
+ bit += BITS_PER_LONG;
+ } while (this_end_bit >= bit + BITS_PER_LONG - 1);
+ continue;
+ }
+ if (!test_bit(bit, data))
+ return false;
+ bit++;
+ }
+ } else if (mode == BITMAP_OP_TEST_ALL_CLEAR) {
+ while (bit <= this_end_bit) {
+ if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+ do {
+ if (data[bit / BITS_PER_LONG] != 0)
+ return false;
+ bit += BITS_PER_LONG;
+ } while (this_end_bit >= bit + BITS_PER_LONG - 1);
+ continue;
+ }
+ if (test_bit(bit, data))
+ return false;
+ bit++;
+ }
+ } else if (mode == BITMAP_OP_SET) {
+ while (bit <= this_end_bit) {
+ if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+ do {
+ data[bit / BITS_PER_LONG] = -1;
+ bit += BITS_PER_LONG;
+ } while (this_end_bit >= bit + BITS_PER_LONG - 1);
+ continue;
+ }
+ __set_bit(bit, data);
+ bit++;
+ }
+ } else if (mode == BITMAP_OP_CLEAR) {
+ if (!bit && this_end_bit == PAGE_SIZE * 8 - 1)
+ clear_page(data);
+ else while (bit <= this_end_bit) {
+ if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+ do {
+ data[bit / BITS_PER_LONG] = 0;
+ bit += BITS_PER_LONG;
+ } while (this_end_bit >= bit + BITS_PER_LONG - 1);
+ continue;
+ }
+ __clear_bit(bit, data);
+ bit++;
+ }
+ } else {
+ BUG();
+ }
+
+ if (unlikely(page < end_page)) {
+ bit = 0;
+ page++;
+ goto repeat;
+ }
+
+ return true;
+}
+
+static void block_bitmap_copy(struct dm_integrity_c *ic, struct page_list *dst, struct page_list *src)
+{
+ unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
+ unsigned i;
+
+ for (i = 0; i < n_bitmap_pages; i++) {
+ unsigned long *dst_data = lowmem_page_address(dst[i].page);
+ unsigned long *src_data = lowmem_page_address(src[i].page);
+ copy_page(dst_data, src_data);
+ }
+}
+
+static struct bitmap_block_status *sector_to_bitmap_block(struct dm_integrity_c *ic, sector_t sector)
+{
+ unsigned bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+ unsigned bitmap_block = bit / (BITMAP_BLOCK_SIZE * 8);
+
+ BUG_ON(bitmap_block >= ic->n_bitmap_blocks);
+ return &ic->bbs[bitmap_block];
+}
+
static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
bool e, const char *function)
{
@@ -459,8 +631,8 @@
if (unlikely(section >= ic->journal_sections) ||
unlikely(offset >= limit)) {
- printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n",
- function, section, offset, ic->journal_sections, limit);
+ DMCRIT("%s: invalid access at (%u,%u), limit (%u,%u)",
+ function, section, offset, ic->journal_sections, limit);
BUG();
}
#endif
@@ -532,7 +704,6 @@
unsigned j, size;
desc->tfm = ic->journal_mac;
- desc->flags = 0;
r = crypto_shash_init(desc);
if (unlikely(r)) {
@@ -559,7 +730,12 @@
}
memset(result + size, 0, JOURNAL_MAC_SIZE - size);
} else {
- __u8 digest[size];
+ __u8 digest[HASH_MAX_DIGESTSIZE];
+
+ if (WARN_ON(size > sizeof(digest))) {
+ dm_integrity_io_error(ic, "digest_size", -EINVAL);
+ goto err;
+ }
r = crypto_shash_final(desc, digest);
if (unlikely(r)) {
dm_integrity_io_error(ic, "crypto_shash_final", r);
@@ -756,12 +932,12 @@
complete_journal_op(comp);
}
-static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
- unsigned n_sections, struct journal_completion *comp)
+static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags,
+ unsigned sector, unsigned n_sectors, struct journal_completion *comp)
{
struct dm_io_request io_req;
struct dm_io_region io_loc;
- unsigned sector, n_sectors, pl_index, pl_offset;
+ unsigned pl_index, pl_offset;
int r;
if (unlikely(dm_integrity_failed(ic))) {
@@ -770,9 +946,6 @@
return;
}
- sector = section * ic->journal_section_sectors;
- n_sectors = n_sections * ic->journal_section_sectors;
-
pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
@@ -805,6 +978,17 @@
}
}
+static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
+ unsigned n_sections, struct journal_completion *comp)
+{
+ unsigned sector, n_sectors;
+
+ sector = section * ic->journal_section_sectors;
+ n_sectors = n_sections * ic->journal_section_sectors;
+
+ rw_journal_sectors(ic, op, op_flags, sector, n_sectors, comp);
+}
+
static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
{
struct journal_completion io_comp;
@@ -908,7 +1092,7 @@
static bool ranges_overlap(struct dm_integrity_range *range1, struct dm_integrity_range *range2)
{
return range1->logical_sector < range2->logical_sector + range2->n_sectors &&
- range2->logical_sector + range2->n_sectors > range2->logical_sector;
+ range1->logical_sector + range1->n_sectors > range2->logical_sector;
}
static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range, bool check_waiting)
@@ -954,8 +1138,6 @@
struct dm_integrity_range *last_range =
list_first_entry(&ic->wait_list, struct dm_integrity_range, wait_entry);
struct task_struct *last_range_task;
- if (!ranges_overlap(range, last_range))
- break;
last_range_task = last_range->task;
list_del(&last_range->wait_entry);
if (!add_new_range(ic, last_range, false)) {
@@ -990,6 +1172,12 @@
} while (unlikely(new_range->waiting));
}
+static void add_new_range_and_wait(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
+{
+ if (unlikely(!add_new_range(ic, new_range, true)))
+ wait_and_add_new_range(ic, new_range);
+}
+
static void init_journal_node(struct journal_node *node)
{
RB_CLEAR_NODE(&node->node);
@@ -1117,7 +1305,7 @@
return r;
data = dm_bufio_read(ic->bufio, *metadata_block, &b);
- if (unlikely(IS_ERR(data)))
+ if (IS_ERR(data))
return PTR_ERR(data);
to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size);
@@ -1206,6 +1394,14 @@
int r = dm_integrity_failed(ic);
if (unlikely(r) && !bio->bi_status)
bio->bi_status = errno_to_blk_status(r);
+ if (unlikely(ic->synchronous_mode) && bio_op(bio) == REQ_OP_WRITE) {
+ unsigned long flags;
+ spin_lock_irqsave(&ic->endio_wait.lock, flags);
+ bio_list_add(&ic->synchronous_bios, bio);
+ queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
+ spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
+ return;
+ }
bio_endio(bio);
}
@@ -1273,7 +1469,6 @@
unsigned digest_size;
req->tfm = ic->internal_hash;
- req->flags = 0;
r = crypto_shash_init(req);
if (unlikely(r < 0)) {
@@ -1324,7 +1519,7 @@
struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
char *checksums;
unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
- char checksums_onstack[ic->tag_size + extra_space];
+ char checksums_onstack[HASH_MAX_DIGESTSIZE];
unsigned sectors_to_process = dio->range.n_sectors;
sector_t sector = dio->range.logical_sector;
@@ -1333,8 +1528,14 @@
checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
- if (!checksums)
+ if (!checksums) {
checksums = checksums_onstack;
+ if (WARN_ON(extra_space &&
+ digest_size > sizeof(checksums_onstack))) {
+ r = -EINVAL;
+ goto error;
+ }
+ }
__bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) {
unsigned pos;
@@ -1357,8 +1558,8 @@
checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
if (unlikely(r)) {
if (r > 0) {
- DMERR("Checksum failed at sector 0x%llx",
- (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
+ DMERR_LIMIT("Checksum failed at sector 0x%llx",
+ (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
r = -EILSEQ;
atomic64_inc(&ic->number_of_mismatches);
}
@@ -1474,7 +1675,8 @@
else
wanted_tag_size *= ic->tag_size;
if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
- DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size);
+ DMERR("Invalid integrity data size %u, expected %u",
+ bip->bip_iter.bi_size, wanted_tag_size);
return DM_MAPIO_KILL;
}
}
@@ -1546,12 +1748,12 @@
} while (++s < ic->sectors_per_block);
#ifdef INTERNAL_VERIFY
if (ic->internal_hash) {
- char checksums_onstack[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
+ char checksums_onstack[max(HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
- DMERR("Checksum failed when reading from journal, at sector 0x%llx",
- (unsigned long long)logical_sector);
+ DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
+ (unsigned long long)logical_sector);
}
}
#endif
@@ -1596,7 +1798,7 @@
if (ic->internal_hash) {
unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
if (unlikely(digest_size > ic->tag_size)) {
- char checksums_onstack[digest_size];
+ char checksums_onstack[HASH_MAX_DIGESTSIZE];
integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size);
} else
@@ -1678,7 +1880,7 @@
unsigned ws, we, range_sectors;
dio->range.n_sectors = min(dio->range.n_sectors,
- ic->free_sectors << ic->sb->log2_sectors_per_block);
+ (sector_t)ic->free_sectors << ic->sb->log2_sectors_per_block);
if (unlikely(!dio->range.n_sectors)) {
if (from_map)
goto offload_to_thread;
@@ -1751,7 +1953,22 @@
queue_work(ic->wait_wq, &dio->work);
return;
}
+ if (journal_read_pos != NOT_FOUND)
+ dio->range.n_sectors = ic->sectors_per_block;
wait_and_add_new_range(ic, &dio->range);
+ /*
+ * wait_and_add_new_range drops the spinlock, so the journal
+ * may have been changed arbitrarily. We need to recheck.
+ * To simplify the code, we restrict I/O size to just one block.
+ */
+ if (journal_read_pos != NOT_FOUND) {
+ sector_t next_sector;
+ unsigned new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
+ if (unlikely(new_pos != journal_read_pos)) {
+ remove_range_unlocked(ic, &dio->range);
+ goto retry;
+ }
+ }
}
spin_unlock_irq(&ic->endio_wait.lock);
@@ -1761,6 +1978,20 @@
goto journal_read_write;
}
+ if (ic->mode == 'B' && dio->write) {
+ if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
+ dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
+ struct bitmap_block_status *bbs;
+
+ bbs = sector_to_bitmap_block(ic, dio->range.logical_sector);
+ spin_lock(&bbs->bio_queue_lock);
+ bio_list_add(&bbs->bio_queue, bio);
+ spin_unlock(&bbs->bio_queue_lock);
+ queue_work(ic->writer_wq, &bbs->work);
+ return;
+ }
+ }
+
dio->in_flight = (atomic_t)ATOMIC_INIT(2);
if (need_sync_io) {
@@ -1787,10 +2018,15 @@
if (need_sync_io) {
wait_for_completion_io(&read_comp);
- if (unlikely(ic->recalc_wq != NULL) &&
- ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector))
goto skip_check;
+ if (ic->mode == 'B') {
+ if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector,
+ dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR))
+ goto skip_check;
+ }
+
if (likely(!bio->bi_status))
integrity_metadata(&dio->work);
else
@@ -1828,8 +2064,16 @@
wraparound_section(ic, &ic->free_section);
ic->n_uncommitted_sections++;
}
- WARN_ON(ic->journal_sections * ic->journal_section_entries !=
- (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors);
+ if (WARN_ON(ic->journal_sections * ic->journal_section_entries !=
+ (ic->n_uncommitted_sections + ic->n_committed_sections) *
+ ic->journal_section_entries + ic->free_sectors)) {
+ DMCRIT("journal_sections %u, journal_section_entries %u, "
+ "n_uncommitted_sections %u, n_committed_sections %u, "
+ "journal_section_entries %u, free_sectors %u",
+ ic->journal_sections, ic->journal_section_entries,
+ ic->n_uncommitted_sections, ic->n_committed_sections,
+ ic->journal_section_entries, ic->free_sectors);
+ }
}
static void integrity_commit(struct work_struct *w)
@@ -1978,8 +2222,7 @@
io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
spin_lock_irq(&ic->endio_wait.lock);
- if (unlikely(!add_new_range(ic, &io->range, true)))
- wait_and_add_new_range(ic, &io->range);
+ add_new_range_and_wait(ic, &io->range);
if (likely(!from_replay)) {
struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
@@ -2023,7 +2266,7 @@
unlikely(from_replay) &&
#endif
ic->internal_hash) {
- char test_tag[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
+ char test_tag[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
(char *)access_journal_data(ic, i, l), test_tag);
@@ -2102,7 +2345,6 @@
if (dm_integrity_failed(ic))
return;
- sb_set_version(ic);
r = sync_rw_sb(ic, REQ_OP_WRITE, 0);
if (unlikely(r))
dm_integrity_io_error(ic, "writing superblock", r);
@@ -2117,11 +2359,14 @@
sector_t area, offset;
sector_t metadata_block;
unsigned metadata_offset;
+ sector_t logical_sector, n_sectors;
__u8 *t;
unsigned i;
int r;
unsigned super_counter = 0;
+ DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector));
+
spin_lock_irq(&ic->endio_wait.lock);
next_chunk:
@@ -2130,21 +2375,49 @@
goto unlock_ret;
range.logical_sector = le64_to_cpu(ic->sb->recalc_sector);
- if (unlikely(range.logical_sector >= ic->provided_data_sectors))
+ if (unlikely(range.logical_sector >= ic->provided_data_sectors)) {
+ if (ic->mode == 'B') {
+ DEBUG_print("queue_delayed_work: bitmap_flush_work\n");
+ queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
+ }
goto unlock_ret;
+ }
get_area_and_offset(ic, range.logical_sector, &area, &offset);
range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector);
if (!ic->meta_dev)
- range.n_sectors = min(range.n_sectors, (1U << ic->sb->log2_interleave_sectors) - (unsigned)offset);
+ range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned)offset);
- if (unlikely(!add_new_range(ic, &range, true)))
- wait_and_add_new_range(ic, &range);
-
+ add_new_range_and_wait(ic, &range);
spin_unlock_irq(&ic->endio_wait.lock);
+ logical_sector = range.logical_sector;
+ n_sectors = range.n_sectors;
+
+ if (ic->mode == 'B') {
+ if (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) {
+ goto advance_and_next;
+ }
+ while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector,
+ ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
+ logical_sector += ic->sectors_per_block;
+ n_sectors -= ic->sectors_per_block;
+ cond_resched();
+ }
+ while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + n_sectors - ic->sectors_per_block,
+ ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
+ n_sectors -= ic->sectors_per_block;
+ cond_resched();
+ }
+ get_area_and_offset(ic, logical_sector, &area, &offset);
+ }
+
+ DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors);
if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
recalc_write_super(ic);
+ if (ic->mode == 'B') {
+ queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
+ }
super_counter = 0;
}
@@ -2159,7 +2432,7 @@
io_req.client = ic->io;
io_loc.bdev = ic->dev->bdev;
io_loc.sector = get_data_sector(ic, area, offset);
- io_loc.count = range.n_sectors;
+ io_loc.count = n_sectors;
r = dm_io(&io_req, 1, &io_loc, NULL);
if (unlikely(r)) {
@@ -2168,8 +2441,8 @@
}
t = ic->recalc_tags;
- for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) {
- integrity_sector_checksum(ic, range.logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
+ for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
+ integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
t += ic->tag_size;
}
@@ -2181,6 +2454,9 @@
goto err;
}
+advance_and_next:
+ cond_resched();
+
spin_lock_irq(&ic->endio_wait.lock);
remove_range_unlocked(ic, &range);
ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors);
@@ -2196,6 +2472,103 @@
recalc_write_super(ic);
}
+static void bitmap_block_work(struct work_struct *w)
+{
+ struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work);
+ struct dm_integrity_c *ic = bbs->ic;
+ struct bio *bio;
+ struct bio_list bio_queue;
+ struct bio_list waiting;
+
+ bio_list_init(&waiting);
+
+ spin_lock(&bbs->bio_queue_lock);
+ bio_queue = bbs->bio_queue;
+ bio_list_init(&bbs->bio_queue);
+ spin_unlock(&bbs->bio_queue_lock);
+
+ while ((bio = bio_list_pop(&bio_queue))) {
+ struct dm_integrity_io *dio;
+
+ dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+
+ if (block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
+ dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
+ remove_range(ic, &dio->range);
+ INIT_WORK(&dio->work, integrity_bio_wait);
+ queue_work(ic->wait_wq, &dio->work);
+ } else {
+ block_bitmap_op(ic, ic->journal, dio->range.logical_sector,
+ dio->range.n_sectors, BITMAP_OP_SET);
+ bio_list_add(&waiting, bio);
+ }
+ }
+
+ if (bio_list_empty(&waiting))
+ return;
+
+ rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC,
+ bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT),
+ BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL);
+
+ while ((bio = bio_list_pop(&waiting))) {
+ struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+
+ block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
+ dio->range.n_sectors, BITMAP_OP_SET);
+
+ remove_range(ic, &dio->range);
+ INIT_WORK(&dio->work, integrity_bio_wait);
+ queue_work(ic->wait_wq, &dio->work);
+ }
+
+ queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
+}
+
+static void bitmap_flush_work(struct work_struct *work)
+{
+ struct dm_integrity_c *ic = container_of(work, struct dm_integrity_c, bitmap_flush_work.work);
+ struct dm_integrity_range range;
+ unsigned long limit;
+ struct bio *bio;
+
+ dm_integrity_flush_buffers(ic);
+
+ range.logical_sector = 0;
+ range.n_sectors = ic->provided_data_sectors;
+
+ spin_lock_irq(&ic->endio_wait.lock);
+ add_new_range_and_wait(ic, &range);
+ spin_unlock_irq(&ic->endio_wait.lock);
+
+ dm_integrity_flush_buffers(ic);
+ if (ic->meta_dev)
+ blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL);
+
+ limit = ic->provided_data_sectors;
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
+ limit = le64_to_cpu(ic->sb->recalc_sector)
+ >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)
+ << (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+ }
+ /*DEBUG_print("zeroing journal\n");*/
+ block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR);
+ block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR);
+
+ rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
+ ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+
+ spin_lock_irq(&ic->endio_wait.lock);
+ remove_range_unlocked(ic, &range);
+ while (unlikely((bio = bio_list_pop(&ic->synchronous_bios)) != NULL)) {
+ bio_endio(bio);
+ spin_unlock_irq(&ic->endio_wait.lock);
+ spin_lock_irq(&ic->endio_wait.lock);
+ }
+ spin_unlock_irq(&ic->endio_wait.lock);
+}
+
+
static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
unsigned n_sections, unsigned char commit_seq)
{
@@ -2392,9 +2765,37 @@
init_journal_node(&ic->journal_tree[i]);
}
+static void dm_integrity_enter_synchronous_mode(struct dm_integrity_c *ic)
+{
+ DEBUG_print("dm_integrity_enter_synchronous_mode\n");
+
+ if (ic->mode == 'B') {
+ ic->bitmap_flush_interval = msecs_to_jiffies(10) + 1;
+ ic->synchronous_mode = 1;
+
+ cancel_delayed_work_sync(&ic->bitmap_flush_work);
+ queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
+ flush_workqueue(ic->commit_wq);
+ }
+}
+
+static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, void *x)
+{
+ struct dm_integrity_c *ic = container_of(n, struct dm_integrity_c, reboot_notifier);
+
+ DEBUG_print("dm_integrity_reboot\n");
+
+ dm_integrity_enter_synchronous_mode(ic);
+
+ return NOTIFY_DONE;
+}
+
static void dm_integrity_postsuspend(struct dm_target *ti)
{
struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+ int r;
+
+ WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier));
del_timer_sync(&ic->autocommit_timer);
@@ -2403,6 +2804,9 @@
if (ic->recalc_wq)
drain_workqueue(ic->recalc_wq);
+ if (ic->mode == 'B')
+ cancel_delayed_work_sync(&ic->bitmap_flush_work);
+
queue_work(ic->commit_wq, &ic->commit_work);
drain_workqueue(ic->commit_wq);
@@ -2413,6 +2817,18 @@
dm_integrity_flush_buffers(ic);
}
+ if (ic->mode == 'B') {
+ dm_integrity_flush_buffers(ic);
+#if 1
+ /* set to 0 to test bitmap replay code */
+ init_journal(ic, 0, ic->journal_sections, 0);
+ ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+ r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+ if (unlikely(r))
+ dm_integrity_io_error(ic, "writing superblock", r);
+#endif
+ }
+
WRITE_ONCE(ic->suspending, 0);
BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
@@ -2423,11 +2839,70 @@
static void dm_integrity_resume(struct dm_target *ti)
{
struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+ int r;
+ DEBUG_print("resume\n");
- replay_journal(ic);
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) {
+ DEBUG_print("resume dirty_bitmap\n");
+ rw_journal_sectors(ic, REQ_OP_READ, 0, 0,
+ ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+ if (ic->mode == 'B') {
+ if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) {
+ block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal);
+ block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal);
+ if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors,
+ BITMAP_OP_TEST_ALL_CLEAR)) {
+ ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
+ ic->sb->recalc_sector = cpu_to_le64(0);
+ }
+ } else {
+ DEBUG_print("non-matching blocks_per_bitmap_bit: %u, %u\n",
+ ic->sb->log2_blocks_per_bitmap_bit, ic->log2_blocks_per_bitmap_bit);
+ ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
+ block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
+ block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
+ block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_SET);
+ rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
+ ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+ ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
+ ic->sb->recalc_sector = cpu_to_le64(0);
+ }
+ } else {
+ if (!(ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit &&
+ block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR))) {
+ ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
+ ic->sb->recalc_sector = cpu_to_le64(0);
+ }
+ init_journal(ic, 0, ic->journal_sections, 0);
+ replay_journal(ic);
+ ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+ }
+ r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+ if (unlikely(r))
+ dm_integrity_io_error(ic, "writing superblock", r);
+ } else {
+ replay_journal(ic);
+ if (ic->mode == 'B') {
+ int mode;
+ ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+ ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
+ r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+ if (unlikely(r))
+ dm_integrity_io_error(ic, "writing superblock", r);
- if (ic->recalc_wq && ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
+ mode = ic->recalculate_flag ? BITMAP_OP_SET : BITMAP_OP_CLEAR;
+ block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, mode);
+ block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, mode);
+ block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, mode);
+ rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
+ ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+ }
+ }
+
+ DEBUG_print("testing recalc: %x\n", ic->sb->flags);
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
__u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector);
+ DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, ic->provided_data_sectors);
if (recalc_pos < ic->provided_data_sectors) {
queue_work(ic->recalc_wq, &ic->recalc_work);
} else if (recalc_pos > ic->provided_data_sectors) {
@@ -2435,6 +2910,16 @@
recalc_write_super(ic);
}
}
+
+ ic->reboot_notifier.notifier_call = dm_integrity_reboot;
+ ic->reboot_notifier.next = NULL;
+ ic->reboot_notifier.priority = INT_MAX - 1; /* be notified after md and before hardware drivers */
+ WARN_ON(register_reboot_notifier(&ic->reboot_notifier));
+
+#if 0
+ /* set to 1 to stress test synchronous mode */
+ dm_integrity_enter_synchronous_mode(ic);
+#endif
}
static void dm_integrity_status(struct dm_target *ti, status_type_t type,
@@ -2459,10 +2944,14 @@
__u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
watermark_percentage += ic->journal_entries / 2;
do_div(watermark_percentage, ic->journal_entries);
- arg_count = 5;
+ arg_count = 3;
arg_count += !!ic->meta_dev;
arg_count += ic->sectors_per_block != 1;
arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
+ arg_count += ic->mode == 'J';
+ arg_count += ic->mode == 'J';
+ arg_count += ic->mode == 'B';
+ arg_count += ic->mode == 'B';
arg_count += !!ic->internal_hash_alg.alg_string;
arg_count += !!ic->journal_crypt_alg.alg_string;
arg_count += !!ic->journal_mac_alg.alg_string;
@@ -2472,13 +2961,19 @@
DMEMIT(" meta_device:%s", ic->meta_dev->name);
if (ic->sectors_per_block != 1)
DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
- if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
+ if (ic->recalculate_flag)
DMEMIT(" recalculate");
DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
- DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
- DMEMIT(" commit_time:%u", ic->autocommit_msec);
+ if (ic->mode == 'J') {
+ DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
+ DMEMIT(" commit_time:%u", ic->autocommit_msec);
+ }
+ if (ic->mode == 'B') {
+ DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
+ DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval));
+ }
#define EMIT_ALG(a, n) \
do { \
@@ -2559,7 +3054,7 @@
if (last_sector < ic->start || last_sector >= ic->meta_device_sectors)
return -EINVAL;
} else {
- __u64 meta_size = ic->provided_data_sectors * ic->tag_size;
+ __u64 meta_size = (ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1))
>> (ic->log2_buffer_sectors + SECTOR_SHIFT);
meta_size <<= ic->log2_buffer_sectors;
@@ -2656,37 +3151,37 @@
blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
}
-static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl)
+static void dm_integrity_free_page_list(struct page_list *pl)
{
unsigned i;
if (!pl)
return;
- for (i = 0; i < ic->journal_pages; i++)
- if (pl[i].page)
- __free_page(pl[i].page);
+ for (i = 0; pl[i].page; i++)
+ __free_page(pl[i].page);
kvfree(pl);
}
-static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic)
+static struct page_list *dm_integrity_alloc_page_list(unsigned n_pages)
{
- size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list);
struct page_list *pl;
unsigned i;
- pl = kvmalloc(page_list_desc_size, GFP_KERNEL | __GFP_ZERO);
+ pl = kvmalloc_array(n_pages + 1, sizeof(struct page_list), GFP_KERNEL | __GFP_ZERO);
if (!pl)
return NULL;
- for (i = 0; i < ic->journal_pages; i++) {
+ for (i = 0; i < n_pages; i++) {
pl[i].page = alloc_page(GFP_KERNEL);
if (!pl[i].page) {
- dm_integrity_free_page_list(ic, pl);
+ dm_integrity_free_page_list(pl);
return NULL;
}
if (i)
pl[i - 1].next = &pl[i];
}
+ pl[i].page = NULL;
+ pl[i].next = NULL;
return pl;
}
@@ -2699,7 +3194,8 @@
kvfree(sl);
}
-static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl)
+static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic,
+ struct page_list *pl)
{
struct scatterlist **sl;
unsigned i;
@@ -2718,7 +3214,8 @@
unsigned idx;
page_list_location(ic, i, 0, &start_index, &start_offset);
- page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset);
+ page_list_location(ic, i, ic->journal_section_sectors - 1,
+ &end_index, &end_offset);
n_pages = (end_index - start_index + 1);
@@ -2793,7 +3290,7 @@
int r;
if (a->alg_string) {
- *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ASYNC);
+ *hash = crypto_alloc_shash(a->alg_string, 0, 0);
if (IS_ERR(*hash)) {
*error = error_alg;
r = PTR_ERR(*hash);
@@ -2832,14 +3329,14 @@
journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors,
PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT);
journal_desc_size = journal_pages * sizeof(struct page_list);
- if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) {
+ if (journal_pages >= totalram_pages() - totalhigh_pages() || journal_desc_size > ULONG_MAX) {
*error = "Journal doesn't fit into memory";
r = -ENOMEM;
goto bad;
}
ic->journal_pages = journal_pages;
- ic->journal = dm_integrity_alloc_page_list(ic);
+ ic->journal = dm_integrity_alloc_page_list(ic->journal_pages);
if (!ic->journal) {
*error = "Could not allocate memory for journal";
r = -ENOMEM;
@@ -2871,7 +3368,7 @@
DEBUG_print("cipher %s, block size %u iv size %u\n",
ic->journal_crypt_alg.alg_string, blocksize, ivsize);
- ic->journal_io = dm_integrity_alloc_page_list(ic);
+ ic->journal_io = dm_integrity_alloc_page_list(ic->journal_pages);
if (!ic->journal_io) {
*error = "Could not allocate memory for journal io";
r = -ENOMEM;
@@ -2888,14 +3385,14 @@
goto bad;
}
- crypt_iv = kmalloc(ivsize, GFP_KERNEL);
+ crypt_iv = kzalloc(ivsize, GFP_KERNEL);
if (!crypt_iv) {
*error = "Could not allocate iv";
r = -ENOMEM;
goto bad;
}
- ic->journal_xor = dm_integrity_alloc_page_list(ic);
+ ic->journal_xor = dm_integrity_alloc_page_list(ic->journal_pages);
if (!ic->journal_xor) {
*error = "Could not allocate memory for journal xor";
r = -ENOMEM;
@@ -2917,9 +3414,9 @@
sg_set_buf(&sg[i], va, PAGE_SIZE);
}
sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
- memset(crypt_iv, 0x00, ivsize);
- skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
+ skcipher_request_set_crypt(req, sg, sg,
+ PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
init_completion(&comp.comp);
comp.in_flight = (atomic_t)ATOMIC_INIT(1);
if (do_crypt(true, req, &comp))
@@ -3060,7 +3557,7 @@
* device
* offset from the start of the device
* tag size
- * D - direct writes, J - journal writes, R - recovery mode
+ * D - direct writes, J - journal writes, B - bitmap mode, R - recovery mode
* number of optional arguments
* optional arguments:
* journal_sectors
@@ -3068,10 +3565,14 @@
* buffer_sectors
* journal_watermark
* commit_time
+ * meta_device
+ * block_size
+ * sectors_per_bit
+ * bitmap_flush_interval
* internal_hash
* journal_crypt
* journal_mac
- * block_size
+ * recalculate
*/
static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
@@ -3084,10 +3585,13 @@
{0, 9, "Invalid number of feature args"},
};
unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
- bool recalculate;
bool should_write_sb;
__u64 threshold;
unsigned long long start;
+ __s8 log2_sectors_per_bitmap_bit = -1;
+ __s8 log2_blocks_per_bitmap_bit;
+ __u64 bits_in_journal;
+ __u64 n_bitmap_bits;
#define DIRECT_ARGUMENTS 4
@@ -3111,6 +3615,7 @@
init_waitqueue_head(&ic->copy_to_journal_wait);
init_completion(&ic->crypto_backoff);
atomic64_set(&ic->number_of_mismatches, 0);
+ ic->bitmap_flush_interval = BITMAP_FLUSH_INTERVAL;
r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
if (r) {
@@ -3133,10 +3638,11 @@
}
}
- if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R"))
+ if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") ||
+ !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) {
ic->mode = argv[3][0];
- else {
- ti->error = "Invalid mode (expecting J, D, R)";
+ } else {
+ ti->error = "Invalid mode (expecting J, B, D, R)";
r = -EINVAL;
goto bad;
}
@@ -3146,7 +3652,6 @@
buffer_sectors = DEFAULT_BUFFER_SECTORS;
journal_watermark = DEFAULT_JOURNAL_WATERMARK;
sync_msec = DEFAULT_SYNC_MSEC;
- recalculate = false;
ic->sectors_per_block = 1;
as.argc = argc - DIRECT_ARGUMENTS;
@@ -3158,6 +3663,7 @@
while (extra_args--) {
const char *opt_string;
unsigned val;
+ unsigned long long llval;
opt_string = dm_shift_arg(&as);
if (!opt_string) {
r = -EINVAL;
@@ -3174,12 +3680,13 @@
journal_watermark = val;
else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1)
sync_msec = val;
- else if (!memcmp(opt_string, "meta_device:", strlen("meta_device:"))) {
+ else if (!strncmp(opt_string, "meta_device:", strlen("meta_device:"))) {
if (ic->meta_dev) {
dm_put_device(ti, ic->meta_dev);
ic->meta_dev = NULL;
}
- r = dm_get_device(ti, strchr(opt_string, ':') + 1, dm_table_get_mode(ti->table), &ic->meta_dev);
+ r = dm_get_device(ti, strchr(opt_string, ':') + 1,
+ dm_table_get_mode(ti->table), &ic->meta_dev);
if (r) {
ti->error = "Device lookup failed";
goto bad;
@@ -3193,23 +3700,31 @@
goto bad;
}
ic->sectors_per_block = val >> SECTOR_SHIFT;
- } else if (!memcmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
+ } else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) {
+ log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval);
+ } else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) {
+ if (val >= (uint64_t)UINT_MAX * 1000 / HZ) {
+ r = -EINVAL;
+ ti->error = "Invalid bitmap_flush_interval argument";
+ }
+ ic->bitmap_flush_interval = msecs_to_jiffies(val);
+ } else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
"Invalid internal_hash argument");
if (r)
goto bad;
- } else if (!memcmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) {
+ } else if (!strncmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) {
r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error,
"Invalid journal_crypt argument");
if (r)
goto bad;
- } else if (!memcmp(opt_string, "journal_mac:", strlen("journal_mac:"))) {
+ } else if (!strncmp(opt_string, "journal_mac:", strlen("journal_mac:"))) {
r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error,
"Invalid journal_mac argument");
if (r)
goto bad;
} else if (!strcmp(opt_string, "recalculate")) {
- recalculate = true;
+ ic->recalculate_flag = true;
} else {
r = -EINVAL;
ti->error = "Invalid argument";
@@ -3225,7 +3740,7 @@
if (!journal_sectors) {
journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
- ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
+ ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
}
if (!buffer_sectors)
@@ -3260,6 +3775,12 @@
else
ic->log2_tag_size = -1;
+ if (ic->mode == 'B' && !ic->internal_hash) {
+ r = -EINVAL;
+ ti->error = "Bitmap mode can be only used with internal hash";
+ goto bad;
+ }
+
ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
ic->autocommit_msec = sync_msec;
timer_setup(&ic->autocommit_timer, autocommit_fn, 0);
@@ -3305,7 +3826,7 @@
}
INIT_WORK(&ic->commit_work, integrity_commit);
- if (ic->mode == 'J') {
+ if (ic->mode == 'J' || ic->mode == 'B') {
ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1);
if (!ic->writer_wq) {
ti->error = "Cannot allocate workqueue";
@@ -3346,7 +3867,7 @@
should_write_sb = true;
}
- if (!ic->sb->version || ic->sb->version > SB_VERSION_2) {
+ if (!ic->sb->version || ic->sb->version > SB_VERSION_3) {
r = -EINVAL;
ti->error = "Unknown version";
goto bad;
@@ -3406,6 +3927,27 @@
ti->error = "The device is too small";
goto bad;
}
+
+ if (log2_sectors_per_bitmap_bit < 0)
+ log2_sectors_per_bitmap_bit = __fls(DEFAULT_SECTORS_PER_BITMAP_BIT);
+ if (log2_sectors_per_bitmap_bit < ic->sb->log2_sectors_per_block)
+ log2_sectors_per_bitmap_bit = ic->sb->log2_sectors_per_block;
+
+ bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3);
+ if (bits_in_journal > UINT_MAX)
+ bits_in_journal = UINT_MAX;
+ while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit)
+ log2_sectors_per_bitmap_bit++;
+
+ log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block;
+ ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
+ if (should_write_sb) {
+ ic->sb->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
+ }
+ n_bitmap_bits = ((ic->provided_data_sectors >> ic->sb->log2_sectors_per_block)
+ + (((sector_t)1 << log2_blocks_per_bitmap_bit) - 1)) >> log2_blocks_per_bitmap_bit;
+ ic->n_bitmap_blocks = DIV_ROUND_UP(n_bitmap_bits, BITMAP_BLOCK_SIZE * 8);
+
if (!ic->meta_dev)
ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run));
@@ -3430,26 +3972,22 @@
DEBUG_print(" journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
DEBUG_print(" journal_entries %u\n", ic->journal_entries);
DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
- DEBUG_print(" device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors);
+ DEBUG_print(" data_device_sectors 0x%llx\n", i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT);
DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors);
DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run);
DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run);
DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors,
(unsigned long long)ic->provided_data_sectors);
DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
+ DEBUG_print(" bits_in_journal %llu\n", (unsigned long long)bits_in_journal);
- if (recalculate && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
+ if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
ic->sb->recalc_sector = cpu_to_le64(0);
}
- if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
- if (!ic->internal_hash) {
- r = -EINVAL;
- ti->error = "Recalculate is only valid with internal hash";
- goto bad;
- }
- ic->recalc_wq = alloc_workqueue("dm-intergrity-recalc", WQ_MEM_RECLAIM, 1);
+ if (ic->internal_hash) {
+ ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1);
if (!ic->recalc_wq ) {
ti->error = "Cannot allocate workqueue";
r = -ENOMEM;
@@ -3485,6 +4023,45 @@
r = create_journal(ic, &ti->error);
if (r)
goto bad;
+
+ }
+
+ if (ic->mode == 'B') {
+ unsigned i;
+ unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
+
+ ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
+ if (!ic->recalc_bitmap) {
+ r = -ENOMEM;
+ goto bad;
+ }
+ ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
+ if (!ic->may_write_bitmap) {
+ r = -ENOMEM;
+ goto bad;
+ }
+ ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL);
+ if (!ic->bbs) {
+ r = -ENOMEM;
+ goto bad;
+ }
+ INIT_DELAYED_WORK(&ic->bitmap_flush_work, bitmap_flush_work);
+ for (i = 0; i < ic->n_bitmap_blocks; i++) {
+ struct bitmap_block_status *bbs = &ic->bbs[i];
+ unsigned sector, pl_index, pl_offset;
+
+ INIT_WORK(&bbs->work, bitmap_block_work);
+ bbs->ic = ic;
+ bbs->idx = i;
+ bio_list_init(&bbs->bio_queue);
+ spin_lock_init(&bbs->bio_queue_lock);
+
+ sector = i * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT);
+ pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+ pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+
+ bbs->bitmap = lowmem_page_address(ic->journal[pl_index].page) + pl_offset;
+ }
}
if (should_write_sb) {
@@ -3509,6 +4086,17 @@
if (r)
goto bad;
}
+ if (ic->mode == 'B') {
+ unsigned max_io_len = ((sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit) * (BITMAP_BLOCK_SIZE * 8);
+ if (!max_io_len)
+ max_io_len = 1U << 31;
+ DEBUG_print("max_io_len: old %u, new %u\n", ti->max_io_len, max_io_len);
+ if (!ti->max_io_len || ti->max_io_len > max_io_len) {
+ r = dm_set_target_max_io_len(ti, max_io_len);
+ if (r)
+ goto bad;
+ }
+ }
if (!ic->internal_hash)
dm_integrity_set(ti, ic);
@@ -3517,6 +4105,7 @@
ti->flush_supported = true;
return 0;
+
bad:
dm_integrity_dtr(ti);
return r;
@@ -3539,10 +4128,9 @@
destroy_workqueue(ic->writer_wq);
if (ic->recalc_wq)
destroy_workqueue(ic->recalc_wq);
- if (ic->recalc_buffer)
- vfree(ic->recalc_buffer);
- if (ic->recalc_tags)
- kvfree(ic->recalc_tags);
+ vfree(ic->recalc_buffer);
+ kvfree(ic->recalc_tags);
+ kvfree(ic->bbs);
if (ic->bufio)
dm_bufio_client_destroy(ic->bufio);
mempool_exit(&ic->journal_io_mempool);
@@ -3552,9 +4140,11 @@
dm_put_device(ti, ic->dev);
if (ic->meta_dev)
dm_put_device(ti, ic->meta_dev);
- dm_integrity_free_page_list(ic, ic->journal);
- dm_integrity_free_page_list(ic, ic->journal_io);
- dm_integrity_free_page_list(ic, ic->journal_xor);
+ dm_integrity_free_page_list(ic->journal);
+ dm_integrity_free_page_list(ic->journal_io);
+ dm_integrity_free_page_list(ic->journal_xor);
+ dm_integrity_free_page_list(ic->recalc_bitmap);
+ dm_integrity_free_page_list(ic->may_write_bitmap);
if (ic->journal_scatterlist)
dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist);
if (ic->journal_io_scatterlist)
@@ -3592,7 +4182,7 @@
static struct target_type integrity_target = {
.name = "integrity",
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.module = THIS_MODULE,
.features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
.ctr = dm_integrity_ctr,
@@ -3605,7 +4195,7 @@
.io_hints = dm_integrity_io_hints,
};
-int __init dm_integrity_init(void)
+static int __init dm_integrity_init(void)
{
int r;
@@ -3624,7 +4214,7 @@
return r;
}
-void dm_integrity_exit(void)
+static void __exit dm_integrity_exit(void)
{
dm_unregister_target(&integrity_target);
kmem_cache_destroy(journal_io_cache);
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index f666778..ac83f50 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -601,17 +601,27 @@
info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1);
}
-static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size)
+static int __list_versions(struct dm_ioctl *param, size_t param_size, const char *name)
{
size_t len, needed = 0;
struct dm_target_versions *vers;
struct vers_iter iter_info;
+ struct target_type *tt = NULL;
+
+ if (name) {
+ tt = dm_get_target_type(name);
+ if (!tt)
+ return -EINVAL;
+ }
/*
* Loop through all the devices working out how much
* space we need.
*/
- dm_target_iterate(list_version_get_needed, &needed);
+ if (!tt)
+ dm_target_iterate(list_version_get_needed, &needed);
+ else
+ list_version_get_needed(tt, &needed);
/*
* Grab our output buffer.
@@ -632,13 +642,28 @@
/*
* Now loop through filling out the names & versions.
*/
- dm_target_iterate(list_version_get_info, &iter_info);
+ if (!tt)
+ dm_target_iterate(list_version_get_info, &iter_info);
+ else
+ list_version_get_info(tt, &iter_info);
param->flags |= iter_info.flags;
out:
+ if (tt)
+ dm_put_target_type(tt);
return 0;
}
+static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size)
+{
+ return __list_versions(param, param_size, NULL);
+}
+
+static int get_target_version(struct file *filp, struct dm_ioctl *param, size_t param_size)
+{
+ return __list_versions(param, param_size, param->name);
+}
+
static int check_name(const char *name)
{
if (strchr(name, '/')) {
@@ -1592,7 +1617,7 @@
}
ti = dm_table_find_target(table, tmsg->sector);
- if (!dm_target_is_valid(ti)) {
+ if (!ti) {
DMWARN("Target message sector outside device.");
r = -EINVAL;
} else if (ti->type->message)
@@ -1664,6 +1689,7 @@
{DM_TARGET_MSG_CMD, 0, target_message},
{DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry},
{DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll},
+ {DM_GET_TARGET_VERSION, 0, get_target_version},
};
if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
@@ -2018,3 +2044,110 @@
return r;
}
+
+
+/**
+ * dm_early_create - create a mapped device in early boot.
+ *
+ * @dmi: Contains main information of the device mapping to be created.
+ * @spec_array: array of pointers to struct dm_target_spec. Describes the
+ * mapping table of the device.
+ * @target_params_array: array of strings with the parameters to a specific
+ * target.
+ *
+ * Instead of having the struct dm_target_spec and the parameters for every
+ * target embedded at the end of struct dm_ioctl (as performed in a normal
+ * ioctl), pass them as arguments, so the caller doesn't need to serialize them.
+ * The size of the spec_array and target_params_array is given by
+ * @dmi->target_count.
+ * This function is supposed to be called in early boot, so locking mechanisms
+ * to protect against concurrent loads are not required.
+ */
+int __init dm_early_create(struct dm_ioctl *dmi,
+ struct dm_target_spec **spec_array,
+ char **target_params_array)
+{
+ int r, m = DM_ANY_MINOR;
+ struct dm_table *t, *old_map;
+ struct mapped_device *md;
+ unsigned int i;
+
+ if (!dmi->target_count)
+ return -EINVAL;
+
+ r = check_name(dmi->name);
+ if (r)
+ return r;
+
+ if (dmi->flags & DM_PERSISTENT_DEV_FLAG)
+ m = MINOR(huge_decode_dev(dmi->dev));
+
+ /* alloc dm device */
+ r = dm_create(m, &md);
+ if (r)
+ return r;
+
+ /* hash insert */
+ r = dm_hash_insert(dmi->name, *dmi->uuid ? dmi->uuid : NULL, md);
+ if (r)
+ goto err_destroy_dm;
+
+ /* alloc table */
+ r = dm_table_create(&t, get_mode(dmi), dmi->target_count, md);
+ if (r)
+ goto err_hash_remove;
+
+ /* add targets */
+ for (i = 0; i < dmi->target_count; i++) {
+ r = dm_table_add_target(t, spec_array[i]->target_type,
+ (sector_t) spec_array[i]->sector_start,
+ (sector_t) spec_array[i]->length,
+ target_params_array[i]);
+ if (r) {
+ DMWARN("error adding target to table");
+ goto err_destroy_table;
+ }
+ }
+
+ /* finish table */
+ r = dm_table_complete(t);
+ if (r)
+ goto err_destroy_table;
+
+ md->type = dm_table_get_type(t);
+ /* setup md->queue to reflect md's type (may block) */
+ r = dm_setup_md_queue(md, t);
+ if (r) {
+ DMWARN("unable to set up device queue for new table.");
+ goto err_destroy_table;
+ }
+
+ /* Set new map */
+ dm_suspend(md, 0);
+ old_map = dm_swap_table(md, t);
+ if (IS_ERR(old_map)) {
+ r = PTR_ERR(old_map);
+ goto err_destroy_table;
+ }
+ set_disk_ro(dm_disk(md), !!(dmi->flags & DM_READONLY_FLAG));
+
+ /* resume device */
+ r = dm_resume(md);
+ if (r)
+ goto err_destroy_table;
+
+ DMINFO("%s (%s) is ready", md->disk->disk_name, dmi->name);
+ dm_put(md);
+ return 0;
+
+err_destroy_table:
+ dm_table_destroy(t);
+err_hash_remove:
+ (void) __hash_remove(__get_name_cell(dmi->name));
+ /* release reference from __get_name_cell */
+ dm_put(md);
+err_destroy_dm:
+ dm_put(md);
+ dm_destroy(md);
+ return r;
+}
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 2fc4213..1bbe4a3 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -28,10 +28,27 @@
#include "dm-core.h"
-#define SUB_JOB_SIZE 128
#define SPLIT_COUNT 8
#define MIN_JOBS 8
-#define RESERVE_PAGES (DIV_ROUND_UP(SUB_JOB_SIZE << SECTOR_SHIFT, PAGE_SIZE))
+
+#define DEFAULT_SUB_JOB_SIZE_KB 512
+#define MAX_SUB_JOB_SIZE_KB 1024
+
+static unsigned kcopyd_subjob_size_kb = DEFAULT_SUB_JOB_SIZE_KB;
+
+module_param(kcopyd_subjob_size_kb, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(kcopyd_subjob_size_kb, "Sub-job size for dm-kcopyd clients");
+
+static unsigned dm_get_kcopyd_subjob_size(void)
+{
+ unsigned sub_job_size_kb;
+
+ sub_job_size_kb = __dm_get_module_param(&kcopyd_subjob_size_kb,
+ DEFAULT_SUB_JOB_SIZE_KB,
+ MAX_SUB_JOB_SIZE_KB);
+
+ return sub_job_size_kb << 1;
+}
/*-----------------------------------------------------------------
* Each kcopyd client has its own little pool of preallocated
@@ -41,6 +58,7 @@
struct page_list *pages;
unsigned nr_reserved_pages;
unsigned nr_free_pages;
+ unsigned sub_job_size;
struct dm_io_client *io_client;
@@ -56,15 +74,17 @@
atomic_t nr_jobs;
/*
- * We maintain three lists of jobs:
+ * We maintain four lists of jobs:
*
* i) jobs waiting for pages
* ii) jobs that have pages, and are waiting for the io to be issued.
- * iii) jobs that have completed.
+ * iii) jobs that don't need to do any IO and just run a callback
+ * iv) jobs that have completed.
*
- * All three of these are protected by job_lock.
+ * All four of these are protected by job_lock.
*/
spinlock_t job_lock;
+ struct list_head callback_jobs;
struct list_head complete_jobs;
struct list_head io_jobs;
struct list_head pages_jobs;
@@ -546,8 +566,10 @@
* no point in continuing.
*/
if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
- job->master_job->write_err)
+ job->master_job->write_err) {
+ job->write_err = job->master_job->write_err;
return -EIO;
+ }
io_job_start(job->kc->throttle);
@@ -599,6 +621,7 @@
else
job->read_err = 1;
push(&kc->complete_jobs, job);
+ wake(kc);
break;
}
@@ -625,6 +648,7 @@
struct dm_kcopyd_client *kc = container_of(work,
struct dm_kcopyd_client, kcopyd_work);
struct blk_plug plug;
+ unsigned long flags;
/*
* The order that these are called is *very* important.
@@ -633,6 +657,10 @@
* list. io jobs call wake when they complete and it all
* starts again.
*/
+ spin_lock_irqsave(&kc->job_lock, flags);
+ list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs);
+ spin_unlock_irqrestore(&kc->job_lock, flags);
+
blk_start_plug(&plug);
process_jobs(&kc->complete_jobs, kc, run_complete_job);
process_jobs(&kc->pages_jobs, kc, run_pages_job);
@@ -650,7 +678,7 @@
struct dm_kcopyd_client *kc = job->kc;
atomic_inc(&kc->nr_jobs);
if (unlikely(!job->source.count))
- push(&kc->complete_jobs, job);
+ push(&kc->callback_jobs, job);
else if (job->pages == &zero_page_list)
push(&kc->io_jobs, job);
else
@@ -686,8 +714,8 @@
progress = job->progress;
count = job->source.count - progress;
if (count) {
- if (count > SUB_JOB_SIZE)
- count = SUB_JOB_SIZE;
+ if (count > kc->sub_job_size)
+ count = kc->sub_job_size;
job->progress += count;
}
@@ -814,7 +842,7 @@
job->master_job = job;
job->write_offset = 0;
- if (job->source.count <= SUB_JOB_SIZE)
+ if (job->source.count <= kc->sub_job_size)
dispatch_job(job);
else {
job->progress = 0;
@@ -858,7 +886,7 @@
job->read_err = read_err;
job->write_err = write_err;
- push(&kc->complete_jobs, job);
+ push(&kc->callback_jobs, job);
wake(kc);
}
EXPORT_SYMBOL(dm_kcopyd_do_callback);
@@ -881,6 +909,7 @@
struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
{
int r;
+ unsigned reserve_pages;
struct dm_kcopyd_client *kc;
kc = kzalloc(sizeof(*kc), GFP_KERNEL);
@@ -888,6 +917,7 @@
return ERR_PTR(-ENOMEM);
spin_lock_init(&kc->job_lock);
+ INIT_LIST_HEAD(&kc->callback_jobs);
INIT_LIST_HEAD(&kc->complete_jobs);
INIT_LIST_HEAD(&kc->io_jobs);
INIT_LIST_HEAD(&kc->pages_jobs);
@@ -904,9 +934,12 @@
goto bad_workqueue;
}
+ kc->sub_job_size = dm_get_kcopyd_subjob_size();
+ reserve_pages = DIV_ROUND_UP(kc->sub_job_size << SECTOR_SHIFT, PAGE_SIZE);
+
kc->pages = NULL;
kc->nr_reserved_pages = kc->nr_free_pages = 0;
- r = client_reserve_pages(kc, RESERVE_PAGES);
+ r = client_reserve_pages(kc, reserve_pages);
if (r)
goto bad_client_pages;
@@ -939,6 +972,7 @@
/* Wait for completion of all jobs submitted by this client. */
wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
+ BUG_ON(!list_empty(&kc->callback_jobs));
BUG_ON(!list_empty(&kc->complete_jobs));
BUG_ON(!list_empty(&kc->io_jobs));
BUG_ON(!list_empty(&kc->pages_jobs));
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 2f7c44a..ecefe67 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -45,7 +45,7 @@
}
ret = -EINVAL;
- if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) {
+ if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || tmp != (sector_t)tmp) {
ti->error = "Invalid device sector";
goto bad;
}
@@ -102,19 +102,6 @@
return DM_MAPIO_REMAPPED;
}
-#ifdef CONFIG_BLK_DEV_ZONED
-static int linear_end_io(struct dm_target *ti, struct bio *bio,
- blk_status_t *error)
-{
- struct linear_c *lc = ti->private;
-
- if (!*error && bio_op(bio) == REQ_OP_ZONE_REPORT)
- dm_remap_zone_report(ti, bio, lc->start);
-
- return DM_ENDIO_DONE;
-}
-#endif
-
static void linear_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
@@ -148,6 +135,25 @@
return 0;
}
+#ifdef CONFIG_BLK_DEV_ZONED
+static int linear_report_zones(struct dm_target *ti, sector_t sector,
+ struct blk_zone *zones, unsigned int *nr_zones)
+{
+ struct linear_c *lc = (struct linear_c *) ti->private;
+ int ret;
+
+ /* Do report and remap it */
+ ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector),
+ zones, nr_zones);
+ if (ret != 0)
+ return ret;
+
+ if (*nr_zones)
+ dm_remap_zone_report(ti, lc->start, zones, nr_zones);
+ return 0;
+}
+#endif
+
static int linear_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
@@ -211,8 +217,8 @@
.name = "linear",
.version = {1, 4, 0},
#ifdef CONFIG_BLK_DEV_ZONED
- .end_io = linear_end_io,
.features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM,
+ .report_zones = linear_report_zones,
#else
.features = DM_TARGET_PASSES_INTEGRITY,
#endif
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 9ea2b02..99721c7 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -40,7 +40,7 @@
*
* Would result in the log looking like this:
*
- * c,a,flush,fuad,b,<other writes>,<next flush>
+ * c,a,b,flush,fuad,<other writes>,<next flush>
*
* This is meant to help expose problems where file systems do not properly wait
* on data being written before invoking a FLUSH. FUA bypasses cache so once it
@@ -60,6 +60,7 @@
#define WRITE_LOG_VERSION 1ULL
#define WRITE_LOG_MAGIC 0x6a736677736872ULL
+#define WRITE_LOG_SUPER_SECTOR 0
/*
* The disk format for this is braindead simple.
@@ -115,6 +116,7 @@
struct list_head logging_blocks;
wait_queue_head_t wait;
struct task_struct *log_kthread;
+ struct completion super_done;
};
struct pending_block {
@@ -180,6 +182,14 @@
bio_put(bio);
}
+static void log_end_super(struct bio *bio)
+{
+ struct log_writes_c *lc = bio->bi_private;
+
+ complete(&lc->super_done);
+ log_end_io(bio);
+}
+
/*
* Meant to be called if there is an error, it will free all the pages
* associated with the block.
@@ -215,7 +225,8 @@
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = sector;
bio_set_dev(bio, lc->logdev->bdev);
- bio->bi_end_io = log_end_io;
+ bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ?
+ log_end_super : log_end_io;
bio->bi_private = lc;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -418,11 +429,18 @@
super.nr_entries = cpu_to_le64(lc->logged_entries);
super.sectorsize = cpu_to_le32(lc->sectorsize);
- if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) {
+ if (write_metadata(lc, &super, sizeof(super), NULL, 0,
+ WRITE_LOG_SUPER_SECTOR)) {
DMERR("Couldn't write super");
return -1;
}
+ /*
+ * Super sector should be writen in-order, otherwise the
+ * nr_entries could be rewritten incorrectly by an old bio.
+ */
+ wait_for_completion_io(&lc->super_done);
+
return 0;
}
@@ -531,6 +549,7 @@
INIT_LIST_HEAD(&lc->unflushed_blocks);
INIT_LIST_HEAD(&lc->logging_blocks);
init_waitqueue_head(&lc->wait);
+ init_completion(&lc->super_done);
atomic_set(&lc->io_blocks, 0);
atomic_set(&lc->pending_blocks, 0);
@@ -680,7 +699,7 @@
if (discard_bio)
alloc_size = sizeof(struct pending_block);
else
- alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio);
+ alloc_size = struct_size(block, vecs, bio_segments(bio));
block = kzalloc(alloc_size, GFP_NOIO);
if (!block) {
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 419362c..dbcc1e4 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -203,14 +203,7 @@
static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
{
if (m->queue_mode == DM_TYPE_NONE) {
- /*
- * Default to request-based.
- */
- if (dm_use_blk_mq(dm_table_get_md(ti->table)))
- m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
- else
- m->queue_mode = DM_TYPE_REQUEST_BASED;
-
+ m->queue_mode = DM_TYPE_REQUEST_BASED;
} else if (m->queue_mode == DM_TYPE_BIO_BASED) {
INIT_WORK(&m->process_queued_bios, process_queued_bios);
/*
@@ -537,10 +530,7 @@
* get the queue busy feedback (via BLK_STS_RESOURCE),
* otherwise I/O merging can suffer.
*/
- if (q->mq_ops)
- return DM_MAPIO_REQUEUE;
- else
- return DM_MAPIO_DELAY_REQUEUE;
+ return DM_MAPIO_REQUEUE;
}
clone->bio = clone->biotail = NULL;
clone->rq_disk = bdev->bd_disk;
@@ -554,8 +544,23 @@
return DM_MAPIO_REMAPPED;
}
-static void multipath_release_clone(struct request *clone)
+static void multipath_release_clone(struct request *clone,
+ union map_info *map_context)
{
+ if (unlikely(map_context)) {
+ /*
+ * non-NULL map_context means caller is still map
+ * method; must undo multipath_clone_and_map()
+ */
+ struct dm_mpath_io *mpio = get_mpio(map_context);
+ struct pgpath *pgpath = mpio->pgpath;
+
+ if (pgpath && pgpath->pg->ps.type->end_io)
+ pgpath->pg->ps.type->end_io(&pgpath->pg->ps,
+ &pgpath->path,
+ mpio->nr_bytes);
+ }
+
blk_put_request(clone);
}
@@ -668,7 +673,7 @@
static void process_queued_io_list(struct multipath *m)
{
- if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED)
+ if (m->queue_mode == DM_TYPE_REQUEST_BASED)
dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table));
else if (m->queue_mode == DM_TYPE_BIO_BASED)
queue_work(kmultipathd, &m->process_queued_bios);
@@ -892,6 +897,7 @@
if (attached_handler_name || m->hw_handler_name) {
INIT_DELAYED_WORK(&p->activate_path, activate_path_work);
r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error);
+ kfree(attached_handler_name);
if (r) {
dm_put_device(ti, p->path.dev);
goto bad;
@@ -906,7 +912,6 @@
return p;
bad:
- kfree(attached_handler_name);
free_pgpath(p);
return ERR_PTR(r);
}
@@ -1089,10 +1094,9 @@
if (!strcasecmp(queue_mode_name, "bio"))
m->queue_mode = DM_TYPE_BIO_BASED;
- else if (!strcasecmp(queue_mode_name, "rq"))
+ else if (!strcasecmp(queue_mode_name, "rq") ||
+ !strcasecmp(queue_mode_name, "mq"))
m->queue_mode = DM_TYPE_REQUEST_BASED;
- else if (!strcasecmp(queue_mode_name, "mq"))
- m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
else {
ti->error = "Unknown 'queue_mode' requested";
r = -EINVAL;
@@ -1222,14 +1226,16 @@
set_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
smp_mb__after_atomic();
- flush_workqueue(kmpath_handlerd);
+ if (atomic_read(&m->pg_init_in_progress))
+ flush_workqueue(kmpath_handlerd);
multipath_wait_for_pg_init_completion(m);
clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
smp_mb__after_atomic();
}
- flush_workqueue(kmultipathd);
+ if (m->queue_mode == DM_TYPE_BIO_BASED)
+ flush_work(&m->process_queued_bios);
flush_work(&m->trigger_event);
}
@@ -1726,9 +1732,6 @@
case DM_TYPE_BIO_BASED:
DMEMIT("queue_mode bio ");
break;
- case DM_TYPE_MQ_REQUEST_BASED:
- DMEMIT("queue_mode mq ");
- break;
default:
WARN_ON_ONCE(true);
break;
@@ -1972,7 +1975,7 @@
/* no paths available, for blk-mq: rely on IO mapping to delay requeue */
if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
- return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED);
+ return (m->queue_mode != DM_TYPE_REQUEST_BASED);
/* Guess which priority_group will be used at next mapping time */
pg = READ_ONCE(m->current_pg);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c44925e..b0aa595 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2475,7 +2475,7 @@
}
/* Enable bitmap creation for RAID levels != 0 */
- mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
+ mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096);
mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
@@ -2986,11 +2986,6 @@
}
}
- /*
- * RAID1 and RAID10 personalities require bio splitting,
- * RAID0/4/5/6 don't and process large discard bios properly.
- */
- ti->split_discard_bios = !!(rs_is_raid1(rs) || rs_is_raid10(rs));
ti->num_discard_bios = 1;
}
@@ -3199,7 +3194,7 @@
*/
r = rs_prepare_reshape(rs);
if (r)
- return r;
+ goto bad;
/* Reshaping ain't recovery, so disable recovery */
rs_setup_recovery(rs, MaxSector);
@@ -3563,7 +3558,7 @@
* v1.5.0+:
*
* Sync action:
- * See Documentation/device-mapper/dm-raid.txt for
+ * See Documentation/admin-guide/device-mapper/dm-raid.rst for
* information on each of these states.
*/
DMEMIT(" %s", sync_action);
@@ -3690,8 +3685,7 @@
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
}
- } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
- test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+ } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
return -EBUSY;
else if (!strcasecmp(argv[0], "resync"))
; /* MD_RECOVERY_NEEDED set below */
@@ -3744,10 +3738,19 @@
static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct raid_set *rs = ti->private;
- unsigned int chunk_size = to_bytes(rs->md.chunk_sectors);
+ unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors);
- blk_limits_io_min(limits, chunk_size);
- blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs));
+ blk_limits_io_min(limits, chunk_size_bytes);
+ blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
+
+ /*
+ * RAID1 and RAID10 personalities require bio splitting,
+ * RAID0/4/5/6 don't and process large discard bios properly.
+ */
+ if (rs_is_raid1(rs) || rs_is_raid10(rs)) {
+ limits->discard_granularity = chunk_size_bytes;
+ limits->max_discard_sectors = rs->md.chunk_sectors;
+ }
}
static void raid_postsuspend(struct dm_target *ti)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 79eab10..089aed5 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -878,12 +878,9 @@
struct dm_target *ti,
struct dm_dirty_log *dl)
{
- size_t len;
- struct mirror_set *ms = NULL;
+ struct mirror_set *ms =
+ kzalloc(struct_size(ms, mirror, nr_mirrors), GFP_KERNEL);
- len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
-
- ms = kzalloc(len, GFP_KERNEL);
if (!ms) {
ti->error = "Cannot allocate mirror context";
return NULL;
@@ -943,7 +940,8 @@
char dummy;
int ret;
- if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) {
+ if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1 ||
+ offset != (sector_t)offset) {
ti->error = "Invalid offset";
return -EINVAL;
}
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 6e547b8..3f8577e 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -12,6 +12,22 @@
#define DM_MSG_PREFIX "core-rq"
+/*
+ * One of these is allocated per request.
+ */
+struct dm_rq_target_io {
+ struct mapped_device *md;
+ struct dm_target *ti;
+ struct request *orig, *clone;
+ struct kthread_work work;
+ blk_status_t error;
+ union map_info info;
+ struct dm_stats_aux stats_aux;
+ unsigned long duration_jiffies;
+ unsigned n_sectors;
+ unsigned completed;
+};
+
#define DM_MQ_NR_HW_QUEUES 1
#define DM_MQ_QUEUE_DEPTH 2048
static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES;
@@ -23,19 +39,6 @@
#define RESERVED_REQUEST_BASED_IOS 256
static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
-static bool use_blk_mq = IS_ENABLED(CONFIG_DM_MQ_DEFAULT);
-
-bool dm_use_blk_mq_default(void)
-{
- return use_blk_mq;
-}
-
-bool dm_use_blk_mq(struct mapped_device *md)
-{
- return md->use_blk_mq;
-}
-EXPORT_SYMBOL_GPL(dm_use_blk_mq);
-
unsigned dm_get_reserved_rq_based_ios(void)
{
return __dm_get_module_param(&reserved_rq_based_ios,
@@ -56,44 +59,16 @@
int dm_request_based(struct mapped_device *md)
{
- return queue_is_rq_based(md->queue);
+ return queue_is_mq(md->queue);
}
-static void dm_old_start_queue(struct request_queue *q)
-{
- unsigned long flags;
-
- spin_lock_irqsave(q->queue_lock, flags);
- if (blk_queue_stopped(q))
- blk_start_queue(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void dm_mq_start_queue(struct request_queue *q)
+void dm_start_queue(struct request_queue *q)
{
blk_mq_unquiesce_queue(q);
blk_mq_kick_requeue_list(q);
}
-void dm_start_queue(struct request_queue *q)
-{
- if (!q->mq_ops)
- dm_old_start_queue(q);
- else
- dm_mq_start_queue(q);
-}
-
-static void dm_old_stop_queue(struct request_queue *q)
-{
- unsigned long flags;
-
- spin_lock_irqsave(q->queue_lock, flags);
- if (!blk_queue_stopped(q))
- blk_stop_queue(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void dm_mq_stop_queue(struct request_queue *q)
+void dm_stop_queue(struct request_queue *q)
{
if (blk_mq_queue_stopped(q))
return;
@@ -101,14 +76,6 @@
blk_mq_quiesce_queue(q);
}
-void dm_stop_queue(struct request_queue *q)
-{
- if (!q->mq_ops)
- dm_old_stop_queue(q);
- else
- dm_mq_stop_queue(q);
-}
-
/*
* Partial completion handling for request-based dm
*/
@@ -148,7 +115,7 @@
/*
* Update the original request.
- * Do not use blk_end_request() here, because it may complete
+ * Do not use blk_mq_end_request() here, because it may complete
* the original request before the clone, and break the ordering.
*/
if (is_last)
@@ -177,30 +144,13 @@
* the md may be freed in dm_put() at the end of this function.
* Or do dm_get() before calling this function and dm_put() later.
*/
-static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
+static void rq_completed(struct mapped_device *md)
{
- struct request_queue *q = md->queue;
- unsigned long flags;
-
- atomic_dec(&md->pending[rw]);
-
/* nudge anyone waiting on suspend queue */
- if (!md_in_flight(md))
+ if (unlikely(wq_has_sleeper(&md->wait)))
wake_up(&md->wait);
/*
- * Run this off this callpath, as drivers could invoke end_io while
- * inside their request_fn (and holding the queue lock). Calling
- * back into ->request_fn() could deadlock attempting to grab the
- * queue lock again.
- */
- if (!q->mq_ops && run_queue) {
- spin_lock_irqsave(q->queue_lock, flags);
- blk_run_queue_async(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
- }
-
- /*
* dm_put() must be at the end of this function. See the comment above
*/
dm_put(md);
@@ -213,34 +163,16 @@
*/
static void dm_end_request(struct request *clone, blk_status_t error)
{
- int rw = rq_data_dir(clone);
struct dm_rq_target_io *tio = clone->end_io_data;
struct mapped_device *md = tio->md;
struct request *rq = tio->orig;
blk_rq_unprep_clone(clone);
- tio->ti->type->release_clone_rq(clone);
+ tio->ti->type->release_clone_rq(clone, NULL);
rq_end_stats(md, rq);
- if (!rq->q->mq_ops)
- blk_end_request_all(rq, error);
- else
- blk_mq_end_request(rq, error);
- rq_completed(md, rw, true);
-}
-
-/*
- * Requeue the original request of a clone.
- */
-static void dm_old_requeue_request(struct request *rq, unsigned long delay_ms)
-{
- struct request_queue *q = rq->q;
- unsigned long flags;
-
- spin_lock_irqsave(q->queue_lock, flags);
- blk_requeue_request(q, rq);
- blk_delay_queue(q, delay_ms);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ blk_mq_end_request(rq, error);
+ rq_completed(md);
}
static void __dm_mq_kick_requeue_list(struct request_queue *q, unsigned long msecs)
@@ -264,21 +196,16 @@
{
struct mapped_device *md = tio->md;
struct request *rq = tio->orig;
- int rw = rq_data_dir(rq);
unsigned long delay_ms = delay_requeue ? 100 : 0;
rq_end_stats(md, rq);
if (tio->clone) {
blk_rq_unprep_clone(tio->clone);
- tio->ti->type->release_clone_rq(tio->clone);
+ tio->ti->type->release_clone_rq(tio->clone, NULL);
}
- if (!rq->q->mq_ops)
- dm_old_requeue_request(rq, delay_ms);
- else
- dm_mq_delay_requeue_request(rq, delay_ms);
-
- rq_completed(md, rw, false);
+ dm_mq_delay_requeue_request(rq, delay_ms);
+ rq_completed(md);
}
static void dm_done(struct request *clone, blk_status_t error, bool mapped)
@@ -295,11 +222,14 @@
}
if (unlikely(error == BLK_STS_TARGET)) {
- if (req_op(clone) == REQ_OP_WRITE_SAME &&
- !clone->q->limits.max_write_same_sectors)
+ if (req_op(clone) == REQ_OP_DISCARD &&
+ !clone->q->limits.max_discard_sectors)
+ disable_discard(tio->md);
+ else if (req_op(clone) == REQ_OP_WRITE_SAME &&
+ !clone->q->limits.max_write_same_sectors)
disable_write_same(tio->md);
- if (req_op(clone) == REQ_OP_WRITE_ZEROES &&
- !clone->q->limits.max_write_zeroes_sectors)
+ else if (req_op(clone) == REQ_OP_WRITE_ZEROES &&
+ !clone->q->limits.max_write_zeroes_sectors)
disable_write_zeroes(tio->md);
}
@@ -333,18 +263,13 @@
bool mapped = true;
struct dm_rq_target_io *tio = tio_from_request(rq);
struct request *clone = tio->clone;
- int rw;
if (!clone) {
struct mapped_device *md = tio->md;
rq_end_stats(md, rq);
- rw = rq_data_dir(rq);
- if (!rq->q->mq_ops)
- blk_end_request_all(rq, tio->error);
- else
- blk_mq_end_request(rq, tio->error);
- rq_completed(md, rw, false);
+ blk_mq_end_request(rq, tio->error);
+ rq_completed(md);
return;
}
@@ -363,17 +288,14 @@
struct dm_rq_target_io *tio = tio_from_request(rq);
tio->error = error;
- if (!rq->q->mq_ops)
- blk_complete_request(rq);
- else
- blk_mq_complete_request(rq);
+ blk_mq_complete_request(rq);
}
/*
* Complete the not-mapped clone and the original request with the error status
* through softirq context.
* Target's rq_end_io() function isn't called.
- * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
+ * This may be used when the target's clone_and_map_rq() function fails.
*/
static void dm_kill_unmapped_request(struct request *rq, blk_status_t error)
{
@@ -381,21 +303,10 @@
dm_complete_request(rq, error);
}
-/*
- * Called with the clone's queue lock held (in the case of .request_fn)
- */
static void end_clone_request(struct request *clone, blk_status_t error)
{
struct dm_rq_target_io *tio = clone->end_io_data;
- /*
- * Actual request completion is done in a softirq context which doesn't
- * hold the clone's queue lock. Otherwise, deadlock could occur because:
- * - another request may be submitted by the upper level driver
- * of the stacking during the completion
- * - the submission which requires queue lock may be done
- * against this clone's queue
- */
dm_complete_request(tio->orig, error);
}
@@ -446,8 +357,6 @@
return 0;
}
-static void map_tio_request(struct kthread_work *work);
-
static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
struct mapped_device *md)
{
@@ -464,8 +373,6 @@
*/
if (!md->init_tio_pdu)
memset(&tio->info, 0, sizeof(tio->info));
- if (md->kworker_task)
- kthread_init_work(&tio->work, map_tio_request);
}
/*
@@ -484,7 +391,6 @@
blk_status_t ret;
r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
-check_again:
switch (r) {
case DM_MAPIO_SUBMITTED:
/* The target has taken the I/O to submit by itself later */
@@ -492,7 +398,7 @@
case DM_MAPIO_REMAPPED:
if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
/* -ENOMEM */
- ti->type->release_clone_rq(clone);
+ ti->type->release_clone_rq(clone, &tio->info);
return DM_MAPIO_REQUEUE;
}
@@ -502,13 +408,10 @@
ret = dm_dispatch_clone_request(clone, rq);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
blk_rq_unprep_clone(clone);
- tio->ti->type->release_clone_rq(clone);
+ blk_mq_cleanup_rq(clone);
+ tio->ti->type->release_clone_rq(clone, &tio->info);
tio->clone = NULL;
- if (!rq->q->mq_ops)
- r = DM_MAPIO_DELAY_REQUEUE;
- else
- r = DM_MAPIO_REQUEUE;
- goto check_again;
+ return DM_MAPIO_REQUEUE;
}
break;
case DM_MAPIO_REQUEUE:
@@ -530,19 +433,21 @@
return r;
}
+/* DEPRECATED: previously used for request-based merge heuristic in dm_request_fn() */
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
+{
+ return sprintf(buf, "%u\n", 0);
+}
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+ const char *buf, size_t count)
+{
+ return count;
+}
+
static void dm_start_request(struct mapped_device *md, struct request *orig)
{
- if (!orig->q->mq_ops)
- blk_start_request(orig);
- else
- blk_mq_start_request(orig);
- atomic_inc(&md->pending[rq_data_dir(orig)]);
-
- if (md->seq_rq_merge_deadline_usecs) {
- md->last_rq_pos = rq_end_sector(orig);
- md->last_rq_rw = rq_data_dir(orig);
- md->last_rq_start_time = ktime_get();
- }
+ blk_mq_start_request(orig);
if (unlikely(dm_stats_used(&md->stats))) {
struct dm_rq_target_io *tio = tio_from_request(orig);
@@ -563,8 +468,10 @@
dm_get(md);
}
-static int __dm_rq_init_rq(struct mapped_device *md, struct request *rq)
+static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+ unsigned int hctx_idx, unsigned int numa_node)
{
+ struct mapped_device *md = set->driver_data;
struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
/*
@@ -581,163 +488,6 @@
return 0;
}
-static int dm_rq_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp)
-{
- return __dm_rq_init_rq(q->rq_alloc_data, rq);
-}
-
-static void map_tio_request(struct kthread_work *work)
-{
- struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
-
- if (map_request(tio) == DM_MAPIO_REQUEUE)
- dm_requeue_original_request(tio, false);
-}
-
-ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
-{
- return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
-}
-
-#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
-
-ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
- const char *buf, size_t count)
-{
- unsigned deadline;
-
- if (dm_get_md_type(md) != DM_TYPE_REQUEST_BASED)
- return count;
-
- if (kstrtouint(buf, 10, &deadline))
- return -EINVAL;
-
- if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
- deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
-
- md->seq_rq_merge_deadline_usecs = deadline;
-
- return count;
-}
-
-static bool dm_old_request_peeked_before_merge_deadline(struct mapped_device *md)
-{
- ktime_t kt_deadline;
-
- if (!md->seq_rq_merge_deadline_usecs)
- return false;
-
- kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
- kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
-
- return !ktime_after(ktime_get(), kt_deadline);
-}
-
-/*
- * q->request_fn for old request-based dm.
- * Called with the queue lock held.
- */
-static void dm_old_request_fn(struct request_queue *q)
-{
- struct mapped_device *md = q->queuedata;
- struct dm_target *ti = md->immutable_target;
- struct request *rq;
- struct dm_rq_target_io *tio;
- sector_t pos = 0;
-
- if (unlikely(!ti)) {
- int srcu_idx;
- struct dm_table *map = dm_get_live_table(md, &srcu_idx);
-
- if (unlikely(!map)) {
- dm_put_live_table(md, srcu_idx);
- return;
- }
- ti = dm_table_find_target(map, pos);
- dm_put_live_table(md, srcu_idx);
- }
-
- /*
- * For suspend, check blk_queue_stopped() and increment
- * ->pending within a single queue_lock not to increment the
- * number of in-flight I/Os after the queue is stopped in
- * dm_suspend().
- */
- while (!blk_queue_stopped(q)) {
- rq = blk_peek_request(q);
- if (!rq)
- return;
-
- /* always use block 0 to find the target for flushes for now */
- pos = 0;
- if (req_op(rq) != REQ_OP_FLUSH)
- pos = blk_rq_pos(rq);
-
- if ((dm_old_request_peeked_before_merge_deadline(md) &&
- md_in_flight(md) && rq->bio && !bio_multiple_segments(rq->bio) &&
- md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) ||
- (ti->type->busy && ti->type->busy(ti))) {
- blk_delay_queue(q, 10);
- return;
- }
-
- dm_start_request(md, rq);
-
- tio = tio_from_request(rq);
- init_tio(tio, rq, md);
- /* Establish tio->ti before queuing work (map_tio_request) */
- tio->ti = ti;
- kthread_queue_work(&md->kworker, &tio->work);
- BUG_ON(!irqs_disabled());
- }
-}
-
-/*
- * Fully initialize a .request_fn request-based queue.
- */
-int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t)
-{
- struct dm_target *immutable_tgt;
-
- /* Fully initialize the queue */
- md->queue->cmd_size = sizeof(struct dm_rq_target_io);
- md->queue->rq_alloc_data = md;
- md->queue->request_fn = dm_old_request_fn;
- md->queue->init_rq_fn = dm_rq_init_rq;
-
- immutable_tgt = dm_table_get_immutable_target(t);
- if (immutable_tgt && immutable_tgt->per_io_data_size) {
- /* any target-specific per-io data is immediately after the tio */
- md->queue->cmd_size += immutable_tgt->per_io_data_size;
- md->init_tio_pdu = true;
- }
- if (blk_init_allocated_queue(md->queue) < 0)
- return -EINVAL;
-
- /* disable dm_old_request_fn's merge heuristic by default */
- md->seq_rq_merge_deadline_usecs = 0;
-
- blk_queue_softirq_done(md->queue, dm_softirq_done);
-
- /* Initialize the request-based DM worker thread */
- kthread_init_worker(&md->kworker);
- md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
- "kdmwork-%s", dm_device_name(md));
- if (IS_ERR(md->kworker_task)) {
- int error = PTR_ERR(md->kworker_task);
- md->kworker_task = NULL;
- return error;
- }
-
- return 0;
-}
-
-static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
- unsigned int hctx_idx, unsigned int numa_node)
-{
- return __dm_rq_init_rq(set->driver_data, rq);
-}
-
static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@@ -771,7 +521,7 @@
if (map_request(tio) == DM_MAPIO_REQUEUE) {
/* Undo dm_start_request() before requeuing */
rq_end_stats(md, rq);
- rq_completed(md, rq_data_dir(rq), false);
+ rq_completed(md);
return BLK_STS_RESOURCE;
}
@@ -790,11 +540,6 @@
struct dm_target *immutable_tgt;
int err;
- if (!dm_table_all_blk_mq_devices(t)) {
- DMERR("request-based dm-mq may only be stacked on blk-mq device(s)");
- return -EINVAL;
- }
-
md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id);
if (!md->tag_set)
return -ENOMEM;
@@ -802,7 +547,7 @@
md->tag_set->ops = &dm_mq_ops;
md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
md->tag_set->numa_node = md->numa_node_id;
- md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
md->tag_set->driver_data = md;
@@ -818,7 +563,7 @@
if (err)
goto out_kfree_tag_set;
- q = blk_mq_init_allocated_queue(md->tag_set, md->queue);
+ q = blk_mq_init_allocated_queue(md->tag_set, md->queue, true);
if (IS_ERR(q)) {
err = PTR_ERR(q);
goto out_tag_set;
@@ -845,6 +590,8 @@
module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
+/* Unused, but preserved for userspace compatibility */
+static bool use_blk_mq = true;
module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h
index f43c454..1eea0da 100644
--- a/drivers/md/dm-rq.h
+++ b/drivers/md/dm-rq.h
@@ -17,22 +17,6 @@
struct mapped_device;
/*
- * One of these is allocated per request.
- */
-struct dm_rq_target_io {
- struct mapped_device *md;
- struct dm_target *ti;
- struct request *orig, *clone;
- struct kthread_work work;
- blk_status_t error;
- union map_info info;
- struct dm_stats_aux stats_aux;
- unsigned long duration_jiffies;
- unsigned n_sectors;
- unsigned completed;
-};
-
-/*
* For request-based dm - the bio clones we allocate are embedded in these
* structs.
*
@@ -46,10 +30,6 @@
struct bio clone;
};
-bool dm_use_blk_mq_default(void);
-bool dm_use_blk_mq(struct mapped_device *md);
-
-int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t);
int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t);
void dm_mq_cleanup_mapped_device(struct mapped_device *md);
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index ae4b33d..4fb1a40 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1,6 +1,4 @@
/*
- * dm-snapshot.c
- *
* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
*
* This file is released under the GPL.
@@ -13,6 +11,7 @@
#include <linux/init.h>
#include <linux/kdev_t.h>
#include <linux/list.h>
+#include <linux/list_bl.h>
#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -43,11 +42,11 @@
struct dm_exception_table {
uint32_t hash_mask;
unsigned hash_shift;
- struct list_head *table;
+ struct hlist_bl_head *table;
};
struct dm_snapshot {
- struct mutex lock;
+ struct rw_semaphore lock;
struct dm_dev *origin;
struct dm_dev *cow;
@@ -75,7 +74,9 @@
atomic_t pending_exceptions_count;
- /* Protected by "lock" */
+ spinlock_t pe_allocation_lock;
+
+ /* Protected by "pe_allocation_lock" */
sector_t exception_start_sequence;
/* Protected by kcopyd single-threaded callback */
@@ -105,6 +106,9 @@
/* The on disk metadata handler */
struct dm_exception_store *store;
+ unsigned in_progress;
+ struct wait_queue_head in_progress_wait;
+
struct dm_kcopyd_client *kcopyd_client;
/* Wait for events based on state_bits */
@@ -127,7 +131,10 @@
* - I/O error while merging
* => stop merging; set merge_failed; process I/O normally.
*/
- int merge_failed;
+ bool merge_failed:1;
+
+ bool discard_zeroes_cow:1;
+ bool discard_passdown_origin:1;
/*
* Incoming bios that overlap with chunks being merged must wait
@@ -145,6 +152,19 @@
#define RUNNING_MERGE 0
#define SHUTDOWN_MERGE 1
+/*
+ * Maximum number of chunks being copied on write.
+ *
+ * The value was decided experimentally as a trade-off between memory
+ * consumption, stalling the kernel's workqueues and maintaining a high enough
+ * throughput.
+ */
+#define DEFAULT_COW_THRESHOLD 2048
+
+static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
+module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
+MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
+
DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
"A percentage of time allocated for copy on write");
@@ -440,9 +460,9 @@
if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
continue;
- mutex_lock(&s->lock);
+ down_read(&s->lock);
active = s->active;
- mutex_unlock(&s->lock);
+ up_read(&s->lock);
if (active) {
if (snap_src)
@@ -601,6 +621,36 @@
* The lowest hash_shift bits of the chunk number are ignored, allowing
* some consecutive chunks to be grouped together.
*/
+static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk);
+
+/* Lock to protect access to the completed and pending exception hash tables. */
+struct dm_exception_table_lock {
+ struct hlist_bl_head *complete_slot;
+ struct hlist_bl_head *pending_slot;
+};
+
+static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
+ struct dm_exception_table_lock *lock)
+{
+ struct dm_exception_table *complete = &s->complete;
+ struct dm_exception_table *pending = &s->pending;
+
+ lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
+ lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
+}
+
+static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
+{
+ hlist_bl_lock(lock->complete_slot);
+ hlist_bl_lock(lock->pending_slot);
+}
+
+static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
+{
+ hlist_bl_unlock(lock->pending_slot);
+ hlist_bl_unlock(lock->complete_slot);
+}
+
static int dm_exception_table_init(struct dm_exception_table *et,
uint32_t size, unsigned hash_shift)
{
@@ -608,12 +658,12 @@
et->hash_shift = hash_shift;
et->hash_mask = size - 1;
- et->table = dm_vcalloc(size, sizeof(struct list_head));
+ et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head));
if (!et->table)
return -ENOMEM;
for (i = 0; i < size; i++)
- INIT_LIST_HEAD(et->table + i);
+ INIT_HLIST_BL_HEAD(et->table + i);
return 0;
}
@@ -621,15 +671,16 @@
static void dm_exception_table_exit(struct dm_exception_table *et,
struct kmem_cache *mem)
{
- struct list_head *slot;
- struct dm_exception *ex, *next;
+ struct hlist_bl_head *slot;
+ struct dm_exception *ex;
+ struct hlist_bl_node *pos, *n;
int i, size;
size = et->hash_mask + 1;
for (i = 0; i < size; i++) {
slot = et->table + i;
- list_for_each_entry_safe (ex, next, slot, hash_list)
+ hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list)
kmem_cache_free(mem, ex);
}
@@ -643,7 +694,7 @@
static void dm_remove_exception(struct dm_exception *e)
{
- list_del(&e->hash_list);
+ hlist_bl_del(&e->hash_list);
}
/*
@@ -653,11 +704,12 @@
static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
chunk_t chunk)
{
- struct list_head *slot;
+ struct hlist_bl_head *slot;
+ struct hlist_bl_node *pos;
struct dm_exception *e;
slot = &et->table[exception_hash(et, chunk)];
- list_for_each_entry (e, slot, hash_list)
+ hlist_bl_for_each_entry(e, pos, slot, hash_list)
if (chunk >= e->old_chunk &&
chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
return e;
@@ -704,7 +756,8 @@
static void dm_insert_exception(struct dm_exception_table *eh,
struct dm_exception *new_e)
{
- struct list_head *l;
+ struct hlist_bl_head *l;
+ struct hlist_bl_node *pos;
struct dm_exception *e = NULL;
l = &eh->table[exception_hash(eh, new_e->old_chunk)];
@@ -714,7 +767,7 @@
goto out;
/* List is ordered by old_chunk */
- list_for_each_entry_reverse(e, l, hash_list) {
+ hlist_bl_for_each_entry(e, pos, l, hash_list) {
/* Insert after an existing chunk? */
if (new_e->old_chunk == (e->old_chunk +
dm_consecutive_chunk_count(e) + 1) &&
@@ -735,12 +788,24 @@
return;
}
- if (new_e->old_chunk > e->old_chunk)
+ if (new_e->old_chunk < e->old_chunk)
break;
}
out:
- list_add(&new_e->hash_list, e ? &e->hash_list : l);
+ if (!e) {
+ /*
+ * Either the table doesn't support consecutive chunks or slot
+ * l is empty.
+ */
+ hlist_bl_add_head(&new_e->hash_list, l);
+ } else if (new_e->old_chunk < e->old_chunk) {
+ /* Add before an existing exception */
+ hlist_bl_add_before(&new_e->hash_list, &e->hash_list);
+ } else {
+ /* Add to l's tail: e is the last exception in this slot */
+ hlist_bl_add_behind(&new_e->hash_list, &e->hash_list);
+ }
}
/*
@@ -749,6 +814,7 @@
*/
static int dm_add_exception(void *context, chunk_t old, chunk_t new)
{
+ struct dm_exception_table_lock lock;
struct dm_snapshot *s = context;
struct dm_exception *e;
@@ -761,7 +827,17 @@
/* Consecutive_count is implicitly initialised to zero */
e->new_chunk = new;
+ /*
+ * Although there is no need to lock access to the exception tables
+ * here, if we don't then hlist_bl_add_head(), called by
+ * dm_insert_exception(), will complain about accessing the
+ * corresponding list without locking it first.
+ */
+ dm_exception_table_lock_init(s, old, &lock);
+
+ dm_exception_table_lock(&lock);
dm_insert_exception(&s->complete, e);
+ dm_exception_table_unlock(&lock);
return 0;
}
@@ -790,7 +866,7 @@
{
/* use a fixed size of 2MB */
unsigned long mem = 2 * 1024 * 1024;
- mem /= sizeof(struct list_head);
+ mem /= sizeof(struct hlist_bl_head);
return mem;
}
@@ -910,7 +986,7 @@
int r;
chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
- mutex_lock(&s->lock);
+ down_write(&s->lock);
/*
* Process chunks (and associated exceptions) in reverse order
@@ -925,7 +1001,7 @@
b = __release_queued_bios_after_merge(s);
out:
- mutex_unlock(&s->lock);
+ up_write(&s->lock);
if (b)
flush_bios(b);
@@ -984,9 +1060,9 @@
if (linear_chunks < 0) {
DMERR("Read error in exception store: "
"shutting down merge");
- mutex_lock(&s->lock);
+ down_write(&s->lock);
s->merge_failed = 1;
- mutex_unlock(&s->lock);
+ up_write(&s->lock);
}
goto shut;
}
@@ -1027,10 +1103,10 @@
previous_count = read_pending_exceptions_done_count();
}
- mutex_lock(&s->lock);
+ down_write(&s->lock);
s->first_merging_chunk = old_chunk;
s->num_merging_chunks = linear_chunks;
- mutex_unlock(&s->lock);
+ up_write(&s->lock);
/* Wait until writes to all 'linear_chunks' drain */
for (i = 0; i < linear_chunks; i++)
@@ -1072,10 +1148,10 @@
return;
shut:
- mutex_lock(&s->lock);
+ down_write(&s->lock);
s->merge_failed = 1;
b = __release_queued_bios_after_merge(s);
- mutex_unlock(&s->lock);
+ up_write(&s->lock);
error_bios(b);
merge_shutdown(s);
@@ -1097,12 +1173,64 @@
clear_bit(SHUTDOWN_MERGE, &s->state_bits);
}
+static int parse_snapshot_features(struct dm_arg_set *as, struct dm_snapshot *s,
+ struct dm_target *ti)
+{
+ int r;
+ unsigned argc;
+ const char *arg_name;
+
+ static const struct dm_arg _args[] = {
+ {0, 2, "Invalid number of feature arguments"},
+ };
+
+ /*
+ * No feature arguments supplied.
+ */
+ if (!as->argc)
+ return 0;
+
+ r = dm_read_arg_group(_args, as, &argc, &ti->error);
+ if (r)
+ return -EINVAL;
+
+ while (argc && !r) {
+ arg_name = dm_shift_arg(as);
+ argc--;
+
+ if (!strcasecmp(arg_name, "discard_zeroes_cow"))
+ s->discard_zeroes_cow = true;
+
+ else if (!strcasecmp(arg_name, "discard_passdown_origin"))
+ s->discard_passdown_origin = true;
+
+ else {
+ ti->error = "Unrecognised feature requested";
+ r = -EINVAL;
+ break;
+ }
+ }
+
+ if (!s->discard_zeroes_cow && s->discard_passdown_origin) {
+ /*
+ * TODO: really these are disjoint.. but ti->num_discard_bios
+ * and dm_bio_get_target_bio_nr() require rigid constraints.
+ */
+ ti->error = "discard_passdown_origin feature depends on discard_zeroes_cow";
+ r = -EINVAL;
+ }
+
+ return r;
+}
+
/*
- * Construct a snapshot mapping: <origin_dev> <COW-dev> <p|po|n> <chunk-size>
+ * Construct a snapshot mapping:
+ * <origin_dev> <COW-dev> <p|po|n> <chunk-size> [<# feature args> [<arg>]*]
*/
static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct dm_snapshot *s;
+ struct dm_arg_set as;
int i;
int r = -EINVAL;
char *origin_path, *cow_path;
@@ -1110,8 +1238,8 @@
unsigned args_used, num_flush_bios = 1;
fmode_t origin_mode = FMODE_READ;
- if (argc != 4) {
- ti->error = "requires exactly 4 arguments";
+ if (argc < 4) {
+ ti->error = "requires 4 or more arguments";
r = -EINVAL;
goto bad;
}
@@ -1128,6 +1256,13 @@
goto bad;
}
+ as.argc = argc;
+ as.argv = argv;
+ dm_consume_args(&as, 4);
+ r = parse_snapshot_features(&as, s, ti);
+ if (r)
+ goto bad_features;
+
origin_path = argv[0];
argv++;
argc--;
@@ -1171,10 +1306,11 @@
s->snapshot_overflowed = 0;
s->active = 0;
atomic_set(&s->pending_exceptions_count, 0);
+ spin_lock_init(&s->pe_allocation_lock);
s->exception_start_sequence = 0;
s->exception_complete_sequence = 0;
s->out_of_order_tree = RB_ROOT;
- mutex_init(&s->lock);
+ init_rwsem(&s->lock);
INIT_LIST_HEAD(&s->list);
spin_lock_init(&s->pe_lock);
s->state_bits = 0;
@@ -1190,6 +1326,8 @@
goto bad_hash_tables;
}
+ init_waitqueue_head(&s->in_progress_wait);
+
s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
if (IS_ERR(s->kcopyd_client)) {
r = PTR_ERR(s->kcopyd_client);
@@ -1210,6 +1348,8 @@
ti->private = s;
ti->num_flush_bios = num_flush_bios;
+ if (s->discard_zeroes_cow)
+ ti->num_discard_bios = (s->discard_passdown_origin ? 2 : 1);
ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk);
/* Add snapshot to the list of snapshots for this origin */
@@ -1257,29 +1397,22 @@
bad_read_metadata:
unregister_snapshot(s);
-
bad_load_and_register:
mempool_exit(&s->pending_pool);
-
bad_pending_pool:
dm_kcopyd_client_destroy(s->kcopyd_client);
-
bad_kcopyd:
dm_exception_table_exit(&s->pending, pending_cache);
dm_exception_table_exit(&s->complete, exception_cache);
-
bad_hash_tables:
dm_exception_store_destroy(s->store);
-
bad_store:
dm_put_device(ti, s->cow);
-
bad_cow:
dm_put_device(ti, s->origin);
-
bad_origin:
+bad_features:
kfree(s);
-
bad:
return r;
}
@@ -1338,9 +1471,9 @@
/* Check whether exception handover must be cancelled */
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest && (s == snap_src)) {
- mutex_lock(&snap_dest->lock);
+ down_write(&snap_dest->lock);
snap_dest->valid = 0;
- mutex_unlock(&snap_dest->lock);
+ up_write(&snap_dest->lock);
DMERR("Cancelling snapshot handover.");
}
up_read(&_origins_lock);
@@ -1371,15 +1504,60 @@
dm_exception_store_destroy(s->store);
- mutex_destroy(&s->lock);
-
dm_put_device(ti, s->cow);
dm_put_device(ti, s->origin);
+ WARN_ON(s->in_progress);
+
kfree(s);
}
+static void account_start_copy(struct dm_snapshot *s)
+{
+ spin_lock(&s->in_progress_wait.lock);
+ s->in_progress++;
+ spin_unlock(&s->in_progress_wait.lock);
+}
+
+static void account_end_copy(struct dm_snapshot *s)
+{
+ spin_lock(&s->in_progress_wait.lock);
+ BUG_ON(!s->in_progress);
+ s->in_progress--;
+ if (likely(s->in_progress <= cow_threshold) &&
+ unlikely(waitqueue_active(&s->in_progress_wait)))
+ wake_up_locked(&s->in_progress_wait);
+ spin_unlock(&s->in_progress_wait.lock);
+}
+
+static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
+{
+ if (unlikely(s->in_progress > cow_threshold)) {
+ spin_lock(&s->in_progress_wait.lock);
+ if (likely(s->in_progress > cow_threshold)) {
+ /*
+ * NOTE: this throttle doesn't account for whether
+ * the caller is servicing an IO that will trigger a COW
+ * so excess throttling may result for chunks not required
+ * to be COW'd. But if cow_threshold was reached, extra
+ * throttling is unlikely to negatively impact performance.
+ */
+ DECLARE_WAITQUEUE(wait, current);
+ __add_wait_queue(&s->in_progress_wait, &wait);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&s->in_progress_wait.lock);
+ if (unlock_origins)
+ up_read(&_origins_lock);
+ io_schedule();
+ remove_wait_queue(&s->in_progress_wait, &wait);
+ return false;
+ }
+ spin_unlock(&s->in_progress_wait.lock);
+ }
+ return true;
+}
+
/*
* Flush a list of buffers.
*/
@@ -1395,7 +1573,7 @@
}
}
-static int do_origin(struct dm_dev *origin, struct bio *bio);
+static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit);
/*
* Flush a list of buffers.
@@ -1408,7 +1586,7 @@
while (bio) {
n = bio->bi_next;
bio->bi_next = NULL;
- r = do_origin(s->origin, bio);
+ r = do_origin(s->origin, bio, false);
if (r == DM_MAPIO_REMAPPED)
generic_make_request(bio);
bio = n;
@@ -1448,6 +1626,13 @@
dm_table_event(s->ti->table);
}
+static void invalidate_snapshot(struct dm_snapshot *s, int err)
+{
+ down_write(&s->lock);
+ __invalidate_snapshot(s, err);
+ up_write(&s->lock);
+}
+
static void pending_complete(void *context, int success)
{
struct dm_snap_pending_exception *pe = context;
@@ -1456,43 +1641,63 @@
struct bio *origin_bios = NULL;
struct bio *snapshot_bios = NULL;
struct bio *full_bio = NULL;
+ struct dm_exception_table_lock lock;
int error = 0;
+ dm_exception_table_lock_init(s, pe->e.old_chunk, &lock);
+
if (!success) {
/* Read/write error - snapshot is unusable */
- mutex_lock(&s->lock);
- __invalidate_snapshot(s, -EIO);
+ invalidate_snapshot(s, -EIO);
error = 1;
+
+ dm_exception_table_lock(&lock);
goto out;
}
e = alloc_completed_exception(GFP_NOIO);
if (!e) {
- mutex_lock(&s->lock);
- __invalidate_snapshot(s, -ENOMEM);
+ invalidate_snapshot(s, -ENOMEM);
error = 1;
+
+ dm_exception_table_lock(&lock);
goto out;
}
*e = pe->e;
- mutex_lock(&s->lock);
+ down_read(&s->lock);
+ dm_exception_table_lock(&lock);
if (!s->valid) {
+ up_read(&s->lock);
free_completed_exception(e);
error = 1;
+
goto out;
}
- /* Check for conflicting reads */
- __check_for_conflicting_io(s, pe->e.old_chunk);
-
/*
- * Add a proper exception, and remove the
- * in-flight exception from the list.
+ * Add a proper exception. After inserting the completed exception all
+ * subsequent snapshot reads to this chunk will be redirected to the
+ * COW device. This ensures that we do not starve. Moreover, as long
+ * as the pending exception exists, neither origin writes nor snapshot
+ * merging can overwrite the chunk in origin.
*/
dm_insert_exception(&s->complete, e);
+ up_read(&s->lock);
+
+ /* Wait for conflicting reads to drain */
+ if (__chunk_is_tracked(s, pe->e.old_chunk)) {
+ dm_exception_table_unlock(&lock);
+ __check_for_conflicting_io(s, pe->e.old_chunk);
+ dm_exception_table_lock(&lock);
+ }
out:
+ /* Remove the in-flight exception from the list */
dm_remove_exception(&pe->e);
+
+ dm_exception_table_unlock(&lock);
+
snapshot_bios = bio_list_get(&pe->snapshot_bios);
origin_bios = bio_list_get(&pe->origin_bios);
full_bio = pe->full_bio;
@@ -1500,8 +1705,6 @@
full_bio->bi_end_io = pe->full_bio_end_io;
increment_pending_exceptions_done_count();
- mutex_unlock(&s->lock);
-
/* Submit any pending write bios */
if (error) {
if (full_bio)
@@ -1575,6 +1778,7 @@
rb_link_node(&pe->out_of_order_node, parent, p);
rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree);
}
+ account_end_copy(s);
}
/*
@@ -1598,6 +1802,7 @@
dest.count = src.count;
/* Hand over to kcopyd */
+ account_start_copy(s);
dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
}
@@ -1617,6 +1822,7 @@
pe->full_bio = bio;
pe->full_bio_end_io = bio->bi_end_io;
+ account_start_copy(s);
callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
copy_callback, pe);
@@ -1638,12 +1844,43 @@
}
/*
+ * Inserts a pending exception into the pending table.
+ *
+ * NOTE: a write lock must be held on the chunk's pending exception table slot
+ * before calling this.
+ */
+static struct dm_snap_pending_exception *
+__insert_pending_exception(struct dm_snapshot *s,
+ struct dm_snap_pending_exception *pe, chunk_t chunk)
+{
+ pe->e.old_chunk = chunk;
+ bio_list_init(&pe->origin_bios);
+ bio_list_init(&pe->snapshot_bios);
+ pe->started = 0;
+ pe->full_bio = NULL;
+
+ spin_lock(&s->pe_allocation_lock);
+ if (s->store->type->prepare_exception(s->store, &pe->e)) {
+ spin_unlock(&s->pe_allocation_lock);
+ free_pending_exception(pe);
+ return NULL;
+ }
+
+ pe->exception_sequence = s->exception_start_sequence++;
+ spin_unlock(&s->pe_allocation_lock);
+
+ dm_insert_exception(&s->pending, &pe->e);
+
+ return pe;
+}
+
+/*
* Looks to see if this snapshot already has a pending exception
* for this chunk, otherwise it allocates a new one and inserts
* it into the pending table.
*
- * NOTE: a write lock must be held on snap->lock before calling
- * this.
+ * NOTE: a write lock must be held on the chunk's pending exception table slot
+ * before calling this.
*/
static struct dm_snap_pending_exception *
__find_pending_exception(struct dm_snapshot *s,
@@ -1657,22 +1894,7 @@
return pe2;
}
- pe->e.old_chunk = chunk;
- bio_list_init(&pe->origin_bios);
- bio_list_init(&pe->snapshot_bios);
- pe->started = 0;
- pe->full_bio = NULL;
-
- if (s->store->type->prepare_exception(s->store, &pe->e)) {
- free_pending_exception(pe);
- return NULL;
- }
-
- pe->exception_sequence = s->exception_start_sequence++;
-
- dm_insert_exception(&s->pending, &pe->e);
-
- return pe;
+ return __insert_pending_exception(s, pe, chunk);
}
static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
@@ -1685,6 +1907,37 @@
(bio->bi_iter.bi_sector & s->store->chunk_mask);
}
+static void zero_callback(int read_err, unsigned long write_err, void *context)
+{
+ struct bio *bio = context;
+ struct dm_snapshot *s = bio->bi_private;
+
+ account_end_copy(s);
+ bio->bi_status = write_err ? BLK_STS_IOERR : 0;
+ bio_endio(bio);
+}
+
+static void zero_exception(struct dm_snapshot *s, struct dm_exception *e,
+ struct bio *bio, chunk_t chunk)
+{
+ struct dm_io_region dest;
+
+ dest.bdev = s->cow->bdev;
+ dest.sector = bio->bi_iter.bi_sector;
+ dest.count = s->store->chunk_size;
+
+ account_start_copy(s);
+ WARN_ON_ONCE(bio->bi_private);
+ bio->bi_private = s;
+ dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio);
+}
+
+static bool io_overlaps_chunk(struct dm_snapshot *s, struct bio *bio)
+{
+ return bio->bi_iter.bi_size ==
+ (s->store->chunk_size << SECTOR_SHIFT);
+}
+
static int snapshot_map(struct dm_target *ti, struct bio *bio)
{
struct dm_exception *e;
@@ -1692,6 +1945,7 @@
int r = DM_MAPIO_REMAPPED;
chunk_t chunk;
struct dm_snap_pending_exception *pe = NULL;
+ struct dm_exception_table_lock lock;
init_tracked_chunk(bio);
@@ -1701,13 +1955,20 @@
}
chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
+ dm_exception_table_lock_init(s, chunk, &lock);
/* Full snapshots are not usable */
/* To get here the table must be live so s->active is always set. */
if (!s->valid)
return DM_MAPIO_KILL;
- mutex_lock(&s->lock);
+ if (bio_data_dir(bio) == WRITE) {
+ while (unlikely(!wait_for_in_progress(s, false)))
+ ; /* wait_for_in_progress() has slept */
+ }
+
+ down_read(&s->lock);
+ dm_exception_table_lock(&lock);
if (!s->valid || (unlikely(s->snapshot_overflowed) &&
bio_data_dir(bio) == WRITE)) {
@@ -1715,10 +1976,43 @@
goto out_unlock;
}
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+ if (s->discard_passdown_origin && dm_bio_get_target_bio_nr(bio)) {
+ /*
+ * passdown discard to origin (without triggering
+ * snapshot exceptions via do_origin; doing so would
+ * defeat the goal of freeing space in origin that is
+ * implied by the "discard_passdown_origin" feature)
+ */
+ bio_set_dev(bio, s->origin->bdev);
+ track_chunk(s, bio, chunk);
+ goto out_unlock;
+ }
+ /* discard to snapshot (target_bio_nr == 0) zeroes exceptions */
+ }
+
/* If the block is already remapped - use that, else remap it */
e = dm_lookup_exception(&s->complete, chunk);
if (e) {
remap_exception(s, e, bio, chunk);
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD) &&
+ io_overlaps_chunk(s, bio)) {
+ dm_exception_table_unlock(&lock);
+ up_read(&s->lock);
+ zero_exception(s, e, bio, chunk);
+ r = DM_MAPIO_SUBMITTED; /* discard is not issued */
+ goto out;
+ }
+ goto out_unlock;
+ }
+
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+ /*
+ * If no exception exists, complete discard immediately
+ * otherwise it'll trigger copy-out.
+ */
+ bio_endio(bio);
+ r = DM_MAPIO_SUBMITTED;
goto out_unlock;
}
@@ -1730,15 +2024,9 @@
if (bio_data_dir(bio) == WRITE) {
pe = __lookup_pending_exception(s, chunk);
if (!pe) {
- mutex_unlock(&s->lock);
+ dm_exception_table_unlock(&lock);
pe = alloc_pending_exception(s);
- mutex_lock(&s->lock);
-
- if (!s->valid || s->snapshot_overflowed) {
- free_pending_exception(pe);
- r = DM_MAPIO_KILL;
- goto out_unlock;
- }
+ dm_exception_table_lock(&lock);
e = dm_lookup_exception(&s->complete, chunk);
if (e) {
@@ -1749,13 +2037,22 @@
pe = __find_pending_exception(s, pe, chunk);
if (!pe) {
+ dm_exception_table_unlock(&lock);
+ up_read(&s->lock);
+
+ down_write(&s->lock);
+
if (s->store->userspace_supports_overflow) {
- s->snapshot_overflowed = 1;
- DMERR("Snapshot overflowed: Unable to allocate exception.");
+ if (s->valid && !s->snapshot_overflowed) {
+ s->snapshot_overflowed = 1;
+ DMERR("Snapshot overflowed: Unable to allocate exception.");
+ }
} else
__invalidate_snapshot(s, -ENOMEM);
+ up_write(&s->lock);
+
r = DM_MAPIO_KILL;
- goto out_unlock;
+ goto out;
}
}
@@ -1763,11 +2060,12 @@
r = DM_MAPIO_SUBMITTED;
- if (!pe->started &&
- bio->bi_iter.bi_size ==
- (s->store->chunk_size << SECTOR_SHIFT)) {
+ if (!pe->started && io_overlaps_chunk(s, bio)) {
pe->started = 1;
- mutex_unlock(&s->lock);
+
+ dm_exception_table_unlock(&lock);
+ up_read(&s->lock);
+
start_full_bio(pe, bio);
goto out;
}
@@ -1775,9 +2073,12 @@
bio_list_add(&pe->snapshot_bios, bio);
if (!pe->started) {
- /* this is protected by snap->lock */
+ /* this is protected by the exception table lock */
pe->started = 1;
- mutex_unlock(&s->lock);
+
+ dm_exception_table_unlock(&lock);
+ up_read(&s->lock);
+
start_copy(pe);
goto out;
}
@@ -1787,7 +2088,8 @@
}
out_unlock:
- mutex_unlock(&s->lock);
+ dm_exception_table_unlock(&lock);
+ up_read(&s->lock);
out:
return r;
}
@@ -1821,9 +2123,15 @@
return DM_MAPIO_REMAPPED;
}
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+ /* Once merging, discards no longer effect change */
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
- mutex_lock(&s->lock);
+ down_write(&s->lock);
/* Full merging snapshots are redirected to the origin */
if (!s->valid)
@@ -1854,12 +2162,12 @@
bio_set_dev(bio, s->origin->bdev);
if (bio_data_dir(bio) == WRITE) {
- mutex_unlock(&s->lock);
- return do_origin(s->origin, bio);
+ up_write(&s->lock);
+ return do_origin(s->origin, bio, false);
}
out_unlock:
- mutex_unlock(&s->lock);
+ up_write(&s->lock);
return r;
}
@@ -1891,7 +2199,7 @@
down_read(&_origins_lock);
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest) {
- mutex_lock(&snap_src->lock);
+ down_read(&snap_src->lock);
if (s == snap_src) {
DMERR("Unable to resume snapshot source until "
"handover completes.");
@@ -1901,7 +2209,7 @@
"source is suspended.");
r = -EINVAL;
}
- mutex_unlock(&snap_src->lock);
+ up_read(&snap_src->lock);
}
up_read(&_origins_lock);
@@ -1947,11 +2255,11 @@
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest) {
- mutex_lock(&snap_src->lock);
- mutex_lock_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
+ down_write(&snap_src->lock);
+ down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
__handover_exceptions(snap_src, snap_dest);
- mutex_unlock(&snap_dest->lock);
- mutex_unlock(&snap_src->lock);
+ up_write(&snap_dest->lock);
+ up_write(&snap_src->lock);
}
up_read(&_origins_lock);
@@ -1966,9 +2274,9 @@
/* Now we have correct chunk size, reregister */
reregister_snapshot(s);
- mutex_lock(&s->lock);
+ down_write(&s->lock);
s->active = 1;
- mutex_unlock(&s->lock);
+ up_write(&s->lock);
}
static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
@@ -2004,11 +2312,12 @@
{
unsigned sz = 0;
struct dm_snapshot *snap = ti->private;
+ unsigned num_features;
switch (type) {
case STATUSTYPE_INFO:
- mutex_lock(&snap->lock);
+ down_write(&snap->lock);
if (!snap->valid)
DMEMIT("Invalid");
@@ -2033,7 +2342,7 @@
DMEMIT("Unknown");
}
- mutex_unlock(&snap->lock);
+ up_write(&snap->lock);
break;
@@ -2044,8 +2353,16 @@
* make sense.
*/
DMEMIT("%s %s", snap->origin->name, snap->cow->name);
- snap->store->type->status(snap->store, type, result + sz,
- maxlen - sz);
+ sz += snap->store->type->status(snap->store, type, result + sz,
+ maxlen - sz);
+ num_features = snap->discard_zeroes_cow + snap->discard_passdown_origin;
+ if (num_features) {
+ DMEMIT(" %u", num_features);
+ if (snap->discard_zeroes_cow)
+ DMEMIT(" discard_zeroes_cow");
+ if (snap->discard_passdown_origin)
+ DMEMIT(" discard_passdown_origin");
+ }
break;
}
}
@@ -2064,6 +2381,26 @@
return r;
}
+static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct dm_snapshot *snap = ti->private;
+
+ if (snap->discard_zeroes_cow) {
+ struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
+
+ down_read(&_origins_lock);
+
+ (void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL);
+ if (snap_src && snap_dest)
+ snap = snap_src;
+
+ /* All discards are split on chunk_size boundary */
+ limits->discard_granularity = snap->store->chunk_size;
+ limits->max_discard_sectors = snap->store->chunk_size;
+
+ up_read(&_origins_lock);
+ }
+}
/*-----------------------------------------------------------------
* Origin methods
@@ -2085,9 +2422,10 @@
int r = DM_MAPIO_REMAPPED;
struct dm_snapshot *snap;
struct dm_exception *e;
- struct dm_snap_pending_exception *pe;
+ struct dm_snap_pending_exception *pe, *pe2;
struct dm_snap_pending_exception *pe_to_start_now = NULL;
struct dm_snap_pending_exception *pe_to_start_last = NULL;
+ struct dm_exception_table_lock lock;
chunk_t chunk;
/* Do all the snapshots on this origin */
@@ -2099,52 +2437,59 @@
if (dm_target_is_snapshot_merge(snap->ti))
continue;
- mutex_lock(&snap->lock);
-
- /* Only deal with valid and active snapshots */
- if (!snap->valid || !snap->active)
- goto next_snapshot;
-
/* Nothing to do if writing beyond end of snapshot */
if (sector >= dm_table_get_size(snap->ti->table))
- goto next_snapshot;
+ continue;
/*
* Remember, different snapshots can have
* different chunk sizes.
*/
chunk = sector_to_chunk(snap->store, sector);
+ dm_exception_table_lock_init(snap, chunk, &lock);
- /*
- * Check exception table to see if block
- * is already remapped in this snapshot
- * and trigger an exception if not.
- */
- e = dm_lookup_exception(&snap->complete, chunk);
- if (e)
+ down_read(&snap->lock);
+ dm_exception_table_lock(&lock);
+
+ /* Only deal with valid and active snapshots */
+ if (!snap->valid || !snap->active)
goto next_snapshot;
pe = __lookup_pending_exception(snap, chunk);
if (!pe) {
- mutex_unlock(&snap->lock);
- pe = alloc_pending_exception(snap);
- mutex_lock(&snap->lock);
-
- if (!snap->valid) {
- free_pending_exception(pe);
- goto next_snapshot;
- }
-
+ /*
+ * Check exception table to see if block is already
+ * remapped in this snapshot and trigger an exception
+ * if not.
+ */
e = dm_lookup_exception(&snap->complete, chunk);
- if (e) {
- free_pending_exception(pe);
+ if (e)
goto next_snapshot;
- }
- pe = __find_pending_exception(snap, pe, chunk);
- if (!pe) {
- __invalidate_snapshot(snap, -ENOMEM);
- goto next_snapshot;
+ dm_exception_table_unlock(&lock);
+ pe = alloc_pending_exception(snap);
+ dm_exception_table_lock(&lock);
+
+ pe2 = __lookup_pending_exception(snap, chunk);
+
+ if (!pe2) {
+ e = dm_lookup_exception(&snap->complete, chunk);
+ if (e) {
+ free_pending_exception(pe);
+ goto next_snapshot;
+ }
+
+ pe = __insert_pending_exception(snap, pe, chunk);
+ if (!pe) {
+ dm_exception_table_unlock(&lock);
+ up_read(&snap->lock);
+
+ invalidate_snapshot(snap, -ENOMEM);
+ continue;
+ }
+ } else {
+ free_pending_exception(pe);
+ pe = pe2;
}
}
@@ -2171,7 +2516,8 @@
}
next_snapshot:
- mutex_unlock(&snap->lock);
+ dm_exception_table_unlock(&lock);
+ up_read(&snap->lock);
if (pe_to_start_now) {
start_copy(pe_to_start_now);
@@ -2192,15 +2538,24 @@
/*
* Called on a write from the origin driver.
*/
-static int do_origin(struct dm_dev *origin, struct bio *bio)
+static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
{
struct origin *o;
int r = DM_MAPIO_REMAPPED;
+again:
down_read(&_origins_lock);
o = __lookup_origin(origin->bdev);
- if (o)
+ if (o) {
+ if (limit) {
+ struct dm_snapshot *s;
+ list_for_each_entry(s, &o->snapshots, list)
+ if (unlikely(!wait_for_in_progress(s, true)))
+ goto again;
+ }
+
r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
+ }
up_read(&_origins_lock);
return r;
@@ -2313,14 +2668,7 @@
dm_accept_partial_bio(bio, available_sectors);
/* Only tell snapshots if this is a write */
- return do_origin(o->dev, bio);
-}
-
-static long origin_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
- long nr_pages, void **kaddr, pfn_t *pfn)
-{
- DMWARN("device does not support dax.");
- return -EIO;
+ return do_origin(o->dev, bio, true);
}
/*
@@ -2382,12 +2730,11 @@
.postsuspend = origin_postsuspend,
.status = origin_status,
.iterate_devices = origin_iterate_devices,
- .direct_access = origin_dax_direct_access,
};
static struct target_type snapshot_target = {
.name = "snapshot",
- .version = {1, 15, 0},
+ .version = {1, 16, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
@@ -2397,11 +2744,12 @@
.resume = snapshot_resume,
.status = snapshot_status,
.iterate_devices = snapshot_iterate_devices,
+ .io_hints = snapshot_io_hints,
};
static struct target_type merge_target = {
.name = dm_snapshot_merge_target_name,
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
@@ -2412,6 +2760,7 @@
.resume = snapshot_merge_resume,
.status = snapshot_status,
.iterate_devices = snapshot_iterate_devices,
+ .io_hints = snapshot_io_hints,
};
static int __init dm_snapshot_init(void)
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 21de30b..7141704 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -85,7 +85,7 @@
a = shared_memory_amount + alloc_size;
if (a < shared_memory_amount)
return false;
- if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR)
+ if (a >> PAGE_SHIFT > totalram_pages() / DM_STATS_MEMORY_FACTOR)
return false;
#ifdef CONFIG_MMU
if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR)
@@ -262,7 +262,7 @@
if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
return -EOVERFLOW;
- shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared);
+ shared_alloc_size = struct_size(s, stat_shared, n_entries);
if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
return -EOVERFLOW;
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index fae35ca..8a0f057 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -61,8 +61,7 @@
{
struct switch_ctx *sctx;
- sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
- GFP_KERNEL);
+ sctx = kzalloc(struct_size(sctx, path_list, nr_paths), GFP_KERNEL);
if (!sctx)
return NULL;
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index c209b8a..a05fcd5 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -92,7 +92,8 @@
static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf)
{
- sprintf(buf, "%d\n", dm_use_blk_mq(md));
+ /* Purely for userspace compatibility */
+ sprintf(buf, "%d\n", true);
return strlen(buf);
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3d0e2c1..52e0495 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -47,7 +47,6 @@
bool integrity_supported:1;
bool singleton:1;
- bool all_blk_mq:1;
unsigned integrity_added:1;
/*
@@ -164,10 +163,8 @@
/*
* Allocate both the target array and offset array at once.
- * Append an empty entry to catch sectors beyond the end of
- * the device.
*/
- n_highs = (sector_t *) dm_vcalloc(num + 1, sizeof(struct dm_target) +
+ n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) +
sizeof(sector_t));
if (!n_highs)
return -ENOMEM;
@@ -562,7 +559,7 @@
gfp = GFP_NOIO;
}
argv = kmalloc_array(new_size, sizeof(*argv), gfp);
- if (argv) {
+ if (argv && old_argv) {
memcpy(argv, old_argv, *size * sizeof(*argv));
*size = new_size;
}
@@ -872,8 +869,7 @@
static bool __table_type_request_based(enum dm_queue_mode table_type)
{
- return (table_type == DM_TYPE_REQUEST_BASED ||
- table_type == DM_TYPE_MQ_REQUEST_BASED);
+ return table_type == DM_TYPE_REQUEST_BASED;
}
void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
@@ -882,13 +878,25 @@
}
EXPORT_SYMBOL_GPL(dm_table_set_type);
-static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
+/* validate the dax capability of the target device span */
+int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
{
- return bdev_dax_supported(dev->bdev, PAGE_SIZE);
+ int blocksize = *(int *) data;
+
+ return generic_fsdax_supported(dev->dax_dev, dev->bdev, blocksize,
+ start, len);
}
-static bool dm_table_supports_dax(struct dm_table *t)
+/* Check devices support synchronous DAX */
+static int device_dax_synchronous(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ return dev->dax_dev && dax_synchronous(dev->dax_dev);
+}
+
+bool dm_table_supports_dax(struct dm_table *t,
+ iterate_devices_callout_fn iterate_fn, int *blocksize)
{
struct dm_target *ti;
unsigned i;
@@ -901,7 +909,7 @@
return false;
if (!ti->type->iterate_devices ||
- !ti->type->iterate_devices(ti, device_supports_dax, NULL))
+ !ti->type->iterate_devices(ti, iterate_fn, blocksize))
return false;
}
@@ -921,12 +929,12 @@
struct request_queue *q = bdev_get_queue(dev->bdev);
struct verify_rq_based_data *v = data;
- if (q->mq_ops)
+ if (queue_is_mq(q))
v->mq_count++;
else
v->sq_count++;
- return queue_is_rq_based(q);
+ return queue_is_mq(q);
}
static int dm_table_determine_type(struct dm_table *t)
@@ -937,6 +945,7 @@
struct dm_target *tgt;
struct list_head *devices = dm_table_get_devices(t);
enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
+ int page_size = PAGE_SIZE;
if (t->type != DM_TYPE_NONE) {
/* target already set the table's type */
@@ -981,7 +990,7 @@
verify_bio_based:
/* We must use this table as bio-based */
t->type = DM_TYPE_BIO_BASED;
- if (dm_table_supports_dax(t) ||
+ if (dm_table_supports_dax(t, device_supports_dax, &page_size) ||
(list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
t->type = DM_TYPE_DAX_BIO_BASED;
} else {
@@ -999,10 +1008,6 @@
BUG_ON(!request_based); /* No targets in this table */
- /*
- * The only way to establish DM_TYPE_MQ_REQUEST_BASED is by
- * having a compatible target use dm_table_set_type.
- */
t->type = DM_TYPE_REQUEST_BASED;
verify_rq_based:
@@ -1022,11 +1027,9 @@
int srcu_idx;
struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx);
- /* inherit live table's type and all_blk_mq */
- if (live_table) {
+ /* inherit live table's type */
+ if (live_table)
t->type = live_table->type;
- t->all_blk_mq = live_table->all_blk_mq;
- }
dm_put_live_table(t->md, srcu_idx);
return 0;
}
@@ -1046,17 +1049,10 @@
DMERR("table load rejected: including non-request-stackable devices");
return -EINVAL;
}
- if (v.sq_count && v.mq_count) {
+ if (v.sq_count > 0) {
DMERR("table load rejected: not all devices are blk-mq request-stackable");
return -EINVAL;
}
- t->all_blk_mq = v.mq_count > 0;
-
- if (!t->all_blk_mq &&
- (t->type == DM_TYPE_MQ_REQUEST_BASED || t->type == DM_TYPE_NVME_BIO_BASED)) {
- DMERR("table load rejected: all devices are not blk-mq request-stackable");
- return -EINVAL;
- }
return 0;
}
@@ -1105,11 +1101,6 @@
return __table_type_request_based(dm_table_get_type(t));
}
-bool dm_table_all_blk_mq_devices(struct dm_table *t)
-{
- return t->all_blk_mq;
-}
-
static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
{
enum dm_queue_mode type = dm_table_get_type(t);
@@ -1349,7 +1340,7 @@
}
EXPORT_SYMBOL(dm_table_event);
-sector_t dm_table_get_size(struct dm_table *t)
+inline sector_t dm_table_get_size(struct dm_table *t)
{
return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
}
@@ -1366,7 +1357,7 @@
/*
* Search the btree for the correct target.
*
- * Caller should check returned pointer with dm_target_is_valid()
+ * Caller should check returned pointer for NULL
* to trap I/O beyond end of device.
*/
struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
@@ -1374,6 +1365,9 @@
unsigned int l, n = 0, k = 0;
sector_t *node;
+ if (unlikely(sector >= dm_table_get_size(t)))
+ return NULL;
+
for (l = 0; l < t->depth; l++) {
n = get_child(n, k);
node = get_node(t, l, n);
@@ -1718,14 +1712,6 @@
return q && !blk_queue_add_random(q);
}
-static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
-{
- struct request_queue *q = bdev_get_queue(dev->bdev);
-
- return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
-}
-
static bool dm_table_all_devices_attribute(struct dm_table *t,
iterate_devices_callout_fn func)
{
@@ -1872,10 +1858,41 @@
return true;
}
+static int device_requires_stable_pages(struct dm_target *ti,
+ struct dm_dev *dev, sector_t start,
+ sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && bdi_cap_stable_pages_required(q->backing_dev_info);
+}
+
+/*
+ * If any underlying device requires stable pages, a table must require
+ * them as well. Only targets that support iterate_devices are considered:
+ * don't want error, zero, etc to require stable pages.
+ */
+static bool dm_table_requires_stable_pages(struct dm_table *t)
+{
+ struct dm_target *ti;
+ unsigned i;
+
+ for (i = 0; i < dm_table_get_num_targets(t); i++) {
+ ti = dm_table_get_target(t, i);
+
+ if (ti->type->iterate_devices &&
+ ti->type->iterate_devices(ti, device_requires_stable_pages, NULL))
+ return true;
+ }
+
+ return false;
+}
+
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
{
bool wc = false, fua = false;
+ int page_size = PAGE_SIZE;
/*
* Copy table's limits to the DM device's request_queue
@@ -1903,8 +1920,11 @@
}
blk_queue_write_cache(q, wc, fua);
- if (dm_table_supports_dax(t))
+ if (dm_table_supports_dax(t, device_supports_dax, &page_size)) {
blk_queue_flag_set(QUEUE_FLAG_DAX, q);
+ if (dm_table_supports_dax(t, device_dax_synchronous, NULL))
+ set_dax_synchronous(t->md->dax_dev);
+ }
else
blk_queue_flag_clear(QUEUE_FLAG_DAX, q);
@@ -1922,14 +1942,18 @@
if (!dm_table_supports_write_zeroes(t))
q->limits.max_write_zeroes_sectors = 0;
- if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
- blk_queue_flag_clear(QUEUE_FLAG_NO_SG_MERGE, q);
- else
- blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
-
dm_table_verify_integrity(t);
/*
+ * Some devices don't use blk_integrity but still want stable pages
+ * because they do their own checksumming.
+ */
+ if (dm_table_requires_stable_pages(t))
+ q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
+ else
+ q->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES;
+
+ /*
* Determine whether or not this queue's I/O timings contribute
* to the entropy pool, Only request-based targets use this.
* Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
@@ -1937,6 +1961,19 @@
*/
if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
+
+ /*
+ * For a zoned target, the number of zones should be updated for the
+ * correct value to be exposed in sysfs queue/nr_zones. For a BIO based
+ * target, this is all that is needed. For a request based target, the
+ * queue zone bitmaps must also be updated.
+ * Use blk_revalidate_disk_zones() to handle this.
+ */
+ if (blk_queue_is_zoned(q))
+ blk_revalidate_disk_zones(t->md->disk);
+
+ /* Allow reads to exceed readahead limits */
+ q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9);
}
unsigned int dm_table_get_num_targets(struct dm_table *t)
@@ -2079,26 +2116,24 @@
}
EXPORT_SYMBOL(dm_table_get_md);
+const char *dm_table_device_name(struct dm_table *t)
+{
+ return dm_device_name(t->md);
+}
+EXPORT_SYMBOL_GPL(dm_table_device_name);
+
void dm_table_run_md_queue_async(struct dm_table *t)
{
struct mapped_device *md;
struct request_queue *queue;
- unsigned long flags;
if (!dm_table_request_based(t))
return;
md = dm_table_get_md(t);
queue = dm_get_md_queue(md);
- if (queue) {
- if (queue->mq_ops)
- blk_mq_run_hw_queues(queue, true);
- else {
- spin_lock_irqsave(queue->queue_lock, flags);
- blk_run_queue_async(queue);
- spin_unlock_irqrestore(queue->queue_lock, flags);
- }
- }
+ if (queue)
+ blk_mq_run_hw_queues(queue, true);
}
EXPORT_SYMBOL(dm_table_run_md_queue_async);
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 314d17c..64dd0b3 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -136,7 +136,8 @@
return DM_MAPIO_KILL;
}
-static void io_err_release_clone_rq(struct request *clone)
+static void io_err_release_clone_rq(struct request *clone,
+ union map_info *map_context)
{
}
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 20b0776..4c68a7b 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -202,6 +202,13 @@
bool fail_io:1;
/*
+ * Set once a thin-pool has been accessed through one of the interfaces
+ * that imply the pool is in-service (e.g. thin devices created/deleted,
+ * thin-pool message, metadata snapshots, etc).
+ */
+ bool in_service:1;
+
+ /*
* Reading the space map roots can fail, so we read it into these
* buffers before the superblock is locked and updated.
*/
@@ -367,6 +374,32 @@
/*----------------------------------------------------------------*/
+/*
+ * Variant that is used for in-core only changes or code that
+ * shouldn't put the pool in service on its own (e.g. commit).
+ */
+static inline void __pmd_write_lock(struct dm_pool_metadata *pmd)
+ __acquires(pmd->root_lock)
+{
+ down_write(&pmd->root_lock);
+}
+#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd))
+
+static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
+{
+ __pmd_write_lock(pmd);
+ if (unlikely(!pmd->in_service))
+ pmd->in_service = true;
+}
+
+static inline void pmd_write_unlock(struct dm_pool_metadata *pmd)
+ __releases(pmd->root_lock)
+{
+ up_write(&pmd->root_lock);
+}
+
+/*----------------------------------------------------------------*/
+
static int superblock_lock_zero(struct dm_pool_metadata *pmd,
struct dm_block **sblock)
{
@@ -790,6 +823,9 @@
*/
BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
+ if (unlikely(!pmd->in_service))
+ return 0;
+
r = __write_changed_details(pmd);
if (r < 0)
return r;
@@ -853,6 +889,7 @@
pmd->time = 0;
INIT_LIST_HEAD(&pmd->thin_devices);
pmd->fail_io = false;
+ pmd->in_service = false;
pmd->bdev = bdev;
pmd->data_block_size = data_block_size;
@@ -903,7 +940,6 @@
DMWARN("%s: __commit_transaction() failed, error = %d",
__func__, r);
}
-
if (!pmd->fail_io)
__destroy_persistent_data_objects(pmd);
@@ -1032,10 +1068,10 @@
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io)
r = __create_thin(pmd, dev);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1123,10 +1159,10 @@
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io)
r = __create_snap(pmd, dev, origin);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1166,10 +1202,10 @@
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io)
r = __delete_device(pmd, dev);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1180,7 +1216,7 @@
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (pmd->fail_io)
goto out;
@@ -1194,7 +1230,7 @@
r = 0;
out:
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1225,7 +1261,12 @@
* We commit to ensure the btree roots which we increment in a
* moment are up to date.
*/
- __commit_transaction(pmd);
+ r = __commit_transaction(pmd);
+ if (r < 0) {
+ DMWARN("%s: __commit_transaction() failed, error = %d",
+ __func__, r);
+ return r;
+ }
/*
* Copy the superblock.
@@ -1283,10 +1324,10 @@
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io)
r = __reserve_metadata_snap(pmd);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1331,10 +1372,10 @@
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io)
r = __release_metadata_snap(pmd);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1377,19 +1418,19 @@
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock_in_core(pmd);
if (!pmd->fail_io)
r = __open_device(pmd, dev, 0, td);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
int dm_pool_close_thin_device(struct dm_thin_device *td)
{
- down_write(&td->pmd->root_lock);
+ pmd_write_lock_in_core(td->pmd);
__close_device(td);
- up_write(&td->pmd->root_lock);
+ pmd_write_unlock(td->pmd);
return 0;
}
@@ -1570,10 +1611,10 @@
{
int r = -EINVAL;
- down_write(&td->pmd->root_lock);
+ pmd_write_lock(td->pmd);
if (!td->pmd->fail_io)
r = __insert(td, block, data_block);
- up_write(&td->pmd->root_lock);
+ pmd_write_unlock(td->pmd);
return r;
}
@@ -1657,10 +1698,10 @@
{
int r = -EINVAL;
- down_write(&td->pmd->root_lock);
+ pmd_write_lock(td->pmd);
if (!td->pmd->fail_io)
r = __remove(td, block);
- up_write(&td->pmd->root_lock);
+ pmd_write_unlock(td->pmd);
return r;
}
@@ -1670,15 +1711,15 @@
{
int r = -EINVAL;
- down_write(&td->pmd->root_lock);
+ pmd_write_lock(td->pmd);
if (!td->pmd->fail_io)
r = __remove_range(td, begin, end);
- up_write(&td->pmd->root_lock);
+ pmd_write_unlock(td->pmd);
return r;
}
-int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
+int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
{
int r;
uint32_t ref_count;
@@ -1686,7 +1727,7 @@
down_read(&pmd->root_lock);
r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
if (!r)
- *result = (ref_count != 0);
+ *result = (ref_count > 1);
up_read(&pmd->root_lock);
return r;
@@ -1696,13 +1737,13 @@
{
int r = 0;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
for (; b != e; b++) {
r = dm_sm_inc_block(pmd->data_sm, b);
if (r)
break;
}
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1711,13 +1752,13 @@
{
int r = 0;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
for (; b != e; b++) {
r = dm_sm_dec_block(pmd->data_sm, b);
if (r)
break;
}
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1765,10 +1806,10 @@
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io)
r = dm_sm_new_block(pmd->data_sm, result);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1777,12 +1818,16 @@
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ /*
+ * Care is taken to not have commit be what
+ * triggers putting the thin-pool in-service.
+ */
+ __pmd_write_lock(pmd);
if (pmd->fail_io)
goto out;
r = __commit_transaction(pmd);
- if (r <= 0)
+ if (r < 0)
goto out;
/*
@@ -1790,7 +1835,7 @@
*/
r = __begin_transaction(pmd);
out:
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1806,7 +1851,7 @@
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (pmd->fail_io)
goto out;
@@ -1817,7 +1862,7 @@
pmd->fail_io = true;
out:
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1948,10 +1993,10 @@
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io)
r = __resize_space_map(pmd->data_sm, new_count);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1960,29 +2005,29 @@
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io) {
r = __resize_space_map(pmd->metadata_sm, new_count);
if (!r)
__set_metadata_reserve(pmd);
}
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
{
- down_write(&pmd->root_lock);
+ pmd_write_lock_in_core(pmd);
dm_bm_set_read_only(pmd->bm);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
}
void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
{
- down_write(&pmd->root_lock);
+ pmd_write_lock_in_core(pmd);
dm_bm_set_read_write(pmd->bm);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
}
int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
@@ -1992,25 +2037,28 @@
{
int r;
- down_write(&pmd->root_lock);
+ pmd_write_lock_in_core(pmd);
r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
{
- int r;
+ int r = -EINVAL;
struct dm_block *sblock;
struct thin_disk_superblock *disk_super;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
+ if (pmd->fail_io)
+ goto out;
+
pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
r = superblock_lock(pmd, &sblock);
if (r) {
- DMERR("couldn't read superblock");
+ DMERR("couldn't lock superblock");
goto out;
}
@@ -2019,7 +2067,7 @@
dm_bm_unlock(sblock);
out:
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 35e954e..f6be0d7 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -195,7 +195,7 @@
int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
-int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
+int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e);
int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 1f225a1..fcd8877 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -257,6 +257,7 @@
spinlock_t lock;
struct bio_list deferred_flush_bios;
+ struct bio_list deferred_flush_completions;
struct list_head prepared_mappings;
struct list_head prepared_discards;
struct list_head prepared_discards_pt2;
@@ -354,7 +355,7 @@
* Ensures the thin is not destroyed until the worker has finished
* iterating the active_thins list.
*/
- atomic_t refcount;
+ refcount_t refcount;
struct completion can_destroy;
};
@@ -956,6 +957,39 @@
mempool_free(m, &m->tc->pool->mapping_pool);
}
+static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ unsigned long flags;
+
+ /*
+ * If the bio has the REQ_FUA flag set we must commit the metadata
+ * before signaling its completion.
+ */
+ if (!bio_triggers_commit(tc, bio)) {
+ bio_endio(bio);
+ return;
+ }
+
+ /*
+ * Complete bio with an error if earlier I/O caused changes to the
+ * metadata that can't be committed, e.g, due to I/O errors on the
+ * metadata device.
+ */
+ if (dm_thin_aborted_changes(tc->td)) {
+ bio_io_error(bio);
+ return;
+ }
+
+ /*
+ * Batch together any bios that trigger commits and then issue a
+ * single commit for them in process_deferred_bios().
+ */
+ spin_lock_irqsave(&pool->lock, flags);
+ bio_list_add(&pool->deferred_flush_completions, bio);
+ spin_unlock_irqrestore(&pool->lock, flags);
+}
+
static void process_prepared_mapping(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
@@ -988,7 +1022,7 @@
*/
if (bio) {
inc_remap_and_issue_cell(tc, m->cell, m->data_block);
- bio_endio(bio);
+ complete_overwrite_bio(tc, bio);
} else {
inc_all_io_entry(tc->pool, m->cell->holder);
remap_and_issue(tc, m->cell->holder, m->data_block);
@@ -1048,7 +1082,7 @@
* passdown we have to check that these blocks are now unused.
*/
int r = 0;
- bool used = true;
+ bool shared = true;
struct thin_c *tc = m->tc;
struct pool *pool = tc->pool;
dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
@@ -1058,11 +1092,11 @@
while (b != end) {
/* find start of unmapped run */
for (; b < end; b++) {
- r = dm_pool_block_is_used(pool->pmd, b, &used);
+ r = dm_pool_block_is_shared(pool->pmd, b, &shared);
if (r)
goto out;
- if (!used)
+ if (!shared)
break;
}
@@ -1071,11 +1105,11 @@
/* find end of run */
for (e = b + 1; e != end; e++) {
- r = dm_pool_block_is_used(pool->pmd, e, &used);
+ r = dm_pool_block_is_shared(pool->pmd, e, &shared);
if (r)
goto out;
- if (used)
+ if (shared)
break;
}
@@ -2317,7 +2351,7 @@
{
unsigned long flags;
struct bio *bio;
- struct bio_list bios;
+ struct bio_list bios, bio_completions;
struct thin_c *tc;
tc = get_first_thin(pool);
@@ -2328,26 +2362,36 @@
}
/*
- * If there are any deferred flush bios, we must commit
- * the metadata before issuing them.
+ * If there are any deferred flush bios, we must commit the metadata
+ * before issuing them or signaling their completion.
*/
bio_list_init(&bios);
+ bio_list_init(&bio_completions);
+
spin_lock_irqsave(&pool->lock, flags);
bio_list_merge(&bios, &pool->deferred_flush_bios);
bio_list_init(&pool->deferred_flush_bios);
+
+ bio_list_merge(&bio_completions, &pool->deferred_flush_completions);
+ bio_list_init(&pool->deferred_flush_completions);
spin_unlock_irqrestore(&pool->lock, flags);
- if (bio_list_empty(&bios) &&
+ if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
!(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
return;
if (commit(pool)) {
+ bio_list_merge(&bios, &bio_completions);
+
while ((bio = bio_list_pop(&bios)))
bio_io_error(bio);
return;
}
pool->last_commit_jiffies = jiffies;
+ while ((bio = bio_list_pop(&bio_completions)))
+ bio_endio(bio);
+
while ((bio = bio_list_pop(&bios)))
generic_make_request(bio);
}
@@ -2954,6 +2998,7 @@
INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
spin_lock_init(&pool->lock);
bio_list_init(&pool->deferred_flush_bios);
+ bio_list_init(&pool->deferred_flush_completions);
INIT_LIST_HEAD(&pool->prepared_mappings);
INIT_LIST_HEAD(&pool->prepared_discards);
INIT_LIST_HEAD(&pool->prepared_discards_pt2);
@@ -3238,6 +3283,13 @@
as.argc = argc;
as.argv = argv;
+ /* make sure metadata and data are different devices */
+ if (!strcmp(argv[0], argv[1])) {
+ ti->error = "Error setting metadata or data device";
+ r = -EINVAL;
+ goto out_unlock;
+ }
+
/*
* Set default pool features.
*/
@@ -4025,7 +4077,7 @@
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 20, 0},
+ .version = {1, 21, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -4046,12 +4098,12 @@
*--------------------------------------------------------------*/
static void thin_get(struct thin_c *tc)
{
- atomic_inc(&tc->refcount);
+ refcount_inc(&tc->refcount);
}
static void thin_put(struct thin_c *tc)
{
- if (atomic_dec_and_test(&tc->refcount))
+ if (refcount_dec_and_test(&tc->refcount))
complete(&tc->can_destroy);
}
@@ -4122,6 +4174,12 @@
tc->sort_bio_list = RB_ROOT;
if (argc == 3) {
+ if (!strcmp(argv[0], argv[2])) {
+ ti->error = "Error setting origin device";
+ r = -EINVAL;
+ goto bad_origin_dev;
+ }
+
r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
if (r) {
ti->error = "Error opening origin device";
@@ -4182,7 +4240,6 @@
if (tc->pool->pf.discard_enabled) {
ti->discards_supported = true;
ti->num_discard_bios = 1;
- ti->split_discard_bios = false;
}
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -4195,7 +4252,7 @@
r = -EINVAL;
goto bad;
}
- atomic_set(&tc->refcount, 1);
+ refcount_set(&tc->refcount, 1);
init_completion(&tc->can_destroy);
list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
spin_unlock_irqrestore(&tc->pool->lock, flags);
@@ -4399,7 +4456,7 @@
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 20, 0},
+ .version = {1, 21, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c
index 8efe033..8671267 100644
--- a/drivers/md/dm-uevent.c
+++ b/drivers/md/dm-uevent.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Device Mapper Uevent Support (dm-uevent)
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright IBM Corporation, 2007
* Author: Mike Anderson <andmike@linux.vnet.ibm.com>
*/
diff --git a/drivers/md/dm-uevent.h b/drivers/md/dm-uevent.h
index 2eccc8b..d30d226 100644
--- a/drivers/md/dm-uevent.h
+++ b/drivers/md/dm-uevent.h
@@ -1,20 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Device Mapper Uevent Support
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright IBM Corporation, 2007
* Author: Mike Anderson <andmike@linux.vnet.ibm.com>
*/
diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c
index 954b7ab..e673dac 100644
--- a/drivers/md/dm-unstripe.c
+++ b/drivers/md/dm-unstripe.c
@@ -78,7 +78,7 @@
goto err;
}
- if (sscanf(argv[4], "%llu%c", &start, &dummy) != 1) {
+ if (sscanf(argv[4], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) {
ti->error = "Invalid striped device offset";
goto err;
}
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 684af08..3ceeb6b 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2015 Google, Inc.
*
* Author: Sami Tolvanen <samitolvanen@google.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
*/
#include "dm-verity-fec.h"
@@ -73,7 +69,7 @@
*offset = (unsigned)(position - (block << v->data_dev_block_bits));
res = dm_bufio_read(v->fec->bufio, v->fec->start + block, buf);
- if (unlikely(IS_ERR(res))) {
+ if (IS_ERR(res)) {
DMERR("%s: FEC %llu: parity read failed (block %llu): %ld",
v->data_dev->name, (unsigned long long)rsb,
(unsigned long long)(v->fec->start + block),
@@ -163,7 +159,7 @@
dm_bufio_release(buf);
par = fec_read_parity(v, rsb, block_offset, &offset, &buf);
- if (unlikely(IS_ERR(par)))
+ if (IS_ERR(par))
return PTR_ERR(par);
}
}
@@ -212,12 +208,15 @@
struct dm_verity_fec_io *fio = fec_io(io);
u64 block, ileaved;
u8 *bbuf, *rs_block;
- u8 want_digest[v->digest_size];
+ u8 want_digest[HASH_MAX_DIGESTSIZE];
unsigned n, k;
if (neras)
*neras = 0;
+ if (WARN_ON(v->digest_size > sizeof(want_digest)))
+ return -EINVAL;
+
/*
* read each of the rsn data blocks that are part of the RS block, and
* interleave contents to available bufs
@@ -250,7 +249,7 @@
}
bbuf = dm_bufio_read(bufio, block, &buf);
- if (unlikely(IS_ERR(bbuf))) {
+ if (IS_ERR(bbuf)) {
DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld",
v->data_dev->name,
(unsigned long long)rsb,
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index 6ad803b..42fbd3a 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright (C) 2015 Google, Inc.
*
* Author: Sami Tolvanen <samitolvanen@google.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
*/
#ifndef DM_VERITY_FEC_H
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index fc65f0d..4fb33e7 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2012 Red Hat, Inc.
*
@@ -5,8 +6,6 @@
*
* Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
*
- * This file is released under the GPLv2.
- *
* In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set
* default prefetch value. Data are read in "prefetch_cluster" chunks from the
* hash device. Setting this greatly improves performance when data and hash
@@ -16,7 +15,7 @@
#include "dm-verity.h"
#include "dm-verity-fec.h"
-
+#include "dm-verity-verify-sig.h"
#include <linux/module.h>
#include <linux/reboot.h>
@@ -34,7 +33,8 @@
#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks"
#define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once"
-#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC)
+#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC + \
+ DM_VERITY_ROOT_HASH_VERIFICATION_OPTS)
static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
@@ -236,8 +236,8 @@
BUG();
}
- DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str,
- block);
+ DMERR_LIMIT("%s: %s block %llu is corrupted", v->data_dev->name,
+ type_str, block);
if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS)
DMERR("%s: reached maximum errors", v->data_dev->name);
@@ -714,6 +714,8 @@
args++;
if (v->validated_blocks)
args++;
+ if (v->signature_key_desc)
+ args += DM_VERITY_ROOT_HASH_VERIFICATION_OPTS;
if (!args)
return;
DMEMIT(" %u", args);
@@ -735,6 +737,9 @@
if (v->validated_blocks)
DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE);
sz = verity_fec_status_table(v, sz, result, maxlen);
+ if (v->signature_key_desc)
+ DMEMIT(" " DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY
+ " %s", v->signature_key_desc);
break;
}
}
@@ -800,6 +805,8 @@
verity_fec_dtr(v);
+ kfree(v->signature_key_desc);
+
kfree(v);
}
@@ -855,7 +862,8 @@
return r;
}
-static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
+static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
+ struct dm_verity_sig_opts *verify_args)
{
int r;
unsigned argc;
@@ -904,6 +912,14 @@
if (r)
return r;
continue;
+ } else if (verity_verify_is_sig_opt_arg(arg_name)) {
+ r = verity_verify_sig_parse_opt_args(as, v,
+ verify_args,
+ &argc, arg_name);
+ if (r)
+ return r;
+ continue;
+
}
ti->error = "Unrecognized verity feature request";
@@ -930,6 +946,7 @@
static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
struct dm_verity *v;
+ struct dm_verity_sig_opts verify_args = {0};
struct dm_arg_set as;
unsigned int num;
unsigned long long num_ll;
@@ -937,6 +954,7 @@
int i;
sector_t hash_position;
char dummy;
+ char *root_hash_digest_to_validate;
v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
if (!v) {
@@ -1040,6 +1058,15 @@
v->tfm = NULL;
goto bad;
}
+
+ /*
+ * dm-verity performance can vary greatly depending on which hash
+ * algorithm implementation is used. Help people debug performance
+ * problems by logging the ->cra_driver_name.
+ */
+ DMINFO("%s using implementation \"%s\"", v->alg_name,
+ crypto_hash_alg_common(v->tfm)->base.cra_driver_name);
+
v->digest_size = crypto_ahash_digestsize(v->tfm);
if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
ti->error = "Digest size too big";
@@ -1061,6 +1088,7 @@
r = -EINVAL;
goto bad;
}
+ root_hash_digest_to_validate = argv[8];
if (strcmp(argv[9], "-")) {
v->salt_size = strlen(argv[9]) / 2;
@@ -1086,11 +1114,20 @@
as.argc = argc;
as.argv = argv;
- r = verity_parse_opt_args(&as, v);
+ r = verity_parse_opt_args(&as, v, &verify_args);
if (r < 0)
goto bad;
}
+ /* Root hash signature is a optional parameter*/
+ r = verity_verify_root_hash(root_hash_digest_to_validate,
+ strlen(root_hash_digest_to_validate),
+ verify_args.sig,
+ verify_args.sig_size);
+ if (r < 0) {
+ ti->error = "Root hash verification failed";
+ goto bad;
+ }
v->hash_per_block_bits =
__fls((1 << v->hash_dev_block_bits) / v->digest_size);
@@ -1156,9 +1193,13 @@
ti->per_io_data_size = roundup(ti->per_io_data_size,
__alignof__(struct dm_verity_io));
+ verity_verify_sig_opts_cleanup(&verify_args);
+
return 0;
bad:
+
+ verity_verify_sig_opts_cleanup(&verify_args);
verity_dtr(ti);
return r;
@@ -1166,7 +1207,7 @@
static struct target_type verity_target = {
.name = "verity",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c
new file mode 100644
index 0000000..614e43d
--- /dev/null
+++ b/drivers/md/dm-verity-verify-sig.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Microsoft Corporation.
+ *
+ * Author: Jaskaran Singh Khurana <jaskarankhurana@linux.microsoft.com>
+ *
+ */
+#include <linux/device-mapper.h>
+#include <linux/verification.h>
+#include <keys/user-type.h>
+#include <linux/module.h>
+#include "dm-verity.h"
+#include "dm-verity-verify-sig.h"
+
+#define DM_VERITY_VERIFY_ERR(s) DM_VERITY_ROOT_HASH_VERIFICATION " " s
+
+static bool require_signatures;
+module_param(require_signatures, bool, false);
+MODULE_PARM_DESC(require_signatures,
+ "Verify the roothash of dm-verity hash tree");
+
+#define DM_VERITY_IS_SIG_FORCE_ENABLED() \
+ (require_signatures != false)
+
+bool verity_verify_is_sig_opt_arg(const char *arg_name)
+{
+ return (!strcasecmp(arg_name,
+ DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY));
+}
+
+static int verity_verify_get_sig_from_key(const char *key_desc,
+ struct dm_verity_sig_opts *sig_opts)
+{
+ struct key *key;
+ const struct user_key_payload *ukp;
+ int ret = 0;
+
+ key = request_key(&key_type_user,
+ key_desc, NULL);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ down_read(&key->sem);
+
+ ukp = user_key_payload_locked(key);
+ if (!ukp) {
+ ret = -EKEYREVOKED;
+ goto end;
+ }
+
+ sig_opts->sig = kmalloc(ukp->datalen, GFP_KERNEL);
+ if (!sig_opts->sig) {
+ ret = -ENOMEM;
+ goto end;
+ }
+ sig_opts->sig_size = ukp->datalen;
+
+ memcpy(sig_opts->sig, ukp->data, sig_opts->sig_size);
+
+end:
+ up_read(&key->sem);
+ key_put(key);
+
+ return ret;
+}
+
+int verity_verify_sig_parse_opt_args(struct dm_arg_set *as,
+ struct dm_verity *v,
+ struct dm_verity_sig_opts *sig_opts,
+ unsigned int *argc,
+ const char *arg_name)
+{
+ struct dm_target *ti = v->ti;
+ int ret = 0;
+ const char *sig_key = NULL;
+
+ if (!*argc) {
+ ti->error = DM_VERITY_VERIFY_ERR("Signature key not specified");
+ return -EINVAL;
+ }
+
+ sig_key = dm_shift_arg(as);
+ (*argc)--;
+
+ ret = verity_verify_get_sig_from_key(sig_key, sig_opts);
+ if (ret < 0)
+ ti->error = DM_VERITY_VERIFY_ERR("Invalid key specified");
+
+ v->signature_key_desc = kstrdup(sig_key, GFP_KERNEL);
+ if (!v->signature_key_desc)
+ return -ENOMEM;
+
+ return ret;
+}
+
+/*
+ * verify_verify_roothash - Verify the root hash of the verity hash device
+ * using builtin trusted keys.
+ *
+ * @root_hash: For verity, the roothash/data to be verified.
+ * @root_hash_len: Size of the roothash/data to be verified.
+ * @sig_data: The trusted signature that verifies the roothash/data.
+ * @sig_len: Size of the signature.
+ *
+ */
+int verity_verify_root_hash(const void *root_hash, size_t root_hash_len,
+ const void *sig_data, size_t sig_len)
+{
+ int ret;
+
+ if (!root_hash || root_hash_len == 0)
+ return -EINVAL;
+
+ if (!sig_data || sig_len == 0) {
+ if (DM_VERITY_IS_SIG_FORCE_ENABLED())
+ return -ENOKEY;
+ else
+ return 0;
+ }
+
+ ret = verify_pkcs7_signature(root_hash, root_hash_len, sig_data,
+ sig_len, NULL, VERIFYING_UNSPECIFIED_SIGNATURE,
+ NULL, NULL);
+
+ return ret;
+}
+
+void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts)
+{
+ kfree(sig_opts->sig);
+ sig_opts->sig = NULL;
+ sig_opts->sig_size = 0;
+}
diff --git a/drivers/md/dm-verity-verify-sig.h b/drivers/md/dm-verity-verify-sig.h
new file mode 100644
index 0000000..19b1547
--- /dev/null
+++ b/drivers/md/dm-verity-verify-sig.h
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Microsoft Corporation.
+ *
+ * Author: Jaskaran Singh Khurana <jaskarankhurana@linux.microsoft.com>
+ *
+ */
+#ifndef DM_VERITY_SIG_VERIFICATION_H
+#define DM_VERITY_SIG_VERIFICATION_H
+
+#define DM_VERITY_ROOT_HASH_VERIFICATION "DM Verity Sig Verification"
+#define DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY "root_hash_sig_key_desc"
+
+struct dm_verity_sig_opts {
+ unsigned int sig_size;
+ u8 *sig;
+};
+
+#ifdef CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG
+
+#define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 2
+
+int verity_verify_root_hash(const void *data, size_t data_len,
+ const void *sig_data, size_t sig_len);
+bool verity_verify_is_sig_opt_arg(const char *arg_name);
+
+int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
+ struct dm_verity_sig_opts *sig_opts,
+ unsigned int *argc, const char *arg_name);
+
+void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts);
+
+#else
+
+#define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 0
+
+int verity_verify_root_hash(const void *data, size_t data_len,
+ const void *sig_data, size_t sig_len)
+{
+ return 0;
+}
+
+bool verity_verify_is_sig_opt_arg(const char *arg_name)
+{
+ return false;
+}
+
+int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
+ struct dm_verity_sig_opts *sig_opts,
+ unsigned int *argc, const char *arg_name)
+{
+ return -EINVAL;
+}
+
+void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts)
+{
+}
+
+#endif /* CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG */
+#endif /* DM_VERITY_SIG_VERIFICATION_H */
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 3441c10..641b9e3 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2012 Red Hat, Inc.
* Copyright (C) 2015 Google, Inc.
@@ -5,8 +6,6 @@
* Author: Mikulas Patocka <mpatocka@redhat.com>
*
* Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
- *
- * This file is released under the GPLv2.
*/
#ifndef DM_VERITY_H
@@ -64,6 +63,8 @@
struct dm_verity_fec *fec; /* forward error correction */
unsigned long *validated_blocks; /* bitset blocks validated */
+
+ char *signature_key_desc; /* signature keyring reference */
};
struct dm_verity_io {
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 5f1f80d..d06b8aa 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -190,8 +190,6 @@
struct dm_writecache *wc;
struct wc_entry **wc_list;
unsigned wc_list_n;
- unsigned page_offset;
- struct page *page;
struct wc_entry *wc_list_inline[WB_LIST_INLINE];
struct bio bio;
};
@@ -350,10 +348,7 @@
static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
{
- if (is_power_of_2(sizeof(struct wc_entry)) && 0)
- return &sb(wc)->entries[e - wc->entries];
- else
- return &sb(wc)->entries[e->index];
+ return &sb(wc)->entries[e->index];
}
static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
@@ -549,21 +544,20 @@
e = container_of(node, struct wc_entry, rb_node);
if (read_original_sector(wc, e) == block)
break;
+
node = (read_original_sector(wc, e) >= block ?
e->rb_node.rb_left : e->rb_node.rb_right);
if (unlikely(!node)) {
- if (!(flags & WFE_RETURN_FOLLOWING)) {
+ if (!(flags & WFE_RETURN_FOLLOWING))
return NULL;
- }
if (read_original_sector(wc, e) >= block) {
- break;
+ return e;
} else {
node = rb_next(&e->rb_node);
- if (unlikely(!node)) {
+ if (unlikely(!node))
return NULL;
- }
e = container_of(node, struct wc_entry, rb_node);
- break;
+ return e;
}
}
}
@@ -574,7 +568,7 @@
node = rb_prev(&e->rb_node);
else
node = rb_next(&e->rb_node);
- if (!node)
+ if (unlikely(!node))
return e;
e2 = container_of(node, struct wc_entry, rb_node);
if (read_original_sector(wc, e2) != block)
@@ -732,7 +726,8 @@
}
writecache_commit_flushed(wc);
- writecache_wait_for_ios(wc, WRITE);
+ if (!WC_MODE_PMEM(wc))
+ writecache_wait_for_ios(wc, WRITE);
wc->seq_count++;
pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
@@ -807,7 +802,7 @@
writecache_free_entry(wc, e);
}
- if (!node)
+ if (unlikely(!node))
break;
e = container_of(node, struct wc_entry, rb_node);
@@ -1481,10 +1476,9 @@
bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
wb = container_of(bio, struct writeback_struct, bio);
wb->wc = wc;
- wb->bio.bi_end_io = writecache_writeback_endio;
- bio_set_dev(&wb->bio, wc->dev->bdev);
- wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
- wb->page_offset = PAGE_SIZE;
+ bio->bi_end_io = writecache_writeback_endio;
+ bio_set_dev(bio, wc->dev->bdev);
+ bio->bi_iter.bi_sector = read_original_sector(wc, e);
if (max_pages <= WB_LIST_INLINE ||
unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
GFP_NOIO | __GFP_NORETRY |
@@ -1510,12 +1504,12 @@
wb->wc_list[wb->wc_list_n++] = f;
e = f;
}
- bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
if (writecache_has_error(wc)) {
bio->bi_status = BLK_STS_IOERR;
- bio_endio(&wb->bio);
+ bio_endio(bio);
} else {
- submit_bio(&wb->bio);
+ submit_bio(bio);
}
__writeback_throttle(wc, wbl);
@@ -1567,7 +1561,7 @@
{
struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
struct blk_plug plug;
- struct wc_entry *e, *f, *g;
+ struct wc_entry *f, *g, *e = NULL;
struct rb_node *node, *next_node;
struct list_head skipped;
struct writeback_list wbl;
@@ -1604,7 +1598,14 @@
break;
}
- e = container_of(wc->lru.prev, struct wc_entry, lru);
+ if (unlikely(wc->writeback_all)) {
+ if (unlikely(!e)) {
+ writecache_flush(wc);
+ e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
+ } else
+ e = g;
+ } else
+ e = container_of(wc->lru.prev, struct wc_entry, lru);
BUG_ON(e->write_in_progress);
if (unlikely(!writecache_entry_is_committed(wc, e))) {
writecache_flush(wc);
@@ -1635,8 +1636,8 @@
if (unlikely(!next_node))
break;
g = container_of(next_node, struct wc_entry, rb_node);
- if (read_original_sector(wc, g) ==
- read_original_sector(wc, f)) {
+ if (unlikely(read_original_sector(wc, g) ==
+ read_original_sector(wc, f))) {
f = g;
continue;
}
@@ -1665,8 +1666,14 @@
g->wc_list_contiguous = BIO_MAX_PAGES;
f = g;
e->wc_list_contiguous++;
- if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES))
+ if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) {
+ if (unlikely(wc->writeback_all)) {
+ next_node = rb_next(&f->rb_node);
+ if (likely(next_node))
+ g = container_of(next_node, struct wc_entry, rb_node);
+ }
break;
+ }
}
cond_resched();
}
@@ -1862,7 +1869,7 @@
goto bad;
}
- wc->writeback_wq = alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM, 1);
+ wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1);
if (!wc->writeback_wq) {
r = -ENOMEM;
ti->error = "Could not allocate writeback workqueue";
@@ -2064,7 +2071,7 @@
if (IS_ERR(wc->flush_thread)) {
r = PTR_ERR(wc->flush_thread);
wc->flush_thread = NULL;
- ti->error = "Couldn't spawn endio thread";
+ ti->error = "Couldn't spawn flush thread";
goto bad;
}
wake_up_process(wc->flush_thread);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index fa68336..595a731 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
*
@@ -8,6 +9,7 @@
#include <linux/module.h>
#include <linux/crc32.h>
+#include <linux/sched/mm.h>
#define DM_MSG_PREFIX "zoned metadata"
@@ -33,7 +35,7 @@
* (1) Super block (1 block)
* (2) Chunk mapping table (nr_map_blocks)
* (3) Bitmap blocks (nr_bitmap_blocks)
- * All metadata blocks are stored in conventional zones, starting from the
+ * All metadata blocks are stored in conventional zones, starting from
* the first conventional zone found on disk.
*/
struct dmz_super {
@@ -232,7 +234,7 @@
* Lock/unlock metadata access. This is a "read" lock on a semaphore
* that prevents metadata flush from running while metadata are being
* modified. The actual metadata write mutual exclusion is achieved with
- * the map lock and zone styate management (active and reclaim state are
+ * the map lock and zone state management (active and reclaim state are
* mutually exclusive).
*/
void dmz_lock_metadata(struct dmz_metadata *zmd)
@@ -401,15 +403,18 @@
sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
struct bio *bio;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return ERR_PTR(-EIO);
+
/* Get a new block and a BIO to read it */
mblk = dmz_alloc_mblock(zmd, mblk_no);
if (!mblk)
- return NULL;
+ return ERR_PTR(-ENOMEM);
bio = bio_alloc(GFP_NOIO, 1);
if (!bio) {
dmz_free_mblock(zmd, mblk);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
spin_lock(&zmd->mblk_lock);
@@ -540,8 +545,8 @@
if (!mblk) {
/* Cache miss: read the block from disk */
mblk = dmz_get_mblock_slow(zmd, mblk_no);
- if (!mblk)
- return ERR_PTR(-ENOMEM);
+ if (IS_ERR(mblk))
+ return mblk;
}
/* Wait for on-going read I/O and check for error */
@@ -569,16 +574,19 @@
/*
* Issue a metadata block write BIO.
*/
-static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
- unsigned int set)
+static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
+ unsigned int set)
{
sector_t block = zmd->sb[set].block + mblk->no;
struct bio *bio;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return -EIO;
+
bio = bio_alloc(GFP_NOIO, 1);
if (!bio) {
set_bit(DMZ_META_ERROR, &mblk->state);
- return;
+ return -ENOMEM;
}
set_bit(DMZ_META_WRITING, &mblk->state);
@@ -590,6 +598,8 @@
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
submit_bio(bio);
+
+ return 0;
}
/*
@@ -601,6 +611,9 @@
struct bio *bio;
int ret;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return -EIO;
+
bio = bio_alloc(GFP_NOIO, 1);
if (!bio)
return -ENOMEM;
@@ -658,22 +671,29 @@
{
struct dmz_mblock *mblk;
struct blk_plug plug;
- int ret = 0;
+ int ret = 0, nr_mblks_submitted = 0;
/* Issue writes */
blk_start_plug(&plug);
- list_for_each_entry(mblk, write_list, link)
- dmz_write_mblock(zmd, mblk, set);
+ list_for_each_entry(mblk, write_list, link) {
+ ret = dmz_write_mblock(zmd, mblk, set);
+ if (ret)
+ break;
+ nr_mblks_submitted++;
+ }
blk_finish_plug(&plug);
/* Wait for completion */
list_for_each_entry(mblk, write_list, link) {
+ if (!nr_mblks_submitted)
+ break;
wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
TASK_UNINTERRUPTIBLE);
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
clear_bit(DMZ_META_ERROR, &mblk->state);
ret = -EIO;
}
+ nr_mblks_submitted--;
}
/* Flush drive cache (this will also sync data) */
@@ -735,6 +755,11 @@
*/
dmz_lock_flush(zmd);
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ ret = -EIO;
+ goto out;
+ }
+
/* Get dirty blocks */
spin_lock(&zmd->mblk_lock);
list_splice_init(&zmd->mblk_dirty_list, &write_list);
@@ -1162,13 +1187,15 @@
while (sector < dev->capacity) {
/* Get zone information */
nr_blkz = DMZ_REPORT_NR_ZONES;
- ret = blkdev_report_zones(dev->bdev, sector, blkz,
- &nr_blkz, GFP_KERNEL);
+ ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz);
if (ret) {
dmz_dev_err(dev, "Report zones failed %d", ret);
goto out;
}
+ if (!nr_blkz)
+ break;
+
/* Process report */
for (i = 0; i < nr_blkz; i++) {
ret = dmz_init_zone(zmd, zone, &blkz[i]);
@@ -1198,12 +1225,22 @@
static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
{
unsigned int nr_blkz = 1;
+ unsigned int noio_flag;
struct blk_zone blkz;
int ret;
- /* Get zone information from disk */
+ /*
+ * Get zone information from disk. Since blkdev_report_zones() uses
+ * GFP_KERNEL by default for memory allocations, set the per-task
+ * PF_MEMALLOC_NOIO flag so that all allocations are done as if
+ * GFP_NOIO was specified.
+ */
+ noio_flag = memalloc_noio_save();
ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
- &blkz, &nr_blkz, GFP_NOIO);
+ &blkz, &nr_blkz);
+ memalloc_noio_restore(noio_flag);
+ if (!nr_blkz)
+ ret = -EIO;
if (ret) {
dmz_dev_err(zmd->dev, "Get zone %u report failed",
dmz_id(zmd, zone));
@@ -1529,7 +1566,7 @@
struct dm_zone *zone;
if (list_empty(&zmd->map_rnd_list))
- return NULL;
+ return ERR_PTR(-EBUSY);
list_for_each_entry(zone, &zmd->map_rnd_list, link) {
if (dmz_is_buf(zone))
@@ -1540,7 +1577,7 @@
return dzone;
}
- return NULL;
+ return ERR_PTR(-EBUSY);
}
/*
@@ -1551,7 +1588,7 @@
struct dm_zone *zone;
if (list_empty(&zmd->map_seq_list))
- return NULL;
+ return ERR_PTR(-EBUSY);
list_for_each_entry(zone, &zmd->map_seq_list, link) {
if (!zone->bzone)
@@ -1560,7 +1597,7 @@
return zone;
}
- return NULL;
+ return ERR_PTR(-EBUSY);
}
/*
@@ -1589,30 +1626,6 @@
}
/*
- * Activate a zone (increment its reference count).
- */
-void dmz_activate_zone(struct dm_zone *zone)
-{
- set_bit(DMZ_ACTIVE, &zone->flags);
- atomic_inc(&zone->refcount);
-}
-
-/*
- * Deactivate a zone. This decrement the zone reference counter
- * and clears the active state of the zone once the count reaches 0,
- * indicating that all BIOs to the zone have completed. Returns
- * true if the zone was deactivated.
- */
-void dmz_deactivate_zone(struct dm_zone *zone)
-{
- if (atomic_dec_and_test(&zone->refcount)) {
- WARN_ON(!test_bit(DMZ_ACTIVE, &zone->flags));
- clear_bit_unlock(DMZ_ACTIVE, &zone->flags);
- smp_mb__after_atomic();
- }
-}
-
-/*
* Get the zone mapping a chunk, if the chunk is mapped already.
* If no mapping exist and the operation is WRITE, a zone is
* allocated and used to map the chunk.
@@ -1639,9 +1652,13 @@
if (op != REQ_OP_WRITE)
goto out;
- /* Alloate a random zone */
+ /* Allocate a random zone */
dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
if (!dzone) {
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ dzone = ERR_PTR(-EIO);
+ goto out;
+ }
dmz_wait_for_free_zones(zmd);
goto again;
}
@@ -1736,9 +1753,13 @@
if (bzone)
goto out;
- /* Alloate a random zone */
+ /* Allocate a random zone */
bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
if (!bzone) {
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ bzone = ERR_PTR(-EIO);
+ goto out;
+ }
dmz_wait_for_free_zones(zmd);
goto again;
}
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
index edf4b95..d240d7c 100644
--- a/drivers/md/dm-zoned-reclaim.c
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
*
@@ -37,7 +38,7 @@
/*
* Number of seconds of target BIO inactivity to consider the target idle.
*/
-#define DMZ_IDLE_PERIOD (10UL * HZ)
+#define DMZ_IDLE_PERIOD (10UL * HZ)
/*
* Percentage of unmapped (free) random zones below which reclaim starts
@@ -134,6 +135,9 @@
set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
while (block < end_block) {
+ if (dev->flags & DMZ_BDEV_DYING)
+ return -EIO;
+
/* Get a valid region from the source zone */
ret = dmz_first_valid_block(zmd, src_zone, &block);
if (ret <= 0)
@@ -215,7 +219,7 @@
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -259,7 +263,7 @@
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -312,7 +316,7 @@
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -334,7 +338,7 @@
/*
* Find a candidate zone for reclaim and process it.
*/
-static void dmz_reclaim(struct dmz_reclaim *zrc)
+static int dmz_do_reclaim(struct dmz_reclaim *zrc)
{
struct dmz_metadata *zmd = zrc->metadata;
struct dm_zone *dzone;
@@ -344,8 +348,8 @@
/* Get a data zone */
dzone = dmz_get_zone_for_reclaim(zmd);
- if (!dzone)
- return;
+ if (IS_ERR(dzone))
+ return PTR_ERR(dzone);
start = jiffies;
@@ -391,13 +395,20 @@
out:
if (ret) {
dmz_unlock_zone_reclaim(dzone);
- return;
+ return ret;
}
- (void) dmz_flush_metadata(zrc->metadata);
+ ret = dmz_flush_metadata(zrc->metadata);
+ if (ret) {
+ dmz_dev_debug(zrc->dev,
+ "Metadata flush for zone %u failed, err %d\n",
+ dmz_id(zmd, rzone), ret);
+ return ret;
+ }
dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms",
dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start));
+ return 0;
}
/*
@@ -427,7 +438,7 @@
return false;
/*
- * If the percentage of unmappped random zones is low,
+ * If the percentage of unmapped random zones is low,
* reclaim even if the target is busy.
*/
return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND;
@@ -442,6 +453,10 @@
struct dmz_metadata *zmd = zrc->metadata;
unsigned int nr_rnd, nr_unmap_rnd;
unsigned int p_unmap_rnd;
+ int ret;
+
+ if (dmz_bdev_is_dying(zrc->dev))
+ return;
if (!dmz_should_reclaim(zrc)) {
mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
@@ -471,7 +486,17 @@
(dmz_target_idle(zrc) ? "Idle" : "Busy"),
p_unmap_rnd, nr_unmap_rnd, nr_rnd);
- dmz_reclaim(zrc);
+ ret = dmz_do_reclaim(zrc);
+ if (ret) {
+ dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret);
+ if (ret == -EIO)
+ /*
+ * LLD might be performing some error handling sequence
+ * at the underlying device. To not interfere, do not
+ * attempt to schedule the next reclaim run immediately.
+ */
+ return;
+ }
dmz_schedule_reclaim(zrc);
}
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 85fb2ba..d3bcc41 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
*
@@ -19,7 +20,7 @@
struct dmz_target *target;
struct dm_zone *zone;
struct bio *bio;
- atomic_t ref;
+ refcount_t ref;
};
/*
@@ -27,7 +28,7 @@
*/
struct dm_chunk_work {
struct work_struct work;
- atomic_t refcount;
+ refcount_t refcount;
struct dmz_target *target;
unsigned int chunk;
struct bio_list bio_list;
@@ -80,7 +81,7 @@
if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
bio->bi_status = status;
- if (atomic_dec_and_test(&bioctx->ref)) {
+ if (refcount_dec_and_test(&bioctx->ref)) {
struct dm_zone *zone = bioctx->zone;
if (zone) {
@@ -131,7 +132,7 @@
bio_advance(bio, clone->bi_iter.bi_size);
- atomic_inc(&bioctx->ref);
+ refcount_inc(&bioctx->ref);
generic_make_request(clone);
if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
@@ -277,8 +278,8 @@
/* Get the buffer zone. One will be allocated if needed */
bzone = dmz_get_chunk_buffer(zmd, zone);
- if (!bzone)
- return -ENOSPC;
+ if (IS_ERR(bzone))
+ return PTR_ERR(bzone);
if (dmz_is_readonly(bzone))
return -EROFS;
@@ -389,6 +390,11 @@
dmz_lock_metadata(zmd);
+ if (dmz->dev->flags & DMZ_BDEV_DYING) {
+ ret = -EIO;
+ goto out;
+ }
+
/*
* Get the data zone mapping the chunk. There may be no
* mapping for read and discard. If a mapping is obtained,
@@ -441,7 +447,7 @@
*/
static inline void dmz_get_chunk_work(struct dm_chunk_work *cw)
{
- atomic_inc(&cw->refcount);
+ refcount_inc(&cw->refcount);
}
/*
@@ -450,7 +456,7 @@
*/
static void dmz_put_chunk_work(struct dm_chunk_work *cw)
{
- if (atomic_dec_and_test(&cw->refcount)) {
+ if (refcount_dec_and_test(&cw->refcount)) {
WARN_ON(!bio_list_empty(&cw->bio_list));
radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk);
kfree(cw);
@@ -493,6 +499,8 @@
/* Flush dirty metadata blocks */
ret = dmz_flush_metadata(dmz->metadata);
+ if (ret)
+ dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret);
/* Process queued flush requests */
while (1) {
@@ -513,25 +521,27 @@
* Get a chunk work and start it to process a new BIO.
* If the BIO chunk has no work yet, create one.
*/
-static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
+static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
{
unsigned int chunk = dmz_bio_chunk(dmz->dev, bio);
struct dm_chunk_work *cw;
+ int ret = 0;
mutex_lock(&dmz->chunk_lock);
/* Get the BIO chunk work. If one is not active yet, create one */
cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
if (!cw) {
- int ret;
/* Create a new chunk work */
cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
- if (!cw)
+ if (unlikely(!cw)) {
+ ret = -ENOMEM;
goto out;
+ }
INIT_WORK(&cw->work, dmz_chunk_work);
- atomic_set(&cw->refcount, 0);
+ refcount_set(&cw->refcount, 0);
cw->target = dmz;
cw->chunk = chunk;
bio_list_init(&cw->bio_list);
@@ -539,7 +549,6 @@
ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw);
if (unlikely(ret)) {
kfree(cw);
- cw = NULL;
goto out;
}
}
@@ -547,10 +556,38 @@
bio_list_add(&cw->bio_list, bio);
dmz_get_chunk_work(cw);
+ dmz_reclaim_bio_acc(dmz->reclaim);
if (queue_work(dmz->chunk_wq, &cw->work))
dmz_get_chunk_work(cw);
out:
mutex_unlock(&dmz->chunk_lock);
+ return ret;
+}
+
+/*
+ * Check the backing device availability. If it's on the way out,
+ * start failing I/O. Reclaim and metadata components also call this
+ * function to cleanly abort operation in the event of such failure.
+ */
+bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
+{
+ struct gendisk *disk;
+
+ if (!(dmz_dev->flags & DMZ_BDEV_DYING)) {
+ disk = dmz_dev->bdev->bd_disk;
+ if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
+ dmz_dev_warn(dmz_dev, "Backing device queue dying");
+ dmz_dev->flags |= DMZ_BDEV_DYING;
+ } else if (disk->fops->check_events) {
+ if (disk->fops->check_events(disk, 0) &
+ DISK_EVENT_MEDIA_CHANGE) {
+ dmz_dev_warn(dmz_dev, "Backing device offline");
+ dmz_dev->flags |= DMZ_BDEV_DYING;
+ }
+ }
+ }
+
+ return dmz_dev->flags & DMZ_BDEV_DYING;
}
/*
@@ -564,6 +601,10 @@
sector_t sector = bio->bi_iter.bi_sector;
unsigned int nr_sectors = bio_sectors(bio);
sector_t chunk_sector;
+ int ret;
+
+ if (dmz_bdev_is_dying(dmz->dev))
+ return DM_MAPIO_KILL;
dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
bio_op(bio), (unsigned long long)sector, nr_sectors,
@@ -584,7 +625,7 @@
bioctx->target = dmz;
bioctx->zone = NULL;
bioctx->bio = bio;
- atomic_set(&bioctx->ref, 1);
+ refcount_set(&bioctx->ref, 1);
/* Set the BIO pending in the flush list */
if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) {
@@ -601,8 +642,14 @@
dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector);
/* Now ready to handle this BIO */
- dmz_reclaim_bio_acc(dmz->reclaim);
- dmz_queue_chunk_work(dmz, bio);
+ ret = dmz_queue_chunk_work(dmz, bio);
+ if (ret) {
+ dmz_dev_debug(dmz->dev,
+ "BIO op %d, can't process chunk %llu, err %i\n",
+ bio_op(bio), (u64)dmz_bio_chunk(dmz->dev, bio),
+ ret);
+ return DM_MAPIO_REQUEUE;
+ }
return DM_MAPIO_SUBMITTED;
}
@@ -643,7 +690,8 @@
q = bdev_get_queue(dev->bdev);
dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
- aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1);
+ aligned_capacity = dev->capacity &
+ ~((sector_t)blk_queue_zone_sectors(q) - 1);
if (ti->begin ||
((ti->len != dev->capacity) && (ti->len != aligned_capacity))) {
ti->error = "Partial mapping not supported";
@@ -657,8 +705,7 @@
dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks);
- dev->nr_zones = (dev->capacity + dev->zone_nr_sectors - 1)
- >> dev->zone_nr_sectors_shift;
+ dev->nr_zones = blkdev_nr_zones(dev->bdev);
dmz->dev = dev;
@@ -728,7 +775,6 @@
ti->per_io_data_size = sizeof(struct dmz_bioctx);
ti->flush_supported = true;
ti->discards_supported = true;
- ti->split_discard_bios = true;
/* The exposed capacity is the number of chunks that can be mapped */
ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift;
@@ -856,6 +902,9 @@
{
struct dmz_target *dmz = ti->private;
+ if (dmz_bdev_is_dying(dmz->dev))
+ return -ENODEV;
+
*bdev = dmz->dev->bdev;
return 0;
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
index 12419f0..d8e70b0 100644
--- a/drivers/md/dm-zoned.h
+++ b/drivers/md/dm-zoned.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
*
@@ -56,6 +57,8 @@
unsigned int nr_zones;
+ unsigned int flags;
+
sector_t zone_nr_sectors;
unsigned int zone_nr_sectors_shift;
@@ -67,6 +70,9 @@
(dev)->zone_nr_sectors_shift)
#define dmz_chunk_block(dev, b) ((b) & ((dev)->zone_nr_blocks - 1))
+/* Device flags. */
+#define DMZ_BDEV_DYING (1 << 0)
+
/*
* Zone descriptor.
*/
@@ -115,7 +121,6 @@
DMZ_BUF,
/* Zone internal state */
- DMZ_ACTIVE,
DMZ_RECLAIM,
DMZ_SEQ_WRITE_ERR,
};
@@ -128,7 +133,6 @@
#define dmz_is_empty(z) ((z)->wp_block == 0)
#define dmz_is_offline(z) test_bit(DMZ_OFFLINE, &(z)->flags)
#define dmz_is_readonly(z) test_bit(DMZ_READ_ONLY, &(z)->flags)
-#define dmz_is_active(z) test_bit(DMZ_ACTIVE, &(z)->flags)
#define dmz_in_reclaim(z) test_bit(DMZ_RECLAIM, &(z)->flags)
#define dmz_seq_write_err(z) test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags)
@@ -188,8 +192,30 @@
unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd);
unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd);
-void dmz_activate_zone(struct dm_zone *zone);
-void dmz_deactivate_zone(struct dm_zone *zone);
+/*
+ * Activate a zone (increment its reference count).
+ */
+static inline void dmz_activate_zone(struct dm_zone *zone)
+{
+ atomic_inc(&zone->refcount);
+}
+
+/*
+ * Deactivate a zone. This decrement the zone reference counter
+ * indicating that all BIOs to the zone have completed when the count is 0.
+ */
+static inline void dmz_deactivate_zone(struct dm_zone *zone)
+{
+ atomic_dec(&zone->refcount);
+}
+
+/*
+ * Test if a zone is active, that is, has a refcount > 0.
+ */
+static inline bool dmz_is_active(struct dm_zone *zone)
+{
+ return atomic_read(&zone->refcount);
+}
int dmz_lock_zone_reclaim(struct dm_zone *zone);
void dmz_unlock_zone_reclaim(struct dm_zone *zone);
@@ -225,4 +251,9 @@
void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc);
void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
+/*
+ * Functions defined in dm-zoned-target.c
+ */
+bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
+
#endif /* DM_ZONED_H */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 07d2949..1a5e328 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -158,9 +158,6 @@
struct dm_dev dm_dev;
};
-static struct kmem_cache *_rq_tio_cache;
-static struct kmem_cache *_rq_cache;
-
/*
* Bio-based DM's mempools' reserved IOs set by the user.
*/
@@ -222,20 +219,11 @@
static int __init local_init(void)
{
- int r = -ENOMEM;
-
- _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
- if (!_rq_tio_cache)
- return r;
-
- _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
- __alignof__(struct request), 0, NULL);
- if (!_rq_cache)
- goto out_free_rq_tio_cache;
+ int r;
r = dm_uevent_init();
if (r)
- goto out_free_rq_cache;
+ return r;
deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
if (!deferred_remove_workqueue) {
@@ -257,10 +245,6 @@
destroy_workqueue(deferred_remove_workqueue);
out_uevent_exit:
dm_uevent_exit();
-out_free_rq_cache:
- kmem_cache_destroy(_rq_cache);
-out_free_rq_tio_cache:
- kmem_cache_destroy(_rq_tio_cache);
return r;
}
@@ -270,8 +254,6 @@
flush_scheduled_work();
destroy_workqueue(deferred_remove_workqueue);
- kmem_cache_destroy(_rq_cache);
- kmem_cache_destroy(_rq_tio_cache);
unregister_blkdev(_major, _name);
dm_uevent_exit();
@@ -458,6 +440,55 @@
return dm_get_geometry(md, geo);
}
+static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
+ struct blk_zone *zones, unsigned int *nr_zones)
+{
+#ifdef CONFIG_BLK_DEV_ZONED
+ struct mapped_device *md = disk->private_data;
+ struct dm_target *tgt;
+ struct dm_table *map;
+ int srcu_idx, ret;
+
+ if (dm_suspended_md(md))
+ return -EAGAIN;
+
+ map = dm_get_live_table(md, &srcu_idx);
+ if (!map)
+ return -EIO;
+
+ tgt = dm_table_find_target(map, sector);
+ if (!tgt) {
+ ret = -EIO;
+ goto out;
+ }
+
+ /*
+ * If we are executing this, we already know that the block device
+ * is a zoned device and so each target should have support for that
+ * type of drive. A missing report_zones method means that the target
+ * driver has a problem.
+ */
+ if (WARN_ON(!tgt->type->report_zones)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ /*
+ * blkdev_report_zones() will loop and call this again to cover all the
+ * zones of the target, eventually moving on to the next target.
+ * So there is no need to loop here trying to fill the entire array
+ * of zones.
+ */
+ ret = tgt->type->report_zones(tgt, sector, zones, nr_zones);
+
+out:
+ dm_put_live_table(md, srcu_idx);
+ return ret;
+#else
+ return -ENOTSUPP;
+#endif
+}
+
static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
struct block_device **bdev)
__acquires(md->io_barrier)
@@ -595,26 +626,38 @@
bio_put(&tio->clone);
}
-int md_in_flight(struct mapped_device *md)
+static bool md_in_flight_bios(struct mapped_device *md)
{
- return atomic_read(&md->pending[READ]) +
- atomic_read(&md->pending[WRITE]);
+ int cpu;
+ struct hd_struct *part = &dm_disk(md)->part0;
+ long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
+ sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
+ }
+
+ return sum != 0;
+}
+
+static bool md_in_flight(struct mapped_device *md)
+{
+ if (queue_is_mq(md->queue))
+ return blk_mq_queue_inflight(md->queue);
+ else
+ return md_in_flight_bios(md);
}
static void start_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
- int rw = bio_data_dir(bio);
io->start_time = jiffies;
generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
&dm_disk(md)->part0);
- atomic_set(&dm_disk(md)->part0.in_flight[rw],
- atomic_inc_return(&md->pending[rw]));
-
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
bio->bi_iter.bi_sector, bio_sectors(bio),
@@ -626,8 +669,6 @@
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
unsigned long duration = jiffies - io->start_time;
- int pending;
- int rw = bio_data_dir(bio);
generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
io->start_time);
@@ -637,16 +678,8 @@
bio->bi_iter.bi_sector, bio_sectors(bio),
true, duration, &io->stats_aux);
- /*
- * After this is decremented the bio must not be touched if it is
- * a flush.
- */
- pending = atomic_dec_return(&md->pending[rw]);
- atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
- pending += atomic_read(&md->pending[rw^0x1]);
-
/* nudge anyone waiting on suspend queue */
- if (!pending)
+ if (unlikely(wq_has_sleeper(&md->wait)))
wake_up(&md->wait);
}
@@ -746,7 +779,8 @@
}
static struct table_device *find_table_device(struct list_head *l, dev_t dev,
- fmode_t mode) {
+ fmode_t mode)
+{
struct table_device *td;
list_for_each_entry(td, l, list)
@@ -757,7 +791,8 @@
}
int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
- struct dm_dev **result) {
+ struct dm_dev **result)
+{
int r;
struct table_device *td;
@@ -910,6 +945,15 @@
}
}
+void disable_discard(struct mapped_device *md)
+{
+ struct queue_limits *limits = dm_get_queue_limits(md);
+
+ /* device doesn't really support DISCARD, disable it */
+ limits->max_discard_sectors = 0;
+ blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
+}
+
void disable_write_same(struct mapped_device *md)
{
struct queue_limits *limits = dm_get_queue_limits(md);
@@ -935,11 +979,14 @@
dm_endio_fn endio = tio->ti->type->end_io;
if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
- if (bio_op(bio) == REQ_OP_WRITE_SAME &&
- !bio->bi_disk->queue->limits.max_write_same_sectors)
+ if (bio_op(bio) == REQ_OP_DISCARD &&
+ !bio->bi_disk->queue->limits.max_discard_sectors)
+ disable_discard(md);
+ else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
+ !bio->bi_disk->queue->limits.max_write_same_sectors)
disable_write_same(md);
- if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
- !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
+ else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+ !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
disable_write_zeroes(md);
}
@@ -1007,15 +1054,7 @@
return -EINVAL;
}
- /*
- * BIO based queue uses its own splitting. When multipage bvecs
- * is switched on, size of the incoming bio may be too big to
- * be handled in some targets, such as crypt.
- *
- * When these targets are ready for the big bio, we can remove
- * the limit.
- */
- ti->max_io_len = min_t(uint32_t, len, BIO_MAX_PAGES * PAGE_SIZE);
+ ti->max_io_len = (uint32_t) len;
return 0;
}
@@ -1033,7 +1072,7 @@
return NULL;
ti = dm_table_find_target(map, sector);
- if (!dm_target_is_valid(ti))
+ if (!ti)
return NULL;
return ti;
@@ -1066,6 +1105,25 @@
return ret;
}
+static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
+ int blocksize, sector_t start, sector_t len)
+{
+ struct mapped_device *md = dax_get_private(dax_dev);
+ struct dm_table *map;
+ int srcu_idx;
+ bool ret;
+
+ map = dm_get_live_table(md, &srcu_idx);
+ if (!map)
+ return false;
+
+ ret = dm_table_supports_dax(map, device_supports_dax, &blocksize);
+
+ dm_put_live_table(md, srcu_idx);
+
+ return ret;
+}
+
static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
@@ -1155,93 +1213,49 @@
EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
/*
- * The zone descriptors obtained with a zone report indicate zone positions
- * within the target backing device, regardless of that device is a partition
- * and regardless of the target mapping start sector on the device or partition.
- * The zone descriptors start sector and write pointer position must be adjusted
- * to match their relative position within the dm device.
- * A target may call dm_remap_zone_report() after completion of a
- * REQ_OP_ZONE_REPORT bio to remap the zone descriptors obtained from the
- * backing device.
+ * The zone descriptors obtained with a zone report indicate
+ * zone positions within the underlying device of the target. The zone
+ * descriptors must be remapped to match their position within the dm device.
+ * The caller target should obtain the zones information using
+ * blkdev_report_zones() to ensure that remapping for partition offset is
+ * already handled.
*/
-void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
+void dm_remap_zone_report(struct dm_target *ti, sector_t start,
+ struct blk_zone *zones, unsigned int *nr_zones)
{
#ifdef CONFIG_BLK_DEV_ZONED
- struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
- struct bio *report_bio = tio->io->orig_bio;
- struct blk_zone_report_hdr *hdr = NULL;
struct blk_zone *zone;
- unsigned int nr_rep = 0;
- unsigned int ofst;
- sector_t part_offset;
- struct bio_vec bvec;
- struct bvec_iter iter;
- void *addr;
-
- if (bio->bi_status)
- return;
+ unsigned int nrz = *nr_zones;
+ int i;
/*
- * bio sector was incremented by the request size on completion. Taking
- * into account the original request sector, the target start offset on
- * the backing device and the target mapping offset (ti->begin), the
- * start sector of the backing device. The partition offset is always 0
- * if the target uses a whole device.
+ * Remap the start sector and write pointer position of the zones in
+ * the array. Since we may have obtained from the target underlying
+ * device more zones that the target size, also adjust the number
+ * of zones.
*/
- part_offset = bio->bi_iter.bi_sector + ti->begin - (start + bio_end_sector(report_bio));
-
- /*
- * Remap the start sector of the reported zones. For sequential zones,
- * also remap the write pointer position.
- */
- bio_for_each_segment(bvec, report_bio, iter) {
- addr = kmap_atomic(bvec.bv_page);
-
- /* Remember the report header in the first page */
- if (!hdr) {
- hdr = addr;
- ofst = sizeof(struct blk_zone_report_hdr);
- } else
- ofst = 0;
-
- /* Set zones start sector */
- while (hdr->nr_zones && ofst < bvec.bv_len) {
- zone = addr + ofst;
- zone->start -= part_offset;
- if (zone->start >= start + ti->len) {
- hdr->nr_zones = 0;
- break;
- }
- zone->start = zone->start + ti->begin - start;
- if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
- if (zone->cond == BLK_ZONE_COND_FULL)
- zone->wp = zone->start + zone->len;
- else if (zone->cond == BLK_ZONE_COND_EMPTY)
- zone->wp = zone->start;
- else
- zone->wp = zone->wp + ti->begin - start - part_offset;
- }
- ofst += sizeof(struct blk_zone);
- hdr->nr_zones--;
- nr_rep++;
+ for (i = 0; i < nrz; i++) {
+ zone = zones + i;
+ if (zone->start >= start + ti->len) {
+ memset(zone, 0, sizeof(struct blk_zone) * (nrz - i));
+ break;
}
- if (addr != hdr)
- kunmap_atomic(addr);
+ zone->start = zone->start + ti->begin - start;
+ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+ continue;
- if (!hdr->nr_zones)
- break;
+ if (zone->cond == BLK_ZONE_COND_FULL)
+ zone->wp = zone->start + zone->len;
+ else if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->wp = zone->start;
+ else
+ zone->wp = zone->wp + ti->begin - start;
}
- if (hdr) {
- hdr->nr_zones = nr_rep;
- kunmap_atomic(hdr);
- }
-
- bio_advance(report_bio, report_bio->bi_iter.bi_size);
-
+ *nr_zones = i;
#else /* !CONFIG_BLK_DEV_ZONED */
- bio->bi_status = BLK_STS_NOTSUPP;
+ *nr_zones = 0;
#endif
}
EXPORT_SYMBOL_GPL(dm_remap_zone_report);
@@ -1311,7 +1325,7 @@
__bio_clone_fast(clone, bio);
- if (unlikely(bio_integrity(bio) != NULL)) {
+ if (bio_integrity(bio)) {
int r;
if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
@@ -1327,11 +1341,10 @@
return r;
}
- if (bio_op(bio) != REQ_OP_ZONE_REPORT)
- bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
+ bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
clone->bi_iter.bi_size = to_bytes(len);
- if (unlikely(bio_integrity(bio) != NULL))
+ if (bio_integrity(bio))
bio_integrity_trim(clone);
return 0;
@@ -1411,10 +1424,21 @@
unsigned target_nr = 0;
struct dm_target *ti;
+ /*
+ * Empty flush uses a statically initialized bio, as the base for
+ * cloning. However, blkg association requires that a bdev is
+ * associated with a gendisk, which doesn't happen until the bdev is
+ * opened. So, blkg association is done at issue time of the flush
+ * rather than when the device is created in alloc_dev().
+ */
+ bio_set_dev(ci->bio, ci->io->md->bdev);
+
BUG_ON(bio_has_data(ci->bio));
while ((ti = dm_table_get_target(ci->map, target_nr++)))
__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
+ bio_disassociate_blkg(ci->bio);
+
return 0;
}
@@ -1459,19 +1483,10 @@
return ti->num_write_zeroes_bios;
}
-typedef bool (*is_split_required_fn)(struct dm_target *ti);
-
-static bool is_split_required_for_discard(struct dm_target *ti)
-{
- return ti->split_discard_bios;
-}
-
static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
- get_num_bios_fn get_num_bios,
- is_split_required_fn is_split_required)
+ unsigned num_bios)
{
unsigned len;
- unsigned num_bios;
/*
* Even though the device advertised support for this type of
@@ -1479,14 +1494,10 @@
* reconfiguration might also have changed that since the
* check was performed.
*/
- num_bios = get_num_bios ? get_num_bios(ti) : 0;
if (!num_bios)
return -EOPNOTSUPP;
- if (is_split_required && !is_split_required(ti))
- len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
- else
- len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
+ len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
__send_duplicate_bios(ci, ti, num_bios, &len);
@@ -1498,23 +1509,38 @@
static int __send_discard(struct clone_info *ci, struct dm_target *ti)
{
- return __send_changing_extent_only(ci, ti, get_num_discard_bios,
- is_split_required_for_discard);
+ return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti));
}
static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
{
- return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios, NULL);
+ return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti));
}
static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
{
- return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL);
+ return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti));
}
static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
{
- return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL);
+ return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti));
+}
+
+static bool is_abnormal_io(struct bio *bio)
+{
+ bool r = false;
+
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_SAME:
+ case REQ_OP_WRITE_ZEROES:
+ r = true;
+ break;
+ }
+
+ return r;
}
static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
@@ -1541,23 +1567,18 @@
*/
static int __split_and_process_non_flush(struct clone_info *ci)
{
- struct bio *bio = ci->bio;
struct dm_target *ti;
unsigned len;
int r;
ti = dm_table_find_target(ci->map, ci->sector);
- if (!dm_target_is_valid(ti))
+ if (!ti)
return -EIO;
- if (unlikely(__process_abnormal_io(ci, ti, &r)))
+ if (__process_abnormal_io(ci, ti, &r))
return r;
- if (bio_op(bio) == REQ_OP_ZONE_REPORT)
- len = ci->sector_count;
- else
- len = min_t(sector_t, max_io_len(ci->sector, ti),
- ci->sector_count);
+ len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
if (r < 0)
@@ -1577,6 +1598,9 @@
ci->sector = bio->bi_iter.bi_sector;
}
+#define __dm_part_stat_sub(part, field, subnd) \
+ (part_stat_get(part, field) -= (subnd))
+
/*
* Entry point to split a bio into clones and submit them to the targets.
*/
@@ -1587,17 +1611,19 @@
blk_qc_t ret = BLK_QC_T_NONE;
int error = 0;
- if (unlikely(!map)) {
- bio_io_error(bio);
- return ret;
- }
-
- blk_queue_split(md->queue, &bio);
-
init_clone_info(&ci, md, map, bio);
if (bio->bi_opf & REQ_PREFLUSH) {
- ci.bio = &ci.io->md->flush_bio;
+ struct bio flush_bio;
+
+ /*
+ * Use an on-stack bio for this, it's safe since we don't
+ * need to reference it after submit. It's just used as
+ * the basis for the clone(s).
+ */
+ bio_init(&flush_bio, NULL, 0);
+ flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+ ci.bio = &flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
@@ -1618,14 +1644,25 @@
* We take a clone of the original to store in
* ci.io->orig_bio to be used by end_io_acct() and
* for dec_pending to use for completion handling.
- * As this path is not used for REQ_OP_ZONE_REPORT,
- * the usage of io->orig_bio in dm_remap_zone_report()
- * won't be affected by this reassignment.
*/
struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
GFP_NOIO, &md->queue->bio_split);
ci.io->orig_bio = b;
+
+ /*
+ * Adjust IO stats for each split, otherwise upon queue
+ * reentry there will be redundant IO accounting.
+ * NOTE: this is a stop-gap fix, a proper fix involves
+ * significant refactoring of DM core's bio splitting
+ * (by eliminating DM's splitting and just using bio_split)
+ */
+ part_stat_lock();
+ __dm_part_stat_sub(&dm_disk(md)->part0,
+ sectors[op_stat_group(bio_op(bio))], ci.sector_count);
+ part_stat_unlock();
+
bio_chain(b, bio);
+ trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
ret = generic_make_request(bio);
break;
}
@@ -1641,41 +1678,35 @@
* Optimized variant of __split_and_process_bio that leverages the
* fact that targets that use it do _not_ have a need to split bios.
*/
-static blk_qc_t __process_bio(struct mapped_device *md,
- struct dm_table *map, struct bio *bio)
+static blk_qc_t __process_bio(struct mapped_device *md, struct dm_table *map,
+ struct bio *bio, struct dm_target *ti)
{
struct clone_info ci;
blk_qc_t ret = BLK_QC_T_NONE;
int error = 0;
- if (unlikely(!map)) {
- bio_io_error(bio);
- return ret;
- }
-
init_clone_info(&ci, md, map, bio);
if (bio->bi_opf & REQ_PREFLUSH) {
- ci.bio = &ci.io->md->flush_bio;
+ struct bio flush_bio;
+
+ /*
+ * Use an on-stack bio for this, it's safe since we don't
+ * need to reference it after submit. It's just used as
+ * the basis for the clone(s).
+ */
+ bio_init(&flush_bio, NULL, 0);
+ flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+ ci.bio = &flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
} else {
- struct dm_target *ti = md->immutable_target;
struct dm_target_io *tio;
- /*
- * Defend against IO still getting in during teardown
- * - as was seen for a time with nvme-fcloop
- */
- if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) {
- error = -EIO;
- goto out;
- }
-
ci.bio = bio;
ci.sector_count = bio_sectors(bio);
- if (unlikely(__process_abnormal_io(&ci, ti, &error)))
+ if (__process_abnormal_io(&ci, ti, &error))
goto out;
tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
@@ -1687,10 +1718,60 @@
return ret;
}
-typedef blk_qc_t (process_bio_fn)(struct mapped_device *, struct dm_table *, struct bio *);
+static void dm_queue_split(struct mapped_device *md, struct dm_target *ti, struct bio **bio)
+{
+ unsigned len, sector_count;
-static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio,
- process_bio_fn process_bio)
+ sector_count = bio_sectors(*bio);
+ len = min_t(sector_t, max_io_len((*bio)->bi_iter.bi_sector, ti), sector_count);
+
+ if (sector_count > len) {
+ struct bio *split = bio_split(*bio, len, GFP_NOIO, &md->queue->bio_split);
+
+ bio_chain(split, *bio);
+ trace_block_split(md->queue, split, (*bio)->bi_iter.bi_sector);
+ generic_make_request(*bio);
+ *bio = split;
+ }
+}
+
+static blk_qc_t dm_process_bio(struct mapped_device *md,
+ struct dm_table *map, struct bio *bio)
+{
+ blk_qc_t ret = BLK_QC_T_NONE;
+ struct dm_target *ti = md->immutable_target;
+
+ if (unlikely(!map)) {
+ bio_io_error(bio);
+ return ret;
+ }
+
+ if (!ti) {
+ ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
+ if (unlikely(!ti)) {
+ bio_io_error(bio);
+ return ret;
+ }
+ }
+
+ /*
+ * If in ->make_request_fn we need to use blk_queue_split(), otherwise
+ * queue_limits for abnormal requests (e.g. discard, writesame, etc)
+ * won't be imposed.
+ */
+ if (current->bio_list) {
+ blk_queue_split(md->queue, &bio);
+ if (!is_abnormal_io(bio))
+ dm_queue_split(md, ti, &bio);
+ }
+
+ if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED)
+ return __process_bio(md, map, bio, ti);
+ else
+ return __split_and_process_bio(md, map, bio);
+}
+
+static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
{
struct mapped_device *md = q->queuedata;
blk_qc_t ret = BLK_QC_T_NONE;
@@ -1710,26 +1791,12 @@
return ret;
}
- ret = process_bio(md, map, bio);
+ ret = dm_process_bio(md, map, bio);
dm_put_live_table(md, srcu_idx);
return ret;
}
-/*
- * The request function that remaps the bio to one target and
- * splits off any remainder.
- */
-static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
-{
- return __dm_make_request(q, bio, __split_and_process_bio);
-}
-
-static blk_qc_t dm_make_request_nvme(struct request_queue *q, struct bio *bio)
-{
- return __dm_make_request(q, bio, __process_bio);
-}
-
static int dm_any_congested(void *congested_data, int bdi_bits)
{
int r = bdi_bits;
@@ -1810,8 +1877,6 @@
static void dm_init_normal_md_queue(struct mapped_device *md)
{
- md->use_blk_mq = false;
-
/*
* Initialize aspects of queue that aren't relevant for blk-mq
*/
@@ -1822,8 +1887,6 @@
{
if (md->wq)
destroy_workqueue(md->wq);
- if (md->kworker_task)
- kthread_stop(md->kworker_task);
bioset_exit(&md->bs);
bioset_exit(&md->io_bs);
@@ -1864,7 +1927,6 @@
static struct mapped_device *alloc_dev(int minor)
{
int r, numa_node_id = dm_get_numa_node();
- struct dax_device *dax_dev = NULL;
struct mapped_device *md;
void *old_md;
@@ -1890,7 +1952,6 @@
goto bad_io_barrier;
md->numa_node_id = numa_node_id;
- md->use_blk_mq = dm_use_blk_mq_default();
md->init_tio_pdu = false;
md->type = DM_TYPE_NONE;
mutex_init(&md->suspend_lock);
@@ -1905,7 +1966,7 @@
INIT_LIST_HEAD(&md->table_devices);
spin_lock_init(&md->uevent_lock);
- md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
+ md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
if (!md->queue)
goto bad;
md->queue->queuedata = md;
@@ -1915,13 +1976,10 @@
if (!md->disk)
goto bad;
- atomic_set(&md->pending[0], 0);
- atomic_set(&md->pending[1], 0);
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
init_completion(&md->kobj_holder.completion);
- md->kworker_task = NULL;
md->disk->major = _major;
md->disk->first_minor = minor;
@@ -1931,11 +1989,11 @@
sprintf(md->disk->disk_name, "dm-%d", minor);
if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
- dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
- if (!dax_dev)
+ md->dax_dev = alloc_dax(md, md->disk->disk_name,
+ &dm_dax_ops, 0);
+ if (!md->dax_dev)
goto bad;
}
- md->dax_dev = dax_dev;
add_disk_no_queue_reg(md->disk);
format_dev_t(md->name, MKDEV(_major, minor));
@@ -1948,10 +2006,6 @@
if (!md->bdev)
goto bad;
- bio_init(&md->flush_bio, NULL, 0);
- bio_set_dev(&md->flush_bio, md->bdev);
- md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
-
dm_stats_init(&md->stats);
/* Populate the mapping, nobody knows we exist yet */
@@ -2221,14 +2275,6 @@
switch (type) {
case DM_TYPE_REQUEST_BASED:
- dm_init_normal_md_queue(md);
- r = dm_old_init_request_queue(md, t);
- if (r) {
- DMERR("Cannot initialize queue for request-based mapped device");
- return r;
- }
- break;
- case DM_TYPE_MQ_REQUEST_BASED:
r = dm_mq_init_request_queue(md, t);
if (r) {
DMERR("Cannot initialize queue for request-based dm-mq mapped device");
@@ -2237,12 +2283,9 @@
break;
case DM_TYPE_BIO_BASED:
case DM_TYPE_DAX_BIO_BASED:
- dm_init_normal_md_queue(md);
- blk_queue_make_request(md->queue, dm_make_request);
- break;
case DM_TYPE_NVME_BIO_BASED:
dm_init_normal_md_queue(md);
- blk_queue_make_request(md->queue, dm_make_request_nvme);
+ blk_queue_make_request(md->queue, dm_make_request);
break;
case DM_TYPE_NONE:
WARN_ON_ONCE(true);
@@ -2333,9 +2376,6 @@
blk_set_queue_dying(md->queue);
- if (dm_request_based(md) && md->kworker_task)
- kthread_flush_worker(&md->kworker);
-
/*
* Take suspend_lock so that presuspend and postsuspend methods
* do not race with internal suspend.
@@ -2429,9 +2469,9 @@
break;
if (dm_request_based(md))
- generic_make_request(c);
+ (void) generic_make_request(c);
else
- __split_and_process_bio(md, map, c);
+ (void) dm_process_bio(md, map, c);
}
dm_put_live_table(md, srcu_idx);
@@ -2588,11 +2628,8 @@
* Stop md->queue before flushing md->wq in case request-based
* dm defers requests to md->wq from md->queue.
*/
- if (dm_request_based(md)) {
+ if (dm_request_based(md))
dm_stop_queue(md->queue);
- if (md->kworker_task)
- kthread_flush_worker(&md->kworker);
- }
flush_workqueue(md->wq);
@@ -2967,7 +3004,6 @@
goto out;
break;
case DM_TYPE_REQUEST_BASED:
- case DM_TYPE_MQ_REQUEST_BASED:
pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
/* per_io_data_size is used for blk-mq pdu at queue allocation */
@@ -3169,12 +3205,14 @@
.release = dm_blk_close,
.ioctl = dm_blk_ioctl,
.getgeo = dm_blk_getgeo,
+ .report_zones = dm_blk_report_zones,
.pr_ops = &dm_pr_ops,
.owner = THIS_MODULE
};
static const struct dax_operations dm_dax_ops = {
.direct_access = dm_dax_direct_access,
+ .dax_supported = dm_dax_supported,
.copy_from_iter = dm_dax_copy_from_iter,
.copy_to_iter = dm_dax_copy_to_iter,
};
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 114a81b..d7c4f66 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -70,9 +70,12 @@
struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
bool dm_table_bio_based(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
-bool dm_table_all_blk_mq_devices(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
+bool dm_table_supports_dax(struct dm_table *t, iterate_devices_callout_fn fn,
+ int *blocksize);
+int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data);
void dm_lock_md_type(struct mapped_device *md);
void dm_unlock_md_type(struct mapped_device *md);
@@ -83,11 +86,6 @@
int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
/*
- * To check the return value from dm_table_find_target().
- */
-#define dm_target_is_valid(t) ((t)->table)
-
-/*
* To check whether the target type is bio-based or not (request-based).
*/
#define dm_target_bio_based(t) ((t)->type->map != NULL)
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 2fc8c11..b092c7b 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
*
@@ -490,10 +491,10 @@
pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
pr_debug(" version: %d\n", le32_to_cpu(sb->version));
pr_debug(" uuid: %08x.%08x.%08x.%08x\n",
- le32_to_cpu(*(__u32 *)(sb->uuid+0)),
- le32_to_cpu(*(__u32 *)(sb->uuid+4)),
- le32_to_cpu(*(__u32 *)(sb->uuid+8)),
- le32_to_cpu(*(__u32 *)(sb->uuid+12)));
+ le32_to_cpu(*(__le32 *)(sb->uuid+0)),
+ le32_to_cpu(*(__le32 *)(sb->uuid+4)),
+ le32_to_cpu(*(__le32 *)(sb->uuid+8)),
+ le32_to_cpu(*(__le32 *)(sb->uuid+12)));
pr_debug(" events: %llu\n",
(unsigned long long) le64_to_cpu(sb->events));
pr_debug("events cleared: %llu\n",
@@ -1789,6 +1790,8 @@
return;
md_bitmap_wait_behind_writes(mddev);
+ mempool_destroy(mddev->wb_info_pool);
+ mddev->wb_info_pool = NULL;
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
@@ -1899,10 +1902,14 @@
sector_t start = 0;
sector_t sector = 0;
struct bitmap *bitmap = mddev->bitmap;
+ struct md_rdev *rdev;
if (!bitmap)
goto out;
+ rdev_for_each(rdev, mddev)
+ mddev_create_wb_pool(mddev, rdev, true);
+
if (mddev_is_clustered(mddev))
md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
@@ -2288,9 +2295,9 @@
goto out;
}
if (mddev->pers) {
- mddev->pers->quiesce(mddev, 1);
+ mddev_suspend(mddev);
md_bitmap_destroy(mddev);
- mddev->pers->quiesce(mddev, 0);
+ mddev_resume(mddev);
}
mddev->bitmap_info.offset = 0;
if (mddev->bitmap_info.file) {
@@ -2327,8 +2334,8 @@
mddev->bitmap_info.offset = offset;
if (mddev->pers) {
struct bitmap *bitmap;
- mddev->pers->quiesce(mddev, 1);
bitmap = md_bitmap_create(mddev, -1);
+ mddev_suspend(mddev);
if (IS_ERR(bitmap))
rv = PTR_ERR(bitmap);
else {
@@ -2337,11 +2344,12 @@
if (rv)
mddev->bitmap_info.offset = 0;
}
- mddev->pers->quiesce(mddev, 0);
if (rv) {
md_bitmap_destroy(mddev);
+ mddev_resume(mddev);
goto out;
}
+ mddev_resume(mddev);
}
}
}
@@ -2460,12 +2468,26 @@
backlog_store(struct mddev *mddev, const char *buf, size_t len)
{
unsigned long backlog;
+ unsigned long old_mwb = mddev->bitmap_info.max_write_behind;
int rv = kstrtoul(buf, 10, &backlog);
if (rv)
return rv;
if (backlog > COUNTER_MAX)
return -EINVAL;
mddev->bitmap_info.max_write_behind = backlog;
+ if (!backlog && mddev->wb_info_pool) {
+ /* wb_info_pool is not needed if backlog is zero */
+ mempool_destroy(mddev->wb_info_pool);
+ mddev->wb_info_pool = NULL;
+ } else if (backlog && !mddev->wb_info_pool) {
+ /* wb_info_pool is needed since backlog is not zero */
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev)
+ mddev_create_wb_pool(mddev, rdev, false);
+ }
+ if (old_mwb != backlog)
+ md_bitmap_update_sb(mddev->bitmap);
return len;
}
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 0b2af6e..813a99f 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1,11 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2015, SUSE
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
*/
@@ -33,13 +28,6 @@
int mode;
};
-struct suspend_info {
- int slot;
- sector_t lo;
- sector_t hi;
- struct list_head list;
-};
-
struct resync_info {
__le64 lo;
__le64 hi;
@@ -80,7 +68,13 @@
struct dlm_lock_resource **other_bitmap_lockres;
struct dlm_lock_resource *resync_lockres;
struct list_head suspend_list;
+
spinlock_t suspend_lock;
+ /* record the region which write should be suspended */
+ sector_t suspend_lo;
+ sector_t suspend_hi;
+ int suspend_from; /* the slot which broadcast suspend_lo/hi */
+
struct md_thread *recovery_thread;
unsigned long recovery_map;
/* communication loc resources */
@@ -105,6 +99,7 @@
RE_ADD,
BITMAP_NEEDS_SYNC,
CHANGE_CAPACITY,
+ BITMAP_RESIZE,
};
struct cluster_msg {
@@ -270,25 +265,22 @@
ri->hi = cpu_to_le64(hi);
}
-static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
+static int read_resync_info(struct mddev *mddev,
+ struct dlm_lock_resource *lockres)
{
struct resync_info ri;
- struct suspend_info *s = NULL;
- sector_t hi = 0;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ int ret = 0;
dlm_lock_sync(lockres, DLM_LOCK_CR);
memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
- hi = le64_to_cpu(ri.hi);
- if (hi > 0) {
- s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
- if (!s)
- goto out;
- s->hi = hi;
- s->lo = le64_to_cpu(ri.lo);
+ if (le64_to_cpu(ri.hi) > 0) {
+ cinfo->suspend_hi = le64_to_cpu(ri.hi);
+ cinfo->suspend_lo = le64_to_cpu(ri.lo);
+ ret = 1;
}
dlm_unlock_sync(lockres);
-out:
- return s;
+ return ret;
}
static void recover_bitmaps(struct md_thread *thread)
@@ -298,7 +290,6 @@
struct dlm_lock_resource *bm_lockres;
char str[64];
int slot, ret;
- struct suspend_info *s, *tmp;
sector_t lo, hi;
while (cinfo->recovery_map) {
@@ -325,13 +316,17 @@
/* Clear suspend_area associated with the bitmap */
spin_lock_irq(&cinfo->suspend_lock);
- list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
- if (slot == s->slot) {
- list_del(&s->list);
- kfree(s);
- }
+ cinfo->suspend_hi = 0;
+ cinfo->suspend_lo = 0;
+ cinfo->suspend_from = -1;
spin_unlock_irq(&cinfo->suspend_lock);
+ /* Kick off a reshape if needed */
+ if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+ test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ mddev->reshape_position != MaxSector)
+ md_wakeup_thread(mddev->sync_thread);
+
if (hi > 0) {
if (lo < mddev->recovery_cp)
mddev->recovery_cp = lo;
@@ -434,34 +429,23 @@
}
}
-static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
-{
- struct suspend_info *s, *tmp;
-
- list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
- if (slot == s->slot) {
- list_del(&s->list);
- kfree(s);
- break;
- }
-}
-
static void remove_suspend_info(struct mddev *mddev, int slot)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
mddev->pers->quiesce(mddev, 1);
spin_lock_irq(&cinfo->suspend_lock);
- __remove_suspend_info(cinfo, slot);
+ cinfo->suspend_hi = 0;
+ cinfo->suspend_lo = 0;
spin_unlock_irq(&cinfo->suspend_lock);
mddev->pers->quiesce(mddev, 0);
}
-
static void process_suspend_info(struct mddev *mddev,
int slot, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
- struct suspend_info *s;
+ struct mdp_superblock_1 *sb = NULL;
+ struct md_rdev *rdev;
if (!hi) {
/*
@@ -475,6 +459,12 @@
return;
}
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
+ sb = page_address(rdev->sb_page);
+ break;
+ }
+
/*
* The bitmaps are not same for different nodes
* if RESYNCING is happening in one node, then
@@ -487,26 +477,26 @@
* sync_low/hi is used to record the region which
* arrived in the previous RESYNCING message,
*
- * Call bitmap_sync_with_cluster to clear
- * NEEDED_MASK and set RESYNC_MASK since
- * resync thread is running in another node,
- * so we don't need to do the resync again
- * with the same section */
- md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, cinfo->sync_hi, lo, hi);
+ * Call md_bitmap_sync_with_cluster to clear NEEDED_MASK
+ * and set RESYNC_MASK since resync thread is running
+ * in another node, so we don't need to do the resync
+ * again with the same section.
+ *
+ * Skip md_bitmap_sync_with_cluster in case reshape
+ * happening, because reshaping region is small and
+ * we don't want to trigger lots of WARN.
+ */
+ if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE))
+ md_bitmap_sync_with_cluster(mddev, cinfo->sync_low,
+ cinfo->sync_hi, lo, hi);
cinfo->sync_low = lo;
cinfo->sync_hi = hi;
- s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
- if (!s)
- return;
- s->slot = slot;
- s->lo = lo;
- s->hi = hi;
mddev->pers->quiesce(mddev, 1);
spin_lock_irq(&cinfo->suspend_lock);
- /* Remove existing entry (if exists) before adding */
- __remove_suspend_info(cinfo, slot);
- list_add(&s->list, &cinfo->suspend_list);
+ cinfo->suspend_from = slot;
+ cinfo->suspend_lo = lo;
+ cinfo->suspend_hi = hi;
spin_unlock_irq(&cinfo->suspend_lock);
mddev->pers->quiesce(mddev, 0);
}
@@ -612,6 +602,11 @@
case BITMAP_NEEDS_SYNC:
__recover_slot(mddev, le32_to_cpu(msg->slot));
break;
+ case BITMAP_RESIZE:
+ if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0))
+ ret = md_bitmap_resize(mddev->bitmap,
+ le64_to_cpu(msg->high), 0, 0);
+ break;
default:
ret = -1;
pr_warn("%s:%d Received unknown message from %d\n",
@@ -800,7 +795,6 @@
struct md_cluster_info *cinfo = mddev->cluster_info;
int i, ret = 0;
struct dlm_lock_resource *bm_lockres;
- struct suspend_info *s;
char str[64];
sector_t lo, hi;
@@ -819,16 +813,13 @@
bm_lockres->flags |= DLM_LKF_NOQUEUE;
ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
if (ret == -EAGAIN) {
- s = read_resync_info(mddev, bm_lockres);
- if (s) {
+ if (read_resync_info(mddev, bm_lockres)) {
pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
__func__, __LINE__,
- (unsigned long long) s->lo,
- (unsigned long long) s->hi, i);
- spin_lock_irq(&cinfo->suspend_lock);
- s->slot = i;
- list_add(&s->list, &cinfo->suspend_list);
- spin_unlock_irq(&cinfo->suspend_lock);
+ (unsigned long long) cinfo->suspend_lo,
+ (unsigned long long) cinfo->suspend_hi,
+ i);
+ cinfo->suspend_from = i;
}
ret = 0;
lockres_free(bm_lockres);
@@ -1001,10 +992,17 @@
if (!cinfo)
return 0;
- /* BITMAP_NEEDS_SYNC message should be sent when node
+ /*
+ * BITMAP_NEEDS_SYNC message should be sent when node
* is leaving the cluster with dirty bitmap, also we
- * can only deliver it when dlm connection is available */
- if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
+ * can only deliver it when dlm connection is available.
+ *
+ * Also, we should send BITMAP_NEEDS_SYNC message in
+ * case reshaping is interrupted.
+ */
+ if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) ||
+ (mddev->reshape_position != MaxSector &&
+ test_bit(MD_CLOSING, &mddev->flags)))
resync_bitmap(mddev);
set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
@@ -1102,6 +1100,80 @@
unlock_comm(cinfo);
}
+static int update_bitmap_size(struct mddev *mddev, sector_t size)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg = {0};
+ int ret;
+
+ cmsg.type = cpu_to_le32(BITMAP_RESIZE);
+ cmsg.high = cpu_to_le64(size);
+ ret = sendmsg(cinfo, &cmsg, 0);
+ if (ret)
+ pr_err("%s:%d: failed to send BITMAP_RESIZE message (%d)\n",
+ __func__, __LINE__, ret);
+ return ret;
+}
+
+static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize)
+{
+ struct bitmap_counts *counts;
+ char str[64];
+ struct dlm_lock_resource *bm_lockres;
+ struct bitmap *bitmap = mddev->bitmap;
+ unsigned long my_pages = bitmap->counts.pages;
+ int i, rv;
+
+ /*
+ * We need to ensure all the nodes can grow to a larger
+ * bitmap size before make the reshaping.
+ */
+ rv = update_bitmap_size(mddev, newsize);
+ if (rv)
+ return rv;
+
+ for (i = 0; i < mddev->bitmap_info.nodes; i++) {
+ if (i == md_cluster_ops->slot_number(mddev))
+ continue;
+
+ bitmap = get_bitmap_from_slot(mddev, i);
+ if (IS_ERR(bitmap)) {
+ pr_err("can't get bitmap from slot %d\n", i);
+ goto out;
+ }
+ counts = &bitmap->counts;
+
+ /*
+ * If we can hold the bitmap lock of one node then
+ * the slot is not occupied, update the pages.
+ */
+ snprintf(str, 64, "bitmap%04d", i);
+ bm_lockres = lockres_init(mddev, str, NULL, 1);
+ if (!bm_lockres) {
+ pr_err("Cannot initialize %s lock\n", str);
+ goto out;
+ }
+ bm_lockres->flags |= DLM_LKF_NOQUEUE;
+ rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+ if (!rv)
+ counts->pages = my_pages;
+ lockres_free(bm_lockres);
+
+ if (my_pages != counts->pages)
+ /*
+ * Let's revert the bitmap size if one node
+ * can't resize bitmap
+ */
+ goto out;
+ }
+
+ return 0;
+out:
+ md_bitmap_free(bitmap);
+ update_bitmap_size(mddev, oldsize);
+ return -1;
+}
+
/*
* return 0 if all the bitmaps have the same sync_size
*/
@@ -1243,6 +1315,16 @@
return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev);
}
+static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ spin_lock_irq(&cinfo->suspend_lock);
+ *lo = cinfo->suspend_lo;
+ *hi = cinfo->suspend_hi;
+ spin_unlock_irq(&cinfo->suspend_lock);
+}
+
static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -1295,21 +1377,14 @@
{
struct md_cluster_info *cinfo = mddev->cluster_info;
int ret = 0;
- struct suspend_info *s;
if ((direction == READ) &&
test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
return 1;
spin_lock_irq(&cinfo->suspend_lock);
- if (list_empty(&cinfo->suspend_list))
- goto out;
- list_for_each_entry(s, &cinfo->suspend_list, list)
- if (hi > s->lo && lo < s->hi) {
- ret = 1;
- break;
- }
-out:
+ if (hi > cinfo->suspend_lo && lo < cinfo->suspend_hi)
+ ret = 1;
spin_unlock_irq(&cinfo->suspend_lock);
return ret;
}
@@ -1482,6 +1557,7 @@
.resync_start = resync_start,
.resync_finish = resync_finish,
.resync_info_update = resync_info_update,
+ .resync_info_get = resync_info_get,
.metadata_update_start = metadata_update_start,
.metadata_update_finish = metadata_update_finish,
.metadata_update_cancel = metadata_update_cancel,
@@ -1492,6 +1568,7 @@
.remove_disk = remove_disk,
.load_bitmaps = load_bitmaps,
.gather_bitmaps = gather_bitmaps,
+ .resize_bitmaps = resize_bitmaps,
.lock_all_bitmaps = lock_all_bitmaps,
.unlock_all_bitmaps = unlock_all_bitmaps,
.update_size = update_size,
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index c024070..a78e302 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -14,6 +14,7 @@
int (*leave)(struct mddev *mddev);
int (*slot_number)(struct mddev *mddev);
int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
+ void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi);
int (*metadata_update_start)(struct mddev *mddev);
int (*metadata_update_finish)(struct mddev *mddev);
void (*metadata_update_cancel)(struct mddev *mddev);
@@ -26,6 +27,7 @@
int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
void (*load_bitmaps)(struct mddev *mddev, int total_slots);
int (*gather_bitmaps)(struct md_rdev *rdev);
+ int (*resize_bitmaps)(struct mddev *mddev, sector_t newsize, sector_t oldsize);
int (*lock_all_bitmaps)(struct mddev *mddev);
void (*unlock_all_bitmaps)(struct mddev *mddev);
void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
index c2fdf89..50ad4ba 100644
--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -1,19 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* faulty.c : Multiple Devices driver for Linux
*
* Copyright (C) 2004 Neil Brown
*
* fautly-device-simulator personality for md
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index d45c697..c766c55 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
linear.c : Multiple Devices driver for Linux
Copyright (C) 1994-96 Marc ZYNGIER
@@ -6,14 +7,6 @@
Linear mode management functions.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
-
- You should have received a copy of the GNU General Public License
- (for example /usr/src/linux/COPYING); if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/blkdev.h>
@@ -96,8 +89,7 @@
int i, cnt;
bool discard_supported = false;
- conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
- GFP_KERNEL);
+ conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL);
if (!conf)
return NULL;
@@ -266,6 +258,11 @@
bio_sector < start_sector))
goto out_of_bounds;
+ if (unlikely(is_mddev_broken(tmp_dev->rdev, "linear"))) {
+ bio_io_error(bio);
+ return true;
+ }
+
if (unlikely(bio_end_sector(bio) > end_sector)) {
/* This bio crosses a device boundary, so we have to split it */
struct bio *split = bio_split(bio, end_sector - bio_sector,
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 881487d..6780938 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* multipath.c : Multiple Devices driver for Linux
*
@@ -8,15 +9,6 @@
* MULTIPATH management functions.
*
* derived from raid1.c.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/blkdev.h>
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8668793..1be7abe 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
md.c : Multiple Devices driver for Linux
Copyright (C) 1998, 1999, 2000 Ingo Molnar
@@ -22,14 +23,6 @@
- persistent bitmap code
Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
-
- You should have received a copy of the GNU General Public License
- (for example /usr/src/linux/COPYING); if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
Errors, Warnings, etc.
Please use:
@@ -44,6 +37,7 @@
*/
+#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/kthread.h>
#include <linux/blkdev.h>
@@ -88,8 +82,7 @@
struct md_cluster_operations *md_cluster_ops;
EXPORT_SYMBOL(md_cluster_ops);
-struct module *md_cluster_mod;
-EXPORT_SYMBOL(md_cluster_mod);
+static struct module *md_cluster_mod;
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
@@ -132,22 +125,75 @@
mddev->sync_speed_max : sysctl_speed_limit_max;
}
-static void * flush_info_alloc(gfp_t gfp_flags, void *data)
+static int rdev_init_wb(struct md_rdev *rdev)
{
- return kzalloc(sizeof(struct flush_info), gfp_flags);
-}
-static void flush_info_free(void *flush_info, void *data)
-{
- kfree(flush_info);
+ if (rdev->bdev->bd_queue->nr_hw_queues == 1)
+ return 0;
+
+ spin_lock_init(&rdev->wb_list_lock);
+ INIT_LIST_HEAD(&rdev->wb_list);
+ init_waitqueue_head(&rdev->wb_io_wait);
+ set_bit(WBCollisionCheck, &rdev->flags);
+
+ return 1;
}
-static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
+/*
+ * Create wb_info_pool if rdev is the first multi-queue device flaged
+ * with writemostly, also write-behind mode is enabled.
+ */
+void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend)
{
- return kzalloc(sizeof(struct flush_bio), gfp_flags);
+ if (mddev->bitmap_info.max_write_behind == 0)
+ return;
+
+ if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev))
+ return;
+
+ if (mddev->wb_info_pool == NULL) {
+ unsigned int noio_flag;
+
+ if (!is_suspend)
+ mddev_suspend(mddev);
+ noio_flag = memalloc_noio_save();
+ mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS,
+ sizeof(struct wb_info));
+ memalloc_noio_restore(noio_flag);
+ if (!mddev->wb_info_pool)
+ pr_err("can't alloc memory pool for writemostly\n");
+ if (!is_suspend)
+ mddev_resume(mddev);
+ }
}
-static void flush_bio_free(void *flush_bio, void *data)
+EXPORT_SYMBOL_GPL(mddev_create_wb_pool);
+
+/*
+ * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck.
+ */
+static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev)
{
- kfree(flush_bio);
+ if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags))
+ return;
+
+ if (mddev->wb_info_pool) {
+ struct md_rdev *temp;
+ int num = 0;
+
+ /*
+ * Check if other rdevs need wb_info_pool.
+ */
+ rdev_for_each(temp, mddev)
+ if (temp != rdev &&
+ test_bit(WBCollisionCheck, &temp->flags))
+ num++;
+ if (!num) {
+ mddev_suspend(rdev->mddev);
+ mempool_destroy(mddev->wb_info_pool);
+ mddev->wb_info_pool = NULL;
+ mddev_resume(rdev->mddev);
+ }
+ }
}
static struct ctl_table_header *raid_table_header;
@@ -207,15 +253,10 @@
struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
struct mddev *mddev)
{
- struct bio *b;
-
if (!mddev || !bioset_initialized(&mddev->bio_set))
return bio_alloc(gfp_mask, nr_iovecs);
- b = bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
- if (!b)
- return NULL;
- return b;
+ return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
}
EXPORT_SYMBOL_GPL(bio_alloc_mddev);
@@ -334,7 +375,11 @@
const int sgrp = op_stat_group(bio_op(bio));
struct mddev *mddev = q->queuedata;
unsigned int sectors;
- int cpu;
+
+ if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
+ bio_io_error(bio);
+ return BLK_QC_T_NONE;
+ }
blk_queue_split(q, &bio);
@@ -359,9 +404,9 @@
md_handle_request(mddev, bio);
- cpu = part_stat_lock();
- part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
- part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
+ part_stat_lock();
+ part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
+ part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
part_stat_unlock();
return BLK_QC_T_NONE;
@@ -429,54 +474,31 @@
/*
* Generic flush handling for md
*/
-static void submit_flushes(struct work_struct *ws)
+
+static void md_end_flush(struct bio *bio)
{
- struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
- struct mddev *mddev = fi->mddev;
- struct bio *bio = fi->bio;
-
- bio->bi_opf &= ~REQ_PREFLUSH;
- md_handle_request(mddev, bio);
-
- mempool_free(fi, mddev->flush_pool);
-}
-
-static void md_end_flush(struct bio *fbio)
-{
- struct flush_bio *fb = fbio->bi_private;
- struct md_rdev *rdev = fb->rdev;
- struct flush_info *fi = fb->fi;
- struct bio *bio = fi->bio;
- struct mddev *mddev = fi->mddev;
+ struct md_rdev *rdev = bio->bi_private;
+ struct mddev *mddev = rdev->mddev;
rdev_dec_pending(rdev, mddev);
- if (atomic_dec_and_test(&fi->flush_pending)) {
- if (bio->bi_iter.bi_size == 0) {
- /* an empty barrier - all done */
- bio_endio(bio);
- mempool_free(fi, mddev->flush_pool);
- } else {
- INIT_WORK(&fi->flush_work, submit_flushes);
- queue_work(md_wq, &fi->flush_work);
- }
+ if (atomic_dec_and_test(&mddev->flush_pending)) {
+ /* The pre-request flush has finished */
+ queue_work(md_wq, &mddev->flush_work);
}
-
- mempool_free(fb, mddev->flush_bio_pool);
- bio_put(fbio);
+ bio_put(bio);
}
-void md_flush_request(struct mddev *mddev, struct bio *bio)
+static void md_submit_flush_data(struct work_struct *ws);
+
+static void submit_flushes(struct work_struct *ws)
{
+ struct mddev *mddev = container_of(ws, struct mddev, flush_work);
struct md_rdev *rdev;
- struct flush_info *fi;
- fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
-
- fi->bio = bio;
- fi->mddev = mddev;
- atomic_set(&fi->flush_pending, 1);
-
+ mddev->start_flush = ktime_get_boottime();
+ INIT_WORK(&mddev->flush_work, md_submit_flush_data);
+ atomic_set(&mddev->flush_pending, 1);
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
@@ -486,37 +508,74 @@
* we reclaim rcu_read_lock
*/
struct bio *bi;
- struct flush_bio *fb;
atomic_inc(&rdev->nr_pending);
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
-
- fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
- fb->fi = fi;
- fb->rdev = rdev;
-
bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
- bio_set_dev(bi, rdev->bdev);
bi->bi_end_io = md_end_flush;
- bi->bi_private = fb;
+ bi->bi_private = rdev;
+ bio_set_dev(bi, rdev->bdev);
bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-
- atomic_inc(&fi->flush_pending);
+ atomic_inc(&mddev->flush_pending);
submit_bio(bi);
-
rcu_read_lock();
rdev_dec_pending(rdev, mddev);
}
rcu_read_unlock();
+ if (atomic_dec_and_test(&mddev->flush_pending))
+ queue_work(md_wq, &mddev->flush_work);
+}
- if (atomic_dec_and_test(&fi->flush_pending)) {
- if (bio->bi_iter.bi_size == 0) {
+static void md_submit_flush_data(struct work_struct *ws)
+{
+ struct mddev *mddev = container_of(ws, struct mddev, flush_work);
+ struct bio *bio = mddev->flush_bio;
+
+ /*
+ * must reset flush_bio before calling into md_handle_request to avoid a
+ * deadlock, because other bios passed md_handle_request suspend check
+ * could wait for this and below md_handle_request could wait for those
+ * bios because of suspend check
+ */
+ mddev->last_flush = mddev->start_flush;
+ mddev->flush_bio = NULL;
+ wake_up(&mddev->sb_wait);
+
+ if (bio->bi_iter.bi_size == 0) {
+ /* an empty barrier - all done */
+ bio_endio(bio);
+ } else {
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ md_handle_request(mddev, bio);
+ }
+}
+
+void md_flush_request(struct mddev *mddev, struct bio *bio)
+{
+ ktime_t start = ktime_get_boottime();
+ spin_lock_irq(&mddev->lock);
+ wait_event_lock_irq(mddev->sb_wait,
+ !mddev->flush_bio ||
+ ktime_after(mddev->last_flush, start),
+ mddev->lock);
+ if (!ktime_after(mddev->last_flush, start)) {
+ WARN_ON(mddev->flush_bio);
+ mddev->flush_bio = bio;
+ bio = NULL;
+ }
+ spin_unlock_irq(&mddev->lock);
+
+ if (!bio) {
+ INIT_WORK(&mddev->flush_work, submit_flushes);
+ queue_work(md_wq, &mddev->flush_work);
+ } else {
+ /* flush was performed for some other bio while we waited. */
+ if (bio->bi_iter.bi_size == 0)
/* an empty barrier - all done */
bio_endio(bio);
- mempool_free(fi, mddev->flush_pool);
- } else {
- INIT_WORK(&fi->flush_work, submit_flushes);
- queue_work(md_wq, &fi->flush_work);
+ else {
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ mddev->pers->make_request(mddev, bio);
}
}
}
@@ -566,6 +625,7 @@
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
spin_lock_init(&mddev->lock);
+ atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
init_waitqueue_head(&mddev->recovery_wait);
mddev->reshape_position = MaxSector;
@@ -1115,8 +1175,7 @@
* (not needed for Linear and RAID0 as metadata doesn't
* record this size)
*/
- if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
- sb->level >= 1)
+ if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
rdev->sectors = (sector_t)(2ULL << 32) - 2;
if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
@@ -1178,6 +1237,8 @@
mddev->new_layout = mddev->layout;
mddev->new_chunk_sectors = mddev->chunk_sectors;
}
+ if (mddev->level == 0)
+ mddev->layout = -1;
if (sb->state & (1<<MD_SB_CLEAN))
mddev->recovery_cp = MaxSector;
@@ -1414,8 +1475,7 @@
/* Limit to 4TB as metadata cannot record more than that.
* 4TB == 2^32 KB, or 2*2^32 sectors.
*/
- if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
- rdev->mddev->level >= 1)
+ if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
num_sectors = (sector_t)(2ULL << 32) - 2;
do {
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
@@ -1559,7 +1619,7 @@
*/
s32 offset;
sector_t bb_sector;
- u64 *bbp;
+ __le64 *bbp;
int i;
int sectors = le16_to_cpu(sb->bblog_size);
if (sectors > (PAGE_SIZE / 512))
@@ -1571,7 +1631,7 @@
if (!sync_page_io(rdev, bb_sector, sectors << 9,
rdev->bb_page, REQ_OP_READ, 0, true))
return -EIO;
- bbp = (u64 *)page_address(rdev->bb_page);
+ bbp = (__le64 *)page_address(rdev->bb_page);
rdev->badblocks.shift = sb->bblog_shift;
for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
u64 bb = le64_to_cpu(*bbp);
@@ -1594,6 +1654,10 @@
rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
}
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
+ sb->level != 0)
+ return -EINVAL;
+
if (!refdev) {
ret = 1;
} else {
@@ -1704,6 +1768,10 @@
mddev->new_chunk_sectors = mddev->chunk_sectors;
}
+ if (mddev->level == 0 &&
+ !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
+ mddev->layout = -1;
+
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
@@ -1773,8 +1841,15 @@
if (!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_BITMAP))
rdev->saved_raid_disk = -1;
- } else
- set_bit(In_sync, &rdev->flags);
+ } else {
+ /*
+ * If the array is FROZEN, then the device can't
+ * be in_sync with rest of array.
+ */
+ if (!test_bit(MD_RECOVERY_FROZEN,
+ &mddev->recovery))
+ set_bit(In_sync, &rdev->flags);
+ }
rdev->raid_disk = role;
break;
}
@@ -1883,7 +1958,7 @@
md_error(mddev, rdev);
else {
struct badblocks *bb = &rdev->badblocks;
- u64 *bbp = (u64 *)page_address(rdev->bb_page);
+ __le64 *bbp = (__le64 *)page_address(rdev->bb_page);
u64 *p = bb->page;
sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
if (bb->changed) {
@@ -2148,14 +2223,12 @@
*/
int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
{
- struct blk_integrity *bi_rdev;
struct blk_integrity *bi_mddev;
char name[BDEVNAME_SIZE];
if (!mddev->gendisk)
return 0;
- bi_rdev = bdev_get_integrity(rdev->bdev);
bi_mddev = blk_get_integrity(mddev->gendisk);
if (!bi_mddev) /* nothing to do */
@@ -2231,6 +2304,9 @@
rdev->mddev = mddev;
pr_debug("md: bind<%s>\n", b);
+ if (mddev->raid_disks)
+ mddev_create_wb_pool(mddev, rdev, false);
+
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
goto fail;
@@ -2267,6 +2343,7 @@
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
+ mddev_destroy_wb_pool(rdev->mddev, rdev);
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
@@ -2779,8 +2856,10 @@
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
+ mddev_create_wb_pool(rdev->mddev, rdev, false);
err = 0;
} else if (cmd_match(buf, "-writemostly")) {
+ mddev_destroy_wb_pool(rdev->mddev, rdev);
clear_bit(WriteMostly, &rdev->flags);
err = 0;
} else if (cmd_match(buf, "blocked")) {
@@ -2863,8 +2942,10 @@
err = 0;
}
} else if (cmd_match(buf, "re-add")) {
- if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
- rdev->saved_raid_disk >= 0) {
+ if (!rdev->mddev->pers)
+ err = -EINVAL;
+ else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
+ rdev->saved_raid_disk >= 0) {
/* clear_bit is performed _after_ all the devices
* have their local Faulty bit cleared. If any writes
* happen in the meantime in the local node, they
@@ -3375,7 +3456,7 @@
if (!entry->show)
return -EIO;
if (!rdev->mddev)
- return -EBUSY;
+ return -ENODEV;
return entry->show(rdev, page);
}
@@ -3392,10 +3473,10 @@
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- rv = mddev ? mddev_lock(mddev): -EBUSY;
+ rv = mddev ? mddev_lock(mddev) : -ENODEV;
if (!rv) {
if (rdev->mddev == NULL)
- rv = -EBUSY;
+ rv = -ENODEV;
else
rv = entry->store(rdev, page, length);
mddev_unlock(mddev);
@@ -3605,11 +3686,7 @@
return -EINVAL;
if (decimals < 0)
decimals = 0;
- while (decimals < scale) {
- result *= 10;
- decimals ++;
- }
- *res = result;
+ *res = result * int_pow(10, scale - decimals);
return 0;
}
@@ -4096,12 +4173,17 @@
* active-idle
* like active, but no writes have been seen for a while (100msec).
*
+ * broken
+ * RAID0/LINEAR-only: same as clean, but array is missing a member.
+ * It's useful because RAID0/LINEAR mounted-arrays aren't stopped
+ * when a member is gone, so this state will at least alert the
+ * user that something is wrong.
*/
enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
- write_pending, active_idle, bad_word};
+ write_pending, active_idle, broken, bad_word};
static char *array_states[] = {
"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
- "write-pending", "active-idle", NULL };
+ "write-pending", "active-idle", "broken", NULL };
static int match_word(const char *word, char **list)
{
@@ -4117,7 +4199,7 @@
{
enum array_state st = inactive;
- if (mddev->pers)
+ if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
switch(mddev->ro) {
case 1:
st = readonly;
@@ -4137,7 +4219,10 @@
st = active;
spin_unlock(&mddev->lock);
}
- else {
+
+ if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
+ st = broken;
+ } else {
if (list_empty(&mddev->disks) &&
mddev->raid_disks == 0 &&
mddev->dev_sectors == 0)
@@ -4251,6 +4336,7 @@
break;
case write_pending:
case active_idle:
+ case broken:
/* these cannot be set */
break;
}
@@ -5123,6 +5209,34 @@
__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
consistency_policy_store);
+static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d\n", mddev->fail_last_dev);
+}
+
+/*
+ * Setting fail_last_dev to true to allow last device to be forcibly removed
+ * from RAID1/RAID10.
+ */
+static ssize_t
+fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int ret;
+ bool value;
+
+ ret = kstrtobool(buf, &value);
+ if (ret)
+ return ret;
+
+ if (value != mddev->fail_last_dev)
+ mddev->fail_last_dev = value;
+
+ return len;
+}
+static struct md_sysfs_entry md_fail_last_dev =
+__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
+ fail_last_dev_store);
+
static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_layout.attr,
@@ -5139,6 +5253,7 @@
&md_array_size.attr,
&max_corr_read_errors.attr,
&md_consistency_policy.attr,
+ &md_fail_last_dev.attr,
NULL,
};
@@ -5257,7 +5372,8 @@
{
if (mddev->writes_pending.percpu_count_ptr)
return 0;
- if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
+ if (percpu_ref_init(&mddev->writes_pending, no_op,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
return -ENOMEM;
/* We want to start with the refcount at zero */
percpu_ref_put(&mddev->writes_pending);
@@ -5519,22 +5635,6 @@
if (err)
return err;
}
- if (mddev->flush_pool == NULL) {
- mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
- flush_info_free, mddev);
- if (!mddev->flush_pool) {
- err = -ENOMEM;
- goto abort;
- }
- }
- if (mddev->flush_bio_pool == NULL) {
- mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
- flush_bio_free, mddev);
- if (!mddev->flush_bio_pool) {
- err = -ENOMEM;
- goto abort;
- }
- }
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
@@ -5623,15 +5723,28 @@
mddev->bitmap = bitmap;
}
- if (err) {
- mddev_detach(mddev);
- if (mddev->private)
- pers->free(mddev, mddev->private);
- mddev->private = NULL;
- module_put(pers->owner);
- md_bitmap_destroy(mddev);
- goto abort;
+ if (err)
+ goto bitmap_abort;
+
+ if (mddev->bitmap_info.max_write_behind > 0) {
+ bool creat_pool = false;
+
+ rdev_for_each(rdev, mddev) {
+ if (test_bit(WriteMostly, &rdev->flags) &&
+ rdev_init_wb(rdev))
+ creat_pool = true;
+ }
+ if (creat_pool && mddev->wb_info_pool == NULL) {
+ mddev->wb_info_pool =
+ mempool_create_kmalloc_pool(NR_WB_INFOS,
+ sizeof(struct wb_info));
+ if (!mddev->wb_info_pool) {
+ err = -ENOMEM;
+ goto bitmap_abort;
+ }
+ }
}
+
if (mddev->queue) {
bool nonrot = true;
@@ -5674,8 +5787,7 @@
spin_unlock(&mddev->lock);
rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0)
- if (sysfs_link_rdev(mddev, rdev))
- /* failure here is OK */;
+ sysfs_link_rdev(mddev, rdev); /* failure here is OK */
if (mddev->degraded && !mddev->ro)
/* This ensures that recovering status is reported immediately
@@ -5688,21 +5800,18 @@
md_update_sb(mddev, 0);
md_new_event(mddev);
- sysfs_notify_dirent_safe(mddev->sysfs_state);
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- sysfs_notify(&mddev->kobj, NULL, "degraded");
return 0;
+bitmap_abort:
+ mddev_detach(mddev);
+ if (mddev->private)
+ pers->free(mddev, mddev->private);
+ mddev->private = NULL;
+ module_put(pers->owner);
+ md_bitmap_destroy(mddev);
abort:
- if (mddev->flush_bio_pool) {
- mempool_destroy(mddev->flush_bio_pool);
- mddev->flush_bio_pool = NULL;
- }
- if (mddev->flush_pool){
- mempool_destroy(mddev->flush_pool);
- mddev->flush_pool = NULL;
- }
-
+ bioset_exit(&mddev->bio_set);
+ bioset_exit(&mddev->sync_set);
return err;
}
EXPORT_SYMBOL_GPL(md_run);
@@ -5711,6 +5820,7 @@
{
int err;
+ set_bit(MD_NOT_READY, &mddev->flags);
err = md_run(mddev);
if (err)
goto out;
@@ -5731,9 +5841,14 @@
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
+ clear_bit(MD_NOT_READY, &mddev->flags);
mddev->changed = 1;
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
out:
+ clear_bit(MD_NOT_READY, &mddev->flags);
return err;
}
@@ -5868,6 +5983,8 @@
mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
+ mempool_destroy(mddev->wb_info_pool);
+ mddev->wb_info_pool = NULL;
}
void md_stop_writes(struct mddev *mddev)
@@ -5906,14 +6023,6 @@
mddev->to_remove = &md_redundancy_group;
module_put(pers->owner);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- if (mddev->flush_bio_pool) {
- mempool_destroy(mddev->flush_bio_pool);
- mddev->flush_bio_pool = NULL;
- }
- if (mddev->flush_pool) {
- mempool_destroy(mddev->flush_pool);
- mddev->flush_pool = NULL;
- }
}
void md_stop(struct mddev *mddev)
@@ -6799,6 +6908,9 @@
mddev->external = 0;
mddev->layout = info->layout;
+ if (mddev->level == 0)
+ /* Cannot trust RAID0 layout info here */
+ mddev->layout = -1;
mddev->chunk_sectors = info->chunk_size >> 9;
if (mddev->persistent) {
@@ -7657,9 +7769,9 @@
static int status_resync(struct seq_file *seq, struct mddev *mddev)
{
sector_t max_sectors, resync, res;
- unsigned long dt, db;
- sector_t rt;
- int scale;
+ unsigned long dt, db = 0;
+ sector_t rt, curr_mark_cnt, resync_mark_cnt;
+ int scale, recovery_active;
unsigned int per_milli;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@ -7748,22 +7860,30 @@
* db: blocks written from mark until now
* rt: remaining time
*
- * rt is a sector_t, so could be 32bit or 64bit.
- * So we divide before multiply in case it is 32bit and close
- * to the limit.
- * We scale the divisor (db) by 32 to avoid losing precision
- * near the end of resync when the number of remaining sectors
- * is close to 'db'.
- * We then divide rt by 32 after multiplying by db to compensate.
- * The '+1' avoids division by zero if db is very small.
+ * rt is a sector_t, which is always 64bit now. We are keeping
+ * the original algorithm, but it is not really necessary.
+ *
+ * Original algorithm:
+ * So we divide before multiply in case it is 32bit and close
+ * to the limit.
+ * We scale the divisor (db) by 32 to avoid losing precision
+ * near the end of resync when the number of remaining sectors
+ * is close to 'db'.
+ * We then divide rt by 32 after multiplying by db to compensate.
+ * The '+1' avoids division by zero if db is very small.
*/
dt = ((jiffies - mddev->resync_mark) / HZ);
if (!dt) dt++;
- db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
- - mddev->resync_mark_cnt;
+
+ curr_mark_cnt = mddev->curr_mark_cnt;
+ recovery_active = atomic_read(&mddev->recovery_active);
+ resync_mark_cnt = mddev->resync_mark_cnt;
+
+ if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
+ db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
rt = max_sectors - resync; /* number of remaining sectors */
- sector_div(rt, db/32+1);
+ rt = div64_u64(rt, db/32+1);
rt *= dt;
rt >>= 5;
@@ -8240,8 +8360,7 @@
{
struct mddev *mddev = thread->mddev;
struct mddev *mddev2;
- unsigned int currspeed = 0,
- window;
+ unsigned int currspeed = 0, window;
sector_t max_sectors,j, io_sectors, recovery_done;
unsigned long mark[SYNC_MARKS];
unsigned long update_time;
@@ -8298,7 +8417,7 @@
* 0 == not engaged in resync at all
* 2 == checking that there is no conflict with another sync
* 1 == like 2, but have yielded to allow conflicting resync to
- * commense
+ * commence
* other == active in resync - this many blocks
*
* Before starting a resync we must have set curr_resync to
@@ -8372,9 +8491,17 @@
else if (!mddev->bitmap)
j = mddev->recovery_cp;
- } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
max_sectors = mddev->resync_max_sectors;
- else {
+ /*
+ * If the original node aborts reshaping then we continue the
+ * reshaping, so set j again to avoid restart reshape from the
+ * first beginning
+ */
+ if (mddev_is_clustered(mddev) &&
+ mddev->reshape_position != MaxSector)
+ j = mddev->reshape_position;
+ } else {
/* recovery follows the physical size of devices */
max_sectors = mddev->dev_sectors;
j = MaxSector;
@@ -8421,7 +8548,7 @@
/*
* Tune reconstruction:
*/
- window = 32*(PAGE_SIZE/512);
+ window = 32 * (PAGE_SIZE / 512);
pr_debug("md: using %dk window, over a total of %lluk.\n",
window/2, (unsigned long long)max_sectors/2);
@@ -8625,8 +8752,10 @@
mddev_lock_nointr(mddev);
md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
mddev_unlock(mddev);
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ if (!mddev_is_clustered(mddev)) {
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ }
}
spin_lock(&mddev->lock);
@@ -8792,6 +8921,18 @@
*/
void md_check_recovery(struct mddev *mddev)
{
+ if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
+ /* Write superblock - thread that called mddev_suspend()
+ * holds reconfig_mutex for us.
+ */
+ set_bit(MD_UPDATING_SB, &mddev->flags);
+ smp_mb__after_atomic();
+ if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
+ md_update_sb(mddev, 0);
+ clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
+ wake_up(&mddev->sb_wait);
+ }
+
if (mddev->suspended)
return;
@@ -8821,6 +8962,7 @@
if (mddev_trylock(mddev)) {
int spares = 0;
+ bool try_set_sync = mddev->safemode != 0;
if (!mddev->external && mddev->safemode == 1)
mddev->safemode = 0;
@@ -8866,7 +9008,7 @@
}
}
- if (!mddev->external && !mddev->in_sync) {
+ if (try_set_sync && !mddev->external && !mddev->in_sync) {
spin_lock(&mddev->lock);
set_in_sync(mddev);
spin_unlock(&mddev->lock);
@@ -8951,16 +9093,6 @@
unlock:
wake_up(&mddev->sb_wait);
mddev_unlock(mddev);
- } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
- /* Write superblock - thread that called mddev_suspend()
- * holds reconfig_mutex for us.
- */
- set_bit(MD_UPDATING_SB, &mddev->flags);
- smp_mb__after_atomic();
- if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
- md_update_sb(mddev, 0);
- clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
- wake_up(&mddev->sb_wait);
}
}
EXPORT_SYMBOL(md_check_recovery);
@@ -8968,11 +9100,14 @@
void md_reap_sync_thread(struct mddev *mddev)
{
struct md_rdev *rdev;
+ sector_t old_dev_sectors = mddev->dev_sectors;
+ bool is_reshaped = false;
/* resync has finished, collect result */
md_unregister_thread(&mddev->sync_thread);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ mddev->degraded != mddev->raid_disks) {
/* success...*/
/* activate any spares */
if (mddev->pers->spare_active(mddev)) {
@@ -8982,8 +9117,11 @@
}
}
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- mddev->pers->finish_reshape)
+ mddev->pers->finish_reshape) {
mddev->pers->finish_reshape(mddev);
+ if (mddev_is_clustered(mddev))
+ is_reshaped = true;
+ }
/* If array is no-longer degraded, then any saved_raid_disk
* information must be scrapped.
@@ -9004,6 +9142,14 @@
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ /*
+ * We call md_cluster_ops->update_size here because sync_size could
+ * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
+ * so it is time to update size across cluster.
+ */
+ if (mddev_is_clustered(mddev) && is_reshaped
+ && !test_bit(MD_CLOSING, &mddev->flags))
+ md_cluster_ops->update_size(mddev, old_dev_sectors);
wake_up(&resync_wait);
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -9203,8 +9349,12 @@
}
if (role != rdev2->raid_disk) {
- /* got activated */
- if (rdev2->raid_disk == -1 && role != 0xffff) {
+ /*
+ * got activated except reshape is happening.
+ */
+ if (rdev2->raid_disk == -1 && role != 0xffff &&
+ !(le32_to_cpu(sb->feature_map) &
+ MD_FEATURE_RESHAPE_ACTIVE)) {
rdev2->saved_raid_disk = role;
ret = remove_and_add_spares(mddev, rdev2);
pr_info("Activated spare: %s\n",
@@ -9213,7 +9363,6 @@
* perform resync with the new activated disk */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
-
}
/* device faulty
* We just want to do the minimum to mark the disk
@@ -9230,6 +9379,30 @@
if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
+ /*
+ * Since mddev->delta_disks has already updated in update_raid_disks,
+ * so it is time to check reshape.
+ */
+ if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+ (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+ /*
+ * reshape is happening in the remote node, we need to
+ * update reshape_position and call start_reshape.
+ */
+ mddev->reshape_position = le64_to_cpu(sb->reshape_position);
+ if (mddev->pers->update_reshape_pos)
+ mddev->pers->update_reshape_pos(mddev);
+ if (mddev->pers->start_reshape)
+ mddev->pers->start_reshape(mddev);
+ } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+ mddev->reshape_position != MaxSector &&
+ !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+ /* reshape is just done in another node. */
+ mddev->reshape_position = MaxSector;
+ if (mddev->pers->update_reshape_pos)
+ mddev->pers->update_reshape_pos(mddev);
+ }
+
/* Finally set the event to be up to date */
mddev->events = le64_to_cpu(sb->events);
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8afd6bf..c5e3ff3 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -1,15 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
md.h : kernel internal structure of the Linux MD driver
Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
-
- You should have received a copy of the GNU General Public License
- (for example /usr/src/linux/COPYING); if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _MD_MD_H
@@ -116,6 +109,14 @@
* for reporting to userspace and storing
* in superblock.
*/
+
+ /*
+ * The members for check collision of write behind IOs.
+ */
+ struct list_head wb_list;
+ spinlock_t wb_list_lock;
+ wait_queue_head_t wb_io_wait;
+
struct work_struct del_work; /* used for delayed sysfs removal */
struct kernfs_node *sysfs_state; /* handle for 'state'
@@ -200,6 +201,10 @@
* it didn't fail, so don't use FailFast
* any more for metadata
*/
+ WBCollisionCheck, /*
+ * multiqueue device should check if there
+ * is collision between write behind bios.
+ */
};
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -243,6 +248,12 @@
MD_UPDATING_SB, /* md_check_recovery is updating the metadata
* without explicitly holding reconfig_mutex.
*/
+ MD_NOT_READY, /* do_md_run() is active, so 'array_state'
+ * must not report that array is ready yet
+ */
+ MD_BROKEN, /* This is used in RAID-0/LINEAR only, to stop
+ * I/O in case an array member is gone/failed.
+ */
};
enum mddev_sb_flags {
@@ -252,17 +263,12 @@
MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
};
-#define NR_FLUSH_INFOS 8
-#define NR_FLUSH_BIOS 64
-struct flush_info {
- struct bio *bio;
- struct mddev *mddev;
- struct work_struct flush_work;
- atomic_t flush_pending;
-};
-struct flush_bio {
- struct flush_info *fi;
- struct md_rdev *rdev;
+#define NR_WB_INFOS 8
+/* record current range of write behind IOs */
+struct wb_info {
+ sector_t lo;
+ sector_t hi;
+ struct list_head list;
};
struct mddev {
@@ -470,14 +476,24 @@
* metadata and bitmap writes
*/
- mempool_t *flush_pool;
- mempool_t *flush_bio_pool;
+ /* Generic flush handling.
+ * The last to finish preflush schedules a worker to submit
+ * the rest of the request (without the REQ_PREFLUSH flag).
+ */
+ struct bio *flush_bio;
+ atomic_t flush_pending;
+ ktime_t start_flush, last_flush; /* last_flush is when the last completed
+ * flush was started.
+ */
+ struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
+ mempool_t *wb_info_pool;
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info;
unsigned int good_device_nr; /* good device num within cluster raid */
bool has_superblocks:1;
+ bool fail_last_dev:1;
};
enum recovery_flags {
@@ -557,6 +573,7 @@
int (*check_reshape) (struct mddev *mddev);
int (*start_reshape) (struct mddev *mddev);
void (*finish_reshape) (struct mddev *mddev);
+ void (*update_reshape_pos) (struct mddev *mddev);
/* quiesce suspends or resumes internal processing.
* 1 - stop new actions and wait for action io to complete
* 0 - return to normal behaviour
@@ -720,9 +737,24 @@
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
+extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
+static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type)
+{
+ int flags = rdev->bdev->bd_disk->flags;
+
+ if (!(flags & GENHD_FL_UP)) {
+ if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags))
+ pr_warn("md: %s: %s array has a missing/failed member\n",
+ mdname(rdev->mddev), md_type);
+ return true;
+ }
+ return false;
+}
+
static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
{
int faulty = test_bit(Faulty, &rdev->flags);
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index a53cbc9..baaec1a 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config DM_PERSISTENT_DATA
tristate
depends on BLK_DEV_DM
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 492a3f8..749ec26 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -35,7 +35,10 @@
#define MAX_HOLDERS 4
#define MAX_STACK 10
-typedef unsigned long stack_entries[MAX_STACK];
+struct stack_store {
+ unsigned int nr_entries;
+ unsigned long entries[MAX_STACK];
+};
struct block_lock {
spinlock_t lock;
@@ -44,8 +47,7 @@
struct task_struct *holders[MAX_HOLDERS];
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
- struct stack_trace traces[MAX_HOLDERS];
- stack_entries entries[MAX_HOLDERS];
+ struct stack_store traces[MAX_HOLDERS];
#endif
};
@@ -73,7 +75,7 @@
{
unsigned h = __find_holder(lock, NULL);
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
- struct stack_trace *t;
+ struct stack_store *t;
#endif
get_task_struct(task);
@@ -81,11 +83,7 @@
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
t = lock->traces + h;
- t->nr_entries = 0;
- t->max_entries = MAX_STACK;
- t->entries = lock->entries[h];
- t->skip = 2;
- save_stack_trace(t);
+ t->nr_entries = stack_trace_save(t->entries, MAX_STACK, 2);
#endif
}
@@ -106,7 +104,8 @@
DMERR("recursive lock detected in metadata");
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
DMERR("previously held here:");
- print_stack_trace(lock->traces + i, 4);
+ stack_trace_print(lock->traces[i].entries,
+ lock->traces[i].nr_entries, 4);
DMERR("subsequent acquisition attempted here:");
dump_stack();
@@ -462,7 +461,7 @@
int r;
p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
- if (unlikely(IS_ERR(p)))
+ if (IS_ERR(p))
return PTR_ERR(p);
aux = dm_bufio_get_aux_data(to_buffer(*result));
@@ -498,7 +497,7 @@
return -EPERM;
p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
- if (unlikely(IS_ERR(p)))
+ if (IS_ERR(p))
return PTR_ERR(p);
aux = dm_bufio_get_aux_data(to_buffer(*result));
@@ -531,7 +530,7 @@
int r;
p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result);
- if (unlikely(IS_ERR(p)))
+ if (IS_ERR(p))
return PTR_ERR(p);
if (unlikely(!p))
return -EWOULDBLOCK;
@@ -567,7 +566,7 @@
return -EPERM;
p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
- if (unlikely(IS_ERR(p)))
+ if (IS_ERR(p))
return PTR_ERR(p);
memset(p, 0, dm_bm_block_size(bm));
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 58b3197..8aae062 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -628,39 +628,40 @@
new_parent = shadow_current(s);
+ pn = dm_block_data(new_parent);
+ size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
+ sizeof(__le64) : s->info->value_type.size;
+
+ /* create & init the left block */
r = new_block(s->info, &left);
if (r < 0)
return r;
+ ln = dm_block_data(left);
+ nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
+
+ ln->header.flags = pn->header.flags;
+ ln->header.nr_entries = cpu_to_le32(nr_left);
+ ln->header.max_entries = pn->header.max_entries;
+ ln->header.value_size = pn->header.value_size;
+ memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
+ memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
+
+ /* create & init the right block */
r = new_block(s->info, &right);
if (r < 0) {
unlock_block(s->info, left);
return r;
}
- pn = dm_block_data(new_parent);
- ln = dm_block_data(left);
rn = dm_block_data(right);
-
- nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
- ln->header.flags = pn->header.flags;
- ln->header.nr_entries = cpu_to_le32(nr_left);
- ln->header.max_entries = pn->header.max_entries;
- ln->header.value_size = pn->header.value_size;
-
rn->header.flags = pn->header.flags;
rn->header.nr_entries = cpu_to_le32(nr_right);
rn->header.max_entries = pn->header.max_entries;
rn->header.value_size = pn->header.value_size;
-
- memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
-
- size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
- sizeof(__le64) : s->info->value_type.size;
- memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
nr_right * size);
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 0a3b8ae..bd68f6f 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -190,6 +190,8 @@
static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
{
+ memset(ll, 0, sizeof(struct ll_disk));
+
ll->tm = tm;
ll->bitmap_info.tm = tm;
@@ -367,10 +369,6 @@
*/
dm_tm_unlock(ll->tm, blk);
continue;
-
- } else if (r < 0) {
- dm_tm_unlock(ll->tm, blk);
- return r;
}
dm_tm_unlock(ll->tm, blk);
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index aec4492..2532858 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -249,7 +249,7 @@
}
if (smm->recursion_count == 1)
- apply_bops(smm);
+ r = apply_bops(smm);
smm->recursion_count--;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ac1cffd..1e77228 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
raid0.c : Multiple Devices driver for Linux
Copyright (C) 1994-96 Marc ZYNGIER
@@ -7,14 +8,6 @@
RAID-0 management functions.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
-
- You should have received a copy of the GNU General Public License
- (for example /usr/src/linux/COPYING); if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/blkdev.h>
@@ -26,6 +19,9 @@
#include "raid0.h"
#include "raid5.h"
+static int default_layout = 0;
+module_param(default_layout, int, 0644);
+
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
(1L << MD_JOURNAL_CLEAN) | \
@@ -146,6 +142,22 @@
}
pr_debug("md/raid0:%s: FINAL %d zones\n",
mdname(mddev), conf->nr_strip_zones);
+
+ if (conf->nr_strip_zones == 1) {
+ conf->layout = RAID0_ORIG_LAYOUT;
+ } else if (mddev->layout == RAID0_ORIG_LAYOUT ||
+ mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) {
+ conf->layout = mddev->layout;
+ } else if (default_layout == RAID0_ORIG_LAYOUT ||
+ default_layout == RAID0_ALT_MULTIZONE_LAYOUT) {
+ conf->layout = default_layout;
+ } else {
+ pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n",
+ mdname(mddev));
+ pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n");
+ err = -ENOTSUPP;
+ goto abort;
+ }
/*
* now since we have the hard sector sizes, we can make sure
* chunk size is a multiple of that sector size
@@ -542,7 +554,7 @@
!discard_bio)
continue;
bio_chain(discard_bio, bio);
- bio_clone_blkcg_association(discard_bio, bio);
+ bio_clone_blkg_association(discard_bio, bio);
if (mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(rdev->bdev),
discard_bio, disk_devt(mddev->gendisk),
@@ -554,10 +566,12 @@
static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
{
+ struct r0conf *conf = mddev->private;
struct strip_zone *zone;
struct md_rdev *tmp_dev;
sector_t bio_sector;
sector_t sector;
+ sector_t orig_sector;
unsigned chunk_sects;
unsigned sectors;
@@ -591,8 +605,26 @@
bio = split;
}
+ orig_sector = sector;
zone = find_zone(mddev->private, §or);
- tmp_dev = map_sector(mddev, zone, sector, §or);
+ switch (conf->layout) {
+ case RAID0_ORIG_LAYOUT:
+ tmp_dev = map_sector(mddev, zone, orig_sector, §or);
+ break;
+ case RAID0_ALT_MULTIZONE_LAYOUT:
+ tmp_dev = map_sector(mddev, zone, sector, §or);
+ break;
+ default:
+ WARN("md/raid0:%s: Invalid layout\n", mdname(mddev));
+ bio_io_error(bio);
+ return true;
+ }
+
+ if (unlikely(is_mddev_broken(tmp_dev, "raid0"))) {
+ bio_io_error(bio);
+ return true;
+ }
+
bio_set_dev(bio, tmp_dev->bdev);
bio->bi_iter.bi_sector = sector + zone->dev_start +
tmp_dev->data_offset;
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 540e65d..3816e54 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -8,11 +8,25 @@
int nb_dev; /* # of devices attached to the zone */
};
+/* Linux 3.14 (20d0189b101) made an unintended change to
+ * the RAID0 layout for multi-zone arrays (where devices aren't all
+ * the same size.
+ * RAID0_ORIG_LAYOUT restores the original layout
+ * RAID0_ALT_MULTIZONE_LAYOUT uses the altered layout
+ * The layouts are identical when there is only one zone (all
+ * devices the same size).
+ */
+
+enum r0layout {
+ RAID0_ORIG_LAYOUT = 1,
+ RAID0_ALT_MULTIZONE_LAYOUT = 2,
+};
struct r0conf {
struct strip_zone *strip_zone;
struct md_rdev **devlist; /* lists of rdevs, pointed to
* by strip_zone->dev */
int nr_strip_zones;
+ enum r0layout layout;
};
#endif
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 400001b..54db341 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -3,12 +3,42 @@
#define RESYNC_BLOCK_SIZE (64*1024)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+/*
+ * Number of guaranteed raid bios in case of extreme VM load:
+ */
+#define NR_RAID_BIOS 256
+
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error. To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio *)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context. So we record
+ * the success by setting devs[n].bio to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
+
+/* When there are this many requests queue to be written by
+ * the raid thread, we become 'congested' to provide back-pressure
+ * for writeback.
+ */
+static int max_queued_requests = 1024;
+
/* for managing resync I/O pages */
struct resync_pages {
void *raid_bio;
struct page *pages[RESYNC_PAGES];
};
+static void rbio_pool_free(void *rbio, void *data)
+{
+ kfree(rbio);
+}
+
static inline int resync_alloc_pages(struct resync_pages *rp,
gfp_t gfp_flags)
{
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1d54109..0466ee2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* raid1.c : Multiple Devices driver for Linux
*
@@ -20,15 +21,6 @@
*
* Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
* - persistent bitmap code
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/slab.h>
@@ -50,31 +42,6 @@
(1L << MD_HAS_PPL) | \
(1L << MD_HAS_MULTIPLE_PPLS))
-/*
- * Number of guaranteed r1bios in case of extreme VM load:
- */
-#define NR_RAID1_BIOS 256
-
-/* when we get a read error on a read-only array, we redirect to another
- * device without failing the first device, or trying to over-write to
- * correct the read error. To keep track of bad blocks on a per-bio
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
- */
-#define IO_BLOCKED ((struct bio *)1)
-/* When we successfully write to a known bad-block, we need to remove the
- * bad-block marking which must be done from process context. So we record
- * the success by setting devs[n].bio to IO_MADE_GOOD
- */
-#define IO_MADE_GOOD ((struct bio *)2)
-
-#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
-
-/* When there are this many requests queue to be written by
- * the raid1 thread, we become 'congested' to provide back-pressure
- * for writeback.
- */
-static int max_queued_requests = 1024;
-
static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
@@ -83,6 +50,57 @@
#include "raid1-10.c"
+static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
+{
+ struct wb_info *wi, *temp_wi;
+ unsigned long flags;
+ int ret = 0;
+ struct mddev *mddev = rdev->mddev;
+
+ wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO);
+
+ spin_lock_irqsave(&rdev->wb_list_lock, flags);
+ list_for_each_entry(temp_wi, &rdev->wb_list, list) {
+ /* collision happened */
+ if (hi > temp_wi->lo && lo < temp_wi->hi) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+
+ if (!ret) {
+ wi->lo = lo;
+ wi->hi = hi;
+ list_add(&wi->list, &rdev->wb_list);
+ } else
+ mempool_free(wi, mddev->wb_info_pool);
+ spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
+
+ return ret;
+}
+
+static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
+{
+ struct wb_info *wi;
+ unsigned long flags;
+ int found = 0;
+ struct mddev *mddev = rdev->mddev;
+
+ spin_lock_irqsave(&rdev->wb_list_lock, flags);
+ list_for_each_entry(wi, &rdev->wb_list, list)
+ if (hi == wi->hi && lo == wi->lo) {
+ list_del(&wi->list);
+ mempool_free(wi, mddev->wb_info_pool);
+ found = 1;
+ break;
+ }
+
+ if (!found)
+ WARN(1, "The write behind IO is not recorded\n");
+ spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
+ wake_up(&rdev->wb_io_wait);
+}
+
/*
* for resync bio, r1bio pointer can be retrieved from the per-bio
* 'struct resync_pages'.
@@ -101,11 +119,6 @@
return kzalloc(size, gfp_flags);
}
-static void r1bio_pool_free(void *r1_bio, void *data)
-{
- kfree(r1_bio);
-}
-
#define RESYNC_DEPTH 32
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
@@ -181,7 +194,7 @@
kfree(rps);
out_free_r1bio:
- r1bio_pool_free(r1_bio, data);
+ rbio_pool_free(r1_bio, data);
return NULL;
}
@@ -201,7 +214,7 @@
/* resync pages array stored in the 1st bio's .bi_private */
kfree(rp);
- r1bio_pool_free(r1bio, data);
+ rbio_pool_free(r1bio, data);
}
static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
@@ -434,19 +447,21 @@
/* We never try FailFast to WriteMostly devices */
!test_bit(WriteMostly, &rdev->flags)) {
md_error(r1_bio->mddev, rdev);
- if (!test_bit(Faulty, &rdev->flags))
- /* This is the only remaining device,
- * We need to retry the write without
- * FailFast
- */
- set_bit(R1BIO_WriteError, &r1_bio->state);
- else {
- /* Finished with this branch */
- r1_bio->bios[mirror] = NULL;
- to_put = bio;
- }
- } else
+ }
+
+ /*
+ * When the device is faulty, it is not necessary to
+ * handle write error.
+ * For failfast, this is the only remaining device,
+ * We need to retry the write without FailFast.
+ */
+ if (!test_bit(Faulty, &rdev->flags))
set_bit(R1BIO_WriteError, &r1_bio->state);
+ else {
+ /* Finished with this branch */
+ r1_bio->bios[mirror] = NULL;
+ to_put = bio;
+ }
} else {
/*
* Set R1BIO_Uptodate in our master bio, so that we
@@ -484,6 +499,12 @@
}
if (behind) {
+ if (test_bit(WBCollisionCheck, &rdev->flags)) {
+ sector_t lo = r1_bio->sector;
+ sector_t hi = r1_bio->sector + r1_bio->sectors;
+
+ remove_wb(rdev, lo, hi);
+ }
if (test_bit(WriteMostly, &rdev->flags))
atomic_dec(&r1_bio->behind_remaining);
@@ -853,8 +874,11 @@
* backgroup IO calls must call raise_barrier. Once that returns
* there is no normal IO happeing. It must arrange to call
* lower_barrier when the particular background IO completes.
+ *
+ * If resync/recovery is interrupted, returns -EINTR;
+ * Otherwise, returns 0.
*/
-static sector_t raise_barrier(struct r1conf *conf, sector_t sector_nr)
+static int raise_barrier(struct r1conf *conf, sector_t sector_nr)
{
int idx = sector_to_idx(sector_nr);
@@ -1457,7 +1481,6 @@
if (!r1_bio->bios[i])
continue;
-
if (first_clone) {
/* do behind I/O ?
* Not if there are too many, or cannot
@@ -1482,7 +1505,16 @@
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (r1_bio->behind_master_bio) {
- if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
+
+ if (test_bit(WBCollisionCheck, &rdev->flags)) {
+ sector_t lo = r1_bio->sector;
+ sector_t hi = r1_bio->sector + r1_bio->sectors;
+
+ wait_event(rdev->wb_io_wait,
+ check_and_add_wb(rdev, lo, hi) == 0);
+ }
+ if (test_bit(WriteMostly, &rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
}
@@ -1585,12 +1617,12 @@
/*
* If it is not operational, then we have already marked it as dead
- * else if it is the last working disks, ignore the error, let the
- * next level up know.
+ * else if it is the last working disks with "fail_last_dev == false",
+ * ignore the error, let the next level up know.
* else mark the drive as failed
*/
spin_lock_irqsave(&conf->device_lock, flags);
- if (test_bit(In_sync, &rdev->flags)
+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
&& (conf->raid_disks - mddev->degraded) == 1) {
/*
* Don't fail the drive, act as though we were just a
@@ -1603,11 +1635,9 @@
return;
}
set_bit(Blocked, &rdev->flags);
- if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++;
- set_bit(Faulty, &rdev->flags);
- } else
- set_bit(Faulty, &rdev->flags);
+ set_bit(Faulty, &rdev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery is running, make sure it aborts.
@@ -1739,9 +1769,8 @@
first = last = rdev->saved_raid_disk;
for (mirror = first; mirror <= last; mirror++) {
- p = conf->mirrors+mirror;
+ p = conf->mirrors + mirror;
if (!p->rdev) {
-
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
@@ -1863,6 +1892,36 @@
reschedule_retry(r1_bio);
}
+static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
+{
+ sector_t sync_blocks = 0;
+ sector_t s = r1_bio->sector;
+ long sectors_to_go = r1_bio->sectors;
+
+ /* make sure these bits don't get cleared. */
+ do {
+ md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
+ s += sync_blocks;
+ sectors_to_go -= sync_blocks;
+ } while (sectors_to_go > 0);
+}
+
+static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
+{
+ if (atomic_dec_and_test(&r1_bio->remaining)) {
+ struct mddev *mddev = r1_bio->mddev;
+ int s = r1_bio->sectors;
+
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, uptodate);
+ }
+ }
+}
+
static void end_sync_write(struct bio *bio)
{
int uptodate = !bio->bi_status;
@@ -1874,15 +1933,7 @@
struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
if (!uptodate) {
- sector_t sync_blocks = 0;
- sector_t s = r1_bio->sector;
- long sectors_to_go = r1_bio->sectors;
- /* make sure these bits doesn't get cleared. */
- do {
- md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
- s += sync_blocks;
- sectors_to_go -= sync_blocks;
- } while (sectors_to_go > 0);
+ abort_sync_write(mddev, r1_bio);
set_bit(WriteErrorSeen, &rdev->flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED, &
@@ -1897,16 +1948,7 @@
)
set_bit(R1BIO_MadeGood, &r1_bio->state);
- if (atomic_dec_and_test(&r1_bio->remaining)) {
- int s = r1_bio->sectors;
- if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
- test_bit(R1BIO_WriteError, &r1_bio->state))
- reschedule_retry(r1_bio);
- else {
- put_buf(r1_bio);
- md_done_sync(mddev, s, uptodate);
- }
- }
+ put_sync_write_buf(r1_bio, uptodate);
}
static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
@@ -2106,7 +2148,7 @@
}
r1_bio->read_disk = primary;
for (i = 0; i < conf->raid_disks * 2; i++) {
- int j;
+ int j = 0;
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
blk_status_t status = sbio->bi_status;
@@ -2114,14 +2156,15 @@
struct page **spages = get_resync_pages(sbio)->pages;
struct bio_vec *bi;
int page_len[RESYNC_PAGES] = { 0 };
+ struct bvec_iter_all iter_all;
if (sbio->bi_end_io != end_sync_read)
continue;
/* Now we can 'fixup' the error value */
sbio->bi_status = 0;
- bio_for_each_segment_all(bi, sbio, j)
- page_len[j] = bi->bv_len;
+ bio_for_each_segment_all(bi, sbio, iter_all)
+ page_len[j++] = bi->bv_len;
if (!status) {
for (j = vcnt; j-- ; ) {
@@ -2172,8 +2215,10 @@
(i == r1_bio->read_disk ||
!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
continue;
- if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
+ if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
+ abort_sync_write(mddev, r1_bio);
continue;
+ }
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
@@ -2186,17 +2231,7 @@
generic_make_request(wbio);
}
- if (atomic_dec_and_test(&r1_bio->remaining)) {
- /* if we're here, all write(s) have completed, so clean up */
- int s = r1_bio->sectors;
- if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
- test_bit(R1BIO_WriteError, &r1_bio->state))
- reschedule_retry(r1_bio);
- else {
- put_buf(r1_bio);
- md_done_sync(mddev, s, 1);
- }
- }
+ put_sync_write_buf(r1_bio, 1);
}
/*
@@ -2889,7 +2924,6 @@
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
generic_make_request(bio);
-
}
return nr_sectors;
}
@@ -2948,8 +2982,8 @@
if (!conf->poolinfo)
goto abort;
conf->poolinfo->raid_disks = mddev->raid_disks * 2;
- err = mempool_init(&conf->r1bio_pool, NR_RAID1_BIOS, r1bio_pool_alloc,
- r1bio_pool_free, conf->poolinfo);
+ err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc,
+ rbio_pool_free, conf->poolinfo);
if (err)
goto abort;
@@ -3090,11 +3124,18 @@
}
mddev->degraded = 0;
- for (i=0; i < conf->raid_disks; i++)
+ for (i = 0; i < conf->raid_disks; i++)
if (conf->mirrors[i].rdev == NULL ||
!test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
test_bit(Faulty, &conf->mirrors[i].rdev->flags))
mddev->degraded++;
+ /*
+ * RAID1 needs at least one disk in active
+ */
+ if (conf->raid_disks - mddev->degraded < 1) {
+ ret = -EINVAL;
+ goto abort;
+ }
if (conf->raid_disks - mddev->degraded == 1)
mddev->recovery_cp = MaxSector;
@@ -3125,11 +3166,15 @@
mddev->queue);
}
- ret = md_integrity_register(mddev);
+ ret = md_integrity_register(mddev);
if (ret) {
md_unregister_thread(&mddev->thread);
- raid1_free(mddev, conf);
+ goto abort;
}
+ return 0;
+
+abort:
+ raid1_free(mddev, conf);
return ret;
}
@@ -3233,8 +3278,8 @@
newpoolinfo->mddev = mddev;
newpoolinfo->raid_disks = raid_disks * 2;
- ret = mempool_init(&newpool, NR_RAID1_BIOS, r1bio_pool_alloc,
- r1bio_pool_free, newpoolinfo);
+ ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc,
+ rbio_pool_free, newpoolinfo);
if (ret) {
kfree(newpoolinfo);
return ret;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 811427e..8a62c92 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* raid10.c : Multiple Devices driver for Linux
*
@@ -6,16 +7,6 @@
* RAID-10 support for md.
*
* Base on code in raid1.c. See raid1.c for further copyright information.
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/slab.h>
@@ -25,6 +16,7 @@
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
+#include <linux/raid/md_p.h>
#include <trace/events/block.h>
#include "md.h"
#include "raid10.h"
@@ -72,31 +64,6 @@
* [B A] [D C] [B A] [E C D]
*/
-/*
- * Number of guaranteed r10bios in case of extreme VM load:
- */
-#define NR_RAID10_BIOS 256
-
-/* when we get a read error on a read-only array, we redirect to another
- * device without failing the first device, or trying to over-write to
- * correct the read error. To keep track of bad blocks on a per-bio
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
- */
-#define IO_BLOCKED ((struct bio *)1)
-/* When we successfully write to a known bad-block, we need to remove the
- * bad-block marking which must be done from process context. So we record
- * the success by setting devs[n].bio to IO_MADE_GOOD
- */
-#define IO_MADE_GOOD ((struct bio *)2)
-
-#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
-
-/* When there are this many requests queued to be written by
- * the raid10 thread, we become 'congested' to provide back-pressure
- * for writeback.
- */
-static int max_queued_requests = 1024;
-
static void allow_barrier(struct r10conf *conf);
static void lower_barrier(struct r10conf *conf);
static int _enough(struct r10conf *conf, int previous, int ignore);
@@ -131,11 +98,6 @@
return kzalloc(size, gfp_flags);
}
-static void r10bio_pool_free(void *r10_bio, void *data)
-{
- kfree(r10_bio);
-}
-
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
/* amount of memory to reserve for resync requests */
#define RESYNC_WINDOW (1024*1024)
@@ -229,7 +191,7 @@
out_free_pages:
while (--j >= 0)
- resync_free_pages(&rps[j * 2]);
+ resync_free_pages(&rps[j]);
j = 0;
out_free_bio:
@@ -241,7 +203,7 @@
}
kfree(rps);
out_free_r10bio:
- r10bio_pool_free(r10_bio, conf);
+ rbio_pool_free(r10_bio, conf);
return NULL;
}
@@ -269,7 +231,7 @@
/* resync pages array stored in the 1st bio's .bi_private */
kfree(rp);
- r10bio_pool_free(r10bio, conf);
+ rbio_pool_free(r10bio, conf);
}
static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
@@ -503,19 +465,21 @@
if (test_bit(FailFast, &rdev->flags) &&
(bio->bi_opf & MD_FAILFAST)) {
md_error(rdev->mddev, rdev);
- if (!test_bit(Faulty, &rdev->flags))
- /* This is the only remaining device,
- * We need to retry the write without
- * FailFast
- */
- set_bit(R10BIO_WriteError, &r10_bio->state);
- else {
- r10_bio->devs[slot].bio = NULL;
- to_put = bio;
- dec_rdev = 1;
- }
- } else
+ }
+
+ /*
+ * When the device is faulty, it is not necessary to
+ * handle write error.
+ * For failfast, this is the only remaining device,
+ * We need to retry the write without FailFast.
+ */
+ if (!test_bit(Faulty, &rdev->flags))
set_bit(R10BIO_WriteError, &r10_bio->state);
+ else {
+ r10_bio->devs[slot].bio = NULL;
+ to_put = bio;
+ dec_rdev = 1;
+ }
}
} else {
/*
@@ -745,15 +709,19 @@
int sectors = r10_bio->sectors;
int best_good_sectors;
sector_t new_distance, best_dist;
- struct md_rdev *best_rdev, *rdev = NULL;
+ struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL;
int do_balance;
- int best_slot;
+ int best_dist_slot, best_pending_slot;
+ bool has_nonrot_disk = false;
+ unsigned int min_pending;
struct geom *geo = &conf->geo;
raid10_find_phys(conf, r10_bio);
rcu_read_lock();
- best_slot = -1;
- best_rdev = NULL;
+ best_dist_slot = -1;
+ min_pending = UINT_MAX;
+ best_dist_rdev = NULL;
+ best_pending_rdev = NULL;
best_dist = MaxSector;
best_good_sectors = 0;
do_balance = 1;
@@ -775,6 +743,8 @@
sector_t first_bad;
int bad_sectors;
sector_t dev_sector;
+ unsigned int pending;
+ bool nonrot;
if (r10_bio->devs[slot].bio == IO_BLOCKED)
continue;
@@ -811,8 +781,8 @@
first_bad - dev_sector;
if (good_sectors > best_good_sectors) {
best_good_sectors = good_sectors;
- best_slot = slot;
- best_rdev = rdev;
+ best_dist_slot = slot;
+ best_dist_rdev = rdev;
}
if (!do_balance)
/* Must read from here */
@@ -825,14 +795,23 @@
if (!do_balance)
break;
- if (best_slot >= 0)
+ nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
+ has_nonrot_disk |= nonrot;
+ pending = atomic_read(&rdev->nr_pending);
+ if (min_pending > pending && nonrot) {
+ min_pending = pending;
+ best_pending_slot = slot;
+ best_pending_rdev = rdev;
+ }
+
+ if (best_dist_slot >= 0)
/* At least 2 disks to choose from so failfast is OK */
set_bit(R10BIO_FailFast, &r10_bio->state);
/* This optimisation is debatable, and completely destroys
* sequential read speed for 'far copies' arrays. So only
* keep it for 'near' arrays, and review those later.
*/
- if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
+ if (geo->near_copies > 1 && !pending)
new_distance = 0;
/* for far > 1 always use the lowest address */
@@ -841,15 +820,21 @@
else
new_distance = abs(r10_bio->devs[slot].addr -
conf->mirrors[disk].head_position);
+
if (new_distance < best_dist) {
best_dist = new_distance;
- best_slot = slot;
- best_rdev = rdev;
+ best_dist_slot = slot;
+ best_dist_rdev = rdev;
}
}
if (slot >= conf->copies) {
- slot = best_slot;
- rdev = best_rdev;
+ if (has_nonrot_disk) {
+ slot = best_pending_slot;
+ rdev = best_pending_rdev;
+ } else {
+ slot = best_dist_slot;
+ rdev = best_dist_rdev;
+ }
}
if (slot >= 0) {
@@ -1123,6 +1108,29 @@
kfree(plug);
}
+/*
+ * 1. Register the new request and wait if the reconstruction thread has put
+ * up a bar for new requests. Continue immediately if no resync is active
+ * currently.
+ * 2. If IO spans the reshape position. Need to wait for reshape to pass.
+ */
+static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
+ struct bio *bio, sector_t sectors)
+{
+ wait_barrier(conf);
+ while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ bio->bi_iter.bi_sector < conf->reshape_progress &&
+ bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
+ raid10_log(conf->mddev, "wait reshape");
+ allow_barrier(conf);
+ wait_event(conf->wait_barrier,
+ conf->reshape_progress <= bio->bi_iter.bi_sector ||
+ conf->reshape_progress >= bio->bi_iter.bi_sector +
+ sectors);
+ wait_barrier(conf);
+ }
+}
+
static void raid10_read_request(struct mddev *mddev, struct bio *bio,
struct r10bio *r10_bio)
{
@@ -1131,7 +1139,6 @@
const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
int max_sectors;
- sector_t sectors;
struct md_rdev *rdev;
char b[BDEVNAME_SIZE];
int slot = r10_bio->read_slot;
@@ -1165,30 +1172,8 @@
}
rcu_read_unlock();
}
- /*
- * Register the new request and wait if the reconstruction
- * thread has put up a bar for new requests.
- * Continue immediately if no resync is active currently.
- */
- wait_barrier(conf);
- sectors = r10_bio->sectors;
- while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- bio->bi_iter.bi_sector < conf->reshape_progress &&
- bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
- /*
- * IO spans the reshape position. Need to wait for reshape to
- * pass
- */
- raid10_log(conf->mddev, "wait reshape");
- allow_barrier(conf);
- wait_event(conf->wait_barrier,
- conf->reshape_progress <= bio->bi_iter.bi_sector ||
- conf->reshape_progress >= bio->bi_iter.bi_sector +
- sectors);
- wait_barrier(conf);
- }
-
+ regular_request_wait(mddev, conf, bio, r10_bio->sectors);
rdev = read_balance(conf, r10_bio, &max_sectors);
if (!rdev) {
if (err_rdev) {
@@ -1208,7 +1193,9 @@
struct bio *split = bio_split(bio, max_sectors,
gfp, &conf->bio_split);
bio_chain(split, bio);
+ allow_barrier(conf);
generic_make_request(bio);
+ wait_barrier(conf);
bio = split;
r10_bio->master_bio = bio;
r10_bio->sectors = max_sectors;
@@ -1331,30 +1318,8 @@
finish_wait(&conf->wait_barrier, &w);
}
- /*
- * Register the new request and wait if the reconstruction
- * thread has put up a bar for new requests.
- * Continue immediately if no resync is active currently.
- */
- wait_barrier(conf);
-
sectors = r10_bio->sectors;
- while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- bio->bi_iter.bi_sector < conf->reshape_progress &&
- bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
- /*
- * IO spans the reshape position. Need to wait for reshape to
- * pass
- */
- raid10_log(conf->mddev, "wait reshape");
- allow_barrier(conf);
- wait_event(conf->wait_barrier,
- conf->reshape_progress <= bio->bi_iter.bi_sector ||
- conf->reshape_progress >= bio->bi_iter.bi_sector +
- sectors);
- wait_barrier(conf);
- }
-
+ regular_request_wait(mddev, conf, bio, sectors);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
(mddev->reshape_backwards
? (bio->bi_iter.bi_sector < conf->reshape_safe &&
@@ -1513,7 +1478,9 @@
struct bio *split = bio_split(bio, r10_bio->sectors,
GFP_NOIO, &conf->bio_split);
bio_chain(split, bio);
+ allow_barrier(conf);
generic_make_request(bio);
+ wait_barrier(conf);
bio = split;
r10_bio->master_bio = bio;
}
@@ -1673,12 +1640,12 @@
/*
* If it is not operational, then we have already marked it as dead
- * else if it is the last working disks, ignore the error, let the
- * next level up know.
+ * else if it is the last working disks with "fail_last_dev == false",
+ * ignore the error, let the next level up know.
* else mark the drive as failed
*/
spin_lock_irqsave(&conf->device_lock, flags);
- if (test_bit(In_sync, &rdev->flags)
+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
&& !enough(conf, rdev->raid_disk)) {
/*
* Don't fail the drive, just return an IO error.
@@ -3080,6 +3047,8 @@
sector_t sect;
int must_sync;
int any_working;
+ int need_recover = 0;
+ int need_replace = 0;
struct raid10_info *mirror = &conf->mirrors[i];
struct md_rdev *mrdev, *mreplace;
@@ -3087,11 +3056,15 @@
mrdev = rcu_dereference(mirror->rdev);
mreplace = rcu_dereference(mirror->replacement);
- if ((mrdev == NULL ||
- test_bit(Faulty, &mrdev->flags) ||
- test_bit(In_sync, &mrdev->flags)) &&
- (mreplace == NULL ||
- test_bit(Faulty, &mreplace->flags))) {
+ if (mrdev != NULL &&
+ !test_bit(Faulty, &mrdev->flags) &&
+ !test_bit(In_sync, &mrdev->flags))
+ need_recover = 1;
+ if (mreplace != NULL &&
+ !test_bit(Faulty, &mreplace->flags))
+ need_replace = 1;
+
+ if (!need_recover && !need_replace) {
rcu_read_unlock();
continue;
}
@@ -3214,7 +3187,7 @@
r10_bio->devs[1].devnum = i;
r10_bio->devs[1].addr = to_addr;
- if (!test_bit(In_sync, &mrdev->flags)) {
+ if (need_recover) {
bio = r10_bio->devs[1].bio;
bio->bi_next = biolist;
biolist = bio;
@@ -3231,16 +3204,11 @@
bio = r10_bio->devs[1].repl_bio;
if (bio)
bio->bi_end_io = NULL;
- /* Note: if mreplace != NULL, then bio
+ /* Note: if need_replace, then bio
* cannot be NULL as r10buf_pool_alloc will
* have allocated it.
- * So the second test here is pointless.
- * But it keeps semantic-checkers happy, and
- * this comment keeps human reviewers
- * happy.
*/
- if (mreplace == NULL || bio == NULL ||
- test_bit(Faulty, &mreplace->flags))
+ if (!need_replace)
break;
bio->bi_next = biolist;
biolist = bio;
@@ -3700,8 +3668,8 @@
conf->geo = geo;
conf->copies = copies;
- err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc,
- r10bio_pool_free, conf);
+ err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
+ rbio_pool_free, conf);
if (err)
goto out;
@@ -3955,6 +3923,8 @@
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"reshape");
+ if (!mddev->sync_thread)
+ goto out_free_conf;
}
return 0;
@@ -4287,12 +4257,46 @@
spin_unlock_irq(&conf->device_lock);
if (mddev->delta_disks && mddev->bitmap) {
- ret = md_bitmap_resize(mddev->bitmap,
- raid10_size(mddev, 0, conf->geo.raid_disks),
- 0, 0);
+ struct mdp_superblock_1 *sb = NULL;
+ sector_t oldsize, newsize;
+
+ oldsize = raid10_size(mddev, 0, 0);
+ newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
+
+ if (!mddev_is_clustered(mddev)) {
+ ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
+ if (ret)
+ goto abort;
+ else
+ goto out;
+ }
+
+ rdev_for_each(rdev, mddev) {
+ if (rdev->raid_disk > -1 &&
+ !test_bit(Faulty, &rdev->flags))
+ sb = page_address(rdev->sb_page);
+ }
+
+ /*
+ * some node is already performing reshape, and no need to
+ * call md_bitmap_resize again since it should be called when
+ * receiving BITMAP_RESIZE msg
+ */
+ if ((sb && (le32_to_cpu(sb->feature_map) &
+ MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
+ goto out;
+
+ ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
if (ret)
goto abort;
+
+ ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
+ if (ret) {
+ md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
+ goto abort;
+ }
}
+out:
if (mddev->delta_disks > 0) {
rdev_for_each(rdev, mddev)
if (rdev->raid_disk < 0 &&
@@ -4569,6 +4573,32 @@
r10_bio->master_bio = read_bio;
r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
+ /*
+ * Broadcast RESYNC message to other nodes, so all nodes would not
+ * write to the region to avoid conflict.
+ */
+ if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
+ struct mdp_superblock_1 *sb = NULL;
+ int sb_reshape_pos = 0;
+
+ conf->cluster_sync_low = sector_nr;
+ conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
+ sb = page_address(rdev->sb_page);
+ if (sb) {
+ sb_reshape_pos = le64_to_cpu(sb->reshape_position);
+ /*
+ * Set cluster_sync_low again if next address for array
+ * reshape is less than cluster_sync_low. Since we can't
+ * update cluster_sync_low until it has finished reshape.
+ */
+ if (sb_reshape_pos < conf->cluster_sync_low)
+ conf->cluster_sync_low = sb_reshape_pos;
+ }
+
+ md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
+ conf->cluster_sync_high);
+ }
+
/* Now find the locations in the new layout */
__raid10_find_phys(&conf->geo, r10_bio);
@@ -4626,7 +4656,6 @@
atomic_inc(&r10_bio->remaining);
read_bio->bi_next = NULL;
generic_make_request(read_bio);
- sector_nr += nr_sectors;
sectors_done += nr_sectors;
if (sector_nr <= last)
goto read_more;
@@ -4720,6 +4749,19 @@
conf->fullsync = 0;
}
+static void raid10_update_reshape_pos(struct mddev *mddev)
+{
+ struct r10conf *conf = mddev->private;
+ sector_t lo, hi;
+
+ md_cluster_ops->resync_info_get(mddev, &lo, &hi);
+ if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
+ || mddev->reshape_position == MaxSector)
+ conf->reshape_progress = mddev->reshape_position;
+ else
+ WARN_ON_ONCE(1);
+}
+
static int handle_reshape_read_error(struct mddev *mddev,
struct r10bio *r10_bio)
{
@@ -4731,8 +4773,7 @@
int idx = 0;
struct page **pages;
- r10b = kmalloc(sizeof(*r10b) +
- sizeof(struct r10dev) * conf->copies, GFP_NOIO);
+ r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
if (!r10b) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return -ENOMEM;
@@ -4888,6 +4929,7 @@
.check_reshape = raid10_check_reshape,
.start_reshape = raid10_start_reshape,
.finish_reshape = raid10_finish_reshape,
+ .update_reshape_pos = raid10_update_reshape_pos,
.congested = raid10_congested,
};
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index e6e925a..9b6da75 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2015 Shaohua Li <shli@fb.com>
* Copyright (C) 2016 Song Liu <songliubraving@fb.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
*/
#include <linux/kernel.h>
#include <linux/wait.h>
@@ -1935,12 +1926,14 @@
}
static struct stripe_head *
-r5c_recovery_alloc_stripe(struct r5conf *conf,
- sector_t stripe_sect)
+r5c_recovery_alloc_stripe(
+ struct r5conf *conf,
+ sector_t stripe_sect,
+ int noblock)
{
struct stripe_head *sh;
- sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0);
+ sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0);
if (!sh)
return NULL; /* no more stripe available */
@@ -2150,7 +2143,7 @@
stripe_sect);
if (!sh) {
- sh = r5c_recovery_alloc_stripe(conf, stripe_sect);
+ sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1);
/*
* cannot get stripe from raid5_get_active_stripe
* try replay some stripes
@@ -2159,20 +2152,29 @@
r5c_recovery_replay_stripes(
cached_stripe_list, ctx);
sh = r5c_recovery_alloc_stripe(
- conf, stripe_sect);
+ conf, stripe_sect, 1);
}
if (!sh) {
+ int new_size = conf->min_nr_stripes * 2;
pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
mdname(mddev),
- conf->min_nr_stripes * 2);
- raid5_set_cache_size(mddev,
- conf->min_nr_stripes * 2);
- sh = r5c_recovery_alloc_stripe(conf,
- stripe_sect);
+ new_size);
+ ret = raid5_set_cache_size(mddev, new_size);
+ if (conf->min_nr_stripes <= new_size / 2) {
+ pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n",
+ mdname(mddev),
+ ret,
+ new_size,
+ conf->min_nr_stripes,
+ conf->max_nr_stripes);
+ return -ENOMEM;
+ }
+ sh = r5c_recovery_alloc_stripe(
+ conf, stripe_sect, 0);
}
if (!sh) {
pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
- mdname(mddev));
+ mdname(mddev));
return -ENOMEM;
}
list_add_tail(&sh->lru, cached_stripe_list);
@@ -3151,8 +3153,6 @@
set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return 0;
- rcu_assign_pointer(conf->log, NULL);
- md_unregister_thread(&log->reclaim_thread);
reclaim_thread:
mempool_exit(&log->meta_pool);
out_mempool:
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index bfb8114..43c714a 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -45,6 +45,7 @@
extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
extern void ppl_quiesce(struct r5conf *conf, int quiesce);
extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio);
+extern struct md_sysfs_entry ppl_write_hint;
static inline bool raid5_has_log(struct r5conf *conf)
{
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 3a7c363..18a4064 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1,26 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Partial Parity Log for closing the RAID5 write hole
* Copyright (c) 2017, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
*/
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/crc32c.h>
-#include <linux/flex_array.h>
#include <linux/async_tx.h>
#include <linux/raid/md_p.h>
#include "md.h"
#include "raid5.h"
+#include "raid5-log.h"
/*
* PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
@@ -116,6 +108,8 @@
/* stripes to retry if failed to allocate io_unit */
struct list_head no_mem_stripes;
spinlock_t no_mem_stripes_lock;
+
+ unsigned short write_hint;
};
struct ppl_log {
@@ -165,7 +159,7 @@
struct dma_async_tx_descriptor *tx)
{
int disks = sh->disks;
- struct page **srcs = flex_array_get(percpu->scribble, 0);
+ struct page **srcs = percpu->scribble;
int count = 0, pd_idx = sh->pd_idx, i;
struct async_submit_ctl submit;
@@ -196,8 +190,7 @@
}
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
- NULL, sh, flex_array_get(percpu->scribble, 0)
- + sizeof(struct page *) * (sh->disks + 2));
+ NULL, sh, (void *) (srcs + sh->disks + 2));
if (count == 1)
tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
@@ -476,6 +469,7 @@
bio_set_dev(bio, log->rdev->bdev);
bio->bi_iter.bi_sector = log->next_io_sector;
bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
+ bio->bi_write_hint = ppl_conf->write_hint;
pr_debug("%s: log->current_io_sector: %llu\n", __func__,
(unsigned long long)log->next_io_sector);
@@ -505,6 +499,7 @@
bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
&ppl_conf->bs);
bio->bi_opf = prev->bi_opf;
+ bio->bi_write_hint = prev->bi_write_hint;
bio_copy_dev(bio, prev);
bio->bi_iter.bi_sector = bio_end_sector(prev);
bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
@@ -1409,6 +1404,7 @@
atomic64_set(&ppl_conf->seq, 0);
INIT_LIST_HEAD(&ppl_conf->no_mem_stripes);
spin_lock_init(&ppl_conf->no_mem_stripes_lock);
+ ppl_conf->write_hint = RWF_WRITE_LIFE_NOT_SET;
if (!mddev->external) {
ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
@@ -1503,3 +1499,60 @@
return ret;
}
+
+static ssize_t
+ppl_write_hint_show(struct mddev *mddev, char *buf)
+{
+ size_t ret = 0;
+ struct r5conf *conf;
+ struct ppl_conf *ppl_conf = NULL;
+
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
+ if (conf && raid5_has_ppl(conf))
+ ppl_conf = conf->log_private;
+ ret = sprintf(buf, "%d\n", ppl_conf ? ppl_conf->write_hint : 0);
+ spin_unlock(&mddev->lock);
+
+ return ret;
+}
+
+static ssize_t
+ppl_write_hint_store(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf;
+ struct ppl_conf *ppl_conf;
+ int err = 0;
+ unsigned short new;
+
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+ if (kstrtou16(page, 10, &new))
+ return -EINVAL;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+
+ conf = mddev->private;
+ if (!conf) {
+ err = -ENODEV;
+ } else if (raid5_has_ppl(conf)) {
+ ppl_conf = conf->log_private;
+ if (!ppl_conf)
+ err = -EINVAL;
+ else
+ ppl_conf->write_hint = new;
+ } else {
+ err = -EINVAL;
+ }
+
+ mddev_unlock(mddev);
+
+ return err ?: len;
+}
+
+struct md_sysfs_entry
+ppl_write_hint = __ATTR(ppl_write_hint, S_IRUGO | S_IWUSR,
+ ppl_write_hint_show,
+ ppl_write_hint_store);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e4e98f4..223e97a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* raid5.c : Multiple Devices driver for Linux
* Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
@@ -7,15 +8,6 @@
* RAID-4/5/6 management functions.
* Thanks to Penguin Computing for making the RAID-6 development possible
* by donating a test server!
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
@@ -54,7 +46,6 @@
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/nodemask.h>
-#include <linux/flex_array.h>
#include <trace/events/block.h>
#include <linux/list_sort.h>
@@ -712,6 +703,8 @@
}
static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+ __acquires(&sh1->stripe_lock)
+ __acquires(&sh2->stripe_lock)
{
if (sh1 > sh2) {
spin_lock_irq(&sh2->stripe_lock);
@@ -723,6 +716,8 @@
}
static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+ __releases(&sh1->stripe_lock)
+ __releases(&sh2->stripe_lock)
{
spin_unlock(&sh1->stripe_lock);
spin_unlock_irq(&sh2->stripe_lock);
@@ -1394,22 +1389,16 @@
}
/* return a pointer to the address conversion region of the scribble buffer */
-static addr_conv_t *to_addr_conv(struct stripe_head *sh,
- struct raid5_percpu *percpu, int i)
+static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
{
- void *addr;
-
- addr = flex_array_get(percpu->scribble, i);
- return addr + sizeof(struct page *) * (sh->disks + 2);
+ return percpu->scribble + i * percpu->scribble_obj_size;
}
/* return a pointer to the address conversion region of the scribble buffer */
-static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
+static addr_conv_t *to_addr_conv(struct stripe_head *sh,
+ struct raid5_percpu *percpu, int i)
{
- void *addr;
-
- addr = flex_array_get(percpu->scribble, i);
- return addr;
+ return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
}
static struct dma_async_tx_descriptor *
@@ -2238,21 +2227,23 @@
* calculate over all devices (not just the data blocks), using zeros in place
* of the P and Q blocks.
*/
-static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
+static int scribble_alloc(struct raid5_percpu *percpu,
+ int num, int cnt, gfp_t flags)
{
- struct flex_array *ret;
- size_t len;
+ size_t obj_size =
+ sizeof(struct page *) * (num+2) +
+ sizeof(addr_conv_t) * (num+2);
+ void *scribble;
- len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
- ret = flex_array_alloc(len, cnt, flags);
- if (!ret)
- return NULL;
- /* always prealloc all elements, so no locking is required */
- if (flex_array_prealloc(ret, 0, cnt, flags)) {
- flex_array_free(ret);
- return NULL;
- }
- return ret;
+ scribble = kvmalloc_array(cnt, obj_size, flags);
+ if (!scribble)
+ return -ENOMEM;
+
+ kvfree(percpu->scribble);
+
+ percpu->scribble = scribble;
+ percpu->scribble_obj_size = obj_size;
+ return 0;
}
static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
@@ -2270,23 +2261,18 @@
return 0;
mddev_suspend(conf->mddev);
get_online_cpus();
+
for_each_present_cpu(cpu) {
struct raid5_percpu *percpu;
- struct flex_array *scribble;
percpu = per_cpu_ptr(conf->percpu, cpu);
- scribble = scribble_alloc(new_disks,
- new_sectors / STRIPE_SECTORS,
- GFP_NOIO);
-
- if (scribble) {
- flex_array_free(percpu->scribble);
- percpu->scribble = scribble;
- } else {
- err = -ENOMEM;
+ err = scribble_alloc(percpu, new_disks,
+ new_sectors / STRIPE_SECTORS,
+ GFP_NOIO);
+ if (err)
break;
- }
}
+
put_online_cpus();
mddev_resume(conf->mddev);
if (!err) {
@@ -2540,7 +2526,8 @@
int set_bad = 0;
clear_bit(R5_UPTODATE, &sh->dev[i].flags);
- atomic_inc(&rdev->read_errors);
+ if (!(bi->bi_status == BLK_STS_PROTECTION))
+ atomic_inc(&rdev->read_errors);
if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
pr_warn_ratelimited(
"md/raid:%s: read error on replacement device (sector %llu on %s).\n",
@@ -2563,16 +2550,24 @@
(unsigned long long)s,
bdn);
} else if (atomic_read(&rdev->read_errors)
- > conf->max_nr_stripes)
- pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
- mdname(conf->mddev), bdn);
- else
+ > conf->max_nr_stripes) {
+ if (!test_bit(Faulty, &rdev->flags)) {
+ pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
+ mdname(conf->mddev),
+ atomic_read(&rdev->read_errors),
+ conf->max_nr_stripes);
+ pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
+ mdname(conf->mddev), bdn);
+ }
+ } else
retry = 1;
if (set_bad && test_bit(In_sync, &rdev->flags)
&& !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
retry = 1;
if (retry)
- if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
+ if (sh->qd_idx >= 0 && sh->pd_idx == i)
+ set_bit(R5_ReadError, &sh->dev[i].flags);
+ else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
set_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
} else
@@ -2681,6 +2676,18 @@
pr_debug("raid456: error called\n");
spin_lock_irqsave(&conf->device_lock, flags);
+
+ if (test_bit(In_sync, &rdev->flags) &&
+ mddev->degraded == conf->max_degraded) {
+ /*
+ * Don't allow to achieve failed state
+ * Don't try to recover this device
+ */
+ conf->recovery_disabled = mddev->recovery_disabled;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ return;
+ }
+
set_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
mddev->degraded = raid5_calc_degraded(conf);
@@ -4185,7 +4192,7 @@
/* now write out any block on a failed drive,
* or P or Q if they were recomputed
*/
- BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
+ dev = NULL;
if (s->failed == 2) {
dev = &sh->dev[s->failed_num[1]];
s->locked++;
@@ -4210,6 +4217,14 @@
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantwrite, &dev->flags);
}
+ if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
+ "%s: disk%td not up to date\n",
+ mdname(conf->mddev),
+ dev - (struct r5dev *) &sh->dev)) {
+ clear_bit(R5_LOCKED, &dev->flags);
+ clear_bit(R5_Wantwrite, &dev->flags);
+ s->locked--;
+ }
clear_bit(STRIPE_DEGRADED, &sh->state);
set_bit(STRIPE_INSYNC, &sh->state);
@@ -4606,7 +4621,6 @@
(1 << STRIPE_FULL_WRITE) |
(1 << STRIPE_BIOFILL_RUN) |
(1 << STRIPE_COMPUTE_RUN) |
- (1 << STRIPE_OPS_REQ_PENDING) |
(1 << STRIPE_DISCARD) |
(1 << STRIPE_BATCH_READY) |
(1 << STRIPE_BATCH_ERR) |
@@ -5245,7 +5259,6 @@
rcu_read_unlock();
raid_bio->bi_next = (void*)rdev;
bio_set_dev(align_bi, rdev->bdev);
- bio_clear_flag(align_bi, BIO_SEG_VALID);
if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
bio_sectors(align_bi),
@@ -5486,7 +5499,7 @@
return;
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
- last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
+ last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
@@ -5713,7 +5726,8 @@
do_flush = false;
}
- set_bit(STRIPE_HANDLE, &sh->state);
+ if (!sh->batch_head)
+ set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
(bi->bi_opf & REQ_SYNC) &&
@@ -6164,6 +6178,8 @@
static int handle_active_stripes(struct r5conf *conf, int group,
struct r5worker *worker,
struct list_head *temp_inactive_list)
+ __releases(&conf->device_lock)
+ __acquires(&conf->device_lock)
{
struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
int i, batch_size = 0, hash;
@@ -6357,6 +6373,7 @@
int
raid5_set_cache_size(struct mddev *mddev, int size)
{
+ int result = 0;
struct r5conf *conf = mddev->private;
if (size <= 16 || size > 32768)
@@ -6373,11 +6390,14 @@
mutex_lock(&conf->cache_size_mutex);
while (size > conf->max_nr_stripes)
- if (!grow_one_stripe(conf, GFP_KERNEL))
+ if (!grow_one_stripe(conf, GFP_KERNEL)) {
+ conf->min_nr_stripes = conf->max_nr_stripes;
+ result = -ENOMEM;
break;
+ }
mutex_unlock(&conf->cache_size_mutex);
- return 0;
+ return result;
}
EXPORT_SYMBOL(raid5_set_cache_size);
@@ -6644,6 +6664,7 @@
&raid5_skip_copy.attr,
&raid5_rmw_level.attr,
&r5c_journal_mode.attr,
+ &ppl_write_hint.attr,
NULL,
};
static struct attribute_group raid5_attrs_group = {
@@ -6726,25 +6747,26 @@
static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
{
safe_put_page(percpu->spare_page);
- if (percpu->scribble)
- flex_array_free(percpu->scribble);
percpu->spare_page = NULL;
+ kvfree(percpu->scribble);
percpu->scribble = NULL;
}
static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
{
- if (conf->level == 6 && !percpu->spare_page)
+ if (conf->level == 6 && !percpu->spare_page) {
percpu->spare_page = alloc_page(GFP_KERNEL);
- if (!percpu->scribble)
- percpu->scribble = scribble_alloc(max(conf->raid_disks,
- conf->previous_raid_disks),
- max(conf->chunk_sectors,
- conf->prev_chunk_sectors)
- / STRIPE_SECTORS,
- GFP_KERNEL);
+ if (!percpu->spare_page)
+ return -ENOMEM;
+ }
- if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
+ if (scribble_alloc(percpu,
+ max(conf->raid_disks,
+ conf->previous_raid_disks),
+ max(conf->chunk_sectors,
+ conf->prev_chunk_sectors)
+ / STRIPE_SECTORS,
+ GFP_KERNEL)) {
free_scratch_buffer(conf, percpu);
return -ENOMEM;
}
@@ -7386,6 +7408,8 @@
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"reshape");
+ if (!mddev->sync_thread)
+ goto abort;
}
/* Ok, everything is just fine now */
@@ -7656,7 +7680,7 @@
static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r5conf *conf = mddev->private;
- int err = -EEXIST;
+ int ret, err = -EEXIST;
int disk;
struct disk_info *p;
int first = 0;
@@ -7671,7 +7695,14 @@
* The array is in readonly mode if journal is missing, so no
* write requests running. We should be safe
*/
- log_init(conf, rdev, false);
+ ret = log_init(conf, rdev, false);
+ if (ret)
+ return ret;
+
+ ret = r5l_start(conf->log);
+ if (ret)
+ return ret;
+
return 0;
}
if (mddev->recovery_disabled == conf->recovery_disabled)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 8474c22..f90e070 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -357,7 +357,6 @@
STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */
STRIPE_BIOFILL_RUN,
STRIPE_COMPUTE_RUN,
- STRIPE_OPS_REQ_PENDING,
STRIPE_ON_UNPLUG_LIST,
STRIPE_DISCARD,
STRIPE_ON_RELEASE_LIST,
@@ -493,9 +492,7 @@
*/
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
{
- int sectors = bio_sectors(bio);
-
- if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
+ if (bio_end_sector(bio) < sector + STRIPE_SECTORS)
return bio->bi_next;
else
return NULL;
@@ -638,10 +635,11 @@
/* per cpu variables */
struct raid5_percpu {
struct page *spare_page; /* Used when checking P/Q in raid6 */
- struct flex_array *scribble; /* space for constructing buffer
- * lists and performing address
- * conversions
- */
+ void *scribble; /* space for constructing buffer
+ * lists and performing address
+ * conversions
+ */
+ int scribble_obj_size;
} __percpu *percpu;
int scribble_disks;
int scribble_sectors;