Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 7dd6e98..2cefb07 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -15,7 +15,7 @@
config BLK_DEV_MD
tristate "RAID support"
- ---help---
+ help
This driver lets you combine several hard disk partitions into one
logical block device. This can be used to simply append one
partition to another one or to combine several redundant hard disks
@@ -27,7 +27,7 @@
More information about Software RAID on Linux is contained in the
Software RAID mini-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>. There you will also learn
+ <https://www.tldp.org/docs.html#howto>. There you will also learn
where to get the supporting user space utilities raidtools.
If unsure, say N.
@@ -36,11 +36,11 @@
bool "Autodetect RAID arrays during kernel boot"
depends on BLK_DEV_MD=y
default y
- ---help---
+ help
If you say Y here, then the kernel will try to autodetect raid
- arrays as part of its boot process.
+ arrays as part of its boot process.
- If you don't use raid and say Y, this autodetection can cause
+ If you don't use raid and say Y, this autodetection can cause
a several-second delay in the boot time due to various
synchronisation steps that are part of this step.
@@ -49,7 +49,7 @@
config MD_LINEAR
tristate "Linear (append) mode"
depends on BLK_DEV_MD
- ---help---
+ help
If you say Y here, then your multiple devices driver will be able to
use the so-called linear mode, i.e. it will combine the hard disk
partitions by simply appending one to the other.
@@ -62,7 +62,7 @@
config MD_RAID0
tristate "RAID-0 (striping) mode"
depends on BLK_DEV_MD
- ---help---
+ help
If you say Y here, then your multiple devices driver will be able to
use the so-called raid0 mode, i.e. it will combine the hard disk
partitions into one logical device in such a fashion as to fill them
@@ -71,7 +71,7 @@
Information about Software RAID on Linux is contained in the
Software-RAID mini-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>. There you will also
+ <https://www.tldp.org/docs.html#howto>. There you will also
learn where to get the supporting user space utilities raidtools.
To compile this as a module, choose M here: the module
@@ -82,7 +82,7 @@
config MD_RAID1
tristate "RAID-1 (mirroring) mode"
depends on BLK_DEV_MD
- ---help---
+ help
A RAID-1 set consists of several disk drives which are exact copies
of each other. In the event of a mirror failure, the RAID driver
will continue to use the operational mirrors in the set, providing
@@ -93,7 +93,7 @@
Information about Software RAID on Linux is contained in the
Software-RAID mini-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>. There you will also
+ <https://www.tldp.org/docs.html#howto>. There you will also
learn where to get the supporting user space utilities raidtools.
If you want to use such a RAID-1 set, say Y. To compile this code
@@ -104,7 +104,7 @@
config MD_RAID10
tristate "RAID-10 (mirrored striping) mode"
depends on BLK_DEV_MD
- ---help---
+ help
RAID-10 provides a combination of striping (RAID-0) and
mirroring (RAID-1) with easier configuration and more flexible
layout.
@@ -129,7 +129,7 @@
select ASYNC_XOR
select ASYNC_PQ
select ASYNC_RAID6_RECOV
- ---help---
+ help
A RAID-5 set of N drives with a capacity of C MB per drive provides
the capacity of C * (N - 1) MB, and protects against a failure
of a single drive. For a given sector (row) number, (N - 1) drives
@@ -148,7 +148,7 @@
Information about Software RAID on Linux is contained in the
Software-RAID mini-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>. There you will also
+ <https://www.tldp.org/docs.html#howto>. There you will also
learn where to get the supporting user space utilities raidtools.
If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y. To
@@ -183,7 +183,7 @@
depends on BLK_DEV_MD
depends on DLM
default n
- ---help---
+ help
Clustering support for MD devices. This enables locking and
synchronization across multiple systems on the cluster, so all
nodes in the cluster can access the MD devices simultaneously.
@@ -203,7 +203,7 @@
tristate "Device mapper support"
select BLK_DEV_DM_BUILTIN
depends on DAX || DAX=n
- ---help---
+ help
Device-mapper is a low level volume manager. It works by allowing
people to specify mappings for ranges of logical sectors. Various
mapping types are available, in addition people may write their own
@@ -219,7 +219,7 @@
config DM_DEBUG
bool "Device mapper debugging support"
depends on BLK_DEV_DM
- ---help---
+ help
Enable this for messages that may help debug device-mapper problems.
If unsure, say N.
@@ -227,7 +227,7 @@
config DM_BUFIO
tristate
depends on BLK_DEV_DM
- ---help---
+ help
This interface allows you to do buffered I/O on a device and acts
as a cache, holding recently-read blocks in memory and performing
delayed writes.
@@ -235,7 +235,7 @@
config DM_DEBUG_BLOCK_MANAGER_LOCKING
bool "Block manager locking"
depends on DM_BUFIO
- ---help---
+ help
Block manager locking can catch various metadata corruption issues.
If unsure, say N.
@@ -244,7 +244,7 @@
bool "Keep stack trace of persistent data block lock holders"
depends on STACKTRACE_SUPPORT && DM_DEBUG_BLOCK_MANAGER_LOCKING
select STACKTRACE
- ---help---
+ help
Enable this for messages that may help debug problems with the
block manager locking used by thin provisioning and caching.
@@ -253,7 +253,7 @@
config DM_BIO_PRISON
tristate
depends on BLK_DEV_DM
- ---help---
+ help
Some bio locking schemes used by other device-mapper targets
including thin provisioning.
@@ -262,17 +262,18 @@
config DM_UNSTRIPED
tristate "Unstriped target"
depends on BLK_DEV_DM
- ---help---
+ help
Unstripes I/O so it is issued solely on a single drive in a HW
RAID0 or dm-striped target.
config DM_CRYPT
tristate "Crypt target support"
depends on BLK_DEV_DM
+ depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
select CRYPTO
select CRYPTO_CBC
select CRYPTO_ESSIV
- ---help---
+ help
This device-mapper target allows you to create a device that
transparently encrypts the data on it. You'll need to activate
the ciphers you're going to use in the cryptoapi configuration.
@@ -289,16 +290,16 @@
tristate "Snapshot target"
depends on BLK_DEV_DM
select DM_BUFIO
- ---help---
- Allow volume managers to take writable snapshots of a device.
+ help
+ Allow volume managers to take writable snapshots of a device.
config DM_THIN_PROVISIONING
tristate "Thin provisioning target"
depends on BLK_DEV_DM
select DM_PERSISTENT_DATA
select DM_BIO_PRISON
- ---help---
- Provides thin provisioning and snapshots that share a data store.
+ help
+ Provides thin provisioning and snapshots that share a data store.
config DM_CACHE
tristate "Cache target (EXPERIMENTAL)"
@@ -306,29 +307,29 @@
default n
select DM_PERSISTENT_DATA
select DM_BIO_PRISON
- ---help---
- dm-cache attempts to improve performance of a block device by
- moving frequently used data to a smaller, higher performance
- device. Different 'policy' plugins can be used to change the
- algorithms used to select which blocks are promoted, demoted,
- cleaned etc. It supports writeback and writethrough modes.
+ help
+ dm-cache attempts to improve performance of a block device by
+ moving frequently used data to a smaller, higher performance
+ device. Different 'policy' plugins can be used to change the
+ algorithms used to select which blocks are promoted, demoted,
+ cleaned etc. It supports writeback and writethrough modes.
config DM_CACHE_SMQ
tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)"
depends on DM_CACHE
default y
- ---help---
- A cache policy that uses a multiqueue ordered by recent hits
- to select which blocks should be promoted and demoted.
- This is meant to be a general purpose policy. It prioritises
- reads over writes. This SMQ policy (vs MQ) offers the promise
- of less memory utilization, improved performance and increased
- adaptability in the face of changing workloads.
+ help
+ A cache policy that uses a multiqueue ordered by recent hits
+ to select which blocks should be promoted and demoted.
+ This is meant to be a general purpose policy. It prioritises
+ reads over writes. This SMQ policy (vs MQ) offers the promise
+ of less memory utilization, improved performance and increased
+ adaptability in the face of changing workloads.
config DM_WRITECACHE
tristate "Writecache target"
depends on BLK_DEV_DM
- ---help---
+ help
The writecache target caches writes on persistent memory or SSD.
It is intended for databases or other programs that need extremely
low commit latency.
@@ -336,43 +337,51 @@
The writecache target doesn't cache reads because reads are supposed
to be cached in standard RAM.
+config DM_EBS
+ tristate "Emulated block size target (EXPERIMENTAL)"
+ depends on BLK_DEV_DM
+ select DM_BUFIO
+ help
+ dm-ebs emulates smaller logical block size on backing devices
+ with larger ones (e.g. 512 byte sectors on 4K native disks).
+
config DM_ERA
tristate "Era target (EXPERIMENTAL)"
depends on BLK_DEV_DM
default n
select DM_PERSISTENT_DATA
select DM_BIO_PRISON
- ---help---
- dm-era tracks which parts of a block device are written to
- over time. Useful for maintaining cache coherency when using
- vendor snapshots.
+ help
+ dm-era tracks which parts of a block device are written to
+ over time. Useful for maintaining cache coherency when using
+ vendor snapshots.
config DM_CLONE
tristate "Clone target (EXPERIMENTAL)"
depends on BLK_DEV_DM
default n
select DM_PERSISTENT_DATA
- ---help---
- dm-clone produces a one-to-one copy of an existing, read-only source
- device into a writable destination device. The cloned device is
- visible/mountable immediately and the copy of the source device to the
- destination device happens in the background, in parallel with user
- I/O.
+ help
+ dm-clone produces a one-to-one copy of an existing, read-only source
+ device into a writable destination device. The cloned device is
+ visible/mountable immediately and the copy of the source device to the
+ destination device happens in the background, in parallel with user
+ I/O.
- If unsure, say N.
+ If unsure, say N.
config DM_MIRROR
tristate "Mirror target"
depends on BLK_DEV_DM
- ---help---
- Allow volume managers to mirror logical volumes, also
- needed for live data migration tools such as 'pvmove'.
+ help
+ Allow volume managers to mirror logical volumes, also
+ needed for live data migration tools such as 'pvmove'.
config DM_LOG_USERSPACE
tristate "Mirror userspace logging"
depends on DM_MIRROR && NET
select CONNECTOR
- ---help---
+ help
The userspace logging module provides a mechanism for
relaying the dm-dirty-log API to userspace. Log designs
which are more suited to userspace implementation (e.g.
@@ -387,7 +396,7 @@
select MD_RAID10
select MD_RAID456
select BLK_DEV_MD
- ---help---
+ help
A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings
A RAID-5 set of N drives with a capacity of C MB per drive provides
@@ -409,7 +418,7 @@
config DM_ZERO
tristate "Zero target"
depends on BLK_DEV_DM
- ---help---
+ help
A target that discards writes, and returns all zeroes for
reads. Useful in some recovery situations.
@@ -421,13 +430,13 @@
# it is, DM_MULTIPATH must depend on it. We get a build
# error if SCSI_DH=m and DM_MULTIPATH=y
depends on !SCSI_DH || SCSI
- ---help---
+ help
Allow volume managers to support multipath hardware.
config DM_MULTIPATH_QL
tristate "I/O Path Selector based on the number of in-flight I/Os"
depends on DM_MULTIPATH
- ---help---
+ help
This path selector is a dynamic load balancer which selects
the path with the least number of in-flight I/Os.
@@ -436,17 +445,28 @@
config DM_MULTIPATH_ST
tristate "I/O Path Selector based on the service time"
depends on DM_MULTIPATH
- ---help---
+ help
This path selector is a dynamic load balancer which selects
the path expected to complete the incoming I/O in the shortest
time.
If unsure, say N.
+config DM_MULTIPATH_HST
+ tristate "I/O Path Selector based on historical service time"
+ depends on DM_MULTIPATH
+ help
+ This path selector is a dynamic load balancer which selects
+ the path expected to complete the incoming I/O in the shortest
+ time by comparing estimated service time (based on historical
+ service time).
+
+ If unsure, say N.
+
config DM_DELAY
tristate "I/O delaying target"
depends on BLK_DEV_DM
- ---help---
+ help
A target that delays reads and/or writes and can send
them to different devices. Useful for testing.
@@ -455,7 +475,7 @@
config DM_DUST
tristate "Bad sector simulation target"
depends on BLK_DEV_DM
- ---help---
+ help
A target that simulates bad sector behavior.
Useful for testing.
@@ -464,7 +484,7 @@
config DM_INIT
bool "DM \"dm-mod.create=\" parameter support"
depends on BLK_DEV_DM=y
- ---help---
+ help
Enable "dm-mod.create=" parameter to create mapped devices at init time.
This option is useful to allow mounting rootfs without requiring an
initramfs.
@@ -476,14 +496,14 @@
config DM_UEVENT
bool "DM uevents"
depends on BLK_DEV_DM
- ---help---
+ help
Generate udev events for DM events.
config DM_FLAKEY
tristate "Flakey target"
depends on BLK_DEV_DM
- ---help---
- A target that intermittently fails I/O for debugging purposes.
+ help
+ A target that intermittently fails I/O for debugging purposes.
config DM_VERITY
tristate "Verity target support"
@@ -491,7 +511,7 @@
select CRYPTO
select CRYPTO_HASH
select DM_BUFIO
- ---help---
+ help
This device-mapper target creates a read-only device that
transparently validates the data on one underlying device against
a pre-generated tree of cryptographic checksums stored on a second
@@ -522,7 +542,7 @@
depends on DM_VERITY
select REED_SOLOMON
select REED_SOLOMON_DEC8
- ---help---
+ help
Add forward error correction support to dm-verity. This option
makes it possible to use pre-generated error correction data to
recover from corrupted blocks.
@@ -532,7 +552,7 @@
config DM_SWITCH
tristate "Switch target support (EXPERIMENTAL)"
depends on BLK_DEV_DM
- ---help---
+ help
This device-mapper target creates a device that supports an arbitrary
mapping of fixed-size regions of I/O across a fixed set of paths.
The path used for any specific region can be switched dynamically
@@ -546,7 +566,7 @@
config DM_LOG_WRITES
tristate "Log writes target support"
depends on BLK_DEV_DM
- ---help---
+ help
This device-mapper target takes two devices, one device to use
normally, one to log all write operations done to the first device.
This is for use by file system developers wishing to verify that
@@ -567,7 +587,7 @@
select CRYPTO
select CRYPTO_SKCIPHER
select ASYNC_XOR
- ---help---
+ help
This device-mapper target emulates a block device that has
additional per-sector tags that can be used for storing
integrity information.
@@ -583,7 +603,8 @@
tristate "Drive-managed zoned block device target support"
depends on BLK_DEV_DM
depends on BLK_DEV_ZONED
- ---help---
+ select CRC32
+ help
This device-mapper target takes a host-managed or host-aware zoned
block device and exposes most of its capacity as a regular block
device (drive-managed zoned block device) without any write
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index d91a7ed..6d3e234 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -17,6 +17,7 @@
dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
dm-cache-background-tracker.o
dm-cache-smq-y += dm-cache-policy-smq.o
+dm-ebs-y += dm-ebs-target.o
dm-era-y += dm-era-target.o
dm-clone-y += dm-clone-target.o dm-clone-metadata.o
dm-verity-y += dm-verity-target.o
@@ -42,6 +43,9 @@
obj-$(CONFIG_MD_CLUSTER) += md-cluster.o
obj-$(CONFIG_BCACHE) += bcache/
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
+ifeq ($(CONFIG_BLK_DEV_MD),y)
+obj-y += md-autodetect.o
+endif
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o
obj-$(CONFIG_DM_UNSTRIPED) += dm-unstripe.o
@@ -54,6 +58,7 @@
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
+obj-$(CONFIG_DM_MULTIPATH_HST) += dm-historical-service-time.o
obj-$(CONFIG_DM_SWITCH) += dm-switch.o
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/
@@ -65,6 +70,7 @@
obj-$(CONFIG_DM_VERITY) += dm-verity.o
obj-$(CONFIG_DM_CACHE) += dm-cache.o
obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
+obj-$(CONFIG_DM_EBS) += dm-ebs.o
obj-$(CONFIG_DM_ERA) += dm-era.o
obj-$(CONFIG_DM_CLONE) += dm-clone.o
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 6dfa653..d1ca4d0 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -26,3 +26,12 @@
Keeps all active closures in a linked list and provides a debugfs
interface to list them, which makes it possible to see asynchronous
operations that get stuck.
+
+config BCACHE_ASYNC_REGISTRATION
+ bool "Asynchronous device registration (EXPERIMENTAL)"
+ depends on BCACHE
+ help
+ Add a sysfs file /sys/fs/bcache/register_async. Writing registering
+ device path into this file will returns immediately and the real
+ registration work is handled in kernel work queue in asynchronous
+ way.
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index d26b351..5b87e59 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -4,6 +4,4 @@
bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
- util.o writeback.o
-
-CFLAGS_request.o += -Iblock
+ util.o writeback.o features.o
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index a1df0d9..8c371d5 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -49,7 +49,7 @@
*
* bch_bucket_alloc() allocates a single bucket from a specific cache.
*
- * bch_bucket_alloc_set() allocates one or more buckets from different caches
+ * bch_bucket_alloc_set() allocates one bucket from different caches
* out of a cache set.
*
* free_some_buckets() drives all the processes described above. It's called
@@ -87,8 +87,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
{
struct cache *ca;
struct bucket *b;
- unsigned int next = c->nbuckets * c->sb.bucket_size / 1024;
- unsigned int i;
+ unsigned long next = c->nbuckets * c->cache->sb.bucket_size / 1024;
int r;
atomic_sub(sectors, &c->rescale);
@@ -104,14 +103,14 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
c->min_prio = USHRT_MAX;
- for_each_cache(ca, c, i)
- for_each_bucket(b, ca)
- if (b->prio &&
- b->prio != BTREE_PRIO &&
- !atomic_read(&b->pin)) {
- b->prio--;
- c->min_prio = min(c->min_prio, b->prio);
- }
+ ca = c->cache;
+ for_each_bucket(b, ca)
+ if (b->prio &&
+ b->prio != BTREE_PRIO &&
+ !atomic_read(&b->pin)) {
+ b->prio--;
+ c->min_prio = min(c->min_prio, b->prio);
+ }
mutex_unlock(&c->bucket_lock);
}
@@ -362,7 +361,7 @@ static int bch_allocator_thread(void *arg)
* new stuff to them:
*/
allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
- if (CACHE_SYNC(&ca->set->sb)) {
+ if (CACHE_SYNC(&ca->sb)) {
/*
* This could deadlock if an allocation with a btree
* node locked ever blocked - having the btree node
@@ -488,34 +487,29 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
}
int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
- struct bkey *k, int n, bool wait)
+ struct bkey *k, bool wait)
{
- int i;
+ struct cache *ca;
+ long b;
/* No allocation if CACHE_SET_IO_DISABLE bit is set */
if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
return -1;
lockdep_assert_held(&c->bucket_lock);
- BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET);
bkey_init(k);
- /* sort by free space/prio of oldest data in caches */
+ ca = c->cache;
+ b = bch_bucket_alloc(ca, reserve, wait);
+ if (b == -1)
+ goto err;
- for (i = 0; i < n; i++) {
- struct cache *ca = c->cache_by_alloc[i];
- long b = bch_bucket_alloc(ca, reserve, wait);
+ k->ptr[0] = MAKE_PTR(ca->buckets[b].gen,
+ bucket_to_sector(c, b),
+ ca->sb.nr_this_dev);
- if (b == -1)
- goto err;
-
- k->ptr[i] = MAKE_PTR(ca->buckets[b].gen,
- bucket_to_sector(c, b),
- ca->sb.nr_this_dev);
-
- SET_KEY_PTRS(k, i + 1);
- }
+ SET_KEY_PTRS(k, 1);
return 0;
err:
@@ -525,12 +519,12 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
}
int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
- struct bkey *k, int n, bool wait)
+ struct bkey *k, bool wait)
{
int ret;
mutex_lock(&c->bucket_lock);
- ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
+ ret = __bch_bucket_alloc_set(c, reserve, k, wait);
mutex_unlock(&c->bucket_lock);
return ret;
}
@@ -589,7 +583,7 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c,
struct open_bucket, list);
found:
if (!ret->sectors_free && KEY_PTRS(alloc)) {
- ret->sectors_free = c->sb.bucket_size;
+ ret->sectors_free = c->cache->sb.bucket_size;
bkey_copy(&ret->key, alloc);
bkey_init(alloc);
}
@@ -638,7 +632,7 @@ bool bch_alloc_sectors(struct cache_set *c,
spin_unlock(&c->data_bucket_lock);
- if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait))
+ if (bch_bucket_alloc_set(c, watermark, &alloc.key, wait))
return false;
spin_lock(&c->data_bucket_lock);
@@ -683,7 +677,7 @@ bool bch_alloc_sectors(struct cache_set *c,
&PTR_CACHE(c, &b->key, i)->sectors_written);
}
- if (b->sectors_free < c->sb.block_size)
+ if (b->sectors_free < c->cache->sb.block_size)
b->sectors_free = 0;
/*
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 36de6f7..e8bf4f7 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -176,7 +176,7 @@
* - updates to non leaf nodes just happen synchronously (see btree_split()).
*/
-#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+#define pr_fmt(fmt) "bcache: %s() " fmt, __func__
#include <linux/bcache.h>
#include <linux/bio.h>
@@ -301,6 +301,7 @@ struct cached_dev {
struct block_device *bdev;
struct cache_sb sb;
+ struct cache_sb_disk *sb_disk;
struct bio sb_bio;
struct bio_vec sb_bv[1];
struct closure sb_write;
@@ -406,6 +407,7 @@ enum alloc_reserve {
struct cache {
struct cache_set *set;
struct cache_sb sb;
+ struct cache_sb_disk *sb_disk;
struct bio sb_bio;
struct bio_vec sb_bv[1];
@@ -515,11 +517,7 @@ struct cache_set {
atomic_t idle_counter;
atomic_t at_max_writeback_rate;
- struct cache_sb sb;
-
- struct cache *cache[MAX_CACHES_PER_SET];
- struct cache *cache_by_alloc[MAX_CACHES_PER_SET];
- int caches_loaded;
+ struct cache *cache;
struct bcache_device **devices;
unsigned int devices_max_used;
@@ -668,6 +666,7 @@ struct cache_set {
struct mutex verify_lock;
#endif
+ uint8_t set_uuid[16];
unsigned int nr_uuids;
struct uuid_entry *uuids;
BKEY_PADDED(uuid_bucket);
@@ -727,6 +726,7 @@ struct cache_set {
unsigned int gc_always_rewrite:1;
unsigned int shrinker_disabled:1;
unsigned int copy_gc_enabled:1;
+ unsigned int idle_max_writeback_rate_enabled:1;
#define BUCKET_HASH_BITS 12
struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
@@ -755,15 +755,35 @@ struct bbio {
#define btree_default_blocks(c) \
((unsigned int) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits))
-#define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS)
-#define bucket_bytes(c) ((c)->sb.bucket_size << 9)
-#define block_bytes(c) ((c)->sb.block_size << 9)
+#define bucket_bytes(ca) ((ca)->sb.bucket_size << 9)
+#define block_bytes(ca) ((ca)->sb.block_size << 9)
-#define prios_per_bucket(c) \
- ((bucket_bytes(c) - sizeof(struct prio_set)) / \
+static inline unsigned int meta_bucket_pages(struct cache_sb *sb)
+{
+ unsigned int n, max_pages;
+
+ max_pages = min_t(unsigned int,
+ __rounddown_pow_of_two(USHRT_MAX) / PAGE_SECTORS,
+ MAX_ORDER_NR_PAGES);
+
+ n = sb->bucket_size / PAGE_SECTORS;
+ if (n > max_pages)
+ n = max_pages;
+
+ return n;
+}
+
+static inline unsigned int meta_bucket_bytes(struct cache_sb *sb)
+{
+ return meta_bucket_pages(sb) << PAGE_SHIFT;
+}
+
+#define prios_per_bucket(ca) \
+ ((meta_bucket_bytes(&(ca)->sb) - sizeof(struct prio_set)) / \
sizeof(struct bucket_disk))
-#define prio_buckets(c) \
- DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
+
+#define prio_buckets(ca) \
+ DIV_ROUND_UP((size_t) (ca)->sb.nbuckets, prios_per_bucket(ca))
static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
{
@@ -777,14 +797,14 @@ static inline sector_t bucket_to_sector(struct cache_set *c, size_t b)
static inline sector_t bucket_remainder(struct cache_set *c, sector_t s)
{
- return s & (c->sb.bucket_size - 1);
+ return s & (c->cache->sb.bucket_size - 1);
}
static inline struct cache *PTR_CACHE(struct cache_set *c,
const struct bkey *k,
unsigned int ptr)
{
- return c->cache[PTR_DEV(k, ptr)];
+ return c->cache;
}
static inline size_t PTR_BUCKET_NR(struct cache_set *c,
@@ -865,9 +885,6 @@ do { \
/* Looping macros */
-#define for_each_cache(ca, cs, iter) \
- for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++)
-
#define for_each_bucket(b, ca) \
for (b = (ca)->buckets + (ca)->sb.first_bucket; \
b < (ca)->buckets + (ca)->sb.nbuckets; b++)
@@ -909,11 +926,9 @@ static inline uint8_t bucket_gc_gen(struct bucket *b)
static inline void wake_up_allocators(struct cache_set *c)
{
- struct cache *ca;
- unsigned int i;
+ struct cache *ca = c->cache;
- for_each_cache(ca, c, i)
- wake_up_process(ca->alloc_thread);
+ wake_up_process(ca->alloc_thread);
}
static inline void closure_bio_submit(struct cache_set *c,
@@ -926,7 +941,7 @@ static inline void closure_bio_submit(struct cache_set *c,
bio_endio(bio);
return;
}
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
/*
@@ -970,9 +985,9 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k);
long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait);
int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
- struct bkey *k, int n, bool wait);
+ struct bkey *k, bool wait);
int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
- struct bkey *k, int n, bool wait);
+ struct bkey *k, bool wait);
bool bch_alloc_sectors(struct cache_set *c, struct bkey *k,
unsigned int sectors, unsigned int write_point,
unsigned int write_prio, bool wait);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index fda68c0..67a2c47 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -6,7 +6,7 @@
* Copyright 2012 Google, Inc.
*/
-#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+#define pr_fmt(fmt) "bcache: %s() " fmt, __func__
#include "util.h"
#include "bset.h"
@@ -31,7 +31,7 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set)
if (b->ops->key_dump)
b->ops->key_dump(b, k);
else
- pr_err("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k));
+ pr_cont("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k));
if (next < bset_bkey_last(i) &&
bkey_cmp(k, b->ops->is_extents ?
@@ -155,6 +155,7 @@ int __bch_keylist_realloc(struct keylist *l, unsigned int u64s)
return 0;
}
+/* Pop the top key of keylist by pointing l->top to its previous key */
struct bkey *bch_keylist_pop(struct keylist *l)
{
struct bkey *k = l->keys;
@@ -168,6 +169,7 @@ struct bkey *bch_keylist_pop(struct keylist *l)
return l->top = k;
}
+/* Pop the bottom key of keylist and update l->top_p */
void bch_keylist_pop_front(struct keylist *l)
{
l->top_p -= bkey_u64s(l->keys);
@@ -309,7 +311,6 @@ void bch_btree_keys_free(struct btree_keys *b)
t->tree = NULL;
t->data = NULL;
}
-EXPORT_SYMBOL(bch_btree_keys_free);
int bch_btree_keys_alloc(struct btree_keys *b,
unsigned int page_order,
@@ -342,7 +343,6 @@ int bch_btree_keys_alloc(struct btree_keys *b,
bch_btree_keys_free(b);
return -ENOMEM;
}
-EXPORT_SYMBOL(bch_btree_keys_alloc);
void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
bool *expensive_debug_checks)
@@ -361,7 +361,6 @@ void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
* any more.
*/
}
-EXPORT_SYMBOL(bch_btree_keys_init);
/* Binary tree stuff for auxiliary search trees */
@@ -678,7 +677,6 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
bch_bset_build_unwritten_tree(b);
}
-EXPORT_SYMBOL(bch_bset_init_next);
/*
* Build auxiliary binary tree 'struct bset_tree *t', this tree is used to
@@ -732,7 +730,6 @@ void bch_bset_build_written_tree(struct btree_keys *b)
j = inorder_next(j, t->size))
make_bfloat(t, j);
}
-EXPORT_SYMBOL(bch_bset_build_written_tree);
/* Insert */
@@ -780,7 +777,6 @@ fix_right: do {
j = j * 2 + 1;
} while (j < t->size);
}
-EXPORT_SYMBOL(bch_bset_fix_invalidated_key);
static void bch_bset_fix_lookup_table(struct btree_keys *b,
struct bset_tree *t,
@@ -855,7 +851,6 @@ bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r)
return b->ops->key_merge(b, l, r);
}
-EXPORT_SYMBOL(bch_bkey_try_merge);
void bch_bset_insert(struct btree_keys *b, struct bkey *where,
struct bkey *insert)
@@ -875,7 +870,6 @@ void bch_bset_insert(struct btree_keys *b, struct bkey *where,
bkey_copy(where, insert);
bch_bset_fix_lookup_table(b, t, where);
}
-EXPORT_SYMBOL(bch_bset_insert);
unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
struct bkey *replace_key)
@@ -931,7 +925,6 @@ copy: bkey_copy(m, k);
merged:
return status;
}
-EXPORT_SYMBOL(bch_btree_insert_key);
/* Lookup */
@@ -1077,7 +1070,6 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
return i.l;
}
-EXPORT_SYMBOL(__bch_bset_search);
/* Btree iterator */
@@ -1132,7 +1124,6 @@ struct bkey *bch_btree_iter_init(struct btree_keys *b,
{
return __bch_btree_iter_init(b, iter, search, b->set);
}
-EXPORT_SYMBOL(bch_btree_iter_init);
static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
btree_iter_cmp_fn *cmp)
@@ -1165,7 +1156,6 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter)
return __bch_btree_iter_next(iter, btree_iter_cmp);
}
-EXPORT_SYMBOL(bch_btree_iter_next);
struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
struct btree_keys *b, ptr_filter_fn fn)
@@ -1196,7 +1186,6 @@ int bch_bset_sort_state_init(struct bset_sort_state *state,
return mempool_init_page_pool(&state->pool, 1, page_order);
}
-EXPORT_SYMBOL(bch_bset_sort_state_init);
static void btree_mergesort(struct btree_keys *b, struct bset *out,
struct btree_iter *iter,
@@ -1236,7 +1225,7 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out,
out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0;
- pr_debug("sorted %i keys", out->keys);
+ pr_debug("sorted %i keys\n", out->keys);
}
static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
@@ -1268,6 +1257,11 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
* Our temporary buffer is the same size as the btree node's
* buffer, we can just swap buffers instead of doing a big
* memcpy()
+ *
+ * Don't worry event 'out' is allocated from mempool, it can
+ * still be swapped here. Because state->pool is a page mempool
+ * creaated by by mempool_init_page_pool(), which allocates
+ * pages by alloc_pages() indeed.
*/
out->magic = b->set->data->magic;
@@ -1313,7 +1307,6 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
}
-EXPORT_SYMBOL(bch_btree_sort_partial);
void bch_btree_sort_and_fix_extents(struct btree_keys *b,
struct btree_iter *iter,
@@ -1366,7 +1359,6 @@ void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state)
out:
bch_bset_build_written_tree(b);
}
-EXPORT_SYMBOL(bch_btree_sort_lazy);
void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats)
{
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 5a33910..fe6dce1 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -103,68 +103,10 @@ static struct workqueue_struct *btree_io_wq;
#define insert_lock(s, b) ((b)->level <= (s)->lock)
-/*
- * These macros are for recursing down the btree - they handle the details of
- * locking and looking up nodes in the cache for you. They're best treated as
- * mere syntax when reading code that uses them.
- *
- * op->lock determines whether we take a read or a write lock at a given depth.
- * If you've got a read lock and find that you need a write lock (i.e. you're
- * going to have to split), set op->lock and return -EINTR; btree_root() will
- * call you again and you'll have the correct lock.
- */
-
-/**
- * btree - recurse down the btree on a specified key
- * @fn: function to call, which will be passed the child node
- * @key: key to recurse on
- * @b: parent btree node
- * @op: pointer to struct btree_op
- */
-#define btree(fn, key, b, op, ...) \
-({ \
- int _r, l = (b)->level - 1; \
- bool _w = l <= (op)->lock; \
- struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \
- _w, b); \
- if (!IS_ERR(_child)) { \
- _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
- rw_unlock(_w, _child); \
- } else \
- _r = PTR_ERR(_child); \
- _r; \
-})
-
-/**
- * btree_root - call a function on the root of the btree
- * @fn: function to call, which will be passed the child node
- * @c: cache set
- * @op: pointer to struct btree_op
- */
-#define btree_root(fn, c, op, ...) \
-({ \
- int _r = -EINTR; \
- do { \
- struct btree *_b = (c)->root; \
- bool _w = insert_lock(op, _b); \
- rw_lock(_w, _b, _b->level); \
- if (_b == (c)->root && \
- _w == insert_lock(op, _b)) { \
- _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
- } \
- rw_unlock(_w, _b); \
- bch_cannibalize_unlock(c); \
- if (_r == -EINTR) \
- schedule(); \
- } while (_r == -EINTR); \
- \
- finish_wait(&(c)->btree_cache_wait, &(op)->wait); \
- _r; \
-})
static inline struct bset *write_block(struct btree *b)
{
- return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
+ return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache);
}
static void bch_btree_init_next(struct btree *b)
@@ -177,7 +119,7 @@ static void bch_btree_init_next(struct btree *b)
if (b->written < btree_blocks(b))
bch_bset_init_next(&b->keys, write_block(b),
- bset_magic(&b->c->sb));
+ bset_magic(&b->c->cache->sb));
}
@@ -215,7 +157,7 @@ void bch_btree_node_read_done(struct btree *b)
* See the comment arount cache_set->fill_iter.
*/
iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
- iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
+ iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
iter->used = 0;
#ifdef CONFIG_BCACHE_DEBUG
@@ -233,12 +175,12 @@ void bch_btree_node_read_done(struct btree *b)
goto err;
err = "bad btree header";
- if (b->written + set_blocks(i, block_bytes(b->c)) >
+ if (b->written + set_blocks(i, block_bytes(b->c->cache)) >
btree_blocks(b))
goto err;
err = "bad magic";
- if (i->magic != bset_magic(&b->c->sb))
+ if (i->magic != bset_magic(&b->c->cache->sb))
goto err;
err = "bad checksum";
@@ -259,13 +201,13 @@ void bch_btree_node_read_done(struct btree *b)
bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
- b->written += set_blocks(i, block_bytes(b->c));
+ b->written += set_blocks(i, block_bytes(b->c->cache));
}
err = "corrupted btree";
for (i = write_block(b);
bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
- i = ((void *) i) + block_bytes(b->c))
+ i = ((void *) i) + block_bytes(b->c->cache))
if (i->seq == b->keys.set[0].data->seq)
goto err;
@@ -279,7 +221,7 @@ void bch_btree_node_read_done(struct btree *b)
if (b->written < btree_blocks(b))
bch_bset_init_next(&b->keys, write_block(b),
- bset_magic(&b->c->sb));
+ bset_magic(&b->c->cache->sb));
out:
mempool_free(iter, &b->c->fill_iter);
return;
@@ -407,7 +349,7 @@ static void do_btree_node_write(struct btree *b)
b->bio->bi_end_io = btree_node_write_endio;
b->bio->bi_private = cl;
- b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
+ b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c->cache));
b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA;
bch_bio_map(b->bio, i);
@@ -483,10 +425,10 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent)
do_btree_node_write(b);
- atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
+ atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size,
&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
- b->written += set_blocks(i, block_bytes(b->c));
+ b->written += set_blocks(i, block_bytes(b->c->cache));
}
void bch_btree_node_write(struct btree *b, struct closure *parent)
@@ -545,6 +487,11 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
set_btree_node_dirty(b);
+ /*
+ * w->journal is always the oldest journal pin of all bkeys
+ * in the leaf node, to make sure the oldest jset seq won't
+ * be increased before this btree node is flushed.
+ */
if (journal_ref) {
if (w->journal &&
journal_pin_cmp(b->c, w->journal, journal_ref)) {
@@ -569,7 +516,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
* mca -> memory cache
*/
-#define mca_reserve(c) (((c->root && c->root->level) \
+#define mca_reserve(c) (((!IS_ERR_OR_NULL(c->root) && c->root->level) \
? c->root->level : 1) * 8 + 16)
#define mca_can_free(c) \
max_t(int, 0, c->btree_cache_used - mca_reserve(c))
@@ -674,7 +621,7 @@ static int mca_reap(struct btree *b, unsigned int min_order, bool flush)
* and BTREE_NODE_journal_flush bit cleared by btree_flush_write().
*/
if (btree_node_journal_flush(b)) {
- pr_debug("bnode %p is flushing by journal, retry", b);
+ pr_debug("bnode %p is flushing by journal, retry\n", b);
mutex_unlock(&b->write_lock);
udelay(1);
goto retry;
@@ -731,34 +678,32 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
i = 0;
btree_cache_used = c->btree_cache_used;
- list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
+ list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) {
if (nr <= 0)
goto out;
- if (++i > 3 &&
- !mca_reap(b, 0, false)) {
+ if (!mca_reap(b, 0, false)) {
mca_data_free(b);
rw_unlock(true, b);
freed++;
}
nr--;
+ i++;
}
- for (; (nr--) && i < btree_cache_used; i++) {
- if (list_empty(&c->btree_cache))
+ list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
+ if (nr <= 0 || i >= btree_cache_used)
goto out;
- b = list_first_entry(&c->btree_cache, struct btree, list);
- list_rotate_left(&c->btree_cache);
-
- if (!b->accessed &&
- !mca_reap(b, 0, false)) {
+ if (!mca_reap(b, 0, false)) {
mca_bucket_free(b);
mca_data_free(b);
rw_unlock(true, b);
freed++;
- } else
- b->accessed = 0;
+ }
+
+ nr--;
+ i++;
}
out:
mutex_unlock(&c->bucket_lock);
@@ -795,7 +740,7 @@ void bch_btree_cache_free(struct cache_set *c)
if (c->verify_data)
list_move(&c->verify_data->list, &c->btree_cache);
- free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
+ free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb)));
#endif
list_splice(&c->btree_cache_freeable,
@@ -842,7 +787,16 @@ int bch_btree_cache_alloc(struct cache_set *c)
mutex_init(&c->verify_lock);
c->verify_ondisk = (void *)
- __get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(bucket_pages(c)));
+ __get_free_pages(GFP_KERNEL|__GFP_COMP,
+ ilog2(meta_bucket_pages(&c->cache->sb)));
+ if (!c->verify_ondisk) {
+ /*
+ * Don't worry about the mca_rereserve buckets
+ * allocated in previous for-loop, they will be
+ * handled properly in bch_cache_set_unregister().
+ */
+ return -ENOMEM;
+ }
c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
@@ -859,7 +813,7 @@ int bch_btree_cache_alloc(struct cache_set *c)
c->shrink.batch = c->btree_pages * 2;
if (register_shrinker(&c->shrink))
- pr_warn("bcache: %s: could not register shrinker",
+ pr_warn("bcache: %s: could not register shrinker\n",
__func__);
return 0;
@@ -1016,7 +970,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
* bch_btree_node_get - find a btree node in the cache and lock it, reading it
* in from disk if necessary.
*
- * If IO is necessary and running under generic_make_request, returns -EAGAIN.
+ * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN.
*
* The btree node will have either a read or a write lock held, depending on
* level and op->lock.
@@ -1066,7 +1020,6 @@ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
BUG_ON(!b->written);
b->parent = parent;
- b->accessed = 1;
for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
prefetch(b->keys.set[i].tree);
@@ -1112,7 +1065,7 @@ static void btree_node_free(struct btree *b)
*/
if (btree_node_journal_flush(b)) {
mutex_unlock(&b->write_lock);
- pr_debug("bnode %p journal_flush set, retry", b);
+ pr_debug("bnode %p journal_flush set, retry\n", b);
udelay(1);
goto retry;
}
@@ -1141,7 +1094,7 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
mutex_lock(&c->bucket_lock);
retry:
- if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
+ if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
goto err;
bkey_put(c, &k.key);
@@ -1157,9 +1110,8 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
goto retry;
}
- b->accessed = 1;
b->parent = parent;
- bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
+ bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb));
mutex_unlock(&c->bucket_lock);
@@ -1218,19 +1170,18 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
static int btree_check_reserve(struct btree *b, struct btree_op *op)
{
struct cache_set *c = b->c;
- struct cache *ca;
- unsigned int i, reserve = (c->root->level - b->level) * 2 + 1;
+ struct cache *ca = c->cache;
+ unsigned int reserve = (c->root->level - b->level) * 2 + 1;
mutex_lock(&c->bucket_lock);
- for_each_cache(ca, c, i)
- if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
- if (op)
- prepare_to_wait(&c->btree_cache_wait, &op->wait,
- TASK_UNINTERRUPTIBLE);
- mutex_unlock(&c->bucket_lock);
- return -EINTR;
- }
+ if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
+ if (op)
+ prepare_to_wait(&c->btree_cache_wait, &op->wait,
+ TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&c->bucket_lock);
+ return -EINTR;
+ }
mutex_unlock(&c->bucket_lock);
@@ -1396,7 +1347,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
if (nodes < 2 ||
__set_blocks(b->keys.set[0].data, keys,
- block_bytes(b->c)) > blocks * (nodes - 1))
+ block_bytes(b->c->cache)) > blocks * (nodes - 1))
return 0;
for (i = 0; i < nodes; i++) {
@@ -1430,7 +1381,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
k = bkey_next(k)) {
if (__set_blocks(n1, n1->keys + keys +
bkey_u64s(k),
- block_bytes(b->c)) > blocks)
+ block_bytes(b->c->cache)) > blocks)
break;
last = k;
@@ -1446,7 +1397,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
* though)
*/
if (__set_blocks(n1, n1->keys + n2->keys,
- block_bytes(b->c)) >
+ block_bytes(b->c->cache)) >
btree_blocks(new_nodes[i]))
goto out_unlock_nocoalesce;
@@ -1455,7 +1406,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
last = &r->b->key;
}
- BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
+ BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) >
btree_blocks(new_nodes[i]));
if (last)
@@ -1746,7 +1697,6 @@ static void btree_gc_start(struct cache_set *c)
{
struct cache *ca;
struct bucket *b;
- unsigned int i;
if (!c->gc_mark_valid)
return;
@@ -1756,14 +1706,14 @@ static void btree_gc_start(struct cache_set *c)
c->gc_mark_valid = 0;
c->gc_done = ZERO_KEY;
- for_each_cache(ca, c, i)
- for_each_bucket(b, ca) {
- b->last_gc = b->gen;
- if (!atomic_read(&b->pin)) {
- SET_GC_MARK(b, 0);
- SET_GC_SECTORS_USED(b, 0);
- }
+ ca = c->cache;
+ for_each_bucket(b, ca) {
+ b->last_gc = b->gen;
+ if (!atomic_read(&b->pin)) {
+ SET_GC_MARK(b, 0);
+ SET_GC_SECTORS_USED(b, 0);
}
+ }
mutex_unlock(&c->bucket_lock);
}
@@ -1772,7 +1722,8 @@ static void bch_btree_gc_finish(struct cache_set *c)
{
struct bucket *b;
struct cache *ca;
- unsigned int i;
+ unsigned int i, j;
+ uint64_t *k;
mutex_lock(&c->bucket_lock);
@@ -1790,7 +1741,6 @@ static void bch_btree_gc_finish(struct cache_set *c)
struct bcache_device *d = c->devices[i];
struct cached_dev *dc;
struct keybuf_key *w, *n;
- unsigned int j;
if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
continue;
@@ -1807,29 +1757,27 @@ static void bch_btree_gc_finish(struct cache_set *c)
rcu_read_unlock();
c->avail_nbuckets = 0;
- for_each_cache(ca, c, i) {
- uint64_t *i;
- ca->invalidate_needs_gc = 0;
+ ca = c->cache;
+ ca->invalidate_needs_gc = 0;
- for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
- SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
+ for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
+ SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
- for (i = ca->prio_buckets;
- i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
- SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
+ for (k = ca->prio_buckets;
+ k < ca->prio_buckets + prio_buckets(ca) * 2; k++)
+ SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
- for_each_bucket(b, ca) {
- c->need_gc = max(c->need_gc, bucket_gc_gen(b));
+ for_each_bucket(b, ca) {
+ c->need_gc = max(c->need_gc, bucket_gc_gen(b));
- if (atomic_read(&b->pin))
- continue;
+ if (atomic_read(&b->pin))
+ continue;
- BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
+ BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
- if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
- c->avail_nbuckets++;
- }
+ if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
+ c->avail_nbuckets++;
}
mutex_unlock(&c->bucket_lock);
@@ -1853,7 +1801,7 @@ static void bch_btree_gc(struct cache_set *c)
/* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
do {
- ret = btree_root(gc_root, c, &op, &writes, &stats);
+ ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
closure_sync(&writes);
cond_resched();
@@ -1861,7 +1809,7 @@ static void bch_btree_gc(struct cache_set *c)
schedule_timeout_interruptible(msecs_to_jiffies
(GC_SLEEP_MS));
else if (ret)
- pr_warn("gc failed!");
+ pr_warn("gc failed!\n");
} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
bch_btree_gc_finish(c);
@@ -1881,12 +1829,10 @@ static void bch_btree_gc(struct cache_set *c)
static bool gc_should_run(struct cache_set *c)
{
- struct cache *ca;
- unsigned int i;
+ struct cache *ca = c->cache;
- for_each_cache(ca, c, i)
- if (ca->invalidate_needs_gc)
- return true;
+ if (ca->invalidate_needs_gc)
+ return true;
if (atomic_read(&c->sectors_to_gc) < 0)
return true;
@@ -1951,7 +1897,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
}
if (p)
- ret = btree(check_recurse, p, b, op);
+ ret = bcache_btree(check_recurse, p, b, op);
p = k;
} while (p && !ret);
@@ -1960,20 +1906,180 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
return ret;
}
+
+static int bch_btree_check_thread(void *arg)
+{
+ int ret;
+ struct btree_check_info *info = arg;
+ struct btree_check_state *check_state = info->state;
+ struct cache_set *c = check_state->c;
+ struct btree_iter iter;
+ struct bkey *k, *p;
+ int cur_idx, prev_idx, skip_nr;
+
+ k = p = NULL;
+ cur_idx = prev_idx = 0;
+ ret = 0;
+
+ /* root node keys are checked before thread created */
+ bch_btree_iter_init(&c->root->keys, &iter, NULL);
+ k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
+ BUG_ON(!k);
+
+ p = k;
+ while (k) {
+ /*
+ * Fetch a root node key index, skip the keys which
+ * should be fetched by other threads, then check the
+ * sub-tree indexed by the fetched key.
+ */
+ spin_lock(&check_state->idx_lock);
+ cur_idx = check_state->key_idx;
+ check_state->key_idx++;
+ spin_unlock(&check_state->idx_lock);
+
+ skip_nr = cur_idx - prev_idx;
+
+ while (skip_nr) {
+ k = bch_btree_iter_next_filter(&iter,
+ &c->root->keys,
+ bch_ptr_bad);
+ if (k)
+ p = k;
+ else {
+ /*
+ * No more keys to check in root node,
+ * current checking threads are enough,
+ * stop creating more.
+ */
+ atomic_set(&check_state->enough, 1);
+ /* Update check_state->enough earlier */
+ smp_mb__after_atomic();
+ goto out;
+ }
+ skip_nr--;
+ cond_resched();
+ }
+
+ if (p) {
+ struct btree_op op;
+
+ btree_node_prefetch(c->root, p);
+ c->gc_stats.nodes++;
+ bch_btree_op_init(&op, 0);
+ ret = bcache_btree(check_recurse, p, c->root, &op);
+ if (ret)
+ goto out;
+ }
+ p = NULL;
+ prev_idx = cur_idx;
+ cond_resched();
+ }
+
+out:
+ info->result = ret;
+ /* update check_state->started among all CPUs */
+ smp_mb__before_atomic();
+ if (atomic_dec_and_test(&check_state->started))
+ wake_up(&check_state->wait);
+
+ return ret;
+}
+
+
+
+static int bch_btree_chkthread_nr(void)
+{
+ int n = num_online_cpus()/2;
+
+ if (n == 0)
+ n = 1;
+ else if (n > BCH_BTR_CHKTHREAD_MAX)
+ n = BCH_BTR_CHKTHREAD_MAX;
+
+ return n;
+}
+
int bch_btree_check(struct cache_set *c)
{
- struct btree_op op;
+ int ret = 0;
+ int i;
+ struct bkey *k = NULL;
+ struct btree_iter iter;
+ struct btree_check_state *check_state;
+ char name[32];
- bch_btree_op_init(&op, SHRT_MAX);
+ /* check and mark root node keys */
+ for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
+ bch_initial_mark_key(c, c->root->level, k);
- return btree_root(check_recurse, c, &op);
+ bch_initial_mark_key(c, c->root->level + 1, &c->root->key);
+
+ if (c->root->level == 0)
+ return 0;
+
+ check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL);
+ if (!check_state)
+ return -ENOMEM;
+
+ check_state->c = c;
+ check_state->total_threads = bch_btree_chkthread_nr();
+ check_state->key_idx = 0;
+ spin_lock_init(&check_state->idx_lock);
+ atomic_set(&check_state->started, 0);
+ atomic_set(&check_state->enough, 0);
+ init_waitqueue_head(&check_state->wait);
+
+ /*
+ * Run multiple threads to check btree nodes in parallel,
+ * if check_state->enough is non-zero, it means current
+ * running check threads are enough, unncessary to create
+ * more.
+ */
+ for (i = 0; i < check_state->total_threads; i++) {
+ /* fetch latest check_state->enough earlier */
+ smp_mb__before_atomic();
+ if (atomic_read(&check_state->enough))
+ break;
+
+ check_state->infos[i].result = 0;
+ check_state->infos[i].state = check_state;
+ snprintf(name, sizeof(name), "bch_btrchk[%u]", i);
+ atomic_inc(&check_state->started);
+
+ check_state->infos[i].thread =
+ kthread_run(bch_btree_check_thread,
+ &check_state->infos[i],
+ name);
+ if (IS_ERR(check_state->infos[i].thread)) {
+ pr_err("fails to run thread bch_btrchk[%d]\n", i);
+ for (--i; i >= 0; i--)
+ kthread_stop(check_state->infos[i].thread);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ wait_event_interruptible(check_state->wait,
+ atomic_read(&check_state->started) == 0 ||
+ test_bit(CACHE_SET_IO_DISABLE, &c->flags));
+
+ for (i = 0; i < check_state->total_threads; i++) {
+ if (check_state->infos[i].result) {
+ ret = check_state->infos[i].result;
+ goto out;
+ }
+ }
+
+out:
+ kfree(check_state);
+ return ret;
}
void bch_initial_gc_finish(struct cache_set *c)
{
- struct cache *ca;
+ struct cache *ca = c->cache;
struct bucket *b;
- unsigned int i;
bch_btree_gc_finish(c);
@@ -1988,20 +2094,18 @@ void bch_initial_gc_finish(struct cache_set *c)
* This is only safe for buckets that have no live data in them, which
* there should always be some of.
*/
- for_each_cache(ca, c, i) {
- for_each_bucket(b, ca) {
- if (fifo_full(&ca->free[RESERVE_PRIO]) &&
- fifo_full(&ca->free[RESERVE_BTREE]))
- break;
+ for_each_bucket(b, ca) {
+ if (fifo_full(&ca->free[RESERVE_PRIO]) &&
+ fifo_full(&ca->free[RESERVE_BTREE]))
+ break;
- if (bch_can_invalidate_bucket(ca, b) &&
- !GC_MARK(b)) {
- __bch_invalidate_one_bucket(ca, b);
- if (!fifo_push(&ca->free[RESERVE_PRIO],
- b - ca->buckets))
- fifo_push(&ca->free[RESERVE_BTREE],
- b - ca->buckets);
- }
+ if (bch_can_invalidate_bucket(ca, b) &&
+ !GC_MARK(b)) {
+ __bch_invalidate_one_bucket(ca, b);
+ if (!fifo_push(&ca->free[RESERVE_PRIO],
+ b - ca->buckets))
+ fifo_push(&ca->free[RESERVE_BTREE],
+ b - ca->buckets);
}
}
@@ -2109,7 +2213,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
goto err;
split = set_blocks(btree_bset_first(n1),
- block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
+ block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5;
if (split) {
unsigned int keys = 0;
@@ -2356,7 +2460,7 @@ int bch_btree_insert(struct cache_set *c, struct keylist *keys,
if (ret) {
struct bkey *k;
- pr_err("error %i", ret);
+ pr_err("error %i\n", ret);
while ((k = bch_keylist_pop(keys)))
bkey_put(c, k);
@@ -2406,7 +2510,7 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
bch_ptr_bad))) {
- ret = btree(map_nodes_recurse, k, b,
+ ret = bcache_btree(map_nodes_recurse, k, b,
op, from, fn, flags);
from = NULL;
@@ -2424,10 +2528,10 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
struct bkey *from, btree_map_nodes_fn *fn, int flags)
{
- return btree_root(map_nodes_recurse, c, op, from, fn, flags);
+ return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags);
}
-static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
+int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
struct bkey *from, btree_map_keys_fn *fn,
int flags)
{
@@ -2440,7 +2544,8 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
ret = !b->level
? fn(op, b, k)
- : btree(map_keys_recurse, k, b, op, from, fn, flags);
+ : bcache_btree(map_keys_recurse, k,
+ b, op, from, fn, flags);
from = NULL;
if (ret != MAP_CONTINUE)
@@ -2457,7 +2562,7 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
struct bkey *from, btree_map_keys_fn *fn, int flags)
{
- return btree_root(map_keys_recurse, c, op, from, fn, flags);
+ return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags);
}
/* Keybuf code */
@@ -2643,7 +2748,7 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
break;
if (bkey_cmp(&buf->last_scanned, end) >= 0) {
- pr_debug("scan finished");
+ pr_debug("scan finished\n");
break;
}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 76cfd12..5048210 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -121,8 +121,6 @@ struct btree {
/* Key/pointer for this btree node */
BKEY_PADDED(key);
- /* Single bit - set when accessed, cleared by shrinker */
- unsigned long accessed;
unsigned long seq;
struct rw_semaphore lock;
struct cache_set *c;
@@ -147,6 +145,9 @@ struct btree {
struct bio *bio;
};
+
+
+
#define BTREE_FLAG(flag) \
static inline bool btree_node_ ## flag(struct btree *b) \
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
@@ -193,7 +194,7 @@ static inline unsigned int bset_block_offset(struct btree *b, struct bset *i)
static inline void set_gc_sectors(struct cache_set *c)
{
- atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
+ atomic_set(&c->sectors_to_gc, c->cache->sb.bucket_size * c->nbuckets / 16);
}
void bkey_put(struct cache_set *c, struct bkey *k);
@@ -218,6 +219,25 @@ struct btree_op {
unsigned int insert_collision:1;
};
+struct btree_check_state;
+struct btree_check_info {
+ struct btree_check_state *state;
+ struct task_struct *thread;
+ int result;
+};
+
+#define BCH_BTR_CHKTHREAD_MAX 64
+struct btree_check_state {
+ struct cache_set *c;
+ int total_threads;
+ int key_idx;
+ spinlock_t idx_lock;
+ atomic_t started;
+ atomic_t enough;
+ wait_queue_head_t wait;
+ struct btree_check_info infos[BCH_BTR_CHKTHREAD_MAX];
+};
+
static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
{
memset(op, 0, sizeof(struct btree_op));
@@ -286,6 +306,65 @@ static inline void force_wake_up_gc(struct cache_set *c)
wake_up_gc(c);
}
+/*
+ * These macros are for recursing down the btree - they handle the details of
+ * locking and looking up nodes in the cache for you. They're best treated as
+ * mere syntax when reading code that uses them.
+ *
+ * op->lock determines whether we take a read or a write lock at a given depth.
+ * If you've got a read lock and find that you need a write lock (i.e. you're
+ * going to have to split), set op->lock and return -EINTR; btree_root() will
+ * call you again and you'll have the correct lock.
+ */
+
+/**
+ * btree - recurse down the btree on a specified key
+ * @fn: function to call, which will be passed the child node
+ * @key: key to recurse on
+ * @b: parent btree node
+ * @op: pointer to struct btree_op
+ */
+#define bcache_btree(fn, key, b, op, ...) \
+({ \
+ int _r, l = (b)->level - 1; \
+ bool _w = l <= (op)->lock; \
+ struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \
+ _w, b); \
+ if (!IS_ERR(_child)) { \
+ _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
+ rw_unlock(_w, _child); \
+ } else \
+ _r = PTR_ERR(_child); \
+ _r; \
+})
+
+/**
+ * btree_root - call a function on the root of the btree
+ * @fn: function to call, which will be passed the child node
+ * @c: cache set
+ * @op: pointer to struct btree_op
+ */
+#define bcache_btree_root(fn, c, op, ...) \
+({ \
+ int _r = -EINTR; \
+ do { \
+ struct btree *_b = (c)->root; \
+ bool _w = insert_lock(op, _b); \
+ rw_lock(_w, _b, _b->level); \
+ if (_b == (c)->root && \
+ _w == insert_lock(op, _b)) { \
+ _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
+ } \
+ rw_unlock(_w, _b); \
+ bch_cannibalize_unlock(c); \
+ if (_r == -EINTR) \
+ schedule(); \
+ } while (_r == -EINTR); \
+ \
+ finish_wait(&(c)->btree_cache_wait, &(op)->wait); \
+ _r; \
+})
+
#define MAP_DONE 0
#define MAP_CONTINUE 1
@@ -316,6 +395,9 @@ typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b,
struct bkey *k);
int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
struct bkey *from, btree_map_keys_fn *fn, int flags);
+int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
+ struct bkey *from, btree_map_keys_fn *fn,
+ int flags);
typedef bool (keybuf_pred_fn)(struct keybuf *buf, struct bkey *k);
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index c12cd80..d8d9394 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -45,7 +45,6 @@ void closure_sub(struct closure *cl, int v)
{
closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
}
-EXPORT_SYMBOL(closure_sub);
/*
* closure_put - decrement a closure's refcount
@@ -54,7 +53,6 @@ void closure_put(struct closure *cl)
{
closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
}
-EXPORT_SYMBOL(closure_put);
/*
* closure_wake_up - wake up all closures on a wait list, without memory barrier
@@ -76,7 +74,6 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
closure_sub(cl, CLOSURE_WAITING + 1);
}
}
-EXPORT_SYMBOL(__closure_wake_up);
/**
* closure_wait - add a closure to a waitlist
@@ -96,7 +93,6 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
return true;
}
-EXPORT_SYMBOL(closure_wait);
struct closure_syncer {
struct task_struct *task;
@@ -131,7 +127,6 @@ void __sched __closure_sync(struct closure *cl)
__set_current_state(TASK_RUNNING);
}
-EXPORT_SYMBOL(__closure_sync);
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
@@ -149,7 +144,6 @@ void closure_debug_create(struct closure *cl)
list_add(&cl->all, &closure_list);
spin_unlock_irqrestore(&closure_list_lock, flags);
}
-EXPORT_SYMBOL(closure_debug_create);
void closure_debug_destroy(struct closure *cl)
{
@@ -162,11 +156,10 @@ void closure_debug_destroy(struct closure *cl)
list_del(&cl->all);
spin_unlock_irqrestore(&closure_list_lock, flags);
}
-EXPORT_SYMBOL(closure_debug_destroy);
static struct dentry *closure_debug;
-static int debug_seq_show(struct seq_file *f, void *data)
+static int debug_show(struct seq_file *f, void *data)
{
struct closure *cl;
@@ -195,17 +188,7 @@ static int debug_seq_show(struct seq_file *f, void *data)
return 0;
}
-static int debug_seq_open(struct inode *inode, struct file *file)
-{
- return single_open(file, debug_seq_show, NULL);
-}
-
-static const struct file_operations debug_ops = {
- .owner = THIS_MODULE,
- .open = debug_seq_open,
- .read = seq_read,
- .release = single_release
-};
+DEFINE_SHOW_ATTRIBUTE(debug);
void __init closure_debug_init(void)
{
@@ -216,7 +199,7 @@ void __init closure_debug_init(void)
* about this.
*/
closure_debug = debugfs_create_file(
- "closures", 0400, bcache_debug, NULL, &debug_ops);
+ "closures", 0400, bcache_debug, NULL, &debug_fops);
}
#endif
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 336f439..b00fd08 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -25,8 +25,8 @@ struct dentry *bcache_debug;
for (i = (start); \
(void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\
i->seq == (start)->seq; \
- i = (void *) i + set_blocks(i, block_bytes(b->c)) * \
- block_bytes(b->c))
+ i = (void *) i + set_blocks(i, block_bytes(b->c->cache)) * \
+ block_bytes(b->c->cache))
void bch_btree_verify(struct btree *b)
{
@@ -82,14 +82,14 @@ void bch_btree_verify(struct btree *b)
for_each_written_bset(b, ondisk, i) {
unsigned int block = ((void *) i - (void *) ondisk) /
- block_bytes(b->c);
+ block_bytes(b->c->cache);
pr_err("*** on disk block %u:\n", block);
bch_dump_bset(&b->keys, i, block);
}
pr_err("*** block %zu not written\n",
- ((void *) i - (void *) ondisk) / block_bytes(b->c));
+ ((void *) i - (void *) ondisk) / block_bytes(b->c->cache));
for (j = 0; j < inmemory->keys; j++)
if (inmemory->d[j] != sorted->d[j])
@@ -238,7 +238,7 @@ void bch_debug_init_cache_set(struct cache_set *c)
if (!IS_ERR_OR_NULL(bcache_debug)) {
char name[50];
- snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
+ snprintf(name, 50, "bcache-%pU", c->set_uuid);
c->debug = debugfs_create_file(name, 0400, bcache_debug, c,
&cache_set_debug_ops);
}
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 8867100..f4658a1 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -54,7 +54,7 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
size_t bucket = PTR_BUCKET_NR(c, k, i);
size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
- if (KEY_SIZE(k) + r > c->sb.bucket_size ||
+ if (KEY_SIZE(k) + r > c->cache->sb.bucket_size ||
bucket < ca->sb.first_bucket ||
bucket >= ca->sb.nbuckets)
return true;
@@ -75,7 +75,7 @@ static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
size_t bucket = PTR_BUCKET_NR(c, k, i);
size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
- if (KEY_SIZE(k) + r > c->sb.bucket_size)
+ if (KEY_SIZE(k) + r > c->cache->sb.bucket_size)
return "bad, length too big";
if (bucket < ca->sb.first_bucket)
return "bad, short offset";
@@ -130,18 +130,18 @@ static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
char buf[80];
bch_extent_to_text(buf, sizeof(buf), k);
- pr_err(" %s", buf);
+ pr_cont(" %s", buf);
for (j = 0; j < KEY_PTRS(k); j++) {
size_t n = PTR_BUCKET_NR(b->c, k, j);
- pr_err(" bucket %zu", n);
- if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
- pr_err(" prio %i",
- PTR_BUCKET(b->c, k, j)->prio);
+ pr_cont(" bucket %zu", n);
+ if (n >= b->c->cache->sb.first_bucket && n < b->c->cache->sb.nbuckets)
+ pr_cont(" prio %i",
+ PTR_BUCKET(b->c, k, j)->prio);
}
- pr_err(" %s\n", bch_ptr_status(b->c, k));
+ pr_cont(" %s\n", bch_ptr_status(b->c, k));
}
/* Btree ptrs */
@@ -553,7 +553,7 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
if (stale && KEY_DIRTY(k)) {
bch_extent_to_text(buf, sizeof(buf), k);
- pr_info("stale dirty pointer, stale %u, key: %s",
+ pr_info("stale dirty pointer, stale %u, key: %s\n",
stale, buf);
}
diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c
new file mode 100644
index 0000000..d636b7b
--- /dev/null
+++ b/drivers/md/bcache/features.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Feature set bits and string conversion.
+ * Inspired by ext4's features compat/incompat/ro_compat related code.
+ *
+ * Copyright 2020 Coly Li <colyli@suse.de>
+ *
+ */
+#include <linux/bcache.h>
+#include "bcache.h"
+#include "features.h"
+
+struct feature {
+ int compat;
+ unsigned int mask;
+ const char *string;
+};
+
+static struct feature feature_list[] = {
+ {BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE,
+ "large_bucket"},
+ {0, 0, 0 },
+};
+
+#define compose_feature_string(type) \
+({ \
+ struct feature *f; \
+ bool first = true; \
+ \
+ for (f = &feature_list[0]; f->compat != 0; f++) { \
+ if (f->compat != BCH_FEATURE_ ## type) \
+ continue; \
+ if (BCH_HAS_ ## type ## _FEATURE(&c->cache->sb, f->mask)) { \
+ if (first) { \
+ out += snprintf(out, buf + size - out, \
+ "["); \
+ } else { \
+ out += snprintf(out, buf + size - out, \
+ " ["); \
+ } \
+ } else if (!first) { \
+ out += snprintf(out, buf + size - out, " "); \
+ } \
+ \
+ out += snprintf(out, buf + size - out, "%s", f->string);\
+ \
+ if (BCH_HAS_ ## type ## _FEATURE(&c->cache->sb, f->mask)) \
+ out += snprintf(out, buf + size - out, "]"); \
+ \
+ first = false; \
+ } \
+ if (!first) \
+ out += snprintf(out, buf + size - out, "\n"); \
+})
+
+int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size)
+{
+ char *out = buf;
+ compose_feature_string(COMPAT);
+ return out - buf;
+}
+
+int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size)
+{
+ char *out = buf;
+ compose_feature_string(RO_COMPAT);
+ return out - buf;
+}
+
+int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size)
+{
+ char *out = buf;
+ compose_feature_string(INCOMPAT);
+ return out - buf;
+}
diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h
new file mode 100644
index 0000000..d1c8fd3
--- /dev/null
+++ b/drivers/md/bcache/features.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _BCACHE_FEATURES_H
+#define _BCACHE_FEATURES_H
+
+#include <linux/bcache.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#define BCH_FEATURE_COMPAT 0
+#define BCH_FEATURE_RO_COMPAT 1
+#define BCH_FEATURE_INCOMPAT 2
+#define BCH_FEATURE_TYPE_MASK 0x03
+
+/* Feature set definition */
+/* Incompat feature set */
+/* 32bit bucket size, obsoleted */
+#define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET 0x0001
+/* real bucket size is (1 << bucket_size) */
+#define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE 0x0002
+
+#define BCH_FEATURE_COMPAT_SUPP 0
+#define BCH_FEATURE_RO_COMPAT_SUPP 0
+#define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \
+ BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE)
+
+#define BCH_HAS_COMPAT_FEATURE(sb, mask) \
+ ((sb)->feature_compat & (mask))
+#define BCH_HAS_RO_COMPAT_FEATURE(sb, mask) \
+ ((sb)->feature_ro_compat & (mask))
+#define BCH_HAS_INCOMPAT_FEATURE(sb, mask) \
+ ((sb)->feature_incompat & (mask))
+
+#define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \
+static inline int bch_has_feature_##name(struct cache_sb *sb) \
+{ \
+ if (sb->version < BCACHE_SB_VERSION_CDEV_WITH_FEATURES) \
+ return 0; \
+ return (((sb)->feature_compat & \
+ BCH##_FEATURE_COMPAT_##flagname) != 0); \
+} \
+static inline void bch_set_feature_##name(struct cache_sb *sb) \
+{ \
+ (sb)->feature_compat |= \
+ BCH##_FEATURE_COMPAT_##flagname; \
+} \
+static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+{ \
+ (sb)->feature_compat &= \
+ ~BCH##_FEATURE_COMPAT_##flagname; \
+}
+
+#define BCH_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
+static inline int bch_has_feature_##name(struct cache_sb *sb) \
+{ \
+ if (sb->version < BCACHE_SB_VERSION_CDEV_WITH_FEATURES) \
+ return 0; \
+ return (((sb)->feature_ro_compat & \
+ BCH##_FEATURE_RO_COMPAT_##flagname) != 0); \
+} \
+static inline void bch_set_feature_##name(struct cache_sb *sb) \
+{ \
+ (sb)->feature_ro_compat |= \
+ BCH##_FEATURE_RO_COMPAT_##flagname; \
+} \
+static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+{ \
+ (sb)->feature_ro_compat &= \
+ ~BCH##_FEATURE_RO_COMPAT_##flagname; \
+}
+
+#define BCH_FEATURE_INCOMPAT_FUNCS(name, flagname) \
+static inline int bch_has_feature_##name(struct cache_sb *sb) \
+{ \
+ if (sb->version < BCACHE_SB_VERSION_CDEV_WITH_FEATURES) \
+ return 0; \
+ return (((sb)->feature_incompat & \
+ BCH##_FEATURE_INCOMPAT_##flagname) != 0); \
+} \
+static inline void bch_set_feature_##name(struct cache_sb *sb) \
+{ \
+ (sb)->feature_incompat |= \
+ BCH##_FEATURE_INCOMPAT_##flagname; \
+} \
+static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+{ \
+ (sb)->feature_incompat &= \
+ ~BCH##_FEATURE_INCOMPAT_##flagname; \
+}
+
+BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET);
+BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE);
+
+static inline bool bch_has_unknown_compat_features(struct cache_sb *sb)
+{
+ return ((sb->feature_compat & ~BCH_FEATURE_COMPAT_SUPP) != 0);
+}
+
+static inline bool bch_has_unknown_ro_compat_features(struct cache_sb *sb)
+{
+ return ((sb->feature_ro_compat & ~BCH_FEATURE_RO_COMPAT_SUPP) != 0);
+}
+
+static inline bool bch_has_unknown_incompat_features(struct cache_sb *sb)
+{
+ return ((sb->feature_incompat & ~BCH_FEATURE_INCOMPAT_SUPP) != 0);
+}
+
+int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size);
+int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size);
+int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size);
+
+#endif
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 4d93f07..dad71a6 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -26,7 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c)
struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
struct bio *bio = &b->bio;
- bio_init(bio, bio->bi_inline_vecs, bucket_pages(c));
+ bio_init(bio, bio->bi_inline_vecs, meta_bucket_pages(&c->cache->sb));
return bio;
}
@@ -65,14 +65,14 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio)
* we shouldn't count failed REQ_RAHEAD bio to dc->io_errors.
*/
if (bio->bi_opf & REQ_RAHEAD) {
- pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore",
+ pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore\n",
dc->backing_dev_name);
return;
}
errors = atomic_add_return(1, &dc->io_errors);
if (errors < dc->error_limit)
- pr_err("%s: IO error on backing device, unrecoverable",
+ pr_err("%s: IO error on backing device, unrecoverable\n",
dc->backing_dev_name);
else
bch_cached_dev_error(dc);
@@ -123,12 +123,12 @@ void bch_count_io_errors(struct cache *ca,
errors >>= IO_ERROR_SHIFT;
if (errors < ca->set->error_limit)
- pr_err("%s: IO error on %s%s",
+ pr_err("%s: IO error on %s%s\n",
ca->cache_dev_name, m,
is_read ? ", recovering." : ".");
else
bch_cache_set_error(ca->set,
- "%s: too many IO errors %s",
+ "%s: too many IO errors %s\n",
ca->cache_dev_name, m);
}
}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index b4fd923..c6613e8 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -47,7 +47,7 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
closure_init_stack(&cl);
- pr_debug("reading %u", bucket_index);
+ pr_debug("reading %u\n", bucket_index);
while (offset < ca->sb.bucket_size) {
reread: left = ca->sb.bucket_size - offset;
@@ -78,13 +78,13 @@ reread: left = ca->sb.bucket_size - offset;
size_t blocks, bytes = set_bytes(j);
if (j->magic != jset_magic(&ca->sb)) {
- pr_debug("%u: bad magic", bucket_index);
+ pr_debug("%u: bad magic\n", bucket_index);
return ret;
}
if (bytes > left << 9 ||
bytes > PAGE_SIZE << JSET_BITS) {
- pr_info("%u: too big, %zu bytes, offset %u",
+ pr_info("%u: too big, %zu bytes, offset %u\n",
bucket_index, bytes, offset);
return ret;
}
@@ -93,12 +93,12 @@ reread: left = ca->sb.bucket_size - offset;
goto reread;
if (j->csum != csum_set(j)) {
- pr_info("%u: bad csum, %zu bytes, offset %u",
+ pr_info("%u: bad csum, %zu bytes, offset %u\n",
bucket_index, bytes, offset);
return ret;
}
- blocks = set_blocks(j, block_bytes(ca->set));
+ blocks = set_blocks(j, block_bytes(ca));
/*
* Nodes in 'list' are in linear increasing order of
@@ -179,115 +179,109 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
ret; \
})
- struct cache *ca;
- unsigned int iter;
+ struct cache *ca = c->cache;
int ret = 0;
+ struct journal_device *ja = &ca->journal;
+ DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS);
+ unsigned int i, l, r, m;
+ uint64_t seq;
- for_each_cache(ca, c, iter) {
- struct journal_device *ja = &ca->journal;
- DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS);
- unsigned int i, l, r, m;
- uint64_t seq;
+ bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
+ pr_debug("%u journal buckets\n", ca->sb.njournal_buckets);
- bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
- pr_debug("%u journal buckets", ca->sb.njournal_buckets);
-
+ /*
+ * Read journal buckets ordered by golden ratio hash to quickly
+ * find a sequence of buckets with valid journal entries
+ */
+ for (i = 0; i < ca->sb.njournal_buckets; i++) {
/*
- * Read journal buckets ordered by golden ratio hash to quickly
- * find a sequence of buckets with valid journal entries
+ * We must try the index l with ZERO first for
+ * correctness due to the scenario that the journal
+ * bucket is circular buffer which might have wrapped
*/
- for (i = 0; i < ca->sb.njournal_buckets; i++) {
- /*
- * We must try the index l with ZERO first for
- * correctness due to the scenario that the journal
- * bucket is circular buffer which might have wrapped
- */
- l = (i * 2654435769U) % ca->sb.njournal_buckets;
+ l = (i * 2654435769U) % ca->sb.njournal_buckets;
- if (test_bit(l, bitmap))
- break;
+ if (test_bit(l, bitmap))
+ break;
- if (read_bucket(l))
- goto bsearch;
- }
-
- /*
- * If that fails, check all the buckets we haven't checked
- * already
- */
- pr_debug("falling back to linear search");
-
- for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
- l < ca->sb.njournal_buckets;
- l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets,
- l + 1))
- if (read_bucket(l))
- goto bsearch;
-
- /* no journal entries on this device? */
- if (l == ca->sb.njournal_buckets)
- continue;
-bsearch:
- BUG_ON(list_empty(list));
-
- /* Binary search */
- m = l;
- r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
- pr_debug("starting binary search, l %u r %u", l, r);
-
- while (l + 1 < r) {
- seq = list_entry(list->prev, struct journal_replay,
- list)->j.seq;
-
- m = (l + r) >> 1;
- read_bucket(m);
-
- if (seq != list_entry(list->prev, struct journal_replay,
- list)->j.seq)
- l = m;
- else
- r = m;
- }
-
- /*
- * Read buckets in reverse order until we stop finding more
- * journal entries
- */
- pr_debug("finishing up: m %u njournal_buckets %u",
- m, ca->sb.njournal_buckets);
- l = m;
-
- while (1) {
- if (!l--)
- l = ca->sb.njournal_buckets - 1;
-
- if (l == m)
- break;
-
- if (test_bit(l, bitmap))
- continue;
-
- if (!read_bucket(l))
- break;
- }
-
- seq = 0;
-
- for (i = 0; i < ca->sb.njournal_buckets; i++)
- if (ja->seq[i] > seq) {
- seq = ja->seq[i];
- /*
- * When journal_reclaim() goes to allocate for
- * the first time, it'll use the bucket after
- * ja->cur_idx
- */
- ja->cur_idx = i;
- ja->last_idx = ja->discard_idx = (i + 1) %
- ca->sb.njournal_buckets;
-
- }
+ if (read_bucket(l))
+ goto bsearch;
}
+ /*
+ * If that fails, check all the buckets we haven't checked
+ * already
+ */
+ pr_debug("falling back to linear search\n");
+
+ for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets)
+ if (read_bucket(l))
+ goto bsearch;
+
+ /* no journal entries on this device? */
+ if (l == ca->sb.njournal_buckets)
+ goto out;
+bsearch:
+ BUG_ON(list_empty(list));
+
+ /* Binary search */
+ m = l;
+ r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
+ pr_debug("starting binary search, l %u r %u\n", l, r);
+
+ while (l + 1 < r) {
+ seq = list_entry(list->prev, struct journal_replay,
+ list)->j.seq;
+
+ m = (l + r) >> 1;
+ read_bucket(m);
+
+ if (seq != list_entry(list->prev, struct journal_replay,
+ list)->j.seq)
+ l = m;
+ else
+ r = m;
+ }
+
+ /*
+ * Read buckets in reverse order until we stop finding more
+ * journal entries
+ */
+ pr_debug("finishing up: m %u njournal_buckets %u\n",
+ m, ca->sb.njournal_buckets);
+ l = m;
+
+ while (1) {
+ if (!l--)
+ l = ca->sb.njournal_buckets - 1;
+
+ if (l == m)
+ break;
+
+ if (test_bit(l, bitmap))
+ continue;
+
+ if (!read_bucket(l))
+ break;
+ }
+
+ seq = 0;
+
+ for (i = 0; i < ca->sb.njournal_buckets; i++)
+ if (ja->seq[i] > seq) {
+ seq = ja->seq[i];
+ /*
+ * When journal_reclaim() goes to allocate for
+ * the first time, it'll use the bucket after
+ * ja->cur_idx
+ */
+ ja->cur_idx = i;
+ ja->last_idx = ja->discard_idx = (i + 1) %
+ ca->sb.njournal_buckets;
+
+ }
+
+out:
if (!list_empty(list))
c->journal.seq = list_entry(list->prev,
struct journal_replay,
@@ -345,12 +339,10 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
static bool is_discard_enabled(struct cache_set *s)
{
- struct cache *ca;
- unsigned int i;
+ struct cache *ca = s->cache;
- for_each_cache(ca, s, i)
- if (ca->discard)
- return true;
+ if (ca->discard)
+ return true;
return false;
}
@@ -370,10 +362,10 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
if (n != i->j.seq) {
if (n == start && is_discard_enabled(s))
- pr_info("bcache: journal entries %llu-%llu may be discarded! (replaying %llu-%llu)",
+ pr_info("journal entries %llu-%llu may be discarded! (replaying %llu-%llu)\n",
n, i->j.seq - 1, start, end);
else {
- pr_err("bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
+ pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n",
n, i->j.seq - 1, start, end);
ret = -EIO;
goto err;
@@ -403,7 +395,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
entries++;
}
- pr_info("journal replay done, %i keys in %i entries, seq %llu",
+ pr_info("journal replay done, %i keys in %i entries, seq %llu\n",
keys, entries, end);
err:
while (!list_empty(list)) {
@@ -417,8 +409,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
/* Journalling */
-#define nr_to_fifo_front(p, front_p, mask) (((p) - (front_p)) & (mask))
-
static void btree_flush_write(struct cache_set *c)
{
struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR];
@@ -483,7 +473,7 @@ static void btree_flush_write(struct cache_set *c)
break;
if (btree_node_journal_flush(b))
- pr_err("BUG: flush_write bit should not be set here!");
+ pr_err("BUG: flush_write bit should not be set here!\n");
mutex_lock(&b->write_lock);
@@ -510,9 +500,8 @@ static void btree_flush_write(struct cache_set *c)
* journal entry can be reclaimed). These selected nodes
* will be ignored and skipped in the folowing for-loop.
*/
- if (nr_to_fifo_front(btree_current_write(b)->journal,
- fifo_front_p,
- mask) != 0) {
+ if (((btree_current_write(b)->journal - fifo_front_p) &
+ mask) != 0) {
mutex_unlock(&b->write_lock);
continue;
}
@@ -537,13 +526,13 @@ static void btree_flush_write(struct cache_set *c)
for (i = 0; i < nr; i++) {
b = btree_nodes[i];
if (!b) {
- pr_err("BUG: btree_nodes[%d] is NULL", i);
+ pr_err("BUG: btree_nodes[%d] is NULL\n", i);
continue;
}
/* safe to check without holding b->write_lock */
if (!btree_node_journal_flush(b)) {
- pr_err("BUG: bnode %p: journal_flush bit cleaned", b);
+ pr_err("BUG: bnode %p: journal_flush bit cleaned\n", b);
continue;
}
@@ -551,14 +540,14 @@ static void btree_flush_write(struct cache_set *c)
if (!btree_current_write(b)->journal) {
clear_bit(BTREE_NODE_journal_flush, &b->flags);
mutex_unlock(&b->write_lock);
- pr_debug("bnode %p: written by others", b);
+ pr_debug("bnode %p: written by others\n", b);
continue;
}
if (!btree_node_dirty(b)) {
clear_bit(BTREE_NODE_journal_flush, &b->flags);
mutex_unlock(&b->write_lock);
- pr_debug("bnode %p: dirty bit cleaned by others", b);
+ pr_debug("bnode %p: dirty bit cleaned by others\n", b);
continue;
}
@@ -614,7 +603,7 @@ static void do_journal_discard(struct cache *ca)
ca->sb.njournal_buckets;
atomic_set(&ja->discard_in_flight, DISCARD_READY);
- /* fallthrough */
+ fallthrough;
case DISCARD_READY:
if (ja->discard_idx == ja->last_idx)
@@ -639,9 +628,10 @@ static void do_journal_discard(struct cache *ca)
static void journal_reclaim(struct cache_set *c)
{
struct bkey *k = &c->journal.key;
- struct cache *ca;
+ struct cache *ca = c->cache;
uint64_t last_seq;
- unsigned int iter, n = 0;
+ unsigned int next;
+ struct journal_device *ja = &ca->journal;
atomic_t p __maybe_unused;
atomic_long_inc(&c->reclaim);
@@ -653,46 +643,31 @@ static void journal_reclaim(struct cache_set *c)
/* Update last_idx */
- for_each_cache(ca, c, iter) {
- struct journal_device *ja = &ca->journal;
+ while (ja->last_idx != ja->cur_idx &&
+ ja->seq[ja->last_idx] < last_seq)
+ ja->last_idx = (ja->last_idx + 1) %
+ ca->sb.njournal_buckets;
- while (ja->last_idx != ja->cur_idx &&
- ja->seq[ja->last_idx] < last_seq)
- ja->last_idx = (ja->last_idx + 1) %
- ca->sb.njournal_buckets;
- }
-
- for_each_cache(ca, c, iter)
- do_journal_discard(ca);
+ do_journal_discard(ca);
if (c->journal.blocks_free)
goto out;
- /*
- * Allocate:
- * XXX: Sort by free journal space
- */
+ next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+ /* No space available on this device */
+ if (next == ja->discard_idx)
+ goto out;
- for_each_cache(ca, c, iter) {
- struct journal_device *ja = &ca->journal;
- unsigned int next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+ ja->cur_idx = next;
+ k->ptr[0] = MAKE_PTR(0,
+ bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+ ca->sb.nr_this_dev);
+ atomic_long_inc(&c->reclaimed_journal_buckets);
- /* No space available on this device */
- if (next == ja->discard_idx)
- continue;
+ bkey_init(k);
+ SET_KEY_PTRS(k, 1);
+ c->journal.blocks_free = ca->sb.bucket_size >> c->block_bits;
- ja->cur_idx = next;
- k->ptr[n++] = MAKE_PTR(0,
- bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
- ca->sb.nr_this_dev);
- atomic_long_inc(&c->reclaimed_journal_buckets);
- }
-
- if (n) {
- bkey_init(k);
- SET_KEY_PTRS(k, n);
- c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
- }
out:
if (!journal_full(&c->journal))
__closure_wake_up(&c->journal.wait);
@@ -719,7 +694,7 @@ void bch_journal_next(struct journal *j)
j->cur->data->keys = 0;
if (fifo_full(&j->pin))
- pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
+ pr_debug("journal_pin full (%zu)\n", fifo_used(&j->pin));
}
static void journal_write_endio(struct bio *bio)
@@ -756,11 +731,11 @@ static void journal_write_unlocked(struct closure *cl)
__releases(c->journal.lock)
{
struct cache_set *c = container_of(cl, struct cache_set, journal.io);
- struct cache *ca;
+ struct cache *ca = c->cache;
struct journal_write *w = c->journal.cur;
struct bkey *k = &c->journal.key;
- unsigned int i, sectors = set_blocks(w->data, block_bytes(c)) *
- c->sb.block_size;
+ unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) *
+ ca->sb.block_size;
struct bio *bio;
struct bio_list list;
@@ -779,17 +754,15 @@ static void journal_write_unlocked(struct closure *cl)
return;
}
- c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
+ c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca));
w->data->btree_level = c->root->level;
bkey_copy(&w->data->btree_root, &c->root->key);
bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
- for_each_cache(ca, c, i)
- w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
-
- w->data->magic = jset_magic(&c->sb);
+ w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
+ w->data->magic = jset_magic(&ca->sb);
w->data->version = BCACHE_JSET_VERSION;
w->data->last_seq = last_seq(&c->journal);
w->data->csum = csum_set(w->data);
@@ -865,6 +838,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
size_t sectors;
struct closure cl;
bool wait = false;
+ struct cache *ca = c->cache;
closure_init_stack(&cl);
@@ -874,10 +848,10 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
struct journal_write *w = c->journal.cur;
sectors = __set_blocks(w->data, w->data->keys + nkeys,
- block_bytes(c)) * c->sb.block_size;
+ block_bytes(ca)) * ca->sb.block_size;
if (sectors <= min_t(size_t,
- c->journal.blocks_free * c->sb.block_size,
+ c->journal.blocks_free * ca->sb.block_size,
PAGE_SECTORS << JSET_BITS))
return w;
@@ -942,7 +916,7 @@ atomic_t *bch_journal(struct cache_set *c,
if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
return NULL;
- if (!CACHE_SYNC(&c->sb))
+ if (!CACHE_SYNC(&c->cache->sb))
return NULL;
w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 7891fb5..b9c3d27 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -145,8 +145,8 @@ static void read_moving(struct cache_set *c)
continue;
}
- io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
- * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs,
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
GFP_KERNEL);
if (!io)
goto err;
@@ -196,50 +196,48 @@ static unsigned int bucket_heap_top(struct cache *ca)
void bch_moving_gc(struct cache_set *c)
{
- struct cache *ca;
+ struct cache *ca = c->cache;
struct bucket *b;
- unsigned int i;
+ unsigned long sectors_to_move, reserve_sectors;
if (!c->copy_gc_enabled)
return;
mutex_lock(&c->bucket_lock);
- for_each_cache(ca, c, i) {
- unsigned int sectors_to_move = 0;
- unsigned int reserve_sectors = ca->sb.bucket_size *
+ sectors_to_move = 0;
+ reserve_sectors = ca->sb.bucket_size *
fifo_used(&ca->free[RESERVE_MOVINGGC]);
- ca->heap.used = 0;
+ ca->heap.used = 0;
- for_each_bucket(b, ca) {
- if (GC_MARK(b) == GC_MARK_METADATA ||
- !GC_SECTORS_USED(b) ||
- GC_SECTORS_USED(b) == ca->sb.bucket_size ||
- atomic_read(&b->pin))
- continue;
+ for_each_bucket(b, ca) {
+ if (GC_MARK(b) == GC_MARK_METADATA ||
+ !GC_SECTORS_USED(b) ||
+ GC_SECTORS_USED(b) == ca->sb.bucket_size ||
+ atomic_read(&b->pin))
+ continue;
- if (!heap_full(&ca->heap)) {
- sectors_to_move += GC_SECTORS_USED(b);
- heap_add(&ca->heap, b, bucket_cmp);
- } else if (bucket_cmp(b, heap_peek(&ca->heap))) {
- sectors_to_move -= bucket_heap_top(ca);
- sectors_to_move += GC_SECTORS_USED(b);
+ if (!heap_full(&ca->heap)) {
+ sectors_to_move += GC_SECTORS_USED(b);
+ heap_add(&ca->heap, b, bucket_cmp);
+ } else if (bucket_cmp(b, heap_peek(&ca->heap))) {
+ sectors_to_move -= bucket_heap_top(ca);
+ sectors_to_move += GC_SECTORS_USED(b);
- ca->heap.data[0] = b;
- heap_sift(&ca->heap, 0, bucket_cmp);
- }
+ ca->heap.data[0] = b;
+ heap_sift(&ca->heap, 0, bucket_cmp);
}
-
- while (sectors_to_move > reserve_sectors) {
- heap_pop(&ca->heap, b, bucket_cmp);
- sectors_to_move -= GC_SECTORS_USED(b);
- }
-
- while (heap_pop(&ca->heap, b, bucket_cmp))
- SET_GC_MOVE(b, 1);
}
+ while (sectors_to_move > reserve_sectors) {
+ heap_pop(&ca->heap, b, bucket_cmp);
+ sectors_to_move -= GC_SECTORS_USED(b);
+ }
+
+ while (heap_pop(&ca->heap, b, bucket_cmp))
+ SET_GC_MOVE(b, 1);
+
mutex_unlock(&c->bucket_lock);
c->moving_gc_keys.last_scanned = ZERO_KEY;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 4045ae7..2143263 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -62,18 +62,6 @@ static void bch_data_insert_keys(struct closure *cl)
struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
int ret;
- /*
- * If we're looping, might already be waiting on
- * another journal write - can't wait on more than one journal write at
- * a time
- *
- * XXX: this looks wrong
- */
-#if 0
- while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING)
- closure_sync(&s->cl);
-#endif
-
if (!op->replace)
journal_ref = bch_journal(op->c, &op->insert_keys,
op->flush_journal ? cl : NULL);
@@ -111,7 +99,7 @@ static int bch_keylist_realloc(struct keylist *l, unsigned int u64s,
* bch_data_insert_keys() will insert the keys created so far
* and finish the rest when the keylist is empty.
*/
- if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
+ if (newsize * sizeof(uint64_t) > block_bytes(c->cache) - sizeof(struct jset))
return -ENOMEM;
return __bch_keylist_realloc(l, u64s);
@@ -122,7 +110,7 @@ static void bch_data_invalidate(struct closure *cl)
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
struct bio *bio = op->bio;
- pr_debug("invalidating %i sectors from %llu",
+ pr_debug("invalidating %i sectors from %llu\n",
bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
while (bio_sectors(bio)) {
@@ -406,9 +394,9 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
goto skip;
}
- if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
- bio_sectors(bio) & (c->sb.block_size - 1)) {
- pr_debug("skipping unaligned io");
+ if (bio->bi_iter.bi_sector & (c->cache->sb.block_size - 1) ||
+ bio_sectors(bio) & (c->cache->sb.block_size - 1)) {
+ pr_debug("skipping unaligned io\n");
goto skip;
}
@@ -487,6 +475,7 @@ struct search {
unsigned int read_dirty_data:1;
unsigned int cache_missed:1;
+ struct hd_struct *part;
unsigned long start_time;
struct btree_op op;
@@ -662,7 +651,7 @@ static void backing_request_endio(struct bio *bio)
*/
if (unlikely(s->iop.writeback &&
bio->bi_opf & REQ_PREFLUSH)) {
- pr_err("Can't flush %s: returned bi_status %i",
+ pr_err("Can't flush %s: returned bi_status %i\n",
dc->backing_dev_name, bio->bi_status);
} else {
/* set to orig_bio->bi_status in bio_complete() */
@@ -680,8 +669,8 @@ static void backing_request_endio(struct bio *bio)
static void bio_complete(struct search *s)
{
if (s->orig_bio) {
- generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio),
- &s->d->disk->part0, s->start_time);
+ /* Count on bcache device */
+ part_end_io_acct(s->part, s->orig_bio, s->start_time);
trace_bcache_request_end(s->d, s->orig_bio);
s->orig_bio->bi_status = s->iop.status;
@@ -742,8 +731,8 @@ static inline struct search *search_alloc(struct bio *bio,
s->recoverable = 1;
s->write = op_is_write(bio_op(bio));
s->read_dirty_data = 0;
- s->start_time = jiffies;
-
+ /* Count on the bcache device */
+ s->start_time = part_start_io_acct(d->disk, &s->part, bio);
s->iop.c = d->c;
s->iop.bio = NULL;
s->iop.inode = d->id;
@@ -1084,6 +1073,7 @@ struct detached_dev_io_private {
unsigned long start_time;
bio_end_io_t *bi_end_io;
void *bi_private;
+ struct hd_struct *part;
};
static void detached_dev_end_io(struct bio *bio)
@@ -1094,8 +1084,8 @@ static void detached_dev_end_io(struct bio *bio)
bio->bi_end_io = ddip->bi_end_io;
bio->bi_private = ddip->bi_private;
- generic_end_io_acct(ddip->d->disk->queue, bio_op(bio),
- &ddip->d->disk->part0, ddip->start_time);
+ /* Count on the bcache device */
+ part_end_io_acct(ddip->part, bio, ddip->start_time);
if (bio->bi_status) {
struct cached_dev *dc = container_of(ddip->d,
@@ -1120,7 +1110,8 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
*/
ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
ddip->d = d;
- ddip->start_time = jiffies;
+ /* Count on the bcache device */
+ ddip->start_time = part_start_io_acct(d->disk, &ddip->part, bio);
ddip->bi_end_io = bio->bi_end_io;
ddip->bi_private = bio->bi_private;
bio->bi_end_io = detached_dev_end_io;
@@ -1130,7 +1121,7 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
!blk_queue_discard(bdev_get_queue(dc->bdev)))
bio->bi_end_io(bio);
else
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
static void quit_max_writeback_rate(struct cache_set *c,
@@ -1173,8 +1164,7 @@ static void quit_max_writeback_rate(struct cache_set *c,
/* Cached devices - read & write stuff */
-static blk_qc_t cached_dev_make_request(struct request_queue *q,
- struct bio *bio)
+blk_qc_t cached_dev_submit_bio(struct bio *bio)
{
struct search *s;
struct bcache_device *d = bio->bi_disk->private_data;
@@ -1203,11 +1193,6 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
}
}
- generic_start_io_acct(q,
- bio_op(bio),
- bio_sectors(bio),
- &d->disk->part0);
-
bio_set_dev(bio, dc->bdev);
bio->bi_iter.bi_sector += dc->sb.data_offset;
@@ -1218,7 +1203,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
if (!bio->bi_iter.bi_size) {
/*
* can't call bch_journal_meta from under
- * generic_make_request
+ * submit_bio_noacct
*/
continue_at_nobarrier(&s->cl,
cached_dev_nodata,
@@ -1249,37 +1234,8 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
}
-static int cached_dev_congested(void *data, int bits)
-{
- struct bcache_device *d = data;
- struct cached_dev *dc = container_of(d, struct cached_dev, disk);
- struct request_queue *q = bdev_get_queue(dc->bdev);
- int ret = 0;
-
- if (bdi_congested(q->backing_dev_info, bits))
- return 1;
-
- if (cached_dev_get(dc)) {
- unsigned int i;
- struct cache *ca;
-
- for_each_cache(ca, d->c, i) {
- q = bdev_get_queue(ca->bdev);
- ret |= bdi_congested(q->backing_dev_info, bits);
- }
-
- cached_dev_put(dc);
- }
-
- return ret;
-}
-
void bch_cached_dev_request_init(struct cached_dev *dc)
{
- struct gendisk *g = dc->disk.disk;
-
- g->queue->make_request_fn = cached_dev_make_request;
- g->queue->backing_dev_info->congested_fn = cached_dev_congested;
dc->disk.cache_miss = cached_dev_cache_miss;
dc->disk.ioctl = cached_dev_ioctl;
}
@@ -1313,8 +1269,7 @@ static void flash_dev_nodata(struct closure *cl)
continue_at(cl, search_free, NULL);
}
-static blk_qc_t flash_dev_make_request(struct request_queue *q,
- struct bio *bio)
+blk_qc_t flash_dev_submit_bio(struct bio *bio)
{
struct search *s;
struct closure *cl;
@@ -1326,8 +1281,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
return BLK_QC_T_NONE;
}
- generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
-
s = search_alloc(bio, d);
cl = &s->cl;
bio = &s->bio.bio;
@@ -1336,8 +1289,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
if (!bio->bi_iter.bi_size) {
/*
- * can't call bch_journal_meta from under
- * generic_make_request
+ * can't call bch_journal_meta from under submit_bio_noacct
*/
continue_at_nobarrier(&s->cl,
flash_dev_nodata,
@@ -1367,28 +1319,8 @@ static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
return -ENOTTY;
}
-static int flash_dev_congested(void *data, int bits)
-{
- struct bcache_device *d = data;
- struct request_queue *q;
- struct cache *ca;
- unsigned int i;
- int ret = 0;
-
- for_each_cache(ca, d->c, i) {
- q = bdev_get_queue(ca->bdev);
- ret |= bdi_congested(q->backing_dev_info, bits);
- }
-
- return ret;
-}
-
void bch_flash_dev_request_init(struct bcache_device *d)
{
- struct gendisk *g = d->disk;
-
- g->queue->make_request_fn = flash_dev_make_request;
- g->queue->backing_dev_info->congested_fn = flash_dev_congested;
d->cache_miss = flash_dev_cache_miss;
d->ioctl = flash_dev_ioctl;
}
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index c64dbd7..82b3836 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -37,7 +37,10 @@ unsigned int bch_get_congested(const struct cache_set *c);
void bch_data_insert(struct closure *cl);
void bch_cached_dev_request_init(struct cached_dev *dc);
+blk_qc_t cached_dev_submit_bio(struct bio *bio);
+
void bch_flash_dev_request_init(struct bcache_device *d);
+blk_qc_t flash_dev_submit_bio(struct bio *bio);
extern struct kmem_cache *bch_search_cache;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index efdf6ce..81f1cc5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -13,13 +13,14 @@
#include "extents.h"
#include "request.h"
#include "writeback.h"
+#include "features.h"
#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/debugfs.h>
#include <linux/genhd.h>
#include <linux/idr.h>
#include <linux/kthread.h>
+#include <linux/workqueue.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/reboot.h>
@@ -60,18 +61,121 @@ struct workqueue_struct *bch_journal_wq;
/* Superblock */
-static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
- struct page **res)
+static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s)
+{
+ unsigned int bucket_size = le16_to_cpu(s->bucket_size);
+
+ if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
+ if (bch_has_feature_large_bucket(sb)) {
+ unsigned int max, order;
+
+ max = sizeof(unsigned int) * BITS_PER_BYTE - 1;
+ order = le16_to_cpu(s->bucket_size);
+ /*
+ * bcache tool will make sure the overflow won't
+ * happen, an error message here is enough.
+ */
+ if (order > max)
+ pr_err("Bucket size (1 << %u) overflows\n",
+ order);
+ bucket_size = 1 << order;
+ } else if (bch_has_feature_obso_large_bucket(sb)) {
+ bucket_size +=
+ le16_to_cpu(s->obso_bucket_size_hi) << 16;
+ }
+ }
+
+ return bucket_size;
+}
+
+static const char *read_super_common(struct cache_sb *sb, struct block_device *bdev,
+ struct cache_sb_disk *s)
{
const char *err;
- struct cache_sb *s;
- struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
unsigned int i;
- if (!bh)
- return "IO error";
+ sb->first_bucket= le16_to_cpu(s->first_bucket);
+ sb->nbuckets = le64_to_cpu(s->nbuckets);
+ sb->bucket_size = get_bucket_size(sb, s);
- s = (struct cache_sb *) bh->b_data;
+ sb->nr_in_set = le16_to_cpu(s->nr_in_set);
+ sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
+
+ err = "Too many journal buckets";
+ if (sb->keys > SB_JOURNAL_BUCKETS)
+ goto err;
+
+ err = "Too many buckets";
+ if (sb->nbuckets > LONG_MAX)
+ goto err;
+
+ err = "Not enough buckets";
+ if (sb->nbuckets < 1 << 7)
+ goto err;
+
+ err = "Bad block size (not power of 2)";
+ if (!is_power_of_2(sb->block_size))
+ goto err;
+
+ err = "Bad block size (larger than page size)";
+ if (sb->block_size > PAGE_SECTORS)
+ goto err;
+
+ err = "Bad bucket size (not power of 2)";
+ if (!is_power_of_2(sb->bucket_size))
+ goto err;
+
+ err = "Bad bucket size (smaller than page size)";
+ if (sb->bucket_size < PAGE_SECTORS)
+ goto err;
+
+ err = "Invalid superblock: device too small";
+ if (get_capacity(bdev->bd_disk) <
+ sb->bucket_size * sb->nbuckets)
+ goto err;
+
+ err = "Bad UUID";
+ if (bch_is_zero(sb->set_uuid, 16))
+ goto err;
+
+ err = "Bad cache device number in set";
+ if (!sb->nr_in_set ||
+ sb->nr_in_set <= sb->nr_this_dev ||
+ sb->nr_in_set > MAX_CACHES_PER_SET)
+ goto err;
+
+ err = "Journal buckets not sequential";
+ for (i = 0; i < sb->keys; i++)
+ if (sb->d[i] != sb->first_bucket + i)
+ goto err;
+
+ err = "Too many journal buckets";
+ if (sb->first_bucket + sb->keys > sb->nbuckets)
+ goto err;
+
+ err = "Invalid superblock: first bucket comes before end of super";
+ if (sb->first_bucket * sb->bucket_size < 16)
+ goto err;
+
+ err = NULL;
+err:
+ return err;
+}
+
+
+static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
+ struct cache_sb_disk **res)
+{
+ const char *err;
+ struct cache_sb_disk *s;
+ struct page *page;
+ unsigned int i;
+
+ page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+ SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
+ if (IS_ERR(page))
+ return "IO error";
+ s = page_address(page) + offset_in_page(SB_OFFSET);
sb->offset = le64_to_cpu(s->offset);
sb->version = le64_to_cpu(s->version);
@@ -84,26 +188,22 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
sb->flags = le64_to_cpu(s->flags);
sb->seq = le64_to_cpu(s->seq);
sb->last_mount = le32_to_cpu(s->last_mount);
- sb->first_bucket = le16_to_cpu(s->first_bucket);
sb->keys = le16_to_cpu(s->keys);
for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
sb->d[i] = le64_to_cpu(s->d[i]);
- pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
+ pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u\n",
sb->version, sb->flags, sb->seq, sb->keys);
- err = "Not a bcache superblock";
+ err = "Not a bcache superblock (bad offset)";
if (sb->offset != SB_SECTOR)
goto err;
+ err = "Not a bcache superblock (bad magic)";
if (memcmp(sb->magic, bcache_magic, 16))
goto err;
- err = "Too many journal buckets";
- if (sb->keys > SB_JOURNAL_BUCKETS)
- goto err;
-
err = "Bad checksum";
if (s->csum != csum_set(s))
goto err;
@@ -123,6 +223,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
sb->data_offset = BDEV_DATA_START_DEFAULT;
break;
case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
+ case BCACHE_SB_VERSION_BDEV_WITH_FEATURES:
sb->data_offset = le64_to_cpu(s->data_offset);
err = "Bad data offset";
@@ -132,55 +233,35 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
break;
case BCACHE_SB_VERSION_CDEV:
case BCACHE_SB_VERSION_CDEV_WITH_UUID:
- sb->nbuckets = le64_to_cpu(s->nbuckets);
- sb->bucket_size = le16_to_cpu(s->bucket_size);
+ err = read_super_common(sb, bdev, s);
+ if (err)
+ goto err;
+ break;
+ case BCACHE_SB_VERSION_CDEV_WITH_FEATURES:
+ /*
+ * Feature bits are needed in read_super_common(),
+ * convert them firstly.
+ */
+ sb->feature_compat = le64_to_cpu(s->feature_compat);
+ sb->feature_incompat = le64_to_cpu(s->feature_incompat);
+ sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat);
- sb->nr_in_set = le16_to_cpu(s->nr_in_set);
- sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
-
- err = "Too many buckets";
- if (sb->nbuckets > LONG_MAX)
+ /* Check incompatible features */
+ err = "Unsupported compatible feature found";
+ if (bch_has_unknown_compat_features(sb))
goto err;
- err = "Not enough buckets";
- if (sb->nbuckets < 1 << 7)
+ err = "Unsupported read-only compatible feature found";
+ if (bch_has_unknown_ro_compat_features(sb))
goto err;
- err = "Bad block/bucket size";
- if (!is_power_of_2(sb->block_size) ||
- sb->block_size > PAGE_SECTORS ||
- !is_power_of_2(sb->bucket_size) ||
- sb->bucket_size < PAGE_SECTORS)
+ err = "Unsupported incompatible feature found";
+ if (bch_has_unknown_incompat_features(sb))
goto err;
- err = "Invalid superblock: device too small";
- if (get_capacity(bdev->bd_disk) <
- sb->bucket_size * sb->nbuckets)
+ err = read_super_common(sb, bdev, s);
+ if (err)
goto err;
-
- err = "Bad UUID";
- if (bch_is_zero(sb->set_uuid, 16))
- goto err;
-
- err = "Bad cache device number in set";
- if (!sb->nr_in_set ||
- sb->nr_in_set <= sb->nr_this_dev ||
- sb->nr_in_set > MAX_CACHES_PER_SET)
- goto err;
-
- err = "Journal buckets not sequential";
- for (i = 0; i < sb->keys; i++)
- if (sb->d[i] != sb->first_bucket + i)
- goto err;
-
- err = "Too many journal buckets";
- if (sb->first_bucket + sb->keys > sb->nbuckets)
- goto err;
-
- err = "Invalid superblock: first bucket comes before end of super";
- if (sb->first_bucket * sb->bucket_size < 16)
- goto err;
-
break;
default:
err = "Unsupported superblock version";
@@ -188,12 +269,10 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
}
sb->last_mount = (u32)ktime_get_real_seconds();
- err = NULL;
-
- get_page(bh->b_page);
- *res = bh->b_page;
+ *res = s;
+ return NULL;
err:
- put_bh(bh);
+ put_page(page);
return err;
}
@@ -207,18 +286,17 @@ static void write_bdev_super_endio(struct bio *bio)
closure_put(&dc->sb_write);
}
-static void __write_super(struct cache_sb *sb, struct bio *bio)
+static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
+ struct bio *bio)
{
- struct cache_sb *out = page_address(bio_first_page_all(bio));
unsigned int i;
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
bio->bi_iter.bi_sector = SB_SECTOR;
- bio->bi_iter.bi_size = SB_SIZE;
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
- bch_bio_map(bio, NULL);
+ __bio_add_page(bio, virt_to_page(out), SB_SIZE,
+ offset_in_page(out));
out->offset = cpu_to_le64(sb->offset);
- out->version = cpu_to_le64(sb->version);
memcpy(out->uuid, sb->uuid, 16);
memcpy(out->set_uuid, sb->set_uuid, 16);
@@ -234,9 +312,16 @@ static void __write_super(struct cache_sb *sb, struct bio *bio)
for (i = 0; i < sb->keys; i++)
out->d[i] = cpu_to_le64(sb->d[i]);
+ if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
+ out->feature_compat = cpu_to_le64(sb->feature_compat);
+ out->feature_incompat = cpu_to_le64(sb->feature_incompat);
+ out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat);
+ }
+
+ out->version = cpu_to_le64(sb->version);
out->csum = csum_set(out);
- pr_debug("ver %llu, flags %llu, seq %llu",
+ pr_debug("ver %llu, flags %llu, seq %llu\n",
sb->version, sb->flags, sb->seq);
submit_bio(bio);
@@ -257,14 +342,14 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
down(&dc->sb_write_mutex);
closure_init(cl, parent);
- bio_reset(bio);
+ bio_init(bio, dc->sb_bv, 1);
bio_set_dev(bio, dc->bdev);
bio->bi_end_io = write_bdev_super_endio;
bio->bi_private = dc;
closure_get(cl);
/* I/O request sent to backing device */
- __write_super(&dc->sb, bio);
+ __write_super(&dc->sb, dc->sb_disk, bio);
closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
}
@@ -289,31 +374,25 @@ static void bcache_write_super_unlock(struct closure *cl)
void bcache_write_super(struct cache_set *c)
{
struct closure *cl = &c->sb_write;
- struct cache *ca;
- unsigned int i;
+ struct cache *ca = c->cache;
+ struct bio *bio = &ca->sb_bio;
+ unsigned int version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
down(&c->sb_write_mutex);
closure_init(cl, &c->cl);
- c->sb.seq++;
+ ca->sb.seq++;
- for_each_cache(ca, c, i) {
- struct bio *bio = &ca->sb_bio;
+ if (ca->sb.version < version)
+ ca->sb.version = version;
- ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
- ca->sb.seq = c->sb.seq;
- ca->sb.last_mount = c->sb.last_mount;
+ bio_init(bio, ca->sb_bv, 1);
+ bio_set_dev(bio, ca->bdev);
+ bio->bi_end_io = write_super_endio;
+ bio->bi_private = ca;
- SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
-
- bio_reset(bio);
- bio_set_dev(bio, ca->bdev);
- bio->bi_end_io = write_super_endio;
- bio->bi_private = ca;
-
- closure_get(cl);
- __write_super(&ca->sb, bio);
- }
+ closure_get(cl);
+ __write_super(&ca->sb, ca->sb_disk, bio);
closure_return_with_destructor(cl, bcache_write_super_unlock);
}
@@ -367,11 +446,11 @@ static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
}
bch_extent_to_text(buf, sizeof(buf), k);
- pr_debug("%s UUIDs at %s", op == REQ_OP_WRITE ? "wrote" : "read", buf);
+ pr_debug("%s UUIDs at %s\n", op == REQ_OP_WRITE ? "wrote" : "read", buf);
for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
if (!bch_is_zero(u->uuid, 16))
- pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
+ pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u\n",
u - c->uuids, u->uuid, u->label,
u->first_reg, u->last_reg, u->invalidated);
@@ -423,20 +502,21 @@ static int __uuid_write(struct cache_set *c)
{
BKEY_PADDED(key) k;
struct closure cl;
- struct cache *ca;
+ struct cache *ca = c->cache;
+ unsigned int size;
closure_init_stack(&cl);
lockdep_assert_held(&bch_register_lock);
- if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
+ if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true))
return 1;
- SET_KEY_SIZE(&k.key, c->sb.bucket_size);
+ size = meta_bucket_pages(&ca->sb) * PAGE_SECTORS;
+ SET_KEY_SIZE(&k.key, size);
uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
closure_sync(&cl);
/* Only one bucket used for uuid write */
- ca = PTR_CACHE(c, &k.key, 0);
atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
bkey_copy(&c->uuid_bucket, &k.key);
@@ -519,7 +599,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op,
bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
bio_set_dev(bio, ca->bdev);
- bio->bi_iter.bi_size = bucket_bytes(ca);
+ bio->bi_iter.bi_size = meta_bucket_bytes(&ca->sb);
bio->bi_end_io = prio_endio;
bio->bi_private = ca;
@@ -536,7 +616,7 @@ int bch_prio_write(struct cache *ca, bool wait)
struct bucket *b;
struct closure cl;
- pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu",
+ pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu\n",
fifo_used(&ca->free[RESERVE_PRIO]),
fifo_used(&ca->free[RESERVE_NONE]),
fifo_used(&ca->free_inc));
@@ -577,7 +657,7 @@ int bch_prio_write(struct cache *ca, bool wait)
p->next_bucket = ca->prio_buckets[i + 1];
p->magic = pset_magic(&ca->sb);
- p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
+ p->csum = bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8);
bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
BUG_ON(bucket == -1);
@@ -611,12 +691,13 @@ int bch_prio_write(struct cache *ca, bool wait)
return 0;
}
-static void prio_read(struct cache *ca, uint64_t bucket)
+static int prio_read(struct cache *ca, uint64_t bucket)
{
struct prio_set *p = ca->disk_buckets;
struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
struct bucket *b;
unsigned int bucket_nr = 0;
+ int ret = -EIO;
for (b = ca->buckets;
b < ca->buckets + ca->sb.nbuckets;
@@ -629,11 +710,15 @@ static void prio_read(struct cache *ca, uint64_t bucket)
prio_io(ca, bucket, REQ_OP_READ, 0);
if (p->csum !=
- bch_crc64(&p->magic, bucket_bytes(ca) - 8))
- pr_warn("bad csum reading priorities");
+ bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) {
+ pr_warn("bad csum reading priorities\n");
+ goto out;
+ }
- if (p->magic != pset_magic(&ca->sb))
- pr_warn("bad magic reading priorities");
+ if (p->magic != pset_magic(&ca->sb)) {
+ pr_warn("bad magic reading priorities\n");
+ goto out;
+ }
bucket = p->next_bucket;
d = p->data;
@@ -642,6 +727,10 @@ static void prio_read(struct cache *ca, uint64_t bucket)
b->prio = le16_to_cpu(d->prio);
b->gen = b->last_gc = d->gen;
}
+
+ ret = 0;
+out:
+ return ret;
}
/* Bcache device */
@@ -672,7 +761,16 @@ static int ioctl_dev(struct block_device *b, fmode_t mode,
return d->ioctl(d, mode, cmd, arg);
}
-static const struct block_device_operations bcache_ops = {
+static const struct block_device_operations bcache_cached_ops = {
+ .submit_bio = cached_dev_submit_bio,
+ .open = open_dev,
+ .release = release_dev,
+ .ioctl = ioctl_dev,
+ .owner = THIS_MODULE,
+};
+
+static const struct block_device_operations bcache_flash_ops = {
+ .submit_bio = flash_dev_submit_bio,
.open = open_dev,
.release = release_dev,
.ioctl = ioctl_dev,
@@ -695,37 +793,33 @@ static void bcache_device_unlink(struct bcache_device *d)
lockdep_assert_held(&bch_register_lock);
if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
- unsigned int i;
- struct cache *ca;
+ struct cache *ca = d->c->cache;
sysfs_remove_link(&d->c->kobj, d->name);
sysfs_remove_link(&d->kobj, "cache");
- for_each_cache(ca, d->c, i)
- bd_unlink_disk_holder(ca->bdev, d->disk);
+ bd_unlink_disk_holder(ca->bdev, d->disk);
}
}
static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
const char *name)
{
- unsigned int i;
- struct cache *ca;
+ struct cache *ca = c->cache;
int ret;
- for_each_cache(ca, d->c, i)
- bd_link_disk_holder(ca->bdev, d->disk);
+ bd_link_disk_holder(ca->bdev, d->disk);
snprintf(d->name, BCACHEDEVNAME_SIZE,
"%s%u", name, d->id);
ret = sysfs_create_link(&d->kobj, &c->kobj, "cache");
if (ret < 0)
- pr_err("Couldn't create device -> cache set symlink");
+ pr_err("Couldn't create device -> cache set symlink\n");
ret = sysfs_create_link(&c->kobj, &d->kobj, d->name);
if (ret < 0)
- pr_err("Couldn't create cache set -> device symlink");
+ pr_err("Couldn't create cache set -> device symlink\n");
clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
}
@@ -782,9 +876,9 @@ static void bcache_device_free(struct bcache_device *d)
lockdep_assert_held(&bch_register_lock);
if (disk)
- pr_info("%s stopped", disk->disk_name);
+ pr_info("%s stopped\n", disk->disk_name);
else
- pr_err("bcache device (NULL gendisk) stopped");
+ pr_err("bcache device (NULL gendisk) stopped\n");
if (d->c)
bcache_device_detach(d);
@@ -812,7 +906,8 @@ static void bcache_device_free(struct bcache_device *d)
}
static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
- sector_t sectors)
+ sector_t sectors, struct block_device *cached_bdev,
+ const struct block_device_operations *ops)
{
struct request_queue *q;
const size_t max_stripes = min_t(size_t, INT_MAX,
@@ -859,17 +954,14 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
d->disk->major = bcache_major;
d->disk->first_minor = idx_to_first_minor(idx);
- d->disk->fops = &bcache_ops;
+ d->disk->fops = ops;
d->disk->private_data = d;
- q = blk_alloc_queue(GFP_KERNEL);
+ q = blk_alloc_queue(NUMA_NO_NODE);
if (!q)
return -ENOMEM;
- blk_queue_make_request(q, NULL);
d->disk->queue = q;
- q->queuedata = d;
- q->backing_dev_info->congested_data = d;
q->limits.max_hw_sectors = UINT_MAX;
q->limits.max_sectors = UINT_MAX;
q->limits.max_segment_size = UINT_MAX;
@@ -879,6 +971,20 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
q->limits.io_min = block_size;
q->limits.logical_block_size = block_size;
q->limits.physical_block_size = block_size;
+
+ if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) {
+ /*
+ * This should only happen with BCACHE_SB_VERSION_BDEV.
+ * Block/page size is checked for BCACHE_SB_VERSION_CDEV.
+ */
+ pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
+ d->disk->disk_name, q->limits.logical_block_size,
+ PAGE_SIZE, bdev_logical_block_size(cached_bdev));
+
+ /* This also adjusts physical block size/min io size if needed */
+ blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev));
+ }
+
blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
@@ -931,11 +1037,11 @@ static int cached_dev_status_update(void *arg)
dc->offline_seconds = 0;
if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
- pr_err("%s: device offline for %d seconds",
+ pr_err("%s: device offline for %d seconds\n",
dc->backing_dev_name,
BACKING_DEV_OFFLINE_TIMEOUT);
- pr_err("%s: disable I/O request due to backing "
- "device offline", dc->disk.name);
+ pr_err("%s: disable I/O request due to backing device offline\n",
+ dc->disk.name);
dc->io_disable = true;
/* let others know earlier that io_disable is true */
smp_mb();
@@ -962,7 +1068,7 @@ int bch_cached_dev_run(struct cached_dev *dc)
};
if (dc->io_disable) {
- pr_err("I/O disabled on cached dev %s",
+ pr_err("I/O disabled on cached dev %s\n",
dc->backing_dev_name);
kfree(env[1]);
kfree(env[2]);
@@ -974,7 +1080,7 @@ int bch_cached_dev_run(struct cached_dev *dc)
kfree(env[1]);
kfree(env[2]);
kfree(buf);
- pr_info("cached dev %s is running already",
+ pr_info("cached dev %s is running already\n",
dc->backing_dev_name);
return -EBUSY;
}
@@ -1004,16 +1110,14 @@ int bch_cached_dev_run(struct cached_dev *dc)
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
sysfs_create_link(&disk_to_dev(d->disk)->kobj,
&d->kobj, "bcache")) {
- pr_err("Couldn't create bcache dev <-> disk sysfs symlinks");
+ pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n");
return -ENOMEM;
}
dc->status_update_thread = kthread_run(cached_dev_status_update,
dc, "bcache_status_update");
if (IS_ERR(dc->status_update_thread)) {
- pr_warn("failed to create bcache_status_update kthread, "
- "continue to run without monitoring backing "
- "device status");
+ pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n");
}
return 0;
@@ -1039,7 +1143,7 @@ static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
} while (time_out > 0);
if (time_out == 0)
- pr_warn("give up waiting for dc->writeback_write_update to quit");
+ pr_warn("give up waiting for dc->writeback_write_update to quit\n");
cancel_delayed_work_sync(&dc->writeback_rate_update);
}
@@ -1080,7 +1184,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
mutex_unlock(&bch_register_lock);
- pr_info("Caching disabled for %s", dc->backing_dev_name);
+ pr_info("Caching disabled for %s\n", dc->backing_dev_name);
/* Drop ref we took in cached_dev_detach() */
closure_put(&dc->disk.cl);
@@ -1115,25 +1219,25 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
struct cached_dev *exist_dc, *t;
int ret = 0;
- if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
- (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
+ if ((set_uuid && memcmp(set_uuid, c->set_uuid, 16)) ||
+ (!set_uuid && memcmp(dc->sb.set_uuid, c->set_uuid, 16)))
return -ENOENT;
if (dc->disk.c) {
- pr_err("Can't attach %s: already attached",
+ pr_err("Can't attach %s: already attached\n",
dc->backing_dev_name);
return -EINVAL;
}
if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
- pr_err("Can't attach %s: shutting down",
+ pr_err("Can't attach %s: shutting down\n",
dc->backing_dev_name);
return -EINVAL;
}
- if (dc->sb.block_size < c->sb.block_size) {
+ if (dc->sb.block_size < c->cache->sb.block_size) {
/* Will die */
- pr_err("Couldn't attach %s: block size less than set's block size",
+ pr_err("Couldn't attach %s: block size less than set's block size\n",
dc->backing_dev_name);
return -EINVAL;
}
@@ -1141,7 +1245,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
/* Check whether already attached */
list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
- pr_err("Tried to attach %s but duplicate UUID already attached",
+ pr_err("Tried to attach %s but duplicate UUID already attached\n",
dc->backing_dev_name);
return -EINVAL;
@@ -1160,14 +1264,14 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
if (!u) {
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
- pr_err("Couldn't find uuid for %s in set",
+ pr_err("Couldn't find uuid for %s in set\n",
dc->backing_dev_name);
return -ENOENT;
}
u = uuid_find_empty(c);
if (!u) {
- pr_err("Not caching %s, no room for UUID",
+ pr_err("Not caching %s, no room for UUID\n",
dc->backing_dev_name);
return -EINVAL;
}
@@ -1188,7 +1292,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
u->first_reg = u->last_reg = rtime;
bch_uuid_write(c);
- memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
+ memcpy(dc->sb.set_uuid, c->set_uuid, 16);
SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
bch_write_bdev_super(dc, &cl);
@@ -1213,7 +1317,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
down_write(&dc->writeback_lock);
if (bch_cached_dev_writeback_start(dc)) {
up_write(&dc->writeback_lock);
- pr_err("Couldn't start writeback facilities for %s",
+ pr_err("Couldn't start writeback facilities for %s\n",
dc->disk.disk->disk_name);
return -ENOMEM;
}
@@ -1236,7 +1340,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
*/
kthread_stop(dc->writeback_thread);
cancel_writeback_rate_update_dwork(dc);
- pr_err("Couldn't run cached device %s",
+ pr_err("Couldn't run cached device %s\n",
dc->backing_dev_name);
return ret;
}
@@ -1244,13 +1348,19 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
bcache_device_link(&dc->disk, c, "bdev");
atomic_inc(&c->attached_dev_nr);
+ if (bch_has_feature_obso_large_bucket(&(c->cache->sb))) {
+ pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
+ pr_err("Please update to the latest bcache-tools to create the cache device\n");
+ set_disk_ro(dc->disk.disk, 1);
+ }
+
/* Allow the writeback thread to proceed */
up_write(&dc->writeback_lock);
- pr_info("Caching %s as %s on set %pU",
+ pr_info("Caching %s as %s on set %pU\n",
dc->backing_dev_name,
dc->disk.disk->disk_name,
- dc->disk.c->sb.set_uuid);
+ dc->disk.c->set_uuid);
return 0;
}
@@ -1284,8 +1394,8 @@ static void cached_dev_free(struct closure *cl)
mutex_unlock(&bch_register_lock);
- if (dc->sb_bio.bi_inline_vecs[0].bv_page)
- put_page(bio_first_page_all(&dc->sb_bio));
+ if (dc->sb_disk)
+ put_page(virt_to_page(dc->sb_disk));
if (!IS_ERR_OR_NULL(dc->bdev))
blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
@@ -1341,13 +1451,13 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
q->limits.raid_partial_stripes_expensive;
ret = bcache_device_init(&dc->disk, block_size,
- dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
+ dc->bdev->bd_part->nr_sects - dc->sb.data_offset,
+ dc->bdev, &bcache_cached_ops);
if (ret)
return ret;
- dc->disk.disk->queue->backing_dev_info->ra_pages =
- max(dc->disk.disk->queue->backing_dev_info->ra_pages,
- q->backing_dev_info->ra_pages);
+ blk_queue_io_opt(dc->disk.disk->queue,
+ max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q)));
atomic_set(&dc->io_errors, 0);
dc->io_disable = false;
@@ -1362,7 +1472,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
/* Cached device - bcache superblock */
-static int register_bdev(struct cache_sb *sb, struct page *sb_page,
+static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
struct block_device *bdev,
struct cached_dev *dc)
{
@@ -1374,11 +1484,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page,
memcpy(&dc->sb, sb, sizeof(struct cache_sb));
dc->bdev = bdev;
dc->bdev->bd_holder = dc;
-
- bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
- bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
- get_page(sb_page);
-
+ dc->sb_disk = sb_disk;
if (cached_dev_init(dc, sb->block_size << 9))
goto err;
@@ -1390,7 +1496,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page,
if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
goto err;
- pr_info("registered backing device %s", dc->backing_dev_name);
+ pr_info("registered backing device %s\n", dc->backing_dev_name);
list_add(&dc->list, &uncached_devices);
/* attach to a matched cache set if it exists */
@@ -1407,7 +1513,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page,
return 0;
err:
- pr_notice("error %s: %s", dc->backing_dev_name, err);
+ pr_notice("error %s: %s\n", dc->backing_dev_name, err);
bcache_device_stop(&dc->disk);
return ret;
}
@@ -1457,7 +1563,8 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
kobject_init(&d->kobj, &bch_flash_dev_ktype);
- if (bcache_device_init(d, block_bytes(c), u->sectors))
+ if (bcache_device_init(d, block_bytes(c->cache), u->sectors,
+ NULL, &bcache_flash_ops))
goto err;
bcache_device_attach(d, c, u - c->uuids);
@@ -1470,6 +1577,12 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
bcache_device_link(d, c, "volume");
+ if (bch_has_feature_obso_large_bucket(&c->cache->sb)) {
+ pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
+ pr_err("Please update to the latest bcache-tools to create the cache device\n");
+ set_disk_ro(d->disk, 1);
+ }
+
return 0;
err:
kobject_put(&d->kobj);
@@ -1502,7 +1615,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
u = uuid_find_empty(c);
if (!u) {
- pr_err("Can't create volume, no room for UUID");
+ pr_err("Can't create volume, no room for UUID\n");
return -EINVAL;
}
@@ -1528,7 +1641,7 @@ bool bch_cached_dev_error(struct cached_dev *dc)
smp_mb();
pr_err("stop %s: too many IO errors on backing device %s\n",
- dc->disk.disk->disk_name, dc->backing_dev_name);
+ dc->disk.disk->disk_name, dc->backing_dev_name);
bcache_device_stop(&dc->disk);
return true;
@@ -1539,6 +1652,7 @@ bool bch_cached_dev_error(struct cached_dev *dc)
__printf(2, 3)
bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
if (c->on_error != ON_ERROR_PANIC &&
@@ -1546,20 +1660,22 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
return false;
if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
- pr_info("CACHE_SET_IO_DISABLE already set");
+ pr_info("CACHE_SET_IO_DISABLE already set\n");
/*
* XXX: we can be called from atomic context
* acquire_console_sem();
*/
- pr_err("bcache: error on %pU: ", c->sb.set_uuid);
-
va_start(args, fmt);
- vprintk(fmt, args);
- va_end(args);
- pr_err(", disabling caching\n");
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_err("error on %pU: %pV, disabling caching\n",
+ c->set_uuid, &vaf);
+
+ va_end(args);
if (c->on_error == ON_ERROR_PANIC)
panic("panic forced after error\n");
@@ -1581,7 +1697,6 @@ static void cache_set_free(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, cl);
struct cache *ca;
- unsigned int i;
debugfs_remove(c->debug);
@@ -1590,15 +1705,16 @@ static void cache_set_free(struct closure *cl)
bch_journal_free(c);
mutex_lock(&bch_register_lock);
- for_each_cache(ca, c, i)
- if (ca) {
- ca->set = NULL;
- c->cache[ca->sb.nr_this_dev] = NULL;
- kobject_put(&ca->kobj);
- }
-
bch_bset_sort_state_free(&c->sort);
- free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
+ free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb)));
+
+ ca = c->cache;
+ if (ca) {
+ ca->set = NULL;
+ c->cache = NULL;
+ kobject_put(&ca->kobj);
+ }
+
if (c->moving_gc_wq)
destroy_workqueue(c->moving_gc_wq);
@@ -1611,7 +1727,7 @@ static void cache_set_free(struct closure *cl)
list_del(&c->list);
mutex_unlock(&bch_register_lock);
- pr_info("Cache set %pU unregistered", c->sb.set_uuid);
+ pr_info("Cache set %pU unregistered\n", c->set_uuid);
wake_up(&unregister_wait);
closure_debug_destroy(&c->cl);
@@ -1621,9 +1737,8 @@ static void cache_set_free(struct closure *cl)
static void cache_set_flush(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, caching);
- struct cache *ca;
+ struct cache *ca = c->cache;
struct btree *b;
- unsigned int i;
bch_cache_accounting_destroy(&c->accounting);
@@ -1648,9 +1763,8 @@ static void cache_set_flush(struct closure *cl)
mutex_unlock(&b->write_lock);
}
- for_each_cache(ca, c, i)
- if (ca->alloc_thread)
- kthread_stop(ca->alloc_thread);
+ if (ca->alloc_thread)
+ kthread_stop(ca->alloc_thread);
if (c->journal.cur) {
cancel_delayed_work_sync(&c->journal.work);
@@ -1682,15 +1796,15 @@ static void conditional_stop_bcache_device(struct cache_set *c,
struct cached_dev *dc)
{
if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
- pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.",
- d->disk->disk_name, c->sb.set_uuid);
+ pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.\n",
+ d->disk->disk_name, c->set_uuid);
bcache_device_stop(d);
} else if (atomic_read(&dc->has_dirty)) {
/*
* dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
* and dc->has_dirty == 1
*/
- pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
+ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.\n",
d->disk->disk_name);
/*
* There might be a small time gap that cache set is
@@ -1712,7 +1826,7 @@ static void conditional_stop_bcache_device(struct cache_set *c,
* dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
* and dc->has_dirty == 0
*/
- pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.",
+ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.\n",
d->disk->disk_name);
}
}
@@ -1760,12 +1874,13 @@ void bch_cache_set_unregister(struct cache_set *c)
bch_cache_set_stop(c);
}
-#define alloc_bucket_pages(gfp, c) \
- ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(bucket_pages(c))))
+#define alloc_meta_bucket_pages(gfp, sb) \
+ ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb))))
struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
{
int iter_size;
+ struct cache *ca = container_of(sb, struct cache, sb);
struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
if (!c)
@@ -1787,17 +1902,16 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
bch_cache_accounting_init(&c->accounting, &c->cl);
- memcpy(c->sb.set_uuid, sb->set_uuid, 16);
- c->sb.block_size = sb->block_size;
- c->sb.bucket_size = sb->bucket_size;
- c->sb.nr_in_set = sb->nr_in_set;
- c->sb.last_mount = sb->last_mount;
+ memcpy(c->set_uuid, sb->set_uuid, 16);
+
+ c->cache = ca;
+ c->cache->set = c;
c->bucket_bits = ilog2(sb->bucket_size);
c->block_bits = ilog2(sb->block_size);
- c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
+ c->nr_uuids = meta_bucket_bytes(sb) / sizeof(struct uuid_entry);
c->devices_max_used = 0;
atomic_set(&c->attached_dev_nr, 0);
- c->btree_pages = bucket_pages(c);
+ c->btree_pages = meta_bucket_pages(sb);
if (c->btree_pages > BTREE_MAX_PAGES)
c->btree_pages = max_t(int, c->btree_pages / 4,
BTREE_MAX_PAGES);
@@ -1823,29 +1937,52 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
INIT_LIST_HEAD(&c->btree_cache_freed);
INIT_LIST_HEAD(&c->data_buckets);
- iter_size = (sb->bucket_size / sb->block_size + 1) *
+ iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
sizeof(struct btree_iter_set);
- if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) ||
- mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
- mempool_init_kmalloc_pool(&c->bio_meta, 2,
- sizeof(struct bbio) + sizeof(struct bio_vec) *
- bucket_pages(c)) ||
- mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
- bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
- BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
- !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
- !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
- WQ_MEM_RECLAIM, 0)) ||
- bch_journal_alloc(c) ||
- bch_btree_cache_alloc(c) ||
- bch_open_buckets_alloc(c) ||
- bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
+ c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
+ if (!c->devices)
+ goto err;
+
+ if (mempool_init_slab_pool(&c->search, 32, bch_search_cache))
+ goto err;
+
+ if (mempool_init_kmalloc_pool(&c->bio_meta, 2,
+ sizeof(struct bbio) +
+ sizeof(struct bio_vec) * meta_bucket_pages(sb)))
+ goto err;
+
+ if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size))
+ goto err;
+
+ if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
+ BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
+ goto err;
+
+ c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb);
+ if (!c->uuids)
+ goto err;
+
+ c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0);
+ if (!c->moving_gc_wq)
+ goto err;
+
+ if (bch_journal_alloc(c))
+ goto err;
+
+ if (bch_btree_cache_alloc(c))
+ goto err;
+
+ if (bch_open_buckets_alloc(c))
+ goto err;
+
+ if (bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
goto err;
c->congested_read_threshold_us = 2000;
c->congested_write_threshold_us = 20000;
c->error_limit = DEFAULT_IO_ERROR_LIMIT;
+ c->idle_max_writeback_rate_enabled = 1;
WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
return c;
@@ -1858,19 +1995,17 @@ static int run_cache_set(struct cache_set *c)
{
const char *err = "cannot allocate memory";
struct cached_dev *dc, *t;
- struct cache *ca;
+ struct cache *ca = c->cache;
struct closure cl;
- unsigned int i;
LIST_HEAD(journal);
struct journal_replay *l;
closure_init_stack(&cl);
- for_each_cache(ca, c, i)
- c->nbuckets += ca->sb.nbuckets;
+ c->nbuckets = ca->sb.nbuckets;
set_gc_sectors(c);
- if (CACHE_SYNC(&c->sb)) {
+ if (CACHE_SYNC(&c->cache->sb)) {
struct bkey *k;
struct jset *j;
@@ -1878,7 +2013,7 @@ static int run_cache_set(struct cache_set *c)
if (bch_journal_read(c, &journal))
goto err;
- pr_debug("btree_journal_read() done");
+ pr_debug("btree_journal_read() done\n");
err = "no journal entries found";
if (list_empty(&journal))
@@ -1887,8 +2022,8 @@ static int run_cache_set(struct cache_set *c)
j = &list_entry(journal.prev, struct journal_replay, list)->j;
err = "IO error reading priorities";
- for_each_cache(ca, c, i)
- prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
+ if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]))
+ goto err;
/*
* If prio_read() fails it'll call cache_set_error and we'll
@@ -1920,26 +2055,9 @@ static int run_cache_set(struct cache_set *c)
if (bch_btree_check(c))
goto err;
- /*
- * bch_btree_check() may occupy too much system memory which
- * has negative effects to user space application (e.g. data
- * base) performance. Shrink the mca cache memory proactively
- * here to avoid competing memory with user space workloads..
- */
- if (!c->shrinker_disabled) {
- struct shrink_control sc;
-
- sc.gfp_mask = GFP_KERNEL;
- sc.nr_to_scan = c->btree_cache_used * c->btree_pages;
- /* first run to clear b->accessed tag */
- c->shrink.scan_objects(&c->shrink, &sc);
- /* second run to reap non-accessed nodes */
- c->shrink.scan_objects(&c->shrink, &sc);
- }
-
bch_journal_mark(c, &journal);
bch_initial_gc_finish(c);
- pr_debug("btree_check() done");
+ pr_debug("btree_check() done\n");
/*
* bcache_journal_next() can't happen sooner, or
@@ -1949,9 +2067,8 @@ static int run_cache_set(struct cache_set *c)
bch_journal_next(&c->journal);
err = "error starting allocator thread";
- for_each_cache(ca, c, i)
- if (bch_cache_allocator_start(ca))
- goto err;
+ if (bch_cache_allocator_start(ca))
+ goto err;
/*
* First place it's safe to allocate: btree_check() and
@@ -1970,28 +2087,23 @@ static int run_cache_set(struct cache_set *c)
if (bch_journal_replay(c, &journal))
goto err;
} else {
- pr_notice("invalidating existing data");
+ unsigned int j;
- for_each_cache(ca, c, i) {
- unsigned int j;
+ pr_notice("invalidating existing data\n");
+ ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
+ 2, SB_JOURNAL_BUCKETS);
- ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
- 2, SB_JOURNAL_BUCKETS);
-
- for (j = 0; j < ca->sb.keys; j++)
- ca->sb.d[j] = ca->sb.first_bucket + j;
- }
+ for (j = 0; j < ca->sb.keys; j++)
+ ca->sb.d[j] = ca->sb.first_bucket + j;
bch_initial_gc_finish(c);
err = "error starting allocator thread";
- for_each_cache(ca, c, i)
- if (bch_cache_allocator_start(ca))
- goto err;
+ if (bch_cache_allocator_start(ca))
+ goto err;
mutex_lock(&c->bucket_lock);
- for_each_cache(ca, c, i)
- bch_prio_write(ca, true);
+ bch_prio_write(ca, true);
mutex_unlock(&c->bucket_lock);
err = "cannot allocate new UUID bucket";
@@ -2016,7 +2128,7 @@ static int run_cache_set(struct cache_set *c)
* everything is set up - fortunately journal entries won't be
* written until the SET_CACHE_SYNC() here:
*/
- SET_CACHE_SYNC(&c->sb, true);
+ SET_CACHE_SYNC(&c->cache->sb, true);
bch_journal_next(&c->journal);
bch_journal_meta(c, &cl);
@@ -2027,9 +2139,12 @@ static int run_cache_set(struct cache_set *c)
goto err;
closure_sync(&cl);
- c->sb.last_mount = (u32)ktime_get_real_seconds();
+ c->cache->sb.last_mount = (u32)ktime_get_real_seconds();
bcache_write_super(c);
+ if (bch_has_feature_obso_large_bucket(&c->cache->sb))
+ pr_err("Detect obsoleted large bucket layout, all attached bcache device will be read-only\n");
+
list_for_each_entry_safe(dc, t, &uncached_devices, list)
bch_cached_dev_attach(dc, c, NULL);
@@ -2051,13 +2166,6 @@ static int run_cache_set(struct cache_set *c)
return -EIO;
}
-static bool can_attach_cache(struct cache *ca, struct cache_set *c)
-{
- return ca->sb.block_size == c->sb.block_size &&
- ca->sb.bucket_size == c->sb.bucket_size &&
- ca->sb.nr_in_set == c->sb.nr_in_set;
-}
-
static const char *register_cache_set(struct cache *ca)
{
char buf[12];
@@ -2065,16 +2173,10 @@ static const char *register_cache_set(struct cache *ca)
struct cache_set *c;
list_for_each_entry(c, &bch_cache_sets, list)
- if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
- if (c->cache[ca->sb.nr_this_dev])
+ if (!memcmp(c->set_uuid, ca->sb.set_uuid, 16)) {
+ if (c->cache)
return "duplicate cache set member";
- if (!can_attach_cache(ca, c))
- return "cache sb does not match set";
-
- if (!CACHE_SYNC(&ca->sb))
- SET_CACHE_SYNC(&c->sb, false);
-
goto found;
}
@@ -2083,7 +2185,7 @@ static const char *register_cache_set(struct cache *ca)
return err;
err = "error creating kobject";
- if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
+ if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->set_uuid) ||
kobject_add(&c->internal, &c->kobj, "internal"))
goto err;
@@ -2099,31 +2201,13 @@ static const char *register_cache_set(struct cache *ca)
sysfs_create_link(&c->kobj, &ca->kobj, buf))
goto err;
- /*
- * A special case is both ca->sb.seq and c->sb.seq are 0,
- * such condition happens on a new created cache device whose
- * super block is never flushed yet. In this case c->sb.version
- * and other members should be updated too, otherwise we will
- * have a mistaken super block version in cache set.
- */
- if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) {
- c->sb.version = ca->sb.version;
- memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
- c->sb.flags = ca->sb.flags;
- c->sb.seq = ca->sb.seq;
- pr_debug("set version = %llu", c->sb.version);
- }
-
kobject_get(&ca->kobj);
ca->set = c;
- ca->set->cache[ca->sb.nr_this_dev] = ca;
- c->cache_by_alloc[c->caches_loaded++] = ca;
+ ca->set->cache = ca;
- if (c->caches_loaded == c->sb.nr_in_set) {
- err = "failed to run cache set";
- if (run_cache_set(c) < 0)
- goto err;
- }
+ err = "failed to run cache set";
+ if (run_cache_set(c) < 0)
+ goto err;
return NULL;
err:
@@ -2140,11 +2224,11 @@ void bch_cache_release(struct kobject *kobj)
unsigned int i;
if (ca->set) {
- BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
- ca->set->cache[ca->sb.nr_this_dev] = NULL;
+ BUG_ON(ca->set->cache != ca);
+ ca->set->cache = NULL;
}
- free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
+ free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb)));
kfree(ca->prio_buckets);
vfree(ca->buckets);
@@ -2154,8 +2238,8 @@ void bch_cache_release(struct kobject *kobj)
for (i = 0; i < RESERVE_NR; i++)
free_fifo(&ca->free[i]);
- if (ca->sb_bio.bi_inline_vecs[0].bv_page)
- put_page(bio_first_page_all(&ca->sb_bio));
+ if (ca->sb_disk)
+ put_page(virt_to_page(ca->sb_disk));
if (!IS_ERR_OR_NULL(ca->bdev))
blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
@@ -2241,7 +2325,7 @@ static int cache_alloc(struct cache *ca)
goto err_prio_buckets_alloc;
}
- ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca);
+ ca->disk_buckets = alloc_meta_bucket_pages(GFP_KERNEL, &ca->sb);
if (!ca->disk_buckets) {
err = "ca->disk_buckets alloc failed";
goto err_disk_buckets_alloc;
@@ -2273,11 +2357,11 @@ static int cache_alloc(struct cache *ca)
err_free:
module_put(THIS_MODULE);
if (err)
- pr_notice("error %s: %s", ca->cache_dev_name, err);
+ pr_notice("error %s: %s\n", ca->cache_dev_name, err);
return ret;
}
-static int register_cache(struct cache_sb *sb, struct page *sb_page,
+static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
struct block_device *bdev, struct cache *ca)
{
const char *err = NULL; /* must be set for any error case */
@@ -2287,10 +2371,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
memcpy(&ca->sb, sb, sizeof(struct cache_sb));
ca->bdev = bdev;
ca->bdev->bd_holder = ca;
-
- bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
- bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
- get_page(sb_page);
+ ca->sb_disk = sb_disk;
if (blk_queue_discard(bdev_get_queue(bdev)))
ca->discard = CACHE_DISCARD(&ca->sb);
@@ -2330,14 +2411,14 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
goto out;
}
- pr_info("registered cache device %s", ca->cache_dev_name);
+ pr_info("registered cache device %s\n", ca->cache_dev_name);
out:
kobject_put(&ca->kobj);
err:
if (err)
- pr_notice("error %s: %s", ca->cache_dev_name, err);
+ pr_notice("error %s: %s\n", ca->cache_dev_name, err);
return ret;
}
@@ -2372,13 +2453,14 @@ static bool bch_is_open_backing(struct block_device *bdev)
static bool bch_is_open_cache(struct block_device *bdev)
{
struct cache_set *c, *tc;
- struct cache *ca;
- unsigned int i;
- list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
- for_each_cache(ca, c, i)
- if (ca->bdev == bdev)
- return true;
+ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
+ struct cache *ca = c->cache;
+
+ if (ca->bdev == bdev)
+ return true;
+ }
+
return false;
}
@@ -2387,15 +2469,98 @@ static bool bch_is_open(struct block_device *bdev)
return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
}
+struct async_reg_args {
+ struct delayed_work reg_work;
+ char *path;
+ struct cache_sb *sb;
+ struct cache_sb_disk *sb_disk;
+ struct block_device *bdev;
+};
+
+static void register_bdev_worker(struct work_struct *work)
+{
+ int fail = false;
+ struct async_reg_args *args =
+ container_of(work, struct async_reg_args, reg_work.work);
+ struct cached_dev *dc;
+
+ dc = kzalloc(sizeof(*dc), GFP_KERNEL);
+ if (!dc) {
+ fail = true;
+ put_page(virt_to_page(args->sb_disk));
+ blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ goto out;
+ }
+
+ mutex_lock(&bch_register_lock);
+ if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0)
+ fail = true;
+ mutex_unlock(&bch_register_lock);
+
+out:
+ if (fail)
+ pr_info("error %s: fail to register backing device\n",
+ args->path);
+ kfree(args->sb);
+ kfree(args->path);
+ kfree(args);
+ module_put(THIS_MODULE);
+}
+
+static void register_cache_worker(struct work_struct *work)
+{
+ int fail = false;
+ struct async_reg_args *args =
+ container_of(work, struct async_reg_args, reg_work.work);
+ struct cache *ca;
+
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ if (!ca) {
+ fail = true;
+ put_page(virt_to_page(args->sb_disk));
+ blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ goto out;
+ }
+
+ /* blkdev_put() will be called in bch_cache_release() */
+ if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0)
+ fail = true;
+
+out:
+ if (fail)
+ pr_info("error %s: fail to register cache device\n",
+ args->path);
+ kfree(args->sb);
+ kfree(args->path);
+ kfree(args);
+ module_put(THIS_MODULE);
+}
+
+static void register_device_aync(struct async_reg_args *args)
+{
+ if (SB_IS_BDEV(args->sb))
+ INIT_DELAYED_WORK(&args->reg_work, register_bdev_worker);
+ else
+ INIT_DELAYED_WORK(&args->reg_work, register_cache_worker);
+
+ /* 10 jiffies is enough for a delay */
+ queue_delayed_work(system_wq, &args->reg_work, 10);
+}
+
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
const char *buffer, size_t size)
{
const char *err;
char *path = NULL;
struct cache_sb *sb;
- struct block_device *bdev = NULL;
- struct page *sb_page;
+ struct cache_sb_disk *sb_disk;
+ struct block_device *bdev;
ssize_t ret;
+ bool async_registration = false;
+
+#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
+ async_registration = true;
+#endif
ret = -EBUSY;
err = "failed to reference bcache module";
@@ -2444,11 +2609,32 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (set_blocksize(bdev, 4096))
goto out_blkdev_put;
- err = read_super(sb, bdev, &sb_page);
+ err = read_super(sb, bdev, &sb_disk);
if (err)
goto out_blkdev_put;
err = "failed to register device";
+
+ if (async_registration) {
+ /* register in asynchronous way */
+ struct async_reg_args *args =
+ kzalloc(sizeof(struct async_reg_args), GFP_KERNEL);
+
+ if (!args) {
+ ret = -ENOMEM;
+ err = "cannot allocate memory";
+ goto out_put_sb_page;
+ }
+
+ args->path = path;
+ args->sb = sb;
+ args->sb_disk = sb_disk;
+ args->bdev = bdev;
+ register_device_aync(args);
+ /* No wait and returns to user space */
+ goto async_done;
+ }
+
if (SB_IS_BDEV(sb)) {
struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
@@ -2456,13 +2642,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
goto out_put_sb_page;
mutex_lock(&bch_register_lock);
- ret = register_bdev(sb, sb_page, bdev, dc);
+ ret = register_bdev(sb, sb_disk, bdev, dc);
mutex_unlock(&bch_register_lock);
/* blkdev_put() will be called in cached_dev_free() */
- if (ret < 0) {
- bdev = NULL;
- goto out_put_sb_page;
- }
+ if (ret < 0)
+ goto out_free_sb;
} else {
struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
@@ -2470,24 +2654,21 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
goto out_put_sb_page;
/* blkdev_put() will be called in bch_cache_release() */
- if (register_cache(sb, sb_page, bdev, ca) != 0) {
- bdev = NULL;
- goto out_put_sb_page;
- }
+ if (register_cache(sb, sb_disk, bdev, ca) != 0)
+ goto out_free_sb;
}
- put_page(sb_page);
done:
kfree(sb);
kfree(path);
module_put(THIS_MODULE);
+async_done:
return size;
out_put_sb_page:
- put_page(sb_page);
+ put_page(virt_to_page(sb_disk));
out_blkdev_put:
- if (bdev)
- blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
out_free_sb:
kfree(sb);
out_free_path:
@@ -2496,7 +2677,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
out_module_put:
module_put(THIS_MODULE);
out:
- pr_info("error %s: %s", path?path:"", err);
+ pr_info("error %s: %s\n", path?path:"", err);
return ret;
}
@@ -2529,7 +2710,7 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
char *pdev_set_uuid = pdev->dc->sb.set_uuid;
- char *set_uuid = c->sb.uuid;
+ char *set_uuid = c->set_uuid;
if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
list_del(&pdev->list);
@@ -2541,7 +2722,7 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
mutex_unlock(&bch_register_lock);
list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
- pr_info("delete pdev %p", pdev);
+ pr_info("delete pdev %p\n", pdev);
list_del(&pdev->list);
bcache_device_stop(&pdev->dc->disk);
kfree(pdev);
@@ -2584,7 +2765,7 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
mutex_unlock(&bch_register_lock);
- pr_info("Stopping all devices:");
+ pr_info("Stopping all devices:\n");
/*
* The reason bch_register_lock is not held to call
@@ -2634,9 +2815,9 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
finish_wait(&unregister_wait, &wait);
if (stopped)
- pr_info("All devices stopped");
+ pr_info("All devices stopped\n");
else
- pr_notice("Timeout waiting for devices to be closed");
+ pr_notice("Timeout waiting for devices to be closed\n");
out:
mutex_unlock(&bch_register_lock);
}
@@ -2675,7 +2856,7 @@ static void check_module_parameters(void)
if (bch_cutoff_writeback_sync == 0)
bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
- pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u",
+ pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u\n",
bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
}
@@ -2683,13 +2864,13 @@ static void check_module_parameters(void)
if (bch_cutoff_writeback == 0)
bch_cutoff_writeback = CUTOFF_WRITEBACK;
else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
- pr_warn("set bch_cutoff_writeback (%u) to max value %u",
+ pr_warn("set bch_cutoff_writeback (%u) to max value %u\n",
bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
}
if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
- pr_warn("set bch_cutoff_writeback (%u) to %u",
+ pr_warn("set bch_cutoff_writeback (%u) to %u\n",
bch_cutoff_writeback, bch_cutoff_writeback_sync);
bch_cutoff_writeback = bch_cutoff_writeback_sync;
}
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 7f0fb4b..554e3af 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -11,6 +11,7 @@
#include "btree.h"
#include "request.h"
#include "writeback.h"
+#include "features.h"
#include <linux/blkdev.h>
#include <linux/sort.h>
@@ -88,6 +89,9 @@ read_attribute(btree_used_percent);
read_attribute(average_key_size);
read_attribute(dirty_data);
read_attribute(bset_tree_stats);
+read_attribute(feature_compat);
+read_attribute(feature_ro_compat);
+read_attribute(feature_incompat);
read_attribute(state);
read_attribute(cache_read_races);
@@ -141,6 +145,7 @@ rw_attribute(expensive_debug_checks);
rw_attribute(cache_replacement_policy);
rw_attribute(btree_shrinker_disabled);
rw_attribute(copy_gc_enabled);
+rw_attribute(idle_max_writeback_rate);
rw_attribute(gc_after_writeback);
rw_attribute(size);
@@ -153,7 +158,7 @@ static ssize_t bch_snprint_string_list(char *buf,
size_t i;
for (i = 0; list[i]; i++)
- out += snprintf(out, buf + size - out,
+ out += scnprintf(out, buf + size - out,
i == selected ? "[%s] " : "%s ", list[i]);
out[-1] = '\n';
@@ -420,7 +425,7 @@ STORE(__cached_dev)
return size;
}
if (v == -ENOENT)
- pr_err("Can't attach %s: cache set not found", buf);
+ pr_err("Can't attach %s: cache set not found\n", buf);
return v;
}
@@ -454,7 +459,7 @@ STORE(bch_cached_dev)
*/
if (dc->writeback_running) {
dc->writeback_running = false;
- pr_err("%s: failed to run non-existent writeback thread",
+ pr_err("%s: failed to run non-existent writeback thread\n",
dc->disk.disk->disk_name);
}
} else
@@ -706,10 +711,10 @@ SHOW(__bch_cache_set)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
- sysfs_print(synchronous, CACHE_SYNC(&c->sb));
+ sysfs_print(synchronous, CACHE_SYNC(&c->cache->sb));
sysfs_print(journal_delay_ms, c->journal_delay_ms);
- sysfs_hprint(bucket_size, bucket_bytes(c));
- sysfs_hprint(block_size, block_bytes(c));
+ sysfs_hprint(bucket_size, bucket_bytes(c->cache));
+ sysfs_hprint(block_size, block_bytes(c->cache));
sysfs_print(tree_depth, c->root->level);
sysfs_print(root_usage_percent, bch_root_usage(c));
@@ -769,6 +774,8 @@ SHOW(__bch_cache_set)
sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+ sysfs_printf(idle_max_writeback_rate, "%i",
+ c->idle_max_writeback_rate_enabled);
sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback);
sysfs_printf(io_disable, "%i",
test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -776,6 +783,13 @@ SHOW(__bch_cache_set)
if (attr == &sysfs_bset_tree_stats)
return bch_bset_print_stats(c, buf);
+ if (attr == &sysfs_feature_compat)
+ return bch_print_cache_set_feature_compat(c, buf, PAGE_SIZE);
+ if (attr == &sysfs_feature_ro_compat)
+ return bch_print_cache_set_feature_ro_compat(c, buf, PAGE_SIZE);
+ if (attr == &sysfs_feature_incompat)
+ return bch_print_cache_set_feature_incompat(c, buf, PAGE_SIZE);
+
return 0;
}
SHOW_LOCKED(bch_cache_set)
@@ -798,8 +812,8 @@ STORE(__bch_cache_set)
if (attr == &sysfs_synchronous) {
bool sync = strtoul_or_return(buf);
- if (sync != CACHE_SYNC(&c->sb)) {
- SET_CACHE_SYNC(&c->sb, sync);
+ if (sync != CACHE_SYNC(&c->cache->sb)) {
+ SET_CACHE_SYNC(&c->cache->sb, sync);
bcache_write_super(c);
}
}
@@ -869,11 +883,11 @@ STORE(__bch_cache_set)
if (v) {
if (test_and_set_bit(CACHE_SET_IO_DISABLE,
&c->flags))
- pr_warn("CACHE_SET_IO_DISABLE already set");
+ pr_warn("CACHE_SET_IO_DISABLE already set\n");
} else {
if (!test_and_clear_bit(CACHE_SET_IO_DISABLE,
&c->flags))
- pr_warn("CACHE_SET_IO_DISABLE already cleared");
+ pr_warn("CACHE_SET_IO_DISABLE already cleared\n");
}
}
@@ -886,6 +900,9 @@ STORE(__bch_cache_set)
sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite);
sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled);
sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled);
+ sysfs_strtoul_bool(idle_max_writeback_rate,
+ c->idle_max_writeback_rate_enabled);
+
/*
* write gc_after_writeback here may overwrite an already set
* BCH_DO_AUTO_GC, it doesn't matter because this flag will be
@@ -976,10 +993,14 @@ static struct attribute *bch_cache_set_internal_files[] = {
&sysfs_gc_always_rewrite,
&sysfs_btree_shrinker_disabled,
&sysfs_copy_gc_enabled,
+ &sysfs_idle_max_writeback_rate,
&sysfs_gc_after_writeback,
&sysfs_io_disable,
&sysfs_cutoff_writeback,
&sysfs_cutoff_writeback_sync,
+ &sysfs_feature_compat,
+ &sysfs_feature_ro_compat,
+ &sysfs_feature_incompat,
NULL
};
KTYPE(bch_cache_set_internal);
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 62fb917..ae380bc 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -33,27 +33,27 @@ int bch_ ## name ## _h(const char *cp, type *res) \
case 'y': \
case 'z': \
u++; \
- /* fall through */ \
+ fallthrough; \
case 'e': \
u++; \
- /* fall through */ \
+ fallthrough; \
case 'p': \
u++; \
- /* fall through */ \
+ fallthrough; \
case 't': \
u++; \
- /* fall through */ \
+ fallthrough; \
case 'g': \
u++; \
- /* fall through */ \
+ fallthrough; \
case 'm': \
u++; \
- /* fall through */ \
+ fallthrough; \
case 'k': \
u++; \
if (e++ == cp) \
return -EINVAL; \
- /* fall through */ \
+ fallthrough; \
case '\n': \
case '\0': \
if (*e == '\n') \
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 0b02210..3c74996 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -35,7 +35,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
* This is the size of the cache, minus the amount used for
* flash-only devices
*/
- uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
+ uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size -
atomic_long_read(&c->flash_dev_dirty_sectors);
/*
@@ -122,6 +122,10 @@ static void __update_writeback_rate(struct cached_dev *dc)
static bool set_at_max_writeback_rate(struct cache_set *c,
struct cached_dev *dc)
{
+ /* Don't sst max writeback rate if it is disabled */
+ if (!c->idle_max_writeback_rate_enabled)
+ return false;
+
/* Don't set max writeback rate if gc is running */
if (!c->gc_mark_valid)
return false;
@@ -179,7 +183,7 @@ static void update_writeback_rate(struct work_struct *work)
*/
set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
- smp_mb();
+ smp_mb__after_atomic();
/*
* CACHE_SET_IO_DISABLE might be set via sysfs interface,
@@ -189,7 +193,7 @@ static void update_writeback_rate(struct work_struct *work)
test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
- smp_mb();
+ smp_mb__after_atomic();
return;
}
@@ -225,7 +229,7 @@ static void update_writeback_rate(struct work_struct *work)
*/
clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
- smp_mb();
+ smp_mb__after_atomic();
}
static unsigned int writeback_delay(struct cached_dev *dc,
@@ -455,10 +459,8 @@ static void read_dirty(struct cached_dev *dc)
for (i = 0; i < nk; i++) {
w = keys[i];
- io = kzalloc(sizeof(struct dirty_io) +
- sizeof(struct bio_vec) *
- DIV_ROUND_UP(KEY_SIZE(&w->key),
- PAGE_SECTORS),
+ io = kzalloc(struct_size(io, bio.bi_inline_vecs,
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
GFP_KERNEL);
if (!io)
goto err;
@@ -785,7 +787,9 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
return MAP_CONTINUE;
}
-void bch_sectors_dirty_init(struct bcache_device *d)
+static int bch_root_node_dirty_init(struct cache_set *c,
+ struct bcache_device *d,
+ struct bkey *k)
{
struct sectors_dirty_init op;
int ret;
@@ -796,16 +800,164 @@ void bch_sectors_dirty_init(struct bcache_device *d)
op.start = KEY(op.inode, 0, 0);
do {
- ret = bch_btree_map_keys(&op.op, d->c, &op.start,
- sectors_dirty_init_fn, 0);
+ ret = bcache_btree(map_keys_recurse,
+ k,
+ c->root,
+ &op.op,
+ &op.start,
+ sectors_dirty_init_fn,
+ 0);
if (ret == -EAGAIN)
schedule_timeout_interruptible(
msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
else if (ret < 0) {
- pr_warn("sectors dirty init failed, ret=%d!", ret);
+ pr_warn("sectors dirty init failed, ret=%d!\n", ret);
break;
}
} while (ret == -EAGAIN);
+
+ return ret;
+}
+
+static int bch_dirty_init_thread(void *arg)
+{
+ struct dirty_init_thrd_info *info = arg;
+ struct bch_dirty_init_state *state = info->state;
+ struct cache_set *c = state->c;
+ struct btree_iter iter;
+ struct bkey *k, *p;
+ int cur_idx, prev_idx, skip_nr;
+
+ k = p = NULL;
+ cur_idx = prev_idx = 0;
+
+ bch_btree_iter_init(&c->root->keys, &iter, NULL);
+ k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
+ BUG_ON(!k);
+
+ p = k;
+
+ while (k) {
+ spin_lock(&state->idx_lock);
+ cur_idx = state->key_idx;
+ state->key_idx++;
+ spin_unlock(&state->idx_lock);
+
+ skip_nr = cur_idx - prev_idx;
+
+ while (skip_nr) {
+ k = bch_btree_iter_next_filter(&iter,
+ &c->root->keys,
+ bch_ptr_bad);
+ if (k)
+ p = k;
+ else {
+ atomic_set(&state->enough, 1);
+ /* Update state->enough earlier */
+ smp_mb__after_atomic();
+ goto out;
+ }
+ skip_nr--;
+ cond_resched();
+ }
+
+ if (p) {
+ if (bch_root_node_dirty_init(c, state->d, p) < 0)
+ goto out;
+ }
+
+ p = NULL;
+ prev_idx = cur_idx;
+ cond_resched();
+ }
+
+out:
+ /* In order to wake up state->wait in time */
+ smp_mb__before_atomic();
+ if (atomic_dec_and_test(&state->started))
+ wake_up(&state->wait);
+
+ return 0;
+}
+
+static int bch_btre_dirty_init_thread_nr(void)
+{
+ int n = num_online_cpus()/2;
+
+ if (n == 0)
+ n = 1;
+ else if (n > BCH_DIRTY_INIT_THRD_MAX)
+ n = BCH_DIRTY_INIT_THRD_MAX;
+
+ return n;
+}
+
+void bch_sectors_dirty_init(struct bcache_device *d)
+{
+ int i;
+ struct bkey *k = NULL;
+ struct btree_iter iter;
+ struct sectors_dirty_init op;
+ struct cache_set *c = d->c;
+ struct bch_dirty_init_state *state;
+ char name[32];
+
+ /* Just count root keys if no leaf node */
+ if (c->root->level == 0) {
+ bch_btree_op_init(&op.op, -1);
+ op.inode = d->id;
+ op.count = 0;
+ op.start = KEY(op.inode, 0, 0);
+
+ for_each_key_filter(&c->root->keys,
+ k, &iter, bch_ptr_invalid)
+ sectors_dirty_init_fn(&op.op, c->root, k);
+ return;
+ }
+
+ state = kzalloc(sizeof(struct bch_dirty_init_state), GFP_KERNEL);
+ if (!state) {
+ pr_warn("sectors dirty init failed: cannot allocate memory\n");
+ return;
+ }
+
+ state->c = c;
+ state->d = d;
+ state->total_threads = bch_btre_dirty_init_thread_nr();
+ state->key_idx = 0;
+ spin_lock_init(&state->idx_lock);
+ atomic_set(&state->started, 0);
+ atomic_set(&state->enough, 0);
+ init_waitqueue_head(&state->wait);
+
+ for (i = 0; i < state->total_threads; i++) {
+ /* Fetch latest state->enough earlier */
+ smp_mb__before_atomic();
+ if (atomic_read(&state->enough))
+ break;
+
+ state->infos[i].state = state;
+ atomic_inc(&state->started);
+ snprintf(name, sizeof(name), "bch_dirty_init[%d]", i);
+
+ state->infos[i].thread =
+ kthread_run(bch_dirty_init_thread,
+ &state->infos[i],
+ name);
+ if (IS_ERR(state->infos[i].thread)) {
+ pr_err("fails to run thread bch_dirty_init[%d]\n", i);
+ for (--i; i >= 0; i--)
+ kthread_stop(state->infos[i].thread);
+ goto out;
+ }
+ }
+
+ wait_event_interruptible(state->wait,
+ atomic_read(&state->started) == 0 ||
+ test_bit(CACHE_SET_IO_DISABLE, &c->flags));
+
+out:
+ kfree(state);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index c4ff760..3f1230e 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -16,6 +16,7 @@
#define BCH_AUTO_GC_DIRTY_THRESHOLD 50
+#define BCH_DIRTY_INIT_THRD_MAX 64
/*
* 14 (16384ths) is chosen here as something that each backing device
* should be a reasonable fraction of the share, and not to blow up
@@ -23,6 +24,24 @@
*/
#define WRITEBACK_SHARE_SHIFT 14
+struct bch_dirty_init_state;
+struct dirty_init_thrd_info {
+ struct bch_dirty_init_state *state;
+ struct task_struct *thread;
+};
+
+struct bch_dirty_init_state {
+ struct cache_set *c;
+ struct bcache_device *d;
+ int total_threads;
+ int key_idx;
+ spinlock_t idx_lock;
+ atomic_t started;
+ atomic_t enough;
+ wait_queue_head_t wait;
+ struct dirty_init_thrd_info infos[BCH_DIRTY_INIT_THRD_MAX];
+};
+
static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
{
uint64_t i, ret = 0;
diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index b538989..1f8f98e 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -150,11 +150,10 @@ static int bio_detain(struct dm_bio_prison *prison,
struct dm_bio_prison_cell **cell_result)
{
int r;
- unsigned long flags;
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
return r;
}
@@ -198,11 +197,9 @@ void dm_cell_release(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell,
struct bio_list *bios)
{
- unsigned long flags;
-
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
__cell_release(prison, cell, bios);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
}
EXPORT_SYMBOL_GPL(dm_cell_release);
@@ -250,12 +247,10 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
void *context,
struct dm_bio_prison_cell *cell)
{
- unsigned long flags;
-
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
visit_fn(context, cell);
rb_erase(&cell->node, &prison->cells);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
}
EXPORT_SYMBOL_GPL(dm_cell_visit_release);
@@ -275,11 +270,10 @@ int dm_cell_promote_or_release(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell)
{
int r;
- unsigned long flags;
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
r = __promote_or_release(prison, cell);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
return r;
}
@@ -379,10 +373,9 @@ EXPORT_SYMBOL_GPL(dm_deferred_entry_dec);
int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work)
{
int r = 1;
- unsigned long flags;
unsigned next_entry;
- spin_lock_irqsave(&ds->lock, flags);
+ spin_lock_irq(&ds->lock);
if ((ds->sweeper == ds->current_entry) &&
!ds->entries[ds->current_entry].count)
r = 0;
@@ -392,7 +385,7 @@ int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work)
if (!ds->entries[next_entry].count)
ds->current_entry = next_entry;
}
- spin_unlock_irqrestore(&ds->lock, flags);
+ spin_unlock_irq(&ds->lock);
return r;
}
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
index b092cdc..9dec3b6 100644
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -177,11 +177,10 @@ bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison,
struct dm_bio_prison_cell_v2 **cell_result)
{
int r;
- unsigned long flags;
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
return r;
}
@@ -261,11 +260,10 @@ int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison,
struct dm_bio_prison_cell_v2 **cell_result)
{
int r;
- unsigned long flags;
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
r = __lock(prison, key, lock_level, cell_prealloc, cell_result);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
return r;
}
@@ -285,11 +283,9 @@ void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison,
struct dm_bio_prison_cell_v2 *cell,
struct work_struct *continuation)
{
- unsigned long flags;
-
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
__quiesce(prison, cell, continuation);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
}
EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2);
@@ -309,11 +305,10 @@ int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison,
unsigned new_lock_level)
{
int r;
- unsigned long flags;
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
r = __promote(prison, cell, new_lock_level);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
return r;
}
@@ -329,7 +324,7 @@ static bool __unlock(struct dm_bio_prison_v2 *prison,
bio_list_init(&cell->bios);
if (cell->shared_count) {
- cell->exclusive_lock = 0;
+ cell->exclusive_lock = false;
return false;
}
@@ -342,11 +337,10 @@ bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison,
struct bio_list *bios)
{
bool r;
- unsigned long flags;
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
r = __unlock(prison, cell, bios);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
return r;
}
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index e8c37d9..50f3e67 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -108,7 +108,10 @@ struct dm_bufio_client {
int async_write_error;
struct list_head client_list;
+
struct shrinker shrinker;
+ struct work_struct shrink_work;
+ atomic_long_t need_shrink;
};
/*
@@ -256,12 +259,35 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
if (b->block == block)
return b;
- n = (b->block < block) ? n->rb_left : n->rb_right;
+ n = block < b->block ? n->rb_left : n->rb_right;
}
return NULL;
}
+static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block)
+{
+ struct rb_node *n = c->buffer_tree.rb_node;
+ struct dm_buffer *b;
+ struct dm_buffer *best = NULL;
+
+ while (n) {
+ b = container_of(n, struct dm_buffer, node);
+
+ if (b->block == block)
+ return b;
+
+ if (block <= b->block) {
+ n = n->rb_left;
+ best = b;
+ } else {
+ n = n->rb_right;
+ }
+ }
+
+ return best;
+}
+
static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
{
struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
@@ -276,8 +302,8 @@ static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
}
parent = *new;
- new = (found->block < b->block) ?
- &((*new)->rb_left) : &((*new)->rb_right);
+ new = b->block < found->block ?
+ &found->node.rb_left : &found->node.rb_right;
}
rb_link_node(&b->node, parent, new);
@@ -400,13 +426,13 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
*/
if (gfp_mask & __GFP_NORETRY) {
unsigned noio_flag = memalloc_noio_save();
- void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
+ void *ptr = __vmalloc(c->block_size, gfp_mask);
memalloc_noio_restore(noio_flag);
return ptr;
}
- return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
+ return __vmalloc(c->block_size, gfp_mask);
}
/*
@@ -631,6 +657,19 @@ static void use_bio(struct dm_buffer *b, int rw, sector_t sector,
submit_bio(bio);
}
+static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
+{
+ sector_t sector;
+
+ if (likely(c->sectors_per_block_bits >= 0))
+ sector = block << c->sectors_per_block_bits;
+ else
+ sector = block * (c->block_size >> SECTOR_SHIFT);
+ sector += c->start;
+
+ return sector;
+}
+
static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t))
{
unsigned n_sectors;
@@ -639,11 +678,7 @@ static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buff
b->end_io = end_io;
- if (likely(b->c->sectors_per_block_bits >= 0))
- sector = b->block << b->c->sectors_per_block_bits;
- else
- sector = b->block * (b->c->block_size >> SECTOR_SHIFT);
- sector += b->c->start;
+ sector = block_to_sector(b->c, b->block);
if (rw != REQ_OP_WRITE) {
n_sectors = b->c->block_size >> SECTOR_SHIFT;
@@ -1326,6 +1361,30 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c)
EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
/*
+ * Use dm-io to send a discard request to flush the device.
+ */
+int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
+{
+ struct dm_io_request io_req = {
+ .bi_op = REQ_OP_DISCARD,
+ .bi_op_flags = REQ_SYNC,
+ .mem.type = DM_IO_KMEM,
+ .mem.ptr.addr = NULL,
+ .client = c->dm_io,
+ };
+ struct dm_io_region io_reg = {
+ .bdev = c->bdev,
+ .sector = block_to_sector(c, block),
+ .count = block_to_sector(c, count),
+ };
+
+ BUG_ON(dm_bufio_in_request());
+
+ return dm_io(&io_req, 1, &io_reg, NULL);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
+
+/*
* We first delete any other buffer that may be at that new location.
*
* Then, we write the buffer to the original location if it was dirty.
@@ -1401,6 +1460,14 @@ void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
}
EXPORT_SYMBOL_GPL(dm_bufio_release_move);
+static void forget_buffer_locked(struct dm_buffer *b)
+{
+ if (likely(!b->hold_count) && likely(!b->state)) {
+ __unlink_buffer(b);
+ __free_buffer_wake(b);
+ }
+}
+
/*
* Free the given buffer.
*
@@ -1414,15 +1481,36 @@ void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
dm_bufio_lock(c);
b = __find(c, block);
- if (b && likely(!b->hold_count) && likely(!b->state)) {
- __unlink_buffer(b);
- __free_buffer_wake(b);
- }
+ if (b)
+ forget_buffer_locked(b);
dm_bufio_unlock(c);
}
EXPORT_SYMBOL_GPL(dm_bufio_forget);
+void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
+{
+ struct dm_buffer *b;
+ sector_t end_block = block + n_blocks;
+
+ while (block < end_block) {
+ dm_bufio_lock(c);
+
+ b = __find_next(c, block);
+ if (b) {
+ block = b->block + 1;
+ forget_buffer_locked(b);
+ }
+
+ dm_bufio_unlock(c);
+
+ if (!b)
+ break;
+ }
+
+}
+EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
+
void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
{
c->minimum_buffers = n;
@@ -1559,8 +1647,7 @@ static unsigned long get_retain_buffers(struct dm_bufio_client *c)
return retain_bytes;
}
-static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
- gfp_t gfp_mask)
+static void __scan(struct dm_bufio_client *c)
{
int l;
struct dm_buffer *b, *tmp;
@@ -1571,42 +1658,58 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
for (l = 0; l < LIST_SIZE; l++) {
list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
- if (__try_evict_buffer(b, gfp_mask))
+ if (count - freed <= retain_target)
+ atomic_long_set(&c->need_shrink, 0);
+ if (!atomic_long_read(&c->need_shrink))
+ return;
+ if (__try_evict_buffer(b, GFP_KERNEL)) {
+ atomic_long_dec(&c->need_shrink);
freed++;
- if (!--nr_to_scan || ((count - freed) <= retain_target))
- return freed;
+ }
cond_resched();
}
}
- return freed;
}
-static unsigned long
-dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+static void shrink_work(struct work_struct *w)
+{
+ struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
+
+ dm_bufio_lock(c);
+ __scan(c);
+ dm_bufio_unlock(c);
+}
+
+static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
struct dm_bufio_client *c;
- unsigned long freed;
c = container_of(shrink, struct dm_bufio_client, shrinker);
- if (sc->gfp_mask & __GFP_FS)
- dm_bufio_lock(c);
- else if (!dm_bufio_trylock(c))
- return SHRINK_STOP;
+ atomic_long_add(sc->nr_to_scan, &c->need_shrink);
+ queue_work(dm_bufio_wq, &c->shrink_work);
- freed = __scan(c, sc->nr_to_scan, sc->gfp_mask);
- dm_bufio_unlock(c);
- return freed;
+ return sc->nr_to_scan;
}
-static unsigned long
-dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
READ_ONCE(c->n_buffers[LIST_DIRTY]);
unsigned long retain_target = get_retain_buffers(c);
+ unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
- return (count < retain_target) ? 0 : (count - retain_target);
+ if (unlikely(count < retain_target))
+ count = 0;
+ else
+ count -= retain_target;
+
+ if (unlikely(count < queued_for_cleanup))
+ count = 0;
+ else
+ count -= queued_for_cleanup;
+
+ return count;
}
/*
@@ -1697,6 +1800,9 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
__free_buffer_wake(b);
}
+ INIT_WORK(&c->shrink_work, shrink_work);
+ atomic_long_set(&c->need_shrink, 0);
+
c->shrinker.count_objects = dm_bufio_shrink_count;
c->shrinker.scan_objects = dm_bufio_shrink_scan;
c->shrinker.seeks = 1;
@@ -1742,6 +1848,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
drop_buffers(c);
unregister_shrinker(&c->shrinker);
+ flush_work(&c->shrink_work);
mutex_lock(&dm_bufio_clients_lock);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index f595e98..4bc453f 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -74,22 +74,19 @@ static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
{
bool r;
- unsigned long flags;
- spin_lock_irqsave(&iot->lock, flags);
+ spin_lock_irq(&iot->lock);
r = __iot_idle_for(iot, jifs);
- spin_unlock_irqrestore(&iot->lock, flags);
+ spin_unlock_irq(&iot->lock);
return r;
}
static void iot_io_begin(struct io_tracker *iot, sector_t len)
{
- unsigned long flags;
-
- spin_lock_irqsave(&iot->lock, flags);
+ spin_lock_irq(&iot->lock);
iot->in_flight += len;
- spin_unlock_irqrestore(&iot->lock, flags);
+ spin_unlock_irq(&iot->lock);
}
static void __iot_io_end(struct io_tracker *iot, sector_t len)
@@ -172,7 +169,6 @@ static void __commit(struct work_struct *_ws)
{
struct batcher *b = container_of(_ws, struct batcher, commit_work);
blk_status_t r;
- unsigned long flags;
struct list_head work_items;
struct work_struct *ws, *tmp;
struct continuation *k;
@@ -186,12 +182,12 @@ static void __commit(struct work_struct *_ws)
* We have to grab these before the commit_op to avoid a race
* condition.
*/
- spin_lock_irqsave(&b->lock, flags);
+ spin_lock_irq(&b->lock);
list_splice_init(&b->work_items, &work_items);
bio_list_merge(&bios, &b->bios);
bio_list_init(&b->bios);
b->commit_scheduled = false;
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_unlock_irq(&b->lock);
r = b->commit_op(b->commit_context);
@@ -238,13 +234,12 @@ static void async_commit(struct batcher *b)
static void continue_after_commit(struct batcher *b, struct continuation *k)
{
- unsigned long flags;
bool commit_scheduled;
- spin_lock_irqsave(&b->lock, flags);
+ spin_lock_irq(&b->lock);
commit_scheduled = b->commit_scheduled;
list_add_tail(&k->ws.entry, &b->work_items);
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_unlock_irq(&b->lock);
if (commit_scheduled)
async_commit(b);
@@ -255,13 +250,12 @@ static void continue_after_commit(struct batcher *b, struct continuation *k)
*/
static void issue_after_commit(struct batcher *b, struct bio *bio)
{
- unsigned long flags;
bool commit_scheduled;
- spin_lock_irqsave(&b->lock, flags);
+ spin_lock_irq(&b->lock);
commit_scheduled = b->commit_scheduled;
bio_list_add(&b->bios, bio);
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_unlock_irq(&b->lock);
if (commit_scheduled)
async_commit(b);
@@ -273,12 +267,11 @@ static void issue_after_commit(struct batcher *b, struct bio *bio)
static void schedule_commit(struct batcher *b)
{
bool immediate;
- unsigned long flags;
- spin_lock_irqsave(&b->lock, flags);
+ spin_lock_irq(&b->lock);
immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
b->commit_scheduled = true;
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_unlock_irq(&b->lock);
if (immediate)
async_commit(b);
@@ -428,8 +421,6 @@ struct cache {
struct rw_semaphore quiesce_lock;
- struct dm_target_callbacks callbacks;
-
/*
* origin_blocks entries, discarded if set.
*/
@@ -630,23 +621,19 @@ static struct per_bio_data *init_per_bio_data(struct bio *bio)
static void defer_bio(struct cache *cache, struct bio *bio)
{
- unsigned long flags;
-
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
bio_list_add(&cache->deferred_bios, bio);
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
wake_deferred_bio_worker(cache);
}
static void defer_bios(struct cache *cache, struct bio_list *bios)
{
- unsigned long flags;
-
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
bio_list_merge(&cache->deferred_bios, bios);
bio_list_init(bios);
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
wake_deferred_bio_worker(cache);
}
@@ -725,10 +712,6 @@ static bool block_size_is_power_of_two(struct cache *cache)
return cache->sectors_per_block_shift >= 0;
}
-/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
-#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
-__always_inline
-#endif
static dm_block_t block_div(dm_block_t b, uint32_t n)
{
do_div(b, n);
@@ -756,33 +739,27 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
static void set_discard(struct cache *cache, dm_dblock_t b)
{
- unsigned long flags;
-
BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
atomic_inc(&cache->stats.discard_count);
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
set_bit(from_dblock(b), cache->discard_bitset);
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
}
static void clear_discard(struct cache *cache, dm_dblock_t b)
{
- unsigned long flags;
-
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
clear_bit(from_dblock(b), cache->discard_bitset);
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
}
static bool is_discarded(struct cache *cache, dm_dblock_t b)
{
int r;
- unsigned long flags;
-
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
r = test_bit(from_dblock(b), cache->discard_bitset);
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
return r;
}
@@ -790,12 +767,10 @@ static bool is_discarded(struct cache *cache, dm_dblock_t b)
static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
{
int r;
- unsigned long flags;
-
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
cache->discard_bitset);
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
return r;
}
@@ -827,17 +802,16 @@ static void remap_to_cache(struct cache *cache, struct bio *bio,
static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
{
- unsigned long flags;
struct per_bio_data *pb;
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
bio_op(bio) != REQ_OP_DISCARD) {
pb = get_per_bio_data(bio);
pb->tick = true;
cache->need_tick_bio = false;
}
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
}
static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
@@ -906,7 +880,7 @@ static void accounted_complete(struct cache *cache, struct bio *bio)
static void accounted_request(struct cache *cache, struct bio *bio)
{
accounted_begin(cache, bio);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
static void issue_op(struct bio *bio, void *context)
@@ -947,7 +921,7 @@ static enum cache_metadata_mode get_cache_mode(struct cache *cache)
static const char *cache_device_name(struct cache *cache)
{
- return dm_device_name(dm_table_get_md(cache->ti->table));
+ return dm_table_device_name(cache->ti->table);
}
static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
@@ -1812,7 +1786,7 @@ static bool process_bio(struct cache *cache, struct bio *bio)
bool commit_needed;
if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
- generic_make_request(bio);
+ submit_bio_noacct(bio);
return commit_needed;
}
@@ -1878,7 +1852,7 @@ static bool process_discard_bio(struct cache *cache, struct bio *bio)
if (cache->features.discard_passdown) {
remap_to_origin(cache, bio);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
} else
bio_endio(bio);
@@ -1889,17 +1863,16 @@ static void process_deferred_bios(struct work_struct *ws)
{
struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
- unsigned long flags;
bool commit_needed = false;
struct bio_list bios;
struct bio *bio;
bio_list_init(&bios);
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
bio_list_merge(&bios, &cache->deferred_bios);
bio_list_init(&cache->deferred_bios);
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
while ((bio = bio_list_pop(&bios))) {
if (bio->bi_opf & REQ_PREFLUSH)
@@ -2444,20 +2417,6 @@ static void set_cache_size(struct cache *cache, dm_cblock_t size)
cache->cache_size = size;
}
-static int is_congested(struct dm_dev *dev, int bdi_bits)
-{
- struct request_queue *q = bdev_get_queue(dev->bdev);
- return bdi_congested(q->backing_dev_info, bdi_bits);
-}
-
-static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
-{
- struct cache *cache = container_of(cb, struct cache, callbacks);
-
- return is_congested(cache->origin_dev, bdi_bits) ||
- is_congested(cache->cache_dev, bdi_bits);
-}
-
#define DEFAULT_MIGRATION_THRESHOLD 2048
static int cache_create(struct cache_args *ca, struct cache **result)
@@ -2492,9 +2451,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
goto bad;
}
- cache->callbacks.congested_fn = cache_is_congested;
- dm_table_add_target_callbacks(ti->table, &cache->callbacks);
-
cache->metadata_dev = ca->metadata_dev;
cache->origin_dev = ca->origin_dev;
cache->cache_dev = ca->cache_dev;
@@ -3513,7 +3469,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {2, 1, 0},
+ .version = {2, 2, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index eb7a5d3..bdb255e 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -68,7 +68,6 @@ struct hash_table_bucket;
struct clone {
struct dm_target *ti;
- struct dm_target_callbacks callbacks;
struct dm_dev *metadata_dev;
struct dm_dev *dest_dev;
@@ -330,7 +329,7 @@ static void submit_bios(struct bio_list *bios)
blk_start_plug(&plug);
while ((bio = bio_list_pop(bios)))
- generic_make_request(bio);
+ submit_bio_noacct(bio);
blk_finish_plug(&plug);
}
@@ -346,7 +345,7 @@ static void submit_bios(struct bio_list *bios)
static void issue_bio(struct clone *clone, struct bio *bio)
{
if (!bio_triggers_commit(clone, bio)) {
- generic_make_request(bio);
+ submit_bio_noacct(bio);
return;
}
@@ -473,7 +472,7 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ
bio_region_range(clone, bio, &rs, &nr_regions);
trim_bio(bio, region_to_sector(clone, rs),
nr_regions << clone->region_shift);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
} else
bio_endio(bio);
}
@@ -574,6 +573,12 @@ struct hash_table_bucket {
#define bucket_unlock_irqrestore(bucket, flags) \
spin_unlock_irqrestore(&(bucket)->lock, flags)
+#define bucket_lock_irq(bucket) \
+ spin_lock_irq(&(bucket)->lock)
+
+#define bucket_unlock_irq(bucket) \
+ spin_unlock_irq(&(bucket)->lock)
+
static int hash_table_init(struct clone *clone)
{
unsigned int i, sz;
@@ -859,7 +864,7 @@ static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio
bio->bi_private = hd;
atomic_inc(&hd->clone->hydrations_in_flight);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
/*
@@ -874,7 +879,6 @@ static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio
*/
static void hydrate_bio_region(struct clone *clone, struct bio *bio)
{
- unsigned long flags;
unsigned long region_nr;
struct hash_table_bucket *bucket;
struct dm_clone_region_hydration *hd, *hd2;
@@ -882,19 +886,19 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio)
region_nr = bio_to_region(clone, bio);
bucket = get_hash_table_bucket(clone, region_nr);
- bucket_lock_irqsave(bucket, flags);
+ bucket_lock_irq(bucket);
hd = __hash_find(bucket, region_nr);
if (hd) {
/* Someone else is hydrating the region */
bio_list_add(&hd->deferred_bios, bio);
- bucket_unlock_irqrestore(bucket, flags);
+ bucket_unlock_irq(bucket);
return;
}
if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
/* The region has been hydrated */
- bucket_unlock_irqrestore(bucket, flags);
+ bucket_unlock_irq(bucket);
issue_bio(clone, bio);
return;
}
@@ -903,16 +907,16 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio)
* We must allocate a hydration descriptor and start the hydration of
* the corresponding region.
*/
- bucket_unlock_irqrestore(bucket, flags);
+ bucket_unlock_irq(bucket);
hd = alloc_hydration(clone);
hydration_init(hd, region_nr);
- bucket_lock_irqsave(bucket, flags);
+ bucket_lock_irq(bucket);
/* Check if the region has been hydrated in the meantime. */
if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
- bucket_unlock_irqrestore(bucket, flags);
+ bucket_unlock_irq(bucket);
free_hydration(hd);
issue_bio(clone, bio);
return;
@@ -922,7 +926,7 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio)
if (hd2 != hd) {
/* Someone else started the region's hydration. */
bio_list_add(&hd2->deferred_bios, bio);
- bucket_unlock_irqrestore(bucket, flags);
+ bucket_unlock_irq(bucket);
free_hydration(hd);
return;
}
@@ -934,7 +938,7 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio)
*/
if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
hlist_del(&hd->h);
- bucket_unlock_irqrestore(bucket, flags);
+ bucket_unlock_irq(bucket);
free_hydration(hd);
bio_io_error(bio);
return;
@@ -948,11 +952,11 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio)
* to the destination device.
*/
if (is_overwrite_bio(clone, bio)) {
- bucket_unlock_irqrestore(bucket, flags);
+ bucket_unlock_irq(bucket);
hydration_overwrite(hd, bio);
} else {
bio_list_add(&hd->deferred_bios, bio);
- bucket_unlock_irqrestore(bucket, flags);
+ bucket_unlock_irq(bucket);
hydration_copy(hd, 1);
}
}
@@ -1019,7 +1023,6 @@ static unsigned long __start_next_hydration(struct clone *clone,
unsigned long offset,
struct batch_info *batch)
{
- unsigned long flags;
struct hash_table_bucket *bucket;
struct dm_clone_region_hydration *hd;
unsigned long nr_regions = clone->nr_regions;
@@ -1033,13 +1036,13 @@ static unsigned long __start_next_hydration(struct clone *clone,
break;
bucket = get_hash_table_bucket(clone, offset);
- bucket_lock_irqsave(bucket, flags);
+ bucket_lock_irq(bucket);
if (!dm_clone_is_region_hydrated(clone->cmd, offset) &&
!__hash_find(bucket, offset)) {
hydration_init(hd, offset);
__insert_region_hydration(bucket, hd);
- bucket_unlock_irqrestore(bucket, flags);
+ bucket_unlock_irq(bucket);
/* Batch hydration */
__batch_hydration(batch, hd);
@@ -1047,7 +1050,7 @@ static unsigned long __start_next_hydration(struct clone *clone,
return (offset + 1);
}
- bucket_unlock_irqrestore(bucket, flags);
+ bucket_unlock_irq(bucket);
} while (++offset < nr_regions);
@@ -1277,7 +1280,7 @@ static void process_deferred_flush_bios(struct clone *clone)
*/
bio_endio(bio);
} else {
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
}
}
@@ -1514,18 +1517,6 @@ static void clone_status(struct dm_target *ti, status_type_t type,
DMEMIT("Error");
}
-static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
-{
- struct request_queue *dest_q, *source_q;
- struct clone *clone = container_of(cb, struct clone, callbacks);
-
- source_q = bdev_get_queue(clone->source_dev->bdev);
- dest_q = bdev_get_queue(clone->dest_dev->bdev);
-
- return (bdi_congested(dest_q->backing_dev_info, bdi_bits) |
- bdi_congested(source_q->backing_dev_info, bdi_bits));
-}
-
static sector_t get_dev_size(struct dm_dev *dev)
{
return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
@@ -1926,8 +1917,6 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto out_with_mempool;
mutex_init(&clone->commit_lock);
- clone->callbacks.congested_fn = clone_is_congested;
- dm_table_add_target_callbacks(ti->table, &clone->callbacks);
/* Enable flushes */
ti->num_flush_bios = 1;
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 3fea121..3db92d9 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -11,6 +11,7 @@
#include <linux/kthread.h>
#include <linux/ktime.h>
+#include <linux/genhd.h>
#include <linux/blk-mq.h>
#include <trace/events/block.h>
@@ -25,9 +26,11 @@ struct dm_kobject_holder {
};
/*
- * DM core internal structure that used directly by dm.c and dm-rq.c
- * DM targets must _not_ deference a mapped_device to directly access its members!
+ * DM core internal structures used directly by dm.c, dm-rq.c and dm-table.c.
+ * DM targets must _not_ deference a mapped_device or dm_table to directly
+ * access their members!
*/
+
struct mapped_device {
struct mutex suspend_lock;
@@ -123,6 +126,55 @@ void disable_discard(struct mapped_device *md);
void disable_write_same(struct mapped_device *md);
void disable_write_zeroes(struct mapped_device *md);
+static inline sector_t dm_get_size(struct mapped_device *md)
+{
+ return get_capacity(md->disk);
+}
+
+static inline struct dm_stats *dm_get_stats(struct mapped_device *md)
+{
+ return &md->stats;
+}
+
+#define DM_TABLE_MAX_DEPTH 16
+
+struct dm_table {
+ struct mapped_device *md;
+ enum dm_queue_mode type;
+
+ /* btree table */
+ unsigned int depth;
+ unsigned int counts[DM_TABLE_MAX_DEPTH]; /* in nodes */
+ sector_t *index[DM_TABLE_MAX_DEPTH];
+
+ unsigned int num_targets;
+ unsigned int num_allocated;
+ sector_t *highs;
+ struct dm_target *targets;
+
+ struct target_type *immutable_target_type;
+
+ bool integrity_supported:1;
+ bool singleton:1;
+ unsigned integrity_added:1;
+
+ /*
+ * Indicates the rw permissions for the new logical
+ * device. This should be a combination of FMODE_READ
+ * and FMODE_WRITE.
+ */
+ fmode_t mode;
+
+ /* a list of devices used by this table */
+ struct list_head devices;
+
+ /* events get handed up using this callback */
+ void (*event_fn)(void *);
+ void *event_context;
+
+ struct dm_md_mempools *mempools;
+};
+
static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
{
return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 571c04e..2aa4acd 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,8 +1,8 @@
/*
* Copyright (C) 2003 Jana Saout <jana@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved.
- * Copyright (C) 2013-2017 Milan Broz <gmazyland@gmail.com>
+ * Copyright (C) 2006-2020 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2013-2020 Milan Broz <gmazyland@gmail.com>
*
* This file is released under the GPL.
*/
@@ -34,7 +34,9 @@
#include <crypto/aead.h>
#include <crypto/authenc.h>
#include <linux/rtnetlink.h> /* for struct rtattr and RTA macros only */
+#include <linux/key-type.h>
#include <keys/user-type.h>
+#include <keys/encrypted-type.h>
#include <linux/device-mapper.h>
@@ -67,6 +69,7 @@ struct dm_crypt_io {
u8 *integrity_metadata;
bool integrity_metadata_from_pool;
struct work_struct work;
+ struct tasklet_struct tasklet;
struct convert_context ctx;
@@ -115,16 +118,24 @@ struct iv_tcw_private {
u8 *whitening;
};
+#define ELEPHANT_MAX_KEY_SIZE 32
+struct iv_elephant_private {
+ struct crypto_skcipher *tfm;
+};
+
/*
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
*/
enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
- DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
+ DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD,
+ DM_CRYPT_NO_READ_WORKQUEUE, DM_CRYPT_NO_WRITE_WORKQUEUE,
+ DM_CRYPT_WRITE_INLINE };
enum cipher_flags {
CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */
CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */
+ CRYPT_ENCRYPT_PREPROCESS, /* Must preprocess data for encryption (elephant) */
};
/*
@@ -152,6 +163,7 @@ struct crypt_config {
struct iv_benbi_private benbi;
struct iv_lmk_private lmk;
struct iv_tcw_private tcw;
+ struct iv_elephant_private elephant;
} iv_gen_private;
u64 iv_offset;
unsigned int iv_size;
@@ -205,7 +217,7 @@ struct crypt_config {
struct mutex bio_alloc_lock;
u8 *authenc_key; /* space for keys in authenc() format (if used) */
- u8 key[0];
+ u8 key[];
};
#define MIN_IOS 64
@@ -223,6 +235,8 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
struct scatterlist *sg);
+static bool crypt_integrity_aead(struct crypt_config *cc);
+
/*
* Use this to access cipher attributes that are independent of the key.
*/
@@ -285,6 +299,11 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
* eboiv: Encrypted byte-offset IV (used in Bitlocker in CBC mode)
* The IV is encrypted little-endian byte-offset (with the same key
* and cipher as the volume).
+ *
+ * elephant: The extended version of eboiv with additional Elephant diffuser
+ * used with Bitlocker CBC mode.
+ * This mode was used in older Windows systems
+ * https://download.microsoft.com/download/0/2/3/0238acaf-d3bf-4a6d-b3d6-0a0be4bbb36e/bitlockercipher200608.pdf
*/
static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
@@ -334,7 +353,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
unsigned bs;
int log;
- if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags))
+ if (crypt_integrity_aead(cc))
bs = crypto_aead_blocksize(any_tfm_aead(cc));
else
bs = crypto_skcipher_blocksize(any_tfm(cc));
@@ -391,7 +410,7 @@ static void crypt_iv_lmk_dtr(struct crypt_config *cc)
crypto_free_shash(lmk->hash_tfm);
lmk->hash_tfm = NULL;
- kzfree(lmk->seed);
+ kfree_sensitive(lmk->seed);
lmk->seed = NULL;
}
@@ -405,7 +424,8 @@ static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
return -EINVAL;
}
- lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
+ lmk->hash_tfm = crypto_alloc_shash("md5", 0,
+ CRYPTO_ALG_ALLOCATES_MEMORY);
if (IS_ERR(lmk->hash_tfm)) {
ti->error = "Error initializing LMK hash";
return PTR_ERR(lmk->hash_tfm);
@@ -542,9 +562,9 @@ static void crypt_iv_tcw_dtr(struct crypt_config *cc)
{
struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
- kzfree(tcw->iv_seed);
+ kfree_sensitive(tcw->iv_seed);
tcw->iv_seed = NULL;
- kzfree(tcw->whitening);
+ kfree_sensitive(tcw->whitening);
tcw->whitening = NULL;
if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm))
@@ -567,7 +587,8 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
return -EINVAL;
}
- tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, 0);
+ tcw->crc32_tfm = crypto_alloc_shash("crc32", 0,
+ CRYPTO_ALG_ALLOCATES_MEMORY);
if (IS_ERR(tcw->crc32_tfm)) {
ti->error = "Error initializing CRC32 in TCW";
return PTR_ERR(tcw->crc32_tfm);
@@ -700,7 +721,7 @@ static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts)
{
- if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) {
+ if (crypt_integrity_aead(cc)) {
ti->error = "AEAD transforms not supported for EBOIV";
return -EINVAL;
}
@@ -740,6 +761,291 @@ static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv,
return err;
}
+static void crypt_iv_elephant_dtr(struct crypt_config *cc)
+{
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+
+ crypto_free_skcipher(elephant->tfm);
+ elephant->tfm = NULL;
+}
+
+static int crypt_iv_elephant_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+ int r;
+
+ elephant->tfm = crypto_alloc_skcipher("ecb(aes)", 0,
+ CRYPTO_ALG_ALLOCATES_MEMORY);
+ if (IS_ERR(elephant->tfm)) {
+ r = PTR_ERR(elephant->tfm);
+ elephant->tfm = NULL;
+ return r;
+ }
+
+ r = crypt_iv_eboiv_ctr(cc, ti, NULL);
+ if (r)
+ crypt_iv_elephant_dtr(cc);
+ return r;
+}
+
+static void diffuser_disk_to_cpu(u32 *d, size_t n)
+{
+#ifndef __LITTLE_ENDIAN
+ int i;
+
+ for (i = 0; i < n; i++)
+ d[i] = le32_to_cpu((__le32)d[i]);
+#endif
+}
+
+static void diffuser_cpu_to_disk(__le32 *d, size_t n)
+{
+#ifndef __LITTLE_ENDIAN
+ int i;
+
+ for (i = 0; i < n; i++)
+ d[i] = cpu_to_le32((u32)d[i]);
+#endif
+}
+
+static void diffuser_a_decrypt(u32 *d, size_t n)
+{
+ int i, i1, i2, i3;
+
+ for (i = 0; i < 5; i++) {
+ i1 = 0;
+ i2 = n - 2;
+ i3 = n - 5;
+
+ while (i1 < (n - 1)) {
+ d[i1] += d[i2] ^ (d[i3] << 9 | d[i3] >> 23);
+ i1++; i2++; i3++;
+
+ if (i3 >= n)
+ i3 -= n;
+
+ d[i1] += d[i2] ^ d[i3];
+ i1++; i2++; i3++;
+
+ if (i2 >= n)
+ i2 -= n;
+
+ d[i1] += d[i2] ^ (d[i3] << 13 | d[i3] >> 19);
+ i1++; i2++; i3++;
+
+ d[i1] += d[i2] ^ d[i3];
+ i1++; i2++; i3++;
+ }
+ }
+}
+
+static void diffuser_a_encrypt(u32 *d, size_t n)
+{
+ int i, i1, i2, i3;
+
+ for (i = 0; i < 5; i++) {
+ i1 = n - 1;
+ i2 = n - 2 - 1;
+ i3 = n - 5 - 1;
+
+ while (i1 > 0) {
+ d[i1] -= d[i2] ^ d[i3];
+ i1--; i2--; i3--;
+
+ d[i1] -= d[i2] ^ (d[i3] << 13 | d[i3] >> 19);
+ i1--; i2--; i3--;
+
+ if (i2 < 0)
+ i2 += n;
+
+ d[i1] -= d[i2] ^ d[i3];
+ i1--; i2--; i3--;
+
+ if (i3 < 0)
+ i3 += n;
+
+ d[i1] -= d[i2] ^ (d[i3] << 9 | d[i3] >> 23);
+ i1--; i2--; i3--;
+ }
+ }
+}
+
+static void diffuser_b_decrypt(u32 *d, size_t n)
+{
+ int i, i1, i2, i3;
+
+ for (i = 0; i < 3; i++) {
+ i1 = 0;
+ i2 = 2;
+ i3 = 5;
+
+ while (i1 < (n - 1)) {
+ d[i1] += d[i2] ^ d[i3];
+ i1++; i2++; i3++;
+
+ d[i1] += d[i2] ^ (d[i3] << 10 | d[i3] >> 22);
+ i1++; i2++; i3++;
+
+ if (i2 >= n)
+ i2 -= n;
+
+ d[i1] += d[i2] ^ d[i3];
+ i1++; i2++; i3++;
+
+ if (i3 >= n)
+ i3 -= n;
+
+ d[i1] += d[i2] ^ (d[i3] << 25 | d[i3] >> 7);
+ i1++; i2++; i3++;
+ }
+ }
+}
+
+static void diffuser_b_encrypt(u32 *d, size_t n)
+{
+ int i, i1, i2, i3;
+
+ for (i = 0; i < 3; i++) {
+ i1 = n - 1;
+ i2 = 2 - 1;
+ i3 = 5 - 1;
+
+ while (i1 > 0) {
+ d[i1] -= d[i2] ^ (d[i3] << 25 | d[i3] >> 7);
+ i1--; i2--; i3--;
+
+ if (i3 < 0)
+ i3 += n;
+
+ d[i1] -= d[i2] ^ d[i3];
+ i1--; i2--; i3--;
+
+ if (i2 < 0)
+ i2 += n;
+
+ d[i1] -= d[i2] ^ (d[i3] << 10 | d[i3] >> 22);
+ i1--; i2--; i3--;
+
+ d[i1] -= d[i2] ^ d[i3];
+ i1--; i2--; i3--;
+ }
+ }
+}
+
+static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *dmreq)
+{
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+ u8 *es, *ks, *data, *data2, *data_offset;
+ struct skcipher_request *req;
+ struct scatterlist *sg, *sg2, src, dst;
+ DECLARE_CRYPTO_WAIT(wait);
+ int i, r;
+
+ req = skcipher_request_alloc(elephant->tfm, GFP_NOIO);
+ es = kzalloc(16, GFP_NOIO); /* Key for AES */
+ ks = kzalloc(32, GFP_NOIO); /* Elephant sector key */
+
+ if (!req || !es || !ks) {
+ r = -ENOMEM;
+ goto out;
+ }
+
+ *(__le64 *)es = cpu_to_le64(dmreq->iv_sector * cc->sector_size);
+
+ /* E(Ks, e(s)) */
+ sg_init_one(&src, es, 16);
+ sg_init_one(&dst, ks, 16);
+ skcipher_request_set_crypt(req, &src, &dst, 16, NULL);
+ skcipher_request_set_callback(req, 0, crypto_req_done, &wait);
+ r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+ if (r)
+ goto out;
+
+ /* E(Ks, e'(s)) */
+ es[15] = 0x80;
+ sg_init_one(&dst, &ks[16], 16);
+ r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+ if (r)
+ goto out;
+
+ sg = crypt_get_sg_data(cc, dmreq->sg_out);
+ data = kmap_atomic(sg_page(sg));
+ data_offset = data + sg->offset;
+
+ /* Cannot modify original bio, copy to sg_out and apply Elephant to it */
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+ sg2 = crypt_get_sg_data(cc, dmreq->sg_in);
+ data2 = kmap_atomic(sg_page(sg2));
+ memcpy(data_offset, data2 + sg2->offset, cc->sector_size);
+ kunmap_atomic(data2);
+ }
+
+ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
+ diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_b_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_a_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32));
+ }
+
+ for (i = 0; i < (cc->sector_size / 32); i++)
+ crypto_xor(data_offset + i * 32, ks, 32);
+
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+ diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_a_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_b_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32));
+ }
+
+ kunmap_atomic(data);
+out:
+ kfree_sensitive(ks);
+ kfree_sensitive(es);
+ skcipher_request_free(req);
+ return r;
+}
+
+static int crypt_iv_elephant_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ int r;
+
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+ r = crypt_iv_elephant(cc, dmreq);
+ if (r)
+ return r;
+ }
+
+ return crypt_iv_eboiv_gen(cc, iv, dmreq);
+}
+
+static int crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
+ return crypt_iv_elephant(cc, dmreq);
+
+ return 0;
+}
+
+static int crypt_iv_elephant_init(struct crypt_config *cc)
+{
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+ int key_offset = cc->key_size - cc->key_extra_size;
+
+ return crypto_skcipher_setkey(elephant->tfm, &cc->key[key_offset], cc->key_extra_size);
+}
+
+static int crypt_iv_elephant_wipe(struct crypt_config *cc)
+{
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+ u8 key[ELEPHANT_MAX_KEY_SIZE];
+
+ memset(key, 0, cc->key_extra_size);
+ return crypto_skcipher_setkey(elephant->tfm, key, cc->key_extra_size);
+}
+
static const struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
@@ -793,6 +1099,15 @@ static struct crypt_iv_operations crypt_iv_eboiv_ops = {
.generator = crypt_iv_eboiv_gen
};
+static struct crypt_iv_operations crypt_iv_elephant_ops = {
+ .ctr = crypt_iv_elephant_ctr,
+ .dtr = crypt_iv_elephant_dtr,
+ .init = crypt_iv_elephant_init,
+ .wipe = crypt_iv_elephant_wipe,
+ .generator = crypt_iv_elephant_gen,
+ .post = crypt_iv_elephant_post
+};
+
/*
* Integrity extensions
*/
@@ -1109,6 +1424,9 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
if (r < 0)
return r;
+ /* Data can be already preprocessed in generator */
+ if (test_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags))
+ sg_in = sg_out;
/* Store generated IV in integrity metadata */
if (cc->integrity_iv_size)
memcpy(tag_iv, org_iv, cc->integrity_iv_size);
@@ -1136,13 +1454,16 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
static void kcryptd_async_done(struct crypto_async_request *async_req,
int error);
-static void crypt_alloc_req_skcipher(struct crypt_config *cc,
+static int crypt_alloc_req_skcipher(struct crypt_config *cc,
struct convert_context *ctx)
{
unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
- if (!ctx->r.req)
- ctx->r.req = mempool_alloc(&cc->req_pool, GFP_NOIO);
+ if (!ctx->r.req) {
+ ctx->r.req = mempool_alloc(&cc->req_pool, in_interrupt() ? GFP_ATOMIC : GFP_NOIO);
+ if (!ctx->r.req)
+ return -ENOMEM;
+ }
skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]);
@@ -1153,13 +1474,18 @@ static void crypt_alloc_req_skcipher(struct crypt_config *cc,
skcipher_request_set_callback(ctx->r.req,
CRYPTO_TFM_REQ_MAY_BACKLOG,
kcryptd_async_done, dmreq_of_req(cc, ctx->r.req));
+
+ return 0;
}
-static void crypt_alloc_req_aead(struct crypt_config *cc,
+static int crypt_alloc_req_aead(struct crypt_config *cc,
struct convert_context *ctx)
{
- if (!ctx->r.req_aead)
- ctx->r.req_aead = mempool_alloc(&cc->req_pool, GFP_NOIO);
+ if (!ctx->r.req_aead) {
+ ctx->r.req_aead = mempool_alloc(&cc->req_pool, in_interrupt() ? GFP_ATOMIC : GFP_NOIO);
+ if (!ctx->r.req_aead)
+ return -ENOMEM;
+ }
aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]);
@@ -1170,15 +1496,17 @@ static void crypt_alloc_req_aead(struct crypt_config *cc,
aead_request_set_callback(ctx->r.req_aead,
CRYPTO_TFM_REQ_MAY_BACKLOG,
kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead));
+
+ return 0;
}
-static void crypt_alloc_req(struct crypt_config *cc,
+static int crypt_alloc_req(struct crypt_config *cc,
struct convert_context *ctx)
{
if (crypt_integrity_aead(cc))
- crypt_alloc_req_aead(cc, ctx);
+ return crypt_alloc_req_aead(cc, ctx);
else
- crypt_alloc_req_skcipher(cc, ctx);
+ return crypt_alloc_req_skcipher(cc, ctx);
}
static void crypt_free_req_skcipher(struct crypt_config *cc,
@@ -1211,17 +1539,28 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_
* Encrypt / decrypt data from one bio to another one (can be the same one)
*/
static blk_status_t crypt_convert(struct crypt_config *cc,
- struct convert_context *ctx)
+ struct convert_context *ctx, bool atomic, bool reset_pending)
{
unsigned int tag_offset = 0;
unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
int r;
- atomic_set(&ctx->cc_pending, 1);
+ /*
+ * if reset_pending is set we are dealing with the bio for the first time,
+ * else we're continuing to work on the previous bio, so don't mess with
+ * the cc_pending counter
+ */
+ if (reset_pending)
+ atomic_set(&ctx->cc_pending, 1);
while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) {
- crypt_alloc_req(cc, ctx);
+ r = crypt_alloc_req(cc, ctx);
+ if (r) {
+ complete(&ctx->restart);
+ return BLK_STS_DEV_RESOURCE;
+ }
+
atomic_inc(&ctx->cc_pending);
if (crypt_integrity_aead(cc))
@@ -1235,9 +1574,27 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
* but the driver request queue is full, let's wait.
*/
case -EBUSY:
- wait_for_completion(&ctx->restart);
+ if (in_interrupt()) {
+ if (try_wait_for_completion(&ctx->restart)) {
+ /*
+ * we don't have to block to wait for completion,
+ * so proceed
+ */
+ } else {
+ /*
+ * we can't wait for completion without blocking
+ * exit and continue processing in a workqueue
+ */
+ ctx->r.req = NULL;
+ ctx->cc_sector += sector_step;
+ tag_offset++;
+ return BLK_STS_DEV_RESOURCE;
+ }
+ } else {
+ wait_for_completion(&ctx->restart);
+ }
reinit_completion(&ctx->restart);
- /* fall through */
+ fallthrough;
/*
* The request is queued and processed asynchronously,
* completion function kcryptd_async_done() will be called.
@@ -1254,7 +1611,8 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
atomic_dec(&ctx->cc_pending);
ctx->cc_sector += sector_step;
tag_offset++;
- cond_resched();
+ if (!atomic)
+ cond_resched();
continue;
/*
* There was a data integrity error.
@@ -1372,6 +1730,12 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
atomic_inc(&io->io_pending);
}
+static void kcryptd_io_bio_endio(struct work_struct *work)
+{
+ struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+ bio_endio(io->base_bio);
+}
+
/*
* One of the bios was finished. Check for completion of
* the whole request and correctly clean up the buffer.
@@ -1394,7 +1758,23 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
kfree(io->integrity_metadata);
base_bio->bi_status = error;
- bio_endio(base_bio);
+
+ /*
+ * If we are running this function from our tasklet,
+ * we can't call bio_endio() here, because it will call
+ * clone_endio() from dm.c, which in turn will
+ * free the current struct dm_crypt_io structure with
+ * our tasklet. In this case we need to delay bio_endio()
+ * execution to after the tasklet is done and dequeued.
+ */
+ if (tasklet_trylock(&io->tasklet)) {
+ tasklet_unlock(&io->tasklet);
+ bio_endio(base_bio);
+ return;
+ }
+
+ INIT_WORK(&io->work, kcryptd_io_bio_endio);
+ queue_work(cc->io_queue, &io->work);
}
/*
@@ -1477,7 +1857,7 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
return 1;
}
- generic_make_request(clone);
+ submit_bio_noacct(clone);
return 0;
}
@@ -1503,7 +1883,7 @@ static void kcryptd_io_write(struct dm_crypt_io *io)
{
struct bio *clone = io->ctx.bio_out;
- generic_make_request(clone);
+ submit_bio_noacct(clone);
}
#define crypt_io_from_node(node) rb_entry((node), struct dm_crypt_io, rb_node)
@@ -1580,8 +1960,9 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
clone->bi_iter.bi_sector = cc->start + io->sector;
- if (likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) {
- generic_make_request(clone);
+ if ((likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) ||
+ test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) {
+ submit_bio_noacct(clone);
return;
}
@@ -1603,9 +1984,63 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
spin_unlock_irqrestore(&cc->write_thread_lock, flags);
}
+static bool kcryptd_crypt_write_inline(struct crypt_config *cc,
+ struct convert_context *ctx)
+
+{
+ if (!test_bit(DM_CRYPT_WRITE_INLINE, &cc->flags))
+ return false;
+
+ /*
+ * Note: zone append writes (REQ_OP_ZONE_APPEND) do not have ordering
+ * constraints so they do not need to be issued inline by
+ * kcryptd_crypt_write_convert().
+ */
+ switch (bio_op(ctx->bio_in)) {
+ case REQ_OP_WRITE:
+ case REQ_OP_WRITE_SAME:
+ case REQ_OP_WRITE_ZEROES:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static void kcryptd_crypt_write_continue(struct work_struct *work)
+{
+ struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+ struct crypt_config *cc = io->cc;
+ struct convert_context *ctx = &io->ctx;
+ int crypt_finished;
+ sector_t sector = io->sector;
+ blk_status_t r;
+
+ wait_for_completion(&ctx->restart);
+ reinit_completion(&ctx->restart);
+
+ r = crypt_convert(cc, &io->ctx, true, false);
+ if (r)
+ io->error = r;
+ crypt_finished = atomic_dec_and_test(&ctx->cc_pending);
+ if (!crypt_finished && kcryptd_crypt_write_inline(cc, ctx)) {
+ /* Wait for completion signaled by kcryptd_async_done() */
+ wait_for_completion(&ctx->restart);
+ crypt_finished = 1;
+ }
+
+ /* Encryption was already finished, submit io now */
+ if (crypt_finished) {
+ kcryptd_crypt_write_io_submit(io, 0);
+ io->sector = sector;
+ }
+
+ crypt_dec_pending(io);
+}
+
static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->cc;
+ struct convert_context *ctx = &io->ctx;
struct bio *clone;
int crypt_finished;
sector_t sector = io->sector;
@@ -1615,7 +2050,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
* Prevent io from disappearing until this function completes.
*/
crypt_inc_pending(io);
- crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector);
+ crypt_convert_init(cc, ctx, NULL, io->base_bio, sector);
clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
if (unlikely(!clone)) {
@@ -1629,10 +2064,26 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
sector += bio_sectors(clone);
crypt_inc_pending(io);
- r = crypt_convert(cc, &io->ctx);
+ r = crypt_convert(cc, ctx,
+ test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags), true);
+ /*
+ * Crypto API backlogged the request, because its queue was full
+ * and we're in softirq context, so continue from a workqueue
+ * (TODO: is it actually possible to be in softirq in the write path?)
+ */
+ if (r == BLK_STS_DEV_RESOURCE) {
+ INIT_WORK(&io->work, kcryptd_crypt_write_continue);
+ queue_work(cc->crypt_queue, &io->work);
+ return;
+ }
if (r)
io->error = r;
- crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
+ crypt_finished = atomic_dec_and_test(&ctx->cc_pending);
+ if (!crypt_finished && kcryptd_crypt_write_inline(cc, ctx)) {
+ /* Wait for completion signaled by kcryptd_async_done() */
+ wait_for_completion(&ctx->restart);
+ crypt_finished = 1;
+ }
/* Encryption was already finished, submit io now */
if (crypt_finished) {
@@ -1649,6 +2100,25 @@ static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
crypt_dec_pending(io);
}
+static void kcryptd_crypt_read_continue(struct work_struct *work)
+{
+ struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+ struct crypt_config *cc = io->cc;
+ blk_status_t r;
+
+ wait_for_completion(&io->ctx.restart);
+ reinit_completion(&io->ctx.restart);
+
+ r = crypt_convert(cc, &io->ctx, true, false);
+ if (r)
+ io->error = r;
+
+ if (atomic_dec_and_test(&io->ctx.cc_pending))
+ kcryptd_crypt_read_done(io);
+
+ crypt_dec_pending(io);
+}
+
static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->cc;
@@ -1659,7 +2129,17 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio,
io->sector);
- r = crypt_convert(cc, &io->ctx);
+ r = crypt_convert(cc, &io->ctx,
+ test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true);
+ /*
+ * Crypto API backlogged the request, because its queue was full
+ * and we're in softirq context, so continue from a workqueue
+ */
+ if (r == BLK_STS_DEV_RESOURCE) {
+ INIT_WORK(&io->work, kcryptd_crypt_read_continue);
+ queue_work(cc->crypt_queue, &io->work);
+ return;
+ }
if (r)
io->error = r;
@@ -1703,10 +2183,21 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
if (!atomic_dec_and_test(&ctx->cc_pending))
return;
- if (bio_data_dir(io->base_bio) == READ)
+ /*
+ * The request is fully completed: for inline writes, let
+ * kcryptd_crypt_write_convert() do the IO submission.
+ */
+ if (bio_data_dir(io->base_bio) == READ) {
kcryptd_crypt_read_done(io);
- else
- kcryptd_crypt_write_io_submit(io, 1);
+ return;
+ }
+
+ if (kcryptd_crypt_write_inline(cc, ctx)) {
+ complete(&ctx->restart);
+ return;
+ }
+
+ kcryptd_crypt_write_io_submit(io, 1);
}
static void kcryptd_crypt(struct work_struct *work)
@@ -1719,10 +2210,32 @@ static void kcryptd_crypt(struct work_struct *work)
kcryptd_crypt_write_convert(io);
}
+static void kcryptd_crypt_tasklet(unsigned long work)
+{
+ kcryptd_crypt((struct work_struct *)work);
+}
+
static void kcryptd_queue_crypt(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->cc;
+ if ((bio_data_dir(io->base_bio) == READ && test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags)) ||
+ (bio_data_dir(io->base_bio) == WRITE && test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags))) {
+ /*
+ * in_irq(): Crypto API's skcipher_walk_first() refuses to work in hard IRQ context.
+ * irqs_disabled(): the kernel may run some IO completion from the idle thread, but
+ * it is being executed with irqs disabled.
+ */
+ if (in_irq() || irqs_disabled()) {
+ tasklet_init(&io->tasklet, kcryptd_crypt_tasklet, (unsigned long)&io->work);
+ tasklet_schedule(&io->tasklet);
+ return;
+ }
+
+ kcryptd_crypt(&io->work);
+ return;
+ }
+
INIT_WORK(&io->work, kcryptd_crypt);
queue_work(cc->crypt_queue, &io->work);
}
@@ -1778,7 +2291,8 @@ static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode)
return -ENOMEM;
for (i = 0; i < cc->tfms_count; i++) {
- cc->cipher_tfm.tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0);
+ cc->cipher_tfm.tfms[i] = crypto_alloc_skcipher(ciphermode, 0,
+ CRYPTO_ALG_ALLOCATES_MEMORY);
if (IS_ERR(cc->cipher_tfm.tfms[i])) {
err = PTR_ERR(cc->cipher_tfm.tfms[i]);
crypt_free_tfms(cc);
@@ -1804,7 +2318,8 @@ static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode)
if (!cc->cipher_tfm.tfms)
return -ENOMEM;
- cc->cipher_tfm.tfms_aead[0] = crypto_alloc_aead(ciphermode, 0, 0);
+ cc->cipher_tfm.tfms_aead[0] = crypto_alloc_aead(ciphermode, 0,
+ CRYPTO_ALG_ALLOCATES_MEMORY);
if (IS_ERR(cc->cipher_tfm.tfms_aead[0])) {
err = PTR_ERR(cc->cipher_tfm.tfms_aead[0]);
crypt_free_tfms(cc);
@@ -1905,12 +2420,47 @@ static bool contains_whitespace(const char *str)
return false;
}
+static int set_key_user(struct crypt_config *cc, struct key *key)
+{
+ const struct user_key_payload *ukp;
+
+ ukp = user_key_payload_locked(key);
+ if (!ukp)
+ return -EKEYREVOKED;
+
+ if (cc->key_size != ukp->datalen)
+ return -EINVAL;
+
+ memcpy(cc->key, ukp->data, cc->key_size);
+
+ return 0;
+}
+
+#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE)
+static int set_key_encrypted(struct crypt_config *cc, struct key *key)
+{
+ const struct encrypted_key_payload *ekp;
+
+ ekp = key->payload.data[0];
+ if (!ekp)
+ return -EKEYREVOKED;
+
+ if (cc->key_size != ekp->decrypted_datalen)
+ return -EINVAL;
+
+ memcpy(cc->key, ekp->decrypted_data, cc->key_size);
+
+ return 0;
+}
+#endif /* CONFIG_ENCRYPTED_KEYS */
+
static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string)
{
char *new_key_string, *key_desc;
int ret;
+ struct key_type *type;
struct key *key;
- const struct user_key_payload *ukp;
+ int (*set_key)(struct crypt_config *cc, struct key *key);
/*
* Reject key_string with whitespace. dm core currently lacks code for
@@ -1926,40 +2476,41 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
if (!key_desc || key_desc == key_string || !strlen(key_desc + 1))
return -EINVAL;
- if (strncmp(key_string, "logon:", key_desc - key_string + 1) &&
- strncmp(key_string, "user:", key_desc - key_string + 1))
+ if (!strncmp(key_string, "logon:", key_desc - key_string + 1)) {
+ type = &key_type_logon;
+ set_key = set_key_user;
+ } else if (!strncmp(key_string, "user:", key_desc - key_string + 1)) {
+ type = &key_type_user;
+ set_key = set_key_user;
+#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE)
+ } else if (!strncmp(key_string, "encrypted:", key_desc - key_string + 1)) {
+ type = &key_type_encrypted;
+ set_key = set_key_encrypted;
+#endif
+ } else {
return -EINVAL;
+ }
new_key_string = kstrdup(key_string, GFP_KERNEL);
if (!new_key_string)
return -ENOMEM;
- key = request_key(key_string[0] == 'l' ? &key_type_logon : &key_type_user,
- key_desc + 1, NULL);
+ key = request_key(type, key_desc + 1, NULL);
if (IS_ERR(key)) {
- kzfree(new_key_string);
+ kfree_sensitive(new_key_string);
return PTR_ERR(key);
}
down_read(&key->sem);
- ukp = user_key_payload_locked(key);
- if (!ukp) {
+ ret = set_key(cc, key);
+ if (ret < 0) {
up_read(&key->sem);
key_put(key);
- kzfree(new_key_string);
- return -EKEYREVOKED;
+ kfree_sensitive(new_key_string);
+ return ret;
}
- if (cc->key_size != ukp->datalen) {
- up_read(&key->sem);
- key_put(key);
- kzfree(new_key_string);
- return -EINVAL;
- }
-
- memcpy(cc->key, ukp->data, cc->key_size);
-
up_read(&key->sem);
key_put(key);
@@ -1970,10 +2521,10 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
if (!ret) {
set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
- kzfree(cc->key_string);
+ kfree_sensitive(cc->key_string);
cc->key_string = new_key_string;
} else
- kzfree(new_key_string);
+ kfree_sensitive(new_key_string);
return ret;
}
@@ -2013,7 +2564,7 @@ static int get_key_size(char **key_string)
return (*key_string[0] == ':') ? -EINVAL : strlen(*key_string) >> 1;
}
-#endif
+#endif /* CONFIG_KEYS */
static int crypt_set_key(struct crypt_config *cc, char *key)
{
@@ -2034,7 +2585,7 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
/* wipe references to any kernel keyring key */
- kzfree(cc->key_string);
+ kfree_sensitive(cc->key_string);
cc->key_string = NULL;
/* Decode key from its hex representation. */
@@ -2066,7 +2617,7 @@ static int crypt_wipe_key(struct crypt_config *cc)
return r;
}
- kzfree(cc->key_string);
+ kfree_sensitive(cc->key_string);
cc->key_string = NULL;
r = crypt_setkey(cc);
memset(&cc->key, 0, cc->key_size * sizeof(u8));
@@ -2150,15 +2701,15 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->dev)
dm_put_device(ti, cc->dev);
- kzfree(cc->cipher_string);
- kzfree(cc->key_string);
- kzfree(cc->cipher_auth);
- kzfree(cc->authenc_key);
+ kfree_sensitive(cc->cipher_string);
+ kfree_sensitive(cc->key_string);
+ kfree_sensitive(cc->cipher_auth);
+ kfree_sensitive(cc->authenc_key);
mutex_destroy(&cc->bio_alloc_lock);
/* Must zero key material before freeing */
- kzfree(cc);
+ kfree_sensitive(cc);
spin_lock(&dm_crypt_clients_lock);
WARN_ON(!dm_crypt_clients_n);
@@ -2202,7 +2753,14 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
cc->iv_gen_ops = &crypt_iv_null_ops;
else if (strcmp(ivmode, "eboiv") == 0)
cc->iv_gen_ops = &crypt_iv_eboiv_ops;
- else if (strcmp(ivmode, "lmk") == 0) {
+ else if (strcmp(ivmode, "elephant") == 0) {
+ cc->iv_gen_ops = &crypt_iv_elephant_ops;
+ cc->key_parts = 2;
+ cc->key_extra_size = cc->key_size / 2;
+ if (cc->key_extra_size > ELEPHANT_MAX_KEY_SIZE)
+ return -EINVAL;
+ set_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags);
+ } else if (strcmp(ivmode, "lmk") == 0) {
cc->iv_gen_ops = &crypt_iv_lmk_ops;
/*
* Version 2 and 3 is recognised according
@@ -2253,7 +2811,7 @@ static int crypt_ctr_auth_cipher(struct crypt_config *cc, char *cipher_api)
return -ENOMEM;
strncpy(mac_alg, start, end - start);
- mac = crypto_alloc_ahash(mac_alg, 0, 0);
+ mac = crypto_alloc_ahash(mac_alg, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
kfree(mac_alg);
if (IS_ERR(mac))
@@ -2488,7 +3046,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
struct crypt_config *cc = ti->private;
struct dm_arg_set as;
static const struct dm_arg _args[] = {
- {0, 6, "Invalid number of feature args"},
+ {0, 8, "Invalid number of feature args"},
};
unsigned int opt_params, val;
const char *opt_string, *sval;
@@ -2518,6 +3076,10 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+ else if (!strcasecmp(opt_string, "no_read_workqueue"))
+ set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
+ else if (!strcasecmp(opt_string, "no_write_workqueue"))
+ set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
else if (sscanf(opt_string, "integrity:%u:", &val) == 1) {
if (val == 0 || val > MAX_TAG_SIZE) {
ti->error = "Invalid integrity arguments";
@@ -2558,6 +3120,21 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
return 0;
}
+#ifdef CONFIG_BLK_DEV_ZONED
+
+static int crypt_report_zones(struct dm_target *ti,
+ struct dm_report_zones_args *args, unsigned int nr_zones)
+{
+ struct crypt_config *cc = ti->private;
+ sector_t sector = cc->start + dm_target_offset(ti, args->next_sector);
+
+ args->start = cc->start;
+ return blkdev_report_zones(cc->dev->bdev, sector, nr_zones,
+ dm_report_zones_cb, args);
+}
+
+#endif
+
/*
* Construct an encryption mapping:
* <cipher> [<key>|:<key_size>:<user|logon>:<key_description>] <iv_offset> <dev_path> <start>
@@ -2691,6 +3268,16 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
cc->start = tmpll;
+ /*
+ * For zoned block devices, we need to preserve the issuer write
+ * ordering. To do so, disable write workqueues and force inline
+ * encryption completion.
+ */
+ if (bdev_is_zoned(cc->dev->bdev)) {
+ set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
+ set_bit(DM_CRYPT_WRITE_INLINE, &cc->flags);
+ }
+
if (crypt_integrity_aead(cc) || cc->integrity_iv_size) {
ret = crypt_integrity_ctr(cc, ti);
if (ret)
@@ -2847,6 +3434,8 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
num_feature_args += !!ti->num_discard_bios;
num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+ num_feature_args += test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
+ num_feature_args += test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT);
num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
if (cc->on_disk_tag_size)
@@ -2859,6 +3448,10 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
DMEMIT(" same_cpu_crypt");
if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
DMEMIT(" submit_from_crypt_cpus");
+ if (test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags))
+ DMEMIT(" no_read_workqueue");
+ if (test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags))
+ DMEMIT(" no_write_workqueue");
if (cc->on_disk_tag_size)
DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth);
if (cc->sector_size != (1 << SECTOR_SHIFT))
@@ -2971,10 +3564,14 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 19, 0},
+ .version = {1, 22, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
+#ifdef CONFIG_BLK_DEV_ZONED
+ .features = DM_TARGET_ZONED_HM,
+ .report_zones = crypt_report_zones,
+#endif
.map = crypt_map,
.status = crypt_status,
.postsuspend = crypt_postsuspend,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index f496213..2628a83 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -72,7 +72,7 @@ static void flush_bios(struct bio *bio)
while (bio) {
n = bio->bi_next;
bio->bi_next = NULL;
- generic_make_request(bio);
+ submit_bio_noacct(bio);
bio = n;
}
}
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
index 8288887..072ea91 100644
--- a/drivers/md/dm-dust.c
+++ b/drivers/md/dm-dust.c
@@ -17,6 +17,7 @@
struct badblock {
struct rb_node node;
sector_t bb;
+ unsigned char wr_fail_cnt;
};
struct dust_device {
@@ -101,7 +102,8 @@ static int dust_remove_block(struct dust_device *dd, unsigned long long block)
return 0;
}
-static int dust_add_block(struct dust_device *dd, unsigned long long block)
+static int dust_add_block(struct dust_device *dd, unsigned long long block,
+ unsigned char wr_fail_cnt)
{
struct badblock *bblock;
unsigned long flags;
@@ -115,6 +117,7 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block)
spin_lock_irqsave(&dd->dust_lock, flags);
bblock->bb = block;
+ bblock->wr_fail_cnt = wr_fail_cnt;
if (!dust_rb_insert(&dd->badblocklist, bblock)) {
if (!dd->quiet_mode) {
DMERR("%s: block %llu already in badblocklist",
@@ -126,27 +129,31 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block)
}
dd->badblock_count++;
- if (!dd->quiet_mode)
- DMINFO("%s: badblock added at block %llu", __func__, block);
+ if (!dd->quiet_mode) {
+ DMINFO("%s: badblock added at block %llu with write fail count %hhu",
+ __func__, block, wr_fail_cnt);
+ }
spin_unlock_irqrestore(&dd->dust_lock, flags);
return 0;
}
-static int dust_query_block(struct dust_device *dd, unsigned long long block)
+static int dust_query_block(struct dust_device *dd, unsigned long long block, char *result,
+ unsigned int maxlen, unsigned int *sz_ptr)
{
struct badblock *bblock;
unsigned long flags;
+ unsigned int sz = *sz_ptr;
spin_lock_irqsave(&dd->dust_lock, flags);
bblock = dust_rb_search(&dd->badblocklist, block);
if (bblock != NULL)
- DMINFO("%s: block %llu found in badblocklist", __func__, block);
+ DMEMIT("%s: block %llu found in badblocklist", __func__, block);
else
- DMINFO("%s: block %llu not found in badblocklist", __func__, block);
+ DMEMIT("%s: block %llu not found in badblocklist", __func__, block);
spin_unlock_irqrestore(&dd->dust_lock, flags);
- return 0;
+ return 1;
}
static int __dust_map_read(struct dust_device *dd, sector_t thisblock)
@@ -163,22 +170,27 @@ static int dust_map_read(struct dust_device *dd, sector_t thisblock,
bool fail_read_on_bb)
{
unsigned long flags;
- int ret = DM_MAPIO_REMAPPED;
+ int r = DM_MAPIO_REMAPPED;
if (fail_read_on_bb) {
thisblock >>= dd->sect_per_block_shift;
spin_lock_irqsave(&dd->dust_lock, flags);
- ret = __dust_map_read(dd, thisblock);
+ r = __dust_map_read(dd, thisblock);
spin_unlock_irqrestore(&dd->dust_lock, flags);
}
- return ret;
+ return r;
}
-static void __dust_map_write(struct dust_device *dd, sector_t thisblock)
+static int __dust_map_write(struct dust_device *dd, sector_t thisblock)
{
struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);
+ if (bblk && bblk->wr_fail_cnt > 0) {
+ bblk->wr_fail_cnt--;
+ return DM_MAPIO_KILL;
+ }
+
if (bblk) {
rb_erase(&bblk->node, &dd->badblocklist);
dd->badblock_count--;
@@ -189,37 +201,40 @@ static void __dust_map_write(struct dust_device *dd, sector_t thisblock)
(unsigned long long)thisblock);
}
}
+
+ return DM_MAPIO_REMAPPED;
}
static int dust_map_write(struct dust_device *dd, sector_t thisblock,
bool fail_read_on_bb)
{
unsigned long flags;
+ int r = DM_MAPIO_REMAPPED;
if (fail_read_on_bb) {
thisblock >>= dd->sect_per_block_shift;
spin_lock_irqsave(&dd->dust_lock, flags);
- __dust_map_write(dd, thisblock);
+ r = __dust_map_write(dd, thisblock);
spin_unlock_irqrestore(&dd->dust_lock, flags);
}
- return DM_MAPIO_REMAPPED;
+ return r;
}
static int dust_map(struct dm_target *ti, struct bio *bio)
{
struct dust_device *dd = ti->private;
- int ret;
+ int r;
bio_set_dev(bio, dd->dev->bdev);
bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
if (bio_data_dir(bio) == READ)
- ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
+ r = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
else
- ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
+ r = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
- return ret;
+ return r;
}
static bool __dust_clear_badblocks(struct rb_root *tree,
@@ -246,11 +261,13 @@ static bool __dust_clear_badblocks(struct rb_root *tree,
return true;
}
-static int dust_clear_badblocks(struct dust_device *dd)
+static int dust_clear_badblocks(struct dust_device *dd, char *result, unsigned int maxlen,
+ unsigned int *sz_ptr)
{
unsigned long flags;
struct rb_root badblocklist;
unsigned long long badblock_count;
+ unsigned int sz = *sz_ptr;
spin_lock_irqsave(&dd->dust_lock, flags);
badblocklist = dd->badblocklist;
@@ -260,11 +277,36 @@ static int dust_clear_badblocks(struct dust_device *dd)
spin_unlock_irqrestore(&dd->dust_lock, flags);
if (!__dust_clear_badblocks(&badblocklist, badblock_count))
- DMINFO("%s: no badblocks found", __func__);
+ DMEMIT("%s: no badblocks found", __func__);
else
- DMINFO("%s: badblocks cleared", __func__);
+ DMEMIT("%s: badblocks cleared", __func__);
- return 0;
+ return 1;
+}
+
+static int dust_list_badblocks(struct dust_device *dd, char *result, unsigned int maxlen,
+ unsigned int *sz_ptr)
+{
+ unsigned long flags;
+ struct rb_root badblocklist;
+ struct rb_node *node;
+ struct badblock *bblk;
+ unsigned int sz = *sz_ptr;
+ unsigned long long num = 0;
+
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ badblocklist = dd->badblocklist;
+ for (node = rb_first(&badblocklist); node; node = rb_next(node)) {
+ bblk = rb_entry(node, struct badblock, node);
+ DMEMIT("%llu\n", bblk->bb);
+ num++;
+ }
+
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+ if (!num)
+ DMEMIT("No blocks in badblocklist");
+
+ return 1;
}
/*
@@ -370,14 +412,17 @@ static void dust_dtr(struct dm_target *ti)
}
static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
- char *result_buf, unsigned int maxlen)
+ char *result, unsigned int maxlen)
{
struct dust_device *dd = ti->private;
sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT;
bool invalid_msg = false;
- int result = -EINVAL;
+ int r = -EINVAL;
unsigned long long tmp, block;
+ unsigned char wr_fail_cnt;
+ unsigned int tmp_ui;
unsigned long flags;
+ unsigned int sz = 0;
char dummy;
if (argc == 1) {
@@ -388,45 +433,71 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
} else if (!strcasecmp(argv[0], "disable")) {
DMINFO("disabling read failures on bad sectors");
dd->fail_read_on_bb = false;
- result = 0;
+ r = 0;
} else if (!strcasecmp(argv[0], "enable")) {
DMINFO("enabling read failures on bad sectors");
dd->fail_read_on_bb = true;
- result = 0;
+ r = 0;
} else if (!strcasecmp(argv[0], "countbadblocks")) {
spin_lock_irqsave(&dd->dust_lock, flags);
- DMINFO("countbadblocks: %llu badblock(s) found",
+ DMEMIT("countbadblocks: %llu badblock(s) found",
dd->badblock_count);
spin_unlock_irqrestore(&dd->dust_lock, flags);
- result = 0;
+ r = 1;
} else if (!strcasecmp(argv[0], "clearbadblocks")) {
- result = dust_clear_badblocks(dd);
+ r = dust_clear_badblocks(dd, result, maxlen, &sz);
} else if (!strcasecmp(argv[0], "quiet")) {
if (!dd->quiet_mode)
dd->quiet_mode = true;
else
dd->quiet_mode = false;
- result = 0;
+ r = 0;
+ } else if (!strcasecmp(argv[0], "listbadblocks")) {
+ r = dust_list_badblocks(dd, result, maxlen, &sz);
} else {
invalid_msg = true;
}
} else if (argc == 2) {
if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1)
- return result;
+ return r;
block = tmp;
sector_div(size, dd->sect_per_block);
if (block > size) {
DMERR("selected block value out of range");
- return result;
+ return r;
}
if (!strcasecmp(argv[0], "addbadblock"))
- result = dust_add_block(dd, block);
+ r = dust_add_block(dd, block, 0);
else if (!strcasecmp(argv[0], "removebadblock"))
- result = dust_remove_block(dd, block);
+ r = dust_remove_block(dd, block);
else if (!strcasecmp(argv[0], "queryblock"))
- result = dust_query_block(dd, block);
+ r = dust_query_block(dd, block, result, maxlen, &sz);
+ else
+ invalid_msg = true;
+
+ } else if (argc == 3) {
+ if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1)
+ return r;
+
+ if (sscanf(argv[2], "%u%c", &tmp_ui, &dummy) != 1)
+ return r;
+
+ block = tmp;
+ if (tmp_ui > 255) {
+ DMERR("selected write fail count out of range");
+ return r;
+ }
+ wr_fail_cnt = tmp_ui;
+ sector_div(size, dd->sect_per_block);
+ if (block > size) {
+ DMERR("selected block value out of range");
+ return r;
+ }
+
+ if (!strcasecmp(argv[0], "addbadblock"))
+ r = dust_add_block(dd, block, wr_fail_cnt);
else
invalid_msg = true;
@@ -436,7 +507,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
if (invalid_msg)
DMERR("unrecognized message '%s' received", argv[0]);
- return result;
+ return r;
}
static void dust_status(struct dm_target *ti, status_type_t type,
@@ -499,12 +570,12 @@ static struct target_type dust_target = {
static int __init dm_dust_init(void)
{
- int result = dm_register_target(&dust_target);
+ int r = dm_register_target(&dust_target);
- if (result < 0)
- DMERR("dm_register_target failed %d", result);
+ if (r < 0)
+ DMERR("dm_register_target failed %d", r);
- return result;
+ return r;
}
static void __exit dm_dust_exit(void)
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
new file mode 100644
index 0000000..cb85610
--- /dev/null
+++ b/drivers/md/dm-ebs-target.c
@@ -0,0 +1,471 @@
+/*
+ * Copyright (C) 2020 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ *
+ * Device-mapper target to emulate smaller logical block
+ * size on backing devices exposing (natively) larger ones.
+ *
+ * E.g. 512 byte sector emulation on 4K native disks.
+ */
+
+#include "dm.h"
+#include <linux/module.h>
+#include <linux/workqueue.h>
+#include <linux/dm-bufio.h>
+
+#define DM_MSG_PREFIX "ebs"
+
+static void ebs_dtr(struct dm_target *ti);
+
+/* Emulated block size context. */
+struct ebs_c {
+ struct dm_dev *dev; /* Underlying device to emulate block size on. */
+ struct dm_bufio_client *bufio; /* Use dm-bufio for read and read-modify-write processing. */
+ struct workqueue_struct *wq; /* Workqueue for ^ processing of bios. */
+ struct work_struct ws; /* Work item used for ^. */
+ struct bio_list bios_in; /* Worker bios input list. */
+ spinlock_t lock; /* Guard bios input list above. */
+ sector_t start; /* <start> table line argument, see ebs_ctr below. */
+ unsigned int e_bs; /* Emulated block size in sectors exposed to upper layer. */
+ unsigned int u_bs; /* Underlying block size in sectors retrievd from/set on lower layer device. */
+ unsigned char block_shift; /* bitshift sectors -> blocks used in dm-bufio API. */
+ bool u_bs_set:1; /* Flag to indicate underlying block size is set on table line. */
+};
+
+static inline sector_t __sector_to_block(struct ebs_c *ec, sector_t sector)
+{
+ return sector >> ec->block_shift;
+}
+
+static inline sector_t __block_mod(sector_t sector, unsigned int bs)
+{
+ return sector & (bs - 1);
+}
+
+/* Return number of blocks for a bio, accounting for misalignement of start and end sectors. */
+static inline unsigned int __nr_blocks(struct ebs_c *ec, struct bio *bio)
+{
+ sector_t end_sector = __block_mod(bio->bi_iter.bi_sector, ec->u_bs) + bio_sectors(bio);
+
+ return __sector_to_block(ec, end_sector) + (__block_mod(end_sector, ec->u_bs) ? 1 : 0);
+}
+
+static inline bool __ebs_check_bs(unsigned int bs)
+{
+ return bs && is_power_of_2(bs);
+}
+
+/*
+ * READ/WRITE:
+ *
+ * copy blocks between bufio blocks and bio vector's (partial/overlapping) pages.
+ */
+static int __ebs_rw_bvec(struct ebs_c *ec, int rw, struct bio_vec *bv, struct bvec_iter *iter)
+{
+ int r = 0;
+ unsigned char *ba, *pa;
+ unsigned int cur_len;
+ unsigned int bv_len = bv->bv_len;
+ unsigned int buf_off = to_bytes(__block_mod(iter->bi_sector, ec->u_bs));
+ sector_t block = __sector_to_block(ec, iter->bi_sector);
+ struct dm_buffer *b;
+
+ if (unlikely(!bv->bv_page || !bv_len))
+ return -EIO;
+
+ pa = page_address(bv->bv_page) + bv->bv_offset;
+
+ /* Handle overlapping page <-> blocks */
+ while (bv_len) {
+ cur_len = min(dm_bufio_get_block_size(ec->bufio) - buf_off, bv_len);
+
+ /* Avoid reading for writes in case bio vector's page overwrites block completely. */
+ if (rw == READ || buf_off || bv_len < dm_bufio_get_block_size(ec->bufio))
+ ba = dm_bufio_read(ec->bufio, block, &b);
+ else
+ ba = dm_bufio_new(ec->bufio, block, &b);
+
+ if (unlikely(IS_ERR(ba))) {
+ /*
+ * Carry on with next buffer, if any, to issue all possible
+ * data but return error.
+ */
+ r = PTR_ERR(ba);
+ } else {
+ /* Copy data to/from bio to buffer if read/new was successful above. */
+ ba += buf_off;
+ if (rw == READ) {
+ memcpy(pa, ba, cur_len);
+ flush_dcache_page(bv->bv_page);
+ } else {
+ flush_dcache_page(bv->bv_page);
+ memcpy(ba, pa, cur_len);
+ dm_bufio_mark_partial_buffer_dirty(b, buf_off, buf_off + cur_len);
+ }
+
+ dm_bufio_release(b);
+ }
+
+ pa += cur_len;
+ bv_len -= cur_len;
+ buf_off = 0;
+ block++;
+ }
+
+ return r;
+}
+
+/* READ/WRITE: iterate bio vector's copying between (partial) pages and bufio blocks. */
+static int __ebs_rw_bio(struct ebs_c *ec, int rw, struct bio *bio)
+{
+ int r = 0, rr;
+ struct bio_vec bv;
+ struct bvec_iter iter;
+
+ bio_for_each_bvec(bv, bio, iter) {
+ rr = __ebs_rw_bvec(ec, rw, &bv, &iter);
+ if (rr)
+ r = rr;
+ }
+
+ return r;
+}
+
+/*
+ * Discard bio's blocks, i.e. pass discards down.
+ *
+ * Avoid discarding partial blocks at beginning and end;
+ * return 0 in case no blocks can be discarded as a result.
+ */
+static int __ebs_discard_bio(struct ebs_c *ec, struct bio *bio)
+{
+ sector_t block, blocks, sector = bio->bi_iter.bi_sector;
+
+ block = __sector_to_block(ec, sector);
+ blocks = __nr_blocks(ec, bio);
+
+ /*
+ * Partial first underlying block (__nr_blocks() may have
+ * resulted in one block).
+ */
+ if (__block_mod(sector, ec->u_bs)) {
+ block++;
+ blocks--;
+ }
+
+ /* Partial last underlying block if any. */
+ if (blocks && __block_mod(bio_end_sector(bio), ec->u_bs))
+ blocks--;
+
+ return blocks ? dm_bufio_issue_discard(ec->bufio, block, blocks) : 0;
+}
+
+/* Release blocks them from the bufio cache. */
+static void __ebs_forget_bio(struct ebs_c *ec, struct bio *bio)
+{
+ sector_t blocks, sector = bio->bi_iter.bi_sector;
+
+ blocks = __nr_blocks(ec, bio);
+
+ dm_bufio_forget_buffers(ec->bufio, __sector_to_block(ec, sector), blocks);
+}
+
+/* Worker funtion to process incoming bios. */
+static void __ebs_process_bios(struct work_struct *ws)
+{
+ int r;
+ bool write = false;
+ sector_t block1, block2;
+ struct ebs_c *ec = container_of(ws, struct ebs_c, ws);
+ struct bio *bio;
+ struct bio_list bios;
+
+ bio_list_init(&bios);
+
+ spin_lock_irq(&ec->lock);
+ bios = ec->bios_in;
+ bio_list_init(&ec->bios_in);
+ spin_unlock_irq(&ec->lock);
+
+ /* Prefetch all read and any mis-aligned write buffers */
+ bio_list_for_each(bio, &bios) {
+ block1 = __sector_to_block(ec, bio->bi_iter.bi_sector);
+ if (bio_op(bio) == REQ_OP_READ)
+ dm_bufio_prefetch(ec->bufio, block1, __nr_blocks(ec, bio));
+ else if (bio_op(bio) == REQ_OP_WRITE && !(bio->bi_opf & REQ_PREFLUSH)) {
+ block2 = __sector_to_block(ec, bio_end_sector(bio));
+ if (__block_mod(bio->bi_iter.bi_sector, ec->u_bs))
+ dm_bufio_prefetch(ec->bufio, block1, 1);
+ if (__block_mod(bio_end_sector(bio), ec->u_bs) && block2 != block1)
+ dm_bufio_prefetch(ec->bufio, block2, 1);
+ }
+ }
+
+ bio_list_for_each(bio, &bios) {
+ r = -EIO;
+ if (bio_op(bio) == REQ_OP_READ)
+ r = __ebs_rw_bio(ec, READ, bio);
+ else if (bio_op(bio) == REQ_OP_WRITE) {
+ write = true;
+ r = __ebs_rw_bio(ec, WRITE, bio);
+ } else if (bio_op(bio) == REQ_OP_DISCARD) {
+ __ebs_forget_bio(ec, bio);
+ r = __ebs_discard_bio(ec, bio);
+ }
+
+ if (r < 0)
+ bio->bi_status = errno_to_blk_status(r);
+ }
+
+ /*
+ * We write dirty buffers after processing I/O on them
+ * but before we endio thus addressing REQ_FUA/REQ_SYNC.
+ */
+ r = write ? dm_bufio_write_dirty_buffers(ec->bufio) : 0;
+
+ while ((bio = bio_list_pop(&bios))) {
+ /* Any other request is endioed. */
+ if (unlikely(r && bio_op(bio) == REQ_OP_WRITE))
+ bio_io_error(bio);
+ else
+ bio_endio(bio);
+ }
+}
+
+/*
+ * Construct an emulated block size mapping: <dev_path> <offset> <ebs> [<ubs>]
+ *
+ * <dev_path>: path of the underlying device
+ * <offset>: offset in 512 bytes sectors into <dev_path>
+ * <ebs>: emulated block size in units of 512 bytes exposed to the upper layer
+ * [<ubs>]: underlying block size in units of 512 bytes imposed on the lower layer;
+ * optional, if not supplied, retrieve logical block size from underlying device
+ */
+static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ int r;
+ unsigned short tmp1;
+ unsigned long long tmp;
+ char dummy;
+ struct ebs_c *ec;
+
+ if (argc < 3 || argc > 4) {
+ ti->error = "Invalid argument count";
+ return -EINVAL;
+ }
+
+ ec = ti->private = kzalloc(sizeof(*ec), GFP_KERNEL);
+ if (!ec) {
+ ti->error = "Cannot allocate ebs context";
+ return -ENOMEM;
+ }
+
+ r = -EINVAL;
+ if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 ||
+ tmp != (sector_t)tmp ||
+ (sector_t)tmp >= ti->len) {
+ ti->error = "Invalid device offset sector";
+ goto bad;
+ }
+ ec->start = tmp;
+
+ if (sscanf(argv[2], "%hu%c", &tmp1, &dummy) != 1 ||
+ !__ebs_check_bs(tmp1) ||
+ to_bytes(tmp1) > PAGE_SIZE) {
+ ti->error = "Invalid emulated block size";
+ goto bad;
+ }
+ ec->e_bs = tmp1;
+
+ if (argc > 3) {
+ if (sscanf(argv[3], "%hu%c", &tmp1, &dummy) != 1 || !__ebs_check_bs(tmp1)) {
+ ti->error = "Invalid underlying block size";
+ goto bad;
+ }
+ ec->u_bs = tmp1;
+ ec->u_bs_set = true;
+ } else
+ ec->u_bs_set = false;
+
+ r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ec->dev);
+ if (r) {
+ ti->error = "Device lookup failed";
+ ec->dev = NULL;
+ goto bad;
+ }
+
+ r = -EINVAL;
+ if (!ec->u_bs_set) {
+ ec->u_bs = to_sector(bdev_logical_block_size(ec->dev->bdev));
+ if (!__ebs_check_bs(ec->u_bs)) {
+ ti->error = "Invalid retrieved underlying block size";
+ goto bad;
+ }
+ }
+
+ if (!ec->u_bs_set && ec->e_bs == ec->u_bs)
+ DMINFO("Emulation superfluous: emulated equal to underlying block size");
+
+ if (__block_mod(ec->start, ec->u_bs)) {
+ ti->error = "Device offset must be multiple of underlying block size";
+ goto bad;
+ }
+
+ ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, 0, NULL, NULL);
+ if (IS_ERR(ec->bufio)) {
+ ti->error = "Cannot create dm bufio client";
+ r = PTR_ERR(ec->bufio);
+ ec->bufio = NULL;
+ goto bad;
+ }
+
+ ec->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
+ if (!ec->wq) {
+ ti->error = "Cannot create dm-" DM_MSG_PREFIX " workqueue";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ ec->block_shift = __ffs(ec->u_bs);
+ INIT_WORK(&ec->ws, &__ebs_process_bios);
+ bio_list_init(&ec->bios_in);
+ spin_lock_init(&ec->lock);
+
+ ti->num_flush_bios = 1;
+ ti->num_discard_bios = 1;
+ ti->num_secure_erase_bios = 0;
+ ti->num_write_same_bios = 0;
+ ti->num_write_zeroes_bios = 0;
+ return 0;
+bad:
+ ebs_dtr(ti);
+ return r;
+}
+
+static void ebs_dtr(struct dm_target *ti)
+{
+ struct ebs_c *ec = ti->private;
+
+ if (ec->wq)
+ destroy_workqueue(ec->wq);
+ if (ec->bufio)
+ dm_bufio_client_destroy(ec->bufio);
+ if (ec->dev)
+ dm_put_device(ti, ec->dev);
+ kfree(ec);
+}
+
+static int ebs_map(struct dm_target *ti, struct bio *bio)
+{
+ struct ebs_c *ec = ti->private;
+
+ bio_set_dev(bio, ec->dev->bdev);
+ bio->bi_iter.bi_sector = ec->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
+
+ if (unlikely(bio_op(bio) == REQ_OP_FLUSH))
+ return DM_MAPIO_REMAPPED;
+ /*
+ * Only queue for bufio processing in case of partial or overlapping buffers
+ * -or-
+ * emulation with ebs == ubs aiming for tests of dm-bufio overhead.
+ */
+ if (likely(__block_mod(bio->bi_iter.bi_sector, ec->u_bs) ||
+ __block_mod(bio_end_sector(bio), ec->u_bs) ||
+ ec->e_bs == ec->u_bs)) {
+ spin_lock_irq(&ec->lock);
+ bio_list_add(&ec->bios_in, bio);
+ spin_unlock_irq(&ec->lock);
+
+ queue_work(ec->wq, &ec->ws);
+
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ /* Forget any buffer content relative to this direct backing device I/O. */
+ __ebs_forget_bio(ec, bio);
+
+ return DM_MAPIO_REMAPPED;
+}
+
+static void ebs_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct ebs_c *ec = ti->private;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ *result = '\0';
+ break;
+ case STATUSTYPE_TABLE:
+ snprintf(result, maxlen, ec->u_bs_set ? "%s %llu %u %u" : "%s %llu %u",
+ ec->dev->name, (unsigned long long) ec->start, ec->e_bs, ec->u_bs);
+ break;
+ }
+}
+
+static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+{
+ struct ebs_c *ec = ti->private;
+ struct dm_dev *dev = ec->dev;
+
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ *bdev = dev->bdev;
+ return !!(ec->start || ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT);
+}
+
+static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct ebs_c *ec = ti->private;
+
+ limits->logical_block_size = to_bytes(ec->e_bs);
+ limits->physical_block_size = to_bytes(ec->u_bs);
+ limits->alignment_offset = limits->physical_block_size;
+ blk_limits_io_min(limits, limits->logical_block_size);
+}
+
+static int ebs_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct ebs_c *ec = ti->private;
+
+ return fn(ti, ec->dev, ec->start, ti->len, data);
+}
+
+static struct target_type ebs_target = {
+ .name = "ebs",
+ .version = {1, 0, 1},
+ .features = DM_TARGET_PASSES_INTEGRITY,
+ .module = THIS_MODULE,
+ .ctr = ebs_ctr,
+ .dtr = ebs_dtr,
+ .map = ebs_map,
+ .status = ebs_status,
+ .io_hints = ebs_io_hints,
+ .prepare_ioctl = ebs_prepare_ioctl,
+ .iterate_devices = ebs_iterate_devices,
+};
+
+static int __init dm_ebs_init(void)
+{
+ int r = dm_register_target(&ebs_target);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ return r;
+}
+
+static void dm_ebs_exit(void)
+{
+ dm_unregister_target(&ebs_target);
+}
+
+module_init(dm_ebs_init);
+module_exit(dm_ebs_exit);
+
+MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
+MODULE_DESCRIPTION(DM_NAME " emulated block size target");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 6b0b3a1..d9ac737 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1152,7 +1152,6 @@ static int metadata_get_stats(struct era_metadata *md, void *ptr)
struct era {
struct dm_target *ti;
- struct dm_target_callbacks callbacks;
struct dm_dev *metadata_dev;
struct dm_dev *origin_dev;
@@ -1290,7 +1289,7 @@ static void process_deferred_bios(struct era *era)
*/
if (commit_needed)
set_bit(get_block(era, bio), ws->bits);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
blk_finish_plug(&plug);
}
@@ -1403,18 +1402,6 @@ static void stop_worker(struct era *era)
/*----------------------------------------------------------------
* Target methods
*--------------------------------------------------------------*/
-static int dev_is_congested(struct dm_dev *dev, int bdi_bits)
-{
- struct request_queue *q = bdev_get_queue(dev->bdev);
- return bdi_congested(q->backing_dev_info, bdi_bits);
-}
-
-static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
-{
- struct era *era = container_of(cb, struct era, callbacks);
- return dev_is_congested(era->origin_dev, bdi_bits);
-}
-
static void era_destroy(struct era *era)
{
if (era->md)
@@ -1533,8 +1520,6 @@ static int era_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->flush_supported = true;
ti->num_discard_bios = 1;
- era->callbacks.congested_fn = era_is_congested;
- dm_table_add_target_callbacks(ti->table, &era->callbacks);
return 0;
}
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 2900fbd..a2cc9e4 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -280,7 +280,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
struct flakey_c *fc = ti->private;
bio_set_dev(bio, fc->dev->bdev);
- if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
+ if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio)))
bio->bi_iter.bi_sector =
flakey_map_sector(ti, bio->bi_iter.bi_sector);
}
@@ -322,8 +322,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
pb->bio_submitted = false;
- /* Do not fail reset zone */
- if (bio_op(bio) == REQ_OP_ZONE_RESET)
+ if (op_is_zone_mgmt(bio_op(bio)))
goto map_bio;
/* Are we alive ? */
@@ -384,7 +383,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
struct flakey_c *fc = ti->private;
struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
- if (bio_op(bio) == REQ_OP_ZONE_RESET)
+ if (op_is_zone_mgmt(bio_op(bio)))
return DM_ENDIO_DONE;
if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) {
@@ -460,21 +459,15 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
}
#ifdef CONFIG_BLK_DEV_ZONED
-static int flakey_report_zones(struct dm_target *ti, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones)
+static int flakey_report_zones(struct dm_target *ti,
+ struct dm_report_zones_args *args, unsigned int nr_zones)
{
struct flakey_c *fc = ti->private;
- int ret;
+ sector_t sector = flakey_map_sector(ti, args->next_sector);
- /* Do report and remap it */
- ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector),
- zones, nr_zones);
- if (ret != 0)
- return ret;
-
- if (*nr_zones)
- dm_remap_zone_report(ti, fc->start, zones, nr_zones);
- return 0;
+ args->start = fc->start;
+ return blkdev_report_zones(fc->dev->bdev, sector, nr_zones,
+ dm_report_zones_cb, args);
}
#endif
diff --git a/drivers/md/dm-historical-service-time.c b/drivers/md/dm-historical-service-time.c
new file mode 100644
index 0000000..186f91e
--- /dev/null
+++ b/drivers/md/dm-historical-service-time.c
@@ -0,0 +1,561 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Historical Service Time
+ *
+ * Keeps a time-weighted exponential moving average of the historical
+ * service time. Estimates future service time based on the historical
+ * service time and the number of outstanding requests.
+ *
+ * Marks paths stale if they have not finished within hst *
+ * num_paths. If a path is stale and unused, we will send a single
+ * request to probe in case the path has improved. This situation
+ * generally arises if the path is so much worse than others that it
+ * will never have the best estimated service time, or if the entire
+ * multipath device is unused. If a path is stale and in use, limit the
+ * number of requests it can receive with the assumption that the path
+ * has become degraded.
+ *
+ * To avoid repeatedly calculating exponents for time weighting, times
+ * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
+ * ns, and the weighting is pre-calculated.
+ *
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+
+#define DM_MSG_PREFIX "multipath historical-service-time"
+#define HST_MIN_IO 1
+#define HST_VERSION "0.1.1"
+
+#define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */
+#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
+#define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
+#define HST_FIXED_95 972
+#define HST_MAX_INFLIGHT HST_FIXED_1
+#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
+#define HST_WEIGHT_COUNT 64ULL
+
+struct selector {
+ struct list_head valid_paths;
+ struct list_head failed_paths;
+ int valid_count;
+ spinlock_t lock;
+
+ unsigned int weights[HST_WEIGHT_COUNT];
+ unsigned int threshold_multiplier;
+};
+
+struct path_info {
+ struct list_head list;
+ struct dm_path *path;
+ unsigned int repeat_count;
+
+ spinlock_t lock;
+
+ u64 historical_service_time; /* Fixed point */
+
+ u64 stale_after;
+ u64 last_finish;
+
+ u64 outstanding;
+};
+
+/**
+ * fixed_power - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ *
+ * (see: kernel/sched/loadavg.c)
+ */
+static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
+{
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) {
+ for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+ }
+
+ return result;
+}
+
+/*
+ * Calculate the next value of an exponential moving average
+ * a_1 = a_0 * e + a * (1 - e)
+ *
+ * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
+ * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
+ * @weight: [0, HST_FIXED_1]
+ *
+ * Note:
+ * To account for multiple periods in the same calculation,
+ * a_n = a_0 * e^n + a * (1 - e^n),
+ * so call fixed_ema(last, next, pow(weight, N))
+ */
+static u64 fixed_ema(u64 last, u64 next, u64 weight)
+{
+ last *= weight;
+ last += next * (HST_FIXED_1 - weight);
+ last += 1ULL << (HST_FIXED_SHIFT - 1);
+ return last >> HST_FIXED_SHIFT;
+}
+
+static struct selector *alloc_selector(void)
+{
+ struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (s) {
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->failed_paths);
+ spin_lock_init(&s->lock);
+ s->valid_count = 0;
+ }
+
+ return s;
+}
+
+/*
+ * Get the weight for a given time span.
+ */
+static u64 hst_weight(struct path_selector *ps, u64 delta)
+{
+ struct selector *s = ps->context;
+ int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
+ HST_WEIGHT_COUNT - 1);
+
+ return s->weights[bucket];
+}
+
+/*
+ * Set up the weights array.
+ *
+ * weights[len-1] = 0
+ * weights[n] = base ^ (n + 1)
+ */
+static void hst_set_weights(struct path_selector *ps, unsigned int base)
+{
+ struct selector *s = ps->context;
+ int i;
+
+ if (base >= HST_FIXED_1)
+ return;
+
+ for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
+ s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
+ s->weights[HST_WEIGHT_COUNT - 1] = 0;
+}
+
+static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
+{
+ struct selector *s;
+ unsigned int base_weight = HST_FIXED_95;
+ unsigned int threshold_multiplier = 0;
+ char dummy;
+
+ /*
+ * Arguments: [<base_weight> [<threshold_multiplier>]]
+ * <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
+ * value of 0 will completely ignore any history.
+ * If not given, default (HST_FIXED_95) is used.
+ * <threshold_multiplier>: Minimum threshold multiplier for paths to
+ * be considered different. That is, a path is
+ * considered different iff (p1 > N * p2) where p1
+ * is the path with higher service time. A threshold
+ * of 1 or 0 has no effect. Defaults to 0.
+ */
+ if (argc > 2)
+ return -EINVAL;
+
+ if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
+ base_weight >= HST_FIXED_1)) {
+ return -EINVAL;
+ }
+
+ if (argc > 1 && (sscanf(argv[1], "%u%c",
+ &threshold_multiplier, &dummy) != 1)) {
+ return -EINVAL;
+ }
+
+ s = alloc_selector();
+ if (!s)
+ return -ENOMEM;
+
+ ps->context = s;
+
+ hst_set_weights(ps, base_weight);
+ s->threshold_multiplier = threshold_multiplier;
+ return 0;
+}
+
+static void free_paths(struct list_head *paths)
+{
+ struct path_info *pi, *next;
+
+ list_for_each_entry_safe(pi, next, paths, list) {
+ list_del(&pi->list);
+ kfree(pi);
+ }
+}
+
+static void hst_destroy(struct path_selector *ps)
+{
+ struct selector *s = ps->context;
+
+ free_paths(&s->valid_paths);
+ free_paths(&s->failed_paths);
+ kfree(s);
+ ps->context = NULL;
+}
+
+static int hst_status(struct path_selector *ps, struct dm_path *path,
+ status_type_t type, char *result, unsigned int maxlen)
+{
+ unsigned int sz = 0;
+ struct path_info *pi;
+
+ if (!path) {
+ struct selector *s = ps->context;
+
+ DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
+ } else {
+ pi = path->pscontext;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%llu %llu %llu ", pi->historical_service_time,
+ pi->outstanding, pi->stale_after);
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT("0 ");
+ break;
+ }
+ }
+
+ return sz;
+}
+
+static int hst_add_path(struct path_selector *ps, struct dm_path *path,
+ int argc, char **argv, char **error)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi;
+ unsigned int repeat_count = HST_MIN_IO;
+ char dummy;
+ unsigned long flags;
+
+ /*
+ * Arguments: [<repeat_count>]
+ * <repeat_count>: The number of I/Os before switching path.
+ * If not given, default (HST_MIN_IO) is used.
+ */
+ if (argc > 1) {
+ *error = "historical-service-time ps: incorrect number of arguments";
+ return -EINVAL;
+ }
+
+ if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
+ *error = "historical-service-time ps: invalid repeat count";
+ return -EINVAL;
+ }
+
+ /* allocate the path */
+ pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+ if (!pi) {
+ *error = "historical-service-time ps: Error allocating path context";
+ return -ENOMEM;
+ }
+
+ pi->path = path;
+ pi->repeat_count = repeat_count;
+
+ pi->historical_service_time = HST_FIXED_1;
+
+ spin_lock_init(&pi->lock);
+ pi->outstanding = 0;
+
+ pi->stale_after = 0;
+ pi->last_finish = 0;
+
+ path->pscontext = pi;
+
+ spin_lock_irqsave(&s->lock, flags);
+ list_add_tail(&pi->list, &s->valid_paths);
+ s->valid_count++;
+ spin_unlock_irqrestore(&s->lock, flags);
+
+ return 0;
+}
+
+static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi = path->pscontext;
+ unsigned long flags;
+
+ spin_lock_irqsave(&s->lock, flags);
+ list_move(&pi->list, &s->failed_paths);
+ s->valid_count--;
+ spin_unlock_irqrestore(&s->lock, flags);
+}
+
+static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi = path->pscontext;
+ unsigned long flags;
+
+ spin_lock_irqsave(&s->lock, flags);
+ list_move_tail(&pi->list, &s->valid_paths);
+ s->valid_count++;
+ spin_unlock_irqrestore(&s->lock, flags);
+
+ return 0;
+}
+
+static void hst_fill_compare(struct path_info *pi, u64 *hst,
+ u64 *out, u64 *stale)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pi->lock, flags);
+ *hst = pi->historical_service_time;
+ *out = pi->outstanding;
+ *stale = pi->stale_after;
+ spin_unlock_irqrestore(&pi->lock, flags);
+}
+
+/*
+ * Compare the estimated service time of 2 paths, pi1 and pi2,
+ * for the incoming I/O.
+ *
+ * Returns:
+ * < 0 : pi1 is better
+ * 0 : no difference between pi1 and pi2
+ * > 0 : pi2 is better
+ *
+ */
+static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
+ u64 time_now, struct path_selector *ps)
+{
+ struct selector *s = ps->context;
+ u64 hst1, hst2;
+ long long out1, out2, stale1, stale2;
+ int pi2_better, over_threshold;
+
+ hst_fill_compare(pi1, &hst1, &out1, &stale1);
+ hst_fill_compare(pi2, &hst2, &out2, &stale2);
+
+ /* Check here if estimated latency for two paths are too similar.
+ * If this is the case, we skip extra calculation and just compare
+ * outstanding requests. In this case, any unloaded paths will
+ * be preferred.
+ */
+ if (hst1 > hst2)
+ over_threshold = hst1 > (s->threshold_multiplier * hst2);
+ else
+ over_threshold = hst2 > (s->threshold_multiplier * hst1);
+
+ if (!over_threshold)
+ return out1 - out2;
+
+ /*
+ * If an unloaded path is stale, choose it. If both paths are unloaded,
+ * choose path that is the most stale.
+ * (If one path is loaded, choose the other)
+ */
+ if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
+ (!out1 && !out2))
+ return (!out2 * stale1) - (!out1 * stale2);
+
+ /* Compare estimated service time. If outstanding is the same, we
+ * don't need to multiply
+ */
+ if (out1 == out2) {
+ pi2_better = hst1 > hst2;
+ } else {
+ /* Potential overflow with out >= 1024 */
+ if (unlikely(out1 >= HST_MAX_INFLIGHT ||
+ out2 >= HST_MAX_INFLIGHT)) {
+ /* If over 1023 in-flights, we may overflow if hst
+ * is at max. (With this shift we still overflow at
+ * 1048576 in-flights, which is high enough).
+ */
+ hst1 >>= HST_FIXED_SHIFT;
+ hst2 >>= HST_FIXED_SHIFT;
+ }
+ pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
+ }
+
+ /* In the case that the 'winner' is stale, limit to equal usage. */
+ if (pi2_better) {
+ if (stale2 < time_now)
+ return out1 - out2;
+ return 1;
+ }
+ if (stale1 < time_now)
+ return out1 - out2;
+ return -1;
+}
+
+static struct dm_path *hst_select_path(struct path_selector *ps,
+ size_t nr_bytes)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi = NULL, *best = NULL;
+ u64 time_now = sched_clock();
+ struct dm_path *ret = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&s->lock, flags);
+ if (list_empty(&s->valid_paths))
+ goto out;
+
+ list_for_each_entry(pi, &s->valid_paths, list) {
+ if (!best || (hst_compare(pi, best, time_now, ps) < 0))
+ best = pi;
+ }
+
+ if (!best)
+ goto out;
+
+ /* Move last used path to end (least preferred in case of ties) */
+ list_move_tail(&best->list, &s->valid_paths);
+
+ ret = best->path;
+
+out:
+ spin_unlock_irqrestore(&s->lock, flags);
+ return ret;
+}
+
+static int hst_start_io(struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes)
+{
+ struct path_info *pi = path->pscontext;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pi->lock, flags);
+ pi->outstanding++;
+ spin_unlock_irqrestore(&pi->lock, flags);
+
+ return 0;
+}
+
+static u64 path_service_time(struct path_info *pi, u64 start_time)
+{
+ u64 sched_now = ktime_get_ns();
+
+ /* if a previous disk request has finished after this IO was
+ * sent to the hardware, pretend the submission happened
+ * serially.
+ */
+ if (time_after64(pi->last_finish, start_time))
+ start_time = pi->last_finish;
+
+ pi->last_finish = sched_now;
+ if (time_before64(sched_now, start_time))
+ return 0;
+
+ return sched_now - start_time;
+}
+
+static int hst_end_io(struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes, u64 start_time)
+{
+ struct path_info *pi = path->pscontext;
+ struct selector *s = ps->context;
+ unsigned long flags;
+ u64 st;
+
+ spin_lock_irqsave(&pi->lock, flags);
+
+ st = path_service_time(pi, start_time);
+ pi->outstanding--;
+ pi->historical_service_time =
+ fixed_ema(pi->historical_service_time,
+ min(st * HST_FIXED_1, HST_FIXED_MAX),
+ hst_weight(ps, st));
+
+ /*
+ * On request end, mark path as fresh. If a path hasn't
+ * finished any requests within the fresh period, the estimated
+ * service time is considered too optimistic and we limit the
+ * maximum requests on that path.
+ */
+ pi->stale_after = pi->last_finish +
+ (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
+
+ spin_unlock_irqrestore(&pi->lock, flags);
+
+ return 0;
+}
+
+static struct path_selector_type hst_ps = {
+ .name = "historical-service-time",
+ .module = THIS_MODULE,
+ .table_args = 1,
+ .info_args = 3,
+ .create = hst_create,
+ .destroy = hst_destroy,
+ .status = hst_status,
+ .add_path = hst_add_path,
+ .fail_path = hst_fail_path,
+ .reinstate_path = hst_reinstate_path,
+ .select_path = hst_select_path,
+ .start_io = hst_start_io,
+ .end_io = hst_end_io,
+};
+
+static int __init dm_hst_init(void)
+{
+ int r = dm_register_path_selector(&hst_ps);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ DMINFO("version " HST_VERSION " loaded");
+
+ return r;
+}
+
+static void __exit dm_hst_exit(void)
+{
+ int r = dm_unregister_path_selector(&hst_ps);
+
+ if (r < 0)
+ DMERR("unregister failed %d", r);
+}
+
+module_init(dm_hst_init);
+module_exit(dm_hst_exit);
+
+MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
+MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c
index b869316..b0c45c6 100644
--- a/drivers/md/dm-init.c
+++ b/drivers/md/dm-init.c
@@ -36,7 +36,7 @@ struct dm_device {
struct list_head list;
};
-const char * const dm_allowed_targets[] __initconst = {
+static const char * const dm_allowed_targets[] __initconst = {
"crypt",
"delay",
"linear",
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 9f4d657..4c7da1c 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -39,6 +39,7 @@
#define RECALC_WRITE_SUPER 16
#define BITMAP_BLOCK_SIZE 4096 /* don't change it */
#define BITMAP_FLUSH_INTERVAL (10 * HZ)
+#define DISCARD_FILLER 0xf6
/*
* Warning - DEBUG_PRINT prints security-sensitive data to the log,
@@ -55,6 +56,7 @@
#define SB_VERSION_1 1
#define SB_VERSION_2 2
#define SB_VERSION_3 3
+#define SB_VERSION_4 4
#define SB_SECTORS 8
#define MAX_SECTORS_PER_BLOCK 8
@@ -75,6 +77,7 @@ struct superblock {
#define SB_FLAG_HAVE_JOURNAL_MAC 0x1
#define SB_FLAG_RECALCULATING 0x2
#define SB_FLAG_DIRTY_BITMAP 0x4
+#define SB_FLAG_FIXED_PADDING 0x8
#define JOURNAL_ENTRY_ROUNDUP 8
@@ -89,7 +92,7 @@ struct journal_entry {
} s;
__u64 sector;
} u;
- commit_id_t last_bytes[0];
+ commit_id_t last_bytes[];
/* __u8 tag[0]; */
};
@@ -254,6 +257,8 @@ struct dm_integrity_c {
bool journal_uptodate;
bool just_formatted;
bool recalculate_flag;
+ bool discard;
+ bool fix_padding;
bool legacy_recalculate;
struct alg_spec internal_hash_alg;
@@ -282,7 +287,7 @@ struct dm_integrity_io {
struct work_struct work;
struct dm_integrity_c *ic;
- bool write;
+ enum req_opf op;
bool fua;
struct dm_integrity_range range;
@@ -472,7 +477,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
static void sb_set_version(struct dm_integrity_c *ic)
{
- if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING))
+ ic->sb->version = SB_VERSION_4;
+ else if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
ic->sb->version = SB_VERSION_3;
else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
ic->sb->version = SB_VERSION_2;
@@ -514,8 +521,8 @@ static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap,
if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) {
DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)",
- (unsigned long long)sector,
- (unsigned long long)n_sectors,
+ sector,
+ n_sectors,
ic->sb->log2_sectors_per_block,
ic->log2_blocks_per_bitmap_bit,
mode);
@@ -1303,6 +1310,11 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_
static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
unsigned *metadata_offset, unsigned total_size, int op)
{
+#define MAY_BE_FILLER 1
+#define MAY_BE_HASH 2
+ unsigned hash_offset = 0;
+ unsigned may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
+
do {
unsigned char *data, *dp;
struct dm_buffer *b;
@@ -1324,18 +1336,35 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
} else if (op == TAG_WRITE) {
memcpy(dp, tag, to_copy);
dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
- } else {
+ } else {
/* e.g.: op == TAG_CMP */
- if (unlikely(memcmp(dp, tag, to_copy))) {
- unsigned i;
- for (i = 0; i < to_copy; i++) {
- if (dp[i] != tag[i])
- break;
- total_size--;
+ if (likely(is_power_of_2(ic->tag_size))) {
+ if (unlikely(memcmp(dp, tag, to_copy)))
+ if (unlikely(!ic->discard) ||
+ unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) {
+ goto thorough_test;
}
- dm_bufio_release(b);
- return total_size;
+ } else {
+ unsigned i, ts;
+thorough_test:
+ ts = total_size;
+
+ for (i = 0; i < to_copy; i++, ts--) {
+ if (unlikely(dp[i] != tag[i]))
+ may_be &= ~MAY_BE_HASH;
+ if (likely(dp[i] != DISCARD_FILLER))
+ may_be &= ~MAY_BE_FILLER;
+ hash_offset++;
+ if (unlikely(hash_offset == ic->tag_size)) {
+ if (unlikely(!may_be)) {
+ dm_bufio_release(b);
+ return ts;
+ }
+ hash_offset = 0;
+ may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
+ }
+ }
}
}
dm_bufio_release(b);
@@ -1346,10 +1375,17 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
(*metadata_block)++;
*metadata_offset = 0;
}
+
+ if (unlikely(!is_power_of_2(ic->tag_size))) {
+ hash_offset = (hash_offset + to_copy) % ic->tag_size;
+ }
+
total_size -= to_copy;
} while (unlikely(total_size));
return 0;
+#undef MAY_BE_FILLER
+#undef MAY_BE_HASH
}
struct flush_request {
@@ -1472,7 +1508,7 @@ static void dec_in_flight(struct dm_integrity_io *dio)
remove_range(ic, &dio->range);
- if (unlikely(dio->write))
+ if (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD))
schedule_autocommit(ic);
bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
@@ -1564,14 +1600,17 @@ static void integrity_metadata(struct work_struct *w)
char *checksums;
unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
- unsigned sectors_to_process = dio->range.n_sectors;
- sector_t sector = dio->range.logical_sector;
+ sector_t sector;
+ unsigned sectors_to_process;
if (unlikely(ic->mode == 'R'))
goto skip_io;
- checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
- GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
+ if (likely(dio->op != REQ_OP_DISCARD))
+ checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
+ GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
+ else
+ checksums = kmalloc(PAGE_SIZE, GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
if (!checksums) {
checksums = checksums_onstack;
if (WARN_ON(extra_space &&
@@ -1581,6 +1620,41 @@ static void integrity_metadata(struct work_struct *w)
}
}
+ if (unlikely(dio->op == REQ_OP_DISCARD)) {
+ sector_t bi_sector = dio->bio_details.bi_iter.bi_sector;
+ unsigned bi_size = dio->bio_details.bi_iter.bi_size;
+ unsigned max_size = likely(checksums != checksums_onstack) ? PAGE_SIZE : HASH_MAX_DIGESTSIZE;
+ unsigned max_blocks = max_size / ic->tag_size;
+ memset(checksums, DISCARD_FILLER, max_size);
+
+ while (bi_size) {
+ unsigned this_step_blocks = bi_size >> (SECTOR_SHIFT + ic->sb->log2_sectors_per_block);
+ this_step_blocks = min(this_step_blocks, max_blocks);
+ r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
+ this_step_blocks * ic->tag_size, TAG_WRITE);
+ if (unlikely(r)) {
+ if (likely(checksums != checksums_onstack))
+ kfree(checksums);
+ goto error;
+ }
+
+ /*if (bi_size < this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block)) {
+ printk("BUGG: bi_sector: %llx, bi_size: %u\n", bi_sector, bi_size);
+ printk("BUGG: this_step_blocks: %u\n", this_step_blocks);
+ BUG();
+ }*/
+ bi_size -= this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block);
+ bi_sector += this_step_blocks << ic->sb->log2_sectors_per_block;
+ }
+
+ if (likely(checksums != checksums_onstack))
+ kfree(checksums);
+ goto skip_io;
+ }
+
+ sector = dio->range.logical_sector;
+ sectors_to_process = dio->range.n_sectors;
+
__bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) {
unsigned pos;
char *mem, *checksums_ptr;
@@ -1599,11 +1673,12 @@ static void integrity_metadata(struct work_struct *w)
kunmap_atomic(mem);
r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
- checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
+ checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE);
if (unlikely(r)) {
if (r > 0) {
- DMERR_LIMIT("Checksum failed at sector 0x%llx",
- (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
+ char b[BDEVNAME_SIZE];
+ DMERR_LIMIT("%s: Checksum failed at sector 0x%llx", bio_devname(bio, b),
+ (sector - ((r + ic->tag_size - 1) / ic->tag_size)));
r = -EILSEQ;
atomic64_inc(&ic->number_of_mismatches);
}
@@ -1642,7 +1717,7 @@ static void integrity_metadata(struct work_struct *w)
tag = lowmem_page_address(biv.bv_page) + biv.bv_offset;
this_len = min(biv.bv_len, data_to_process);
r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset,
- this_len, !dio->write ? TAG_READ : TAG_WRITE);
+ this_len, dio->op == REQ_OP_READ ? TAG_READ : TAG_WRITE);
if (unlikely(r))
goto error;
data_to_process -= this_len;
@@ -1669,6 +1744,20 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
dio->ic = ic;
dio->bi_status = 0;
+ dio->op = bio_op(bio);
+
+ if (unlikely(dio->op == REQ_OP_DISCARD)) {
+ if (ti->max_io_len) {
+ sector_t sec = dm_target_offset(ti, bio->bi_iter.bi_sector);
+ unsigned log2_max_io_len = __fls(ti->max_io_len);
+ sector_t start_boundary = sec >> log2_max_io_len;
+ sector_t end_boundary = (sec + bio_sectors(bio) - 1) >> log2_max_io_len;
+ if (start_boundary < end_boundary) {
+ sector_t len = ti->max_io_len - (sec & (ti->max_io_len - 1));
+ dm_accept_partial_bio(bio, len);
+ }
+ }
+ }
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
submit_flush_bio(ic, dio);
@@ -1676,8 +1765,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
}
dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
- dio->write = bio_op(bio) == REQ_OP_WRITE;
- dio->fua = dio->write && bio->bi_opf & REQ_FUA;
+ dio->fua = dio->op == REQ_OP_WRITE && bio->bi_opf & REQ_FUA;
if (unlikely(dio->fua)) {
/*
* Don't pass down the FUA flag because we have to flush
@@ -1687,18 +1775,18 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
}
if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) {
DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx",
- (unsigned long long)dio->range.logical_sector, bio_sectors(bio),
- (unsigned long long)ic->provided_data_sectors);
+ dio->range.logical_sector, bio_sectors(bio),
+ ic->provided_data_sectors);
return DM_MAPIO_KILL;
}
if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) {
DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x",
ic->sectors_per_block,
- (unsigned long long)dio->range.logical_sector, bio_sectors(bio));
+ dio->range.logical_sector, bio_sectors(bio));
return DM_MAPIO_KILL;
}
- if (ic->sectors_per_block > 1) {
+ if (ic->sectors_per_block > 1 && likely(dio->op != REQ_OP_DISCARD)) {
struct bvec_iter iter;
struct bio_vec bv;
bio_for_each_segment(bv, bio, iter) {
@@ -1731,7 +1819,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
}
}
- if (unlikely(ic->mode == 'R') && unlikely(dio->write))
+ if (unlikely(ic->mode == 'R') && unlikely(dio->op != REQ_OP_READ))
return DM_MAPIO_KILL;
get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
@@ -1761,13 +1849,13 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
bio_advance_iter(bio, &bio->bi_iter, bv.bv_len);
retry_kmap:
mem = kmap_atomic(bv.bv_page);
- if (likely(dio->write))
+ if (likely(dio->op == REQ_OP_WRITE))
flush_dcache_page(bv.bv_page);
do {
struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry);
- if (unlikely(!dio->write)) {
+ if (unlikely(dio->op == REQ_OP_READ)) {
struct journal_sector *js;
char *mem_ptr;
unsigned s;
@@ -1797,7 +1885,7 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
- (unsigned long long)logical_sector);
+ logical_sector);
}
}
#endif
@@ -1814,7 +1902,7 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
char *tag_addr;
BUG_ON(PageHighMem(biv.bv_page));
tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset;
- if (likely(dio->write))
+ if (likely(dio->op == REQ_OP_WRITE))
memcpy(tag_ptr, tag_addr, tag_now);
else
memcpy(tag_addr, tag_ptr, tag_now);
@@ -1822,12 +1910,12 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
tag_ptr += tag_now;
tag_todo -= tag_now;
} while (unlikely(tag_todo)); else {
- if (likely(dio->write))
+ if (likely(dio->op == REQ_OP_WRITE))
memset(tag_ptr, 0, tag_todo);
}
}
- if (likely(dio->write)) {
+ if (likely(dio->op == REQ_OP_WRITE)) {
struct journal_sector *js;
unsigned s;
@@ -1863,12 +1951,12 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT;
} while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT);
- if (unlikely(!dio->write))
+ if (unlikely(dio->op == REQ_OP_READ))
flush_dcache_page(bv.bv_page);
kunmap_atomic(mem);
} while (n_sectors);
- if (likely(dio->write)) {
+ if (likely(dio->op == REQ_OP_WRITE)) {
smp_mb();
if (unlikely(waitqueue_active(&ic->copy_to_journal_wait)))
wake_up(&ic->copy_to_journal_wait);
@@ -1900,7 +1988,10 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
unsigned journal_section, journal_entry;
unsigned journal_read_pos;
struct completion read_comp;
- bool need_sync_io = ic->internal_hash && !dio->write;
+ bool discard_retried = false;
+ bool need_sync_io = ic->internal_hash && dio->op == REQ_OP_READ;
+ if (unlikely(dio->op == REQ_OP_DISCARD) && ic->mode != 'D')
+ need_sync_io = true;
if (need_sync_io && from_map) {
INIT_WORK(&dio->work, integrity_bio_wait);
@@ -1918,8 +2009,8 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
}
dio->range.n_sectors = bio_sectors(bio);
journal_read_pos = NOT_FOUND;
- if (likely(ic->mode == 'J')) {
- if (dio->write) {
+ if (ic->mode == 'J' && likely(dio->op != REQ_OP_DISCARD)) {
+ if (dio->op == REQ_OP_WRITE) {
unsigned next_entry, i, pos;
unsigned ws, we, range_sectors;
@@ -2014,6 +2105,21 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
}
}
}
+ if (ic->mode == 'J' && likely(dio->op == REQ_OP_DISCARD) && !discard_retried) {
+ sector_t next_sector;
+ unsigned new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
+ if (unlikely(new_pos != NOT_FOUND) ||
+ unlikely(next_sector < dio->range.logical_sector - dio->range.n_sectors)) {
+ remove_range_unlocked(ic, &dio->range);
+ spin_unlock_irq(&ic->endio_wait.lock);
+ queue_work(ic->commit_wq, &ic->commit_work);
+ flush_workqueue(ic->commit_wq);
+ queue_work(ic->writer_wq, &ic->writer_work);
+ flush_workqueue(ic->writer_wq);
+ discard_retried = true;
+ goto lock_retry;
+ }
+ }
spin_unlock_irq(&ic->endio_wait.lock);
if (unlikely(journal_read_pos != NOT_FOUND)) {
@@ -2022,7 +2128,7 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
goto journal_read_write;
}
- if (ic->mode == 'B' && dio->write) {
+ if (ic->mode == 'B' && (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD))) {
if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
struct bitmap_block_status *bbs;
@@ -2051,7 +2157,19 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
bio->bi_end_io = integrity_end_io;
bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT;
- generic_make_request(bio);
+ if (unlikely(dio->op == REQ_OP_DISCARD) && likely(ic->mode != 'D')) {
+ integrity_metadata(&dio->work);
+ dm_integrity_flush_buffers(ic, false);
+
+ dio->in_flight = (atomic_t)ATOMIC_INIT(1);
+ dio->completion = NULL;
+
+ submit_bio_noacct(bio);
+
+ return;
+ }
+
+ submit_bio_noacct(bio);
if (need_sync_io) {
wait_for_completion_io(&read_comp);
@@ -2237,6 +2355,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
sec &= ~(sector_t)(ic->sectors_per_block - 1);
}
}
+ if (unlikely(sec >= ic->provided_data_sectors))
+ continue;
get_area_and_offset(ic, sec, &area, &offset);
restore_last_bytes(ic, access_journal_data(ic, i, j), je);
for (k = j + 1; k < ic->journal_section_entries; k++) {
@@ -2246,6 +2366,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
break;
BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay);
sec2 = journal_entry_get_sector(je2);
+ if (unlikely(sec2 >= ic->provided_data_sectors))
+ break;
get_area_and_offset(ic, sec2, &area2, &offset2);
if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block))
break;
@@ -2449,7 +2571,7 @@ static void integrity_recalc(struct work_struct *w)
get_area_and_offset(ic, logical_sector, &area, &offset);
}
- DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors);
+ DEBUG_print("recalculating: %llx, %llx\n", logical_sector, n_sectors);
if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
recalc_write_super(ic);
@@ -2591,8 +2713,6 @@ static void bitmap_flush_work(struct work_struct *work)
spin_unlock_irq(&ic->endio_wait.lock);
dm_integrity_flush_buffers(ic, true);
- if (ic->meta_dev)
- blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL);
limit = ic->provided_data_sectors;
if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
@@ -2884,9 +3004,29 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
static void dm_integrity_resume(struct dm_target *ti)
{
struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+ __u64 old_provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
int r;
+
DEBUG_print("resume\n");
+ if (ic->provided_data_sectors != old_provided_data_sectors) {
+ if (ic->provided_data_sectors > old_provided_data_sectors &&
+ ic->mode == 'B' &&
+ ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) {
+ rw_journal_sectors(ic, REQ_OP_READ, 0, 0,
+ ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+ block_bitmap_op(ic, ic->journal, old_provided_data_sectors,
+ ic->provided_data_sectors - old_provided_data_sectors, BITMAP_OP_SET);
+ rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
+ ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+ }
+
+ ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
+ r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+ if (unlikely(r))
+ dm_integrity_io_error(ic, "writing superblock", r);
+ }
+
if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) {
DEBUG_print("resume dirty_bitmap\n");
rw_journal_sectors(ic, REQ_OP_READ, 0, 0,
@@ -2954,7 +3094,7 @@ static void dm_integrity_resume(struct dm_target *ti)
DEBUG_print("testing recalc: %x\n", ic->sb->flags);
if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
__u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector);
- DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, ic->provided_data_sectors);
+ DEBUG_print("recalc pos: %llx / %llx\n", recalc_pos, ic->provided_data_sectors);
if (recalc_pos < ic->provided_data_sectors) {
queue_work(ic->recalc_wq, &ic->recalc_work);
} else if (recalc_pos > ic->provided_data_sectors) {
@@ -2985,9 +3125,9 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
case STATUSTYPE_INFO:
DMEMIT("%llu %llu",
(unsigned long long)atomic64_read(&ic->number_of_mismatches),
- (unsigned long long)ic->provided_data_sectors);
+ ic->provided_data_sectors);
if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
- DMEMIT(" %llu", (unsigned long long)le64_to_cpu(ic->sb->recalc_sector));
+ DMEMIT(" %llu", le64_to_cpu(ic->sb->recalc_sector));
else
DMEMIT(" -");
break;
@@ -3000,6 +3140,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
arg_count += !!ic->meta_dev;
arg_count += ic->sectors_per_block != 1;
arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
+ arg_count += ic->discard;
arg_count += ic->mode == 'J';
arg_count += ic->mode == 'J';
arg_count += ic->mode == 'B';
@@ -3007,8 +3148,9 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
arg_count += !!ic->internal_hash_alg.alg_string;
arg_count += !!ic->journal_crypt_alg.alg_string;
arg_count += !!ic->journal_mac_alg.alg_string;
+ arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0;
arg_count += ic->legacy_recalculate;
- DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start,
+ DMEMIT("%s %llu %u %c %u", ic->dev->name, ic->start,
ic->tag_size, ic->mode, arg_count);
if (ic->meta_dev)
DMEMIT(" meta_device:%s", ic->meta_dev->name);
@@ -3016,6 +3158,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
DMEMIT(" recalculate");
+ if (ic->discard)
+ DMEMIT(" allow_discards");
DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
@@ -3024,9 +3168,11 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
DMEMIT(" commit_time:%u", ic->autocommit_msec);
}
if (ic->mode == 'B') {
- DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
+ DMEMIT(" sectors_per_bit:%llu", (sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval));
}
+ if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0)
+ DMEMIT(" fix_padding");
if (ic->legacy_recalculate)
DMEMIT(" legacy_recalculate");
@@ -3097,8 +3243,14 @@ static int calculate_device_limits(struct dm_integrity_c *ic)
if (!ic->meta_dev) {
sector_t last_sector, last_area, last_offset;
- ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
- (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT;
+ /* we have to maintain excessive padding for compatibility with existing volumes */
+ __u64 metadata_run_padding =
+ ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING) ?
+ (__u64)(METADATA_PADDING_SECTORS << SECTOR_SHIFT) :
+ (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS);
+
+ ic->metadata_run = round_up((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
+ metadata_run_padding) >> SECTOR_SHIFT;
if (!(ic->metadata_run & (ic->metadata_run - 1)))
ic->log2_metadata_run = __ffs(ic->metadata_run);
else
@@ -3123,6 +3275,24 @@ static int calculate_device_limits(struct dm_integrity_c *ic)
return 0;
}
+static void get_provided_data_sectors(struct dm_integrity_c *ic)
+{
+ if (!ic->meta_dev) {
+ int test_bit;
+ ic->provided_data_sectors = 0;
+ for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) {
+ __u64 prev_data_sectors = ic->provided_data_sectors;
+
+ ic->provided_data_sectors |= (sector_t)1 << test_bit;
+ if (calculate_device_limits(ic))
+ ic->provided_data_sectors = prev_data_sectors;
+ }
+ } else {
+ ic->provided_data_sectors = ic->data_device_sectors;
+ ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1);
+ }
+}
+
static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors)
{
unsigned journal_sections;
@@ -3141,6 +3311,8 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec
journal_sections = 1;
if (!ic->meta_dev) {
+ if (ic->fix_padding)
+ ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_PADDING);
ic->sb->journal_sections = cpu_to_le32(journal_sections);
if (!interleave_sectors)
interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
@@ -3148,20 +3320,15 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec
ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
- ic->provided_data_sectors = 0;
- for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) {
- __u64 prev_data_sectors = ic->provided_data_sectors;
-
- ic->provided_data_sectors |= (sector_t)1 << test_bit;
- if (calculate_device_limits(ic))
- ic->provided_data_sectors = prev_data_sectors;
- }
+ get_provided_data_sectors(ic);
if (!ic->provided_data_sectors)
return -EINVAL;
} else {
ic->sb->log2_interleave_sectors = 0;
- ic->provided_data_sectors = ic->data_device_sectors;
- ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1);
+
+ get_provided_data_sectors(ic);
+ if (!ic->provided_data_sectors)
+ return -EINVAL;
try_smaller_buffer:
ic->sb->journal_sections = cpu_to_le32(0);
@@ -3300,8 +3467,8 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int
static void free_alg(struct alg_spec *a)
{
- kzfree(a->alg_string);
- kzfree(a->key);
+ kfree_sensitive(a->alg_string);
+ kfree_sensitive(a->key);
memset(a, 0, sizeof *a);
}
@@ -3345,7 +3512,7 @@ static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
int r;
if (a->alg_string) {
- *hash = crypto_alloc_shash(a->alg_string, 0, 0);
+ *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
if (IS_ERR(*hash)) {
*error = error_alg;
r = PTR_ERR(*hash);
@@ -3402,7 +3569,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
struct journal_completion comp;
comp.ic = ic;
- ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, 0);
+ ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
if (IS_ERR(ic->journal_crypt)) {
*error = "Invalid journal cipher";
r = PTR_ERR(ic->journal_crypt);
@@ -3637,7 +3804,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
unsigned extra_args;
struct dm_arg_set as;
static const struct dm_arg _args[] = {
- {0, 14, "Invalid number of feature args"},
+ {0, 16, "Invalid number of feature args"},
};
unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
bool should_write_sb;
@@ -3782,6 +3949,10 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
} else if (!strcmp(opt_string, "recalculate")) {
ic->recalculate_flag = true;
+ } else if (!strcmp(opt_string, "allow_discards")) {
+ ic->discard = true;
+ } else if (!strcmp(opt_string, "fix_padding")) {
+ ic->fix_padding = true;
} else if (!strcmp(opt_string, "legacy_recalculate")) {
ic->legacy_recalculate = true;
} else {
@@ -3840,6 +4011,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
+ if (ic->discard && !ic->internal_hash) {
+ r = -EINVAL;
+ ti->error = "Discard can be only used with internal hash";
+ goto bad;
+ }
+
ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
ic->autocommit_msec = sync_msec;
timer_setup(&ic->autocommit_timer, autocommit_fn, 0);
@@ -3934,7 +4111,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
should_write_sb = true;
}
- if (!ic->sb->version || ic->sb->version > SB_VERSION_3) {
+ if (!ic->sb->version || ic->sb->version > SB_VERSION_4) {
r = -EINVAL;
ti->error = "Unknown version";
goto bad;
@@ -3969,19 +4146,19 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
}
- ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
- if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) {
- /* test for overflow */
- r = -EINVAL;
- ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors";
- goto bad;
- }
if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) {
r = -EINVAL;
ti->error = "Journal mac mismatch";
goto bad;
}
+ get_provided_data_sectors(ic);
+ if (!ic->provided_data_sectors) {
+ r = -EINVAL;
+ ti->error = "The device is too small";
+ goto bad;
+ }
+
try_smaller_buffer:
r = calculate_device_limits(ic);
if (r) {
@@ -4043,10 +4220,9 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors);
DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run);
DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run);
- DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors,
- (unsigned long long)ic->provided_data_sectors);
+ DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", ic->provided_data_sectors, ic->provided_data_sectors);
DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
- DEBUG_print(" bits_in_journal %llu\n", (unsigned long long)bits_in_journal);
+ DEBUG_print(" bits_in_journal %llu\n", bits_in_journal);
if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
@@ -4184,6 +4360,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->num_flush_bios = 1;
ti->flush_supported = true;
+ if (ic->discard)
+ ti->num_discard_bios = 1;
return 0;
@@ -4238,7 +4416,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
for (i = 0; i < ic->journal_sections; i++) {
struct skcipher_request *req = ic->sk_requests[i];
if (req) {
- kzfree(req->iv);
+ kfree_sensitive(req->iv);
skcipher_request_free(req);
}
}
@@ -4265,7 +4443,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
static struct target_type integrity_target = {
.name = "integrity",
- .version = {1, 3, 0},
+ .version = {1, 6, 0},
.module = THIS_MODULE,
.features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
.ctr = dm_integrity_ctr,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 81ffc59..4312007 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -306,7 +306,7 @@ static void do_region(int op, int op_flags, unsigned region,
struct request_queue *q = bdev_get_queue(where->bdev);
unsigned short logical_block_size = queue_logical_block_size(q);
sector_t num_sectors;
- unsigned int uninitialized_var(special_cmd_max_sectors);
+ unsigned int special_cmd_max_sectors;
/*
* Reject unsupported discard and write same requests.
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 3f15d8d..1ca65b4 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1168,7 +1168,7 @@ static void retrieve_status(struct dm_table *table,
spec->sector_start = ti->begin;
spec->length = ti->len;
strncpy(spec->target_type, ti->type->name,
- sizeof(spec->target_type));
+ sizeof(spec->target_type) - 1);
outptr += sizeof(struct dm_target_spec);
remaining = len - (outptr - outbuf);
@@ -1471,7 +1471,7 @@ static void retrieve_deps(struct dm_table *table,
/*
* Check we have enough space.
*/
- needed = sizeof(*deps) + (sizeof(*deps->dev) * count);
+ needed = struct_size(deps, dev, count);
if (len < needed) {
param->flags |= DM_BUFFER_FULL_FLAG;
return;
@@ -1845,7 +1845,7 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us
int ioctl_flags;
int param_flags;
unsigned int cmd;
- struct dm_ioctl *uninitialized_var(param);
+ struct dm_ioctl *param;
ioctl_fn fn = NULL;
size_t input_param_size;
struct dm_ioctl param_kernel;
@@ -2045,7 +2045,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
return r;
}
-
+EXPORT_SYMBOL_GPL(dm_copy_name_and_uuid);
/**
* dm_early_create - create a mapped device in early boot.
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index ecefe67..00774b5 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -90,7 +90,7 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
struct linear_c *lc = ti->private;
bio_set_dev(bio, lc->dev->bdev);
- if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
+ if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio)))
bio->bi_iter.bi_sector =
linear_map_sector(ti, bio->bi_iter.bi_sector);
}
@@ -136,21 +136,15 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
}
#ifdef CONFIG_BLK_DEV_ZONED
-static int linear_report_zones(struct dm_target *ti, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones)
+static int linear_report_zones(struct dm_target *ti,
+ struct dm_report_zones_args *args, unsigned int nr_zones)
{
- struct linear_c *lc = (struct linear_c *) ti->private;
- int ret;
+ struct linear_c *lc = ti->private;
+ sector_t sector = linear_map_sector(ti, args->next_sector);
- /* Do report and remap it */
- ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector),
- zones, nr_zones);
- if (ret != 0)
- return ret;
-
- if (*nr_zones)
- dm_remap_zone_report(ti, lc->start, zones, nr_zones);
- return 0;
+ args->start = lc->start;
+ return blkdev_report_zones(lc->dev->bdev, sector, nr_zones,
+ dm_report_zones_cb, args);
}
#endif
@@ -207,20 +201,38 @@ static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
}
+static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
+ size_t nr_pages)
+{
+ int ret;
+ struct linear_c *lc = ti->private;
+ struct block_device *bdev = lc->dev->bdev;
+ struct dax_device *dax_dev = lc->dev->dax_dev;
+ sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+ dev_sector = linear_map_sector(ti, sector);
+ ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages << PAGE_SHIFT, &pgoff);
+ if (ret)
+ return ret;
+ return dax_zero_page_range(dax_dev, pgoff, nr_pages);
+}
+
#else
#define linear_dax_direct_access NULL
#define linear_dax_copy_from_iter NULL
#define linear_dax_copy_to_iter NULL
+#define linear_dax_zero_page_range NULL
#endif
static struct target_type linear_target = {
.name = "linear",
.version = {1, 4, 0},
#ifdef CONFIG_BLK_DEV_ZONED
- .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM,
+ .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT |
+ DM_TARGET_ZONED_HM,
.report_zones = linear_report_zones,
#else
- .features = DM_TARGET_PASSES_INTEGRITY,
+ .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT,
#endif
.module = THIS_MODULE,
.ctr = linear_ctr,
@@ -232,6 +244,7 @@ static struct target_type linear_target = {
.direct_access = linear_dax_direct_access,
.dax_copy_from_iter = linear_dax_copy_from_iter,
.dax_copy_to_iter = linear_dax_copy_to_iter,
+ .dax_zero_page_range = linear_dax_zero_page_range,
};
int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 99721c7..e3d35c6 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -127,7 +127,7 @@ struct pending_block {
char *data;
u32 datalen;
struct list_head list;
- struct bio_vec vecs[0];
+ struct bio_vec vecs[];
};
struct per_bio_data {
@@ -994,10 +994,26 @@ static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
}
+static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
+ size_t nr_pages)
+{
+ int ret;
+ struct log_writes_c *lc = ti->private;
+ sector_t sector = pgoff * PAGE_SECTORS;
+
+ ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages << PAGE_SHIFT,
+ &pgoff);
+ if (ret)
+ return ret;
+ return dax_zero_page_range(lc->dev->dax_dev, pgoff,
+ nr_pages << PAGE_SHIFT);
+}
+
#else
#define log_writes_dax_direct_access NULL
#define log_writes_dax_copy_from_iter NULL
#define log_writes_dax_copy_to_iter NULL
+#define log_writes_dax_zero_page_range NULL
#endif
static struct target_type log_writes_target = {
@@ -1016,6 +1032,7 @@ static struct target_type log_writes_target = {
.direct_access = log_writes_dax_direct_access,
.dax_copy_from_iter = log_writes_dax_copy_from_iter,
.dax_copy_to_iter = log_writes_dax_copy_to_iter,
+ .dax_zero_page_range = log_writes_dax_zero_page_range,
};
static int __init dm_log_writes_init(void)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 54ecfea..bced42f 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -20,6 +20,7 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/time.h>
+#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/delay.h>
#include <scsi/scsi_dh.h>
@@ -29,6 +30,9 @@
#define DM_MSG_PREFIX "multipath"
#define DM_PG_INIT_DELAY_MSECS 2000
#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
+#define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
+
+static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
/* Path properties */
struct pgpath {
@@ -91,6 +95,8 @@ struct multipath {
struct work_struct process_queued_bios;
struct bio_list queued_bios;
+
+ struct timer_list nopath_timer; /* Timeout for queue_if_no_path */
};
/*
@@ -108,6 +114,7 @@ static void trigger_event(struct work_struct *work);
static void activate_or_offline_path(struct pgpath *pgpath);
static void activate_path_work(struct work_struct *work);
static void process_queued_bios(struct work_struct *work);
+static void queue_if_no_path_timeout_work(struct timer_list *t);
/*-----------------------------------------------
* Multipath state flags.
@@ -121,6 +128,20 @@ static void process_queued_bios(struct work_struct *work);
#define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */
#define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */
+static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m)
+{
+ bool r = test_bit(MPATHF_bit, &m->flags);
+
+ if (r) {
+ unsigned long flags;
+ spin_lock_irqsave(&m->lock, flags);
+ r = test_bit(MPATHF_bit, &m->flags);
+ spin_unlock_irqrestore(&m->lock, flags);
+ }
+
+ return r;
+}
+
/*-----------------------------------------------
* Allocation routines
*-----------------------------------------------*/
@@ -195,6 +216,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
m->ti = ti;
ti->private = m;
+
+ timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0);
}
return m;
@@ -326,6 +349,8 @@ static int pg_init_all_paths(struct multipath *m)
static void __switch_pg(struct multipath *m, struct priority_group *pg)
{
+ lockdep_assert_held(&m->lock);
+
m->current_pg = pg;
/* Must we initialise the PG first, and queue I/O till it's ready? */
@@ -373,7 +398,9 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
unsigned bypassed = 1;
if (!atomic_read(&m->nr_valid_paths)) {
+ spin_lock_irqsave(&m->lock, flags);
clear_bit(MPATHF_QUEUE_IO, &m->flags);
+ spin_unlock_irqrestore(&m->lock, flags);
goto failed;
}
@@ -413,8 +440,11 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
continue;
pgpath = choose_path_in_pg(m, pg, nr_bytes);
if (!IS_ERR_OR_NULL(pgpath)) {
- if (!bypassed)
+ if (!bypassed) {
+ spin_lock_irqsave(&m->lock, flags);
set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
+ spin_unlock_irqrestore(&m->lock, flags);
+ }
return pgpath;
}
}
@@ -430,51 +460,38 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
}
/*
- * dm_report_EIO() is a macro instead of a function to make pr_debug()
+ * dm_report_EIO() is a macro instead of a function to make pr_debug_ratelimited()
* report the function name and line number of the function from which
* it has been invoked.
*/
#define dm_report_EIO(m) \
do { \
- struct mapped_device *md = dm_table_get_md((m)->ti->table); \
- \
- pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \
- dm_device_name(md), \
- test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \
- test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \
- dm_noflush_suspending((m)->ti)); \
+ DMDEBUG_LIMIT("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d", \
+ dm_table_device_name((m)->ti->table), \
+ test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \
+ test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \
+ dm_noflush_suspending((m)->ti)); \
} while (0)
/*
* Check whether bios must be queued in the device-mapper core rather
* than here in the target.
- *
- * If MPATHF_QUEUE_IF_NO_PATH and MPATHF_SAVED_QUEUE_IF_NO_PATH hold
- * the same value then we are not between multipath_presuspend()
- * and multipath_resume() calls and we have no need to check
- * for the DMF_NOFLUSH_SUSPENDING flag.
*/
-static bool __must_push_back(struct multipath *m, unsigned long flags)
+static bool __must_push_back(struct multipath *m)
{
- return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) !=
- test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &flags)) &&
- dm_noflush_suspending(m->ti));
+ return dm_noflush_suspending(m->ti);
}
-/*
- * Following functions use READ_ONCE to get atomic access to
- * all m->flags to avoid taking spinlock
- */
static bool must_push_back_rq(struct multipath *m)
{
- unsigned long flags = READ_ONCE(m->flags);
- return test_bit(MPATHF_QUEUE_IF_NO_PATH, &flags) || __must_push_back(m, flags);
-}
+ unsigned long flags;
+ bool ret;
-static bool must_push_back_bio(struct multipath *m)
-{
- unsigned long flags = READ_ONCE(m->flags);
- return __must_push_back(m, flags);
+ spin_lock_irqsave(&m->lock, flags);
+ ret = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m));
+ spin_unlock_irqrestore(&m->lock, flags);
+
+ return ret;
}
/*
@@ -494,7 +511,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
/* Do we need to select a new pgpath? */
pgpath = READ_ONCE(m->current_pgpath);
- if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
+ if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
pgpath = choose_pgpath(m, nr_bytes);
if (!pgpath) {
@@ -502,8 +519,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
return DM_MAPIO_DELAY_REQUEUE;
dm_report_EIO(m); /* Failed */
return DM_MAPIO_KILL;
- } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
- test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
+ } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) ||
+ mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) {
pg_init_all_paths(m);
return DM_MAPIO_DELAY_REQUEUE;
}
@@ -558,7 +575,8 @@ static void multipath_release_clone(struct request *clone,
if (pgpath && pgpath->pg->ps.type->end_io)
pgpath->pg->ps.type->end_io(&pgpath->pg->ps,
&pgpath->path,
- mpio->nr_bytes);
+ mpio->nr_bytes,
+ clone->io_start_time_ns);
}
blk_put_request(clone);
@@ -568,33 +586,45 @@ static void multipath_release_clone(struct request *clone,
* Map cloned bios (bio-based multipath)
*/
+static void __multipath_queue_bio(struct multipath *m, struct bio *bio)
+{
+ /* Queue for the daemon to resubmit */
+ bio_list_add(&m->queued_bios, bio);
+ if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
+ queue_work(kmultipathd, &m->process_queued_bios);
+}
+
+static void multipath_queue_bio(struct multipath *m, struct bio *bio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&m->lock, flags);
+ __multipath_queue_bio(m, bio);
+ spin_unlock_irqrestore(&m->lock, flags);
+}
+
static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
{
struct pgpath *pgpath;
unsigned long flags;
- bool queue_io;
/* Do we need to select a new pgpath? */
pgpath = READ_ONCE(m->current_pgpath);
- if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
+ if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
- /* MPATHF_QUEUE_IO might have been cleared by choose_pgpath. */
- queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
-
- if ((pgpath && queue_io) ||
- (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
- /* Queue for the daemon to resubmit */
+ if (!pgpath) {
spin_lock_irqsave(&m->lock, flags);
- bio_list_add(&m->queued_bios, bio);
+ if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+ __multipath_queue_bio(m, bio);
+ pgpath = ERR_PTR(-EAGAIN);
+ }
spin_unlock_irqrestore(&m->lock, flags);
- /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
- if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
- pg_init_all_paths(m);
- else if (!queue_io)
- queue_work(kmultipathd, &m->process_queued_bios);
-
+ } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) ||
+ mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) {
+ multipath_queue_bio(m, bio);
+ pg_init_all_paths(m);
return ERR_PTR(-EAGAIN);
}
@@ -610,7 +640,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio,
return DM_MAPIO_SUBMITTED;
if (!pgpath) {
- if (must_push_back_bio(m))
+ if (__must_push_back(m))
return DM_MAPIO_REQUEUE;
dm_report_EIO(m);
return DM_MAPIO_KILL;
@@ -685,7 +715,7 @@ static void process_queued_bios(struct work_struct *work)
bio_endio(bio);
break;
case DM_MAPIO_REMAPPED:
- generic_make_request(bio);
+ submit_bio_noacct(bio);
break;
case DM_MAPIO_SUBMITTED:
break;
@@ -700,15 +730,38 @@ static void process_queued_bios(struct work_struct *work)
* If we run out of usable paths, should we queue I/O or error it?
*/
static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
- bool save_old_value)
+ bool save_old_value, const char *caller)
{
unsigned long flags;
+ bool queue_if_no_path_bit, saved_queue_if_no_path_bit;
+ const char *dm_dev_name = dm_table_device_name(m->ti->table);
+
+ DMDEBUG("%s: %s caller=%s queue_if_no_path=%d save_old_value=%d",
+ dm_dev_name, __func__, caller, queue_if_no_path, save_old_value);
spin_lock_irqsave(&m->lock, flags);
- assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags,
- (save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
- (!save_old_value && queue_if_no_path));
+
+ queue_if_no_path_bit = test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
+ saved_queue_if_no_path_bit = test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
+
+ if (save_old_value) {
+ if (unlikely(!queue_if_no_path_bit && saved_queue_if_no_path_bit)) {
+ DMERR("%s: QIFNP disabled but saved as enabled, saving again loses state, not saving!",
+ dm_dev_name);
+ } else
+ assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path_bit);
+ } else if (!queue_if_no_path && saved_queue_if_no_path_bit) {
+ /* due to "fail_if_no_path" message, need to honor it. */
+ clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
+ }
assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path);
+
+ DMDEBUG("%s: after %s changes; QIFNP = %d; SQIFNP = %d; DNFS = %d",
+ dm_dev_name, __func__,
+ test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags),
+ test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags),
+ dm_noflush_suspending(m->ti));
+
spin_unlock_irqrestore(&m->lock, flags);
if (!queue_if_no_path) {
@@ -720,6 +773,43 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
}
/*
+ * If the queue_if_no_path timeout fires, turn off queue_if_no_path and
+ * process any queued I/O.
+ */
+static void queue_if_no_path_timeout_work(struct timer_list *t)
+{
+ struct multipath *m = from_timer(m, t, nopath_timer);
+
+ DMWARN("queue_if_no_path timeout on %s, failing queued IO",
+ dm_table_device_name(m->ti->table));
+ queue_if_no_path(m, false, false, __func__);
+}
+
+/*
+ * Enable the queue_if_no_path timeout if necessary.
+ * Called with m->lock held.
+ */
+static void enable_nopath_timeout(struct multipath *m)
+{
+ unsigned long queue_if_no_path_timeout =
+ READ_ONCE(queue_if_no_path_timeout_secs) * HZ;
+
+ lockdep_assert_held(&m->lock);
+
+ if (queue_if_no_path_timeout > 0 &&
+ atomic_read(&m->nr_valid_paths) == 0 &&
+ test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+ mod_timer(&m->nopath_timer,
+ jiffies + queue_if_no_path_timeout);
+ }
+}
+
+static void disable_nopath_timeout(struct multipath *m)
+{
+ del_timer_sync(&m->nopath_timer);
+}
+
+/*
* An event is triggered whenever a path is taken out of use.
* Includes path failure and PG bypass.
*/
@@ -783,7 +873,7 @@ static int setup_scsi_dh(struct block_device *bdev, struct multipath *m,
struct request_queue *q = bdev_get_queue(bdev);
int r;
- if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) {
+ if (mpath_double_check_test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, m)) {
retain:
if (*attached_handler_name) {
/*
@@ -1032,7 +1122,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
argc--;
if (!strcasecmp(arg_name, "queue_if_no_path")) {
- r = queue_if_no_path(m, true, false);
+ r = queue_if_no_path(m, true, false, __func__);
continue;
}
@@ -1092,6 +1182,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
struct dm_arg_set as;
unsigned pg_count = 0;
unsigned next_pg_num;
+ unsigned long flags;
as.argc = argc;
as.argv = argv;
@@ -1156,6 +1247,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
+ spin_lock_irqsave(&m->lock, flags);
+ enable_nopath_timeout(m);
+ spin_unlock_irqrestore(&m->lock, flags);
+
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
@@ -1218,6 +1313,7 @@ static void multipath_dtr(struct dm_target *ti)
{
struct multipath *m = ti->private;
+ disable_nopath_timeout(m);
flush_multipath_work(m);
free_multipath(m);
}
@@ -1235,7 +1331,9 @@ static int fail_path(struct pgpath *pgpath)
if (!pgpath->is_active)
goto out;
- DMWARN("Failing path %s.", pgpath->path.dev->name);
+ DMWARN("%s: Failing path %s.",
+ dm_table_device_name(m->ti->table),
+ pgpath->path.dev->name);
pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
pgpath->is_active = false;
@@ -1251,6 +1349,8 @@ static int fail_path(struct pgpath *pgpath)
schedule_work(&m->trigger_event);
+ enable_nopath_timeout(m);
+
out:
spin_unlock_irqrestore(&m->lock, flags);
@@ -1272,7 +1372,9 @@ static int reinstate_path(struct pgpath *pgpath)
if (pgpath->is_active)
goto out;
- DMWARN("Reinstating path %s.", pgpath->path.dev->name);
+ DMWARN("%s: Reinstating path %s.",
+ dm_table_device_name(m->ti->table),
+ pgpath->path.dev->name);
r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
if (r)
@@ -1301,6 +1403,9 @@ static int reinstate_path(struct pgpath *pgpath)
process_queued_io_list(m);
}
+ if (pgpath->is_active)
+ disable_nopath_timeout(m);
+
return r;
}
@@ -1454,8 +1559,8 @@ static void pg_init_done(void *data, int errors)
break;
case SCSI_DH_RETRY:
/* Wait before retrying. */
- delay_retry = 1;
- /* fall through */
+ delay_retry = true;
+ fallthrough;
case SCSI_DH_IMM_RETRY:
case SCSI_DH_RES_TEMP_UNAVAIL:
if (pg_init_limit_reached(m, pgpath))
@@ -1555,7 +1660,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
if (pgpath)
fail_path(pgpath);
- if (atomic_read(&m->nr_valid_paths) == 0 &&
+ if (!atomic_read(&m->nr_valid_paths) &&
!must_push_back_rq(m)) {
if (error == BLK_STS_IOERR)
dm_report_EIO(m);
@@ -1568,7 +1673,8 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
struct path_selector *ps = &pgpath->pg->ps;
if (ps->type->end_io)
- ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
+ ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes,
+ clone->io_start_time_ns);
}
return r;
@@ -1589,46 +1695,49 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
if (pgpath)
fail_path(pgpath);
- if (atomic_read(&m->nr_valid_paths) == 0 &&
- !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
- if (must_push_back_bio(m)) {
- r = DM_ENDIO_REQUEUE;
- } else {
- dm_report_EIO(m);
- *error = BLK_STS_IOERR;
+ if (!atomic_read(&m->nr_valid_paths)) {
+ spin_lock_irqsave(&m->lock, flags);
+ if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+ if (__must_push_back(m)) {
+ r = DM_ENDIO_REQUEUE;
+ } else {
+ dm_report_EIO(m);
+ *error = BLK_STS_IOERR;
+ }
+ spin_unlock_irqrestore(&m->lock, flags);
+ goto done;
}
- goto done;
+ spin_unlock_irqrestore(&m->lock, flags);
}
- spin_lock_irqsave(&m->lock, flags);
- bio_list_add(&m->queued_bios, clone);
- spin_unlock_irqrestore(&m->lock, flags);
- if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
- queue_work(kmultipathd, &m->process_queued_bios);
-
+ multipath_queue_bio(m, clone);
r = DM_ENDIO_INCOMPLETE;
done:
if (pgpath) {
struct path_selector *ps = &pgpath->pg->ps;
if (ps->type->end_io)
- ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
+ ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes,
+ dm_start_time_ns_from_clone(clone));
}
return r;
}
/*
- * Suspend can't complete until all the I/O is processed so if
- * the last path fails we must error any remaining I/O.
- * Note that if the freeze_bdev fails while suspending, the
- * queue_if_no_path state is lost - userspace should reset it.
+ * Suspend with flush can't complete until all the I/O is processed
+ * so if the last path fails we must error any remaining I/O.
+ * - Note that if the freeze_bdev fails while suspending, the
+ * queue_if_no_path state is lost - userspace should reset it.
+ * Otherwise, during noflush suspend, queue_if_no_path will not change.
*/
static void multipath_presuspend(struct dm_target *ti)
{
struct multipath *m = ti->private;
- queue_if_no_path(m, false, true);
+ /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */
+ if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti))
+ queue_if_no_path(m, false, true, __func__);
}
static void multipath_postsuspend(struct dm_target *ti)
@@ -1649,8 +1758,16 @@ static void multipath_resume(struct dm_target *ti)
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
- assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags,
- test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags));
+ if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) {
+ set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
+ clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
+ }
+
+ DMDEBUG("%s: %s finished; QIFNP = %d; SQIFNP = %d",
+ dm_table_device_name(m->ti->table), __func__,
+ test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags),
+ test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags));
+
spin_unlock_irqrestore(&m->lock, flags);
}
@@ -1799,6 +1916,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
struct dm_dev *dev;
struct multipath *m = ti->private;
action_fn action;
+ unsigned long flags;
mutex_lock(&m->work_mutex);
@@ -1809,10 +1927,14 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
if (argc == 1) {
if (!strcasecmp(argv[0], "queue_if_no_path")) {
- r = queue_if_no_path(m, true, false);
+ r = queue_if_no_path(m, true, false, __func__);
+ spin_lock_irqsave(&m->lock, flags);
+ enable_nopath_timeout(m);
+ spin_unlock_irqrestore(&m->lock, flags);
goto out;
} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
- r = queue_if_no_path(m, false, false);
+ r = queue_if_no_path(m, false, false, __func__);
+ disable_nopath_timeout(m);
goto out;
}
}
@@ -1860,16 +1982,17 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
struct block_device **bdev)
{
struct multipath *m = ti->private;
- struct pgpath *current_pgpath;
+ struct pgpath *pgpath;
+ unsigned long flags;
int r;
- current_pgpath = READ_ONCE(m->current_pgpath);
- if (!current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
- current_pgpath = choose_pgpath(m, 0);
+ pgpath = READ_ONCE(m->current_pgpath);
+ if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
+ pgpath = choose_pgpath(m, 0);
- if (current_pgpath) {
- if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
- *bdev = current_pgpath->path.dev->bdev;
+ if (pgpath) {
+ if (!mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) {
+ *bdev = pgpath->path.dev->bdev;
r = 0;
} else {
/* pg_init has not started or completed */
@@ -1877,10 +2000,11 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
}
} else {
/* No path is available */
+ r = -EIO;
+ spin_lock_irqsave(&m->lock, flags);
if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
r = -ENOTCONN;
- else
- r = -EIO;
+ spin_unlock_irqrestore(&m->lock, flags);
}
if (r == -ENOTCONN) {
@@ -1888,8 +2012,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
/* Path status changed, redo selection */
(void) choose_pgpath(m, 0);
}
+ spin_lock_irqsave(&m->lock, flags);
if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
- pg_init_all_paths(m);
+ (void) __pg_init_all_paths(m);
+ spin_unlock_irqrestore(&m->lock, flags);
dm_table_run_md_queue_async(m->ti->table);
process_queued_io_list(m);
}
@@ -1949,8 +2075,15 @@ static int multipath_busy(struct dm_target *ti)
return true;
/* no paths available, for blk-mq: rely on IO mapping to delay requeue */
- if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
- return (m->queue_mode != DM_TYPE_REQUEST_BASED);
+ if (!atomic_read(&m->nr_valid_paths)) {
+ unsigned long flags;
+ spin_lock_irqsave(&m->lock, flags);
+ if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+ spin_unlock_irqrestore(&m->lock, flags);
+ return (m->queue_mode != DM_TYPE_REQUEST_BASED);
+ }
+ spin_unlock_irqrestore(&m->lock, flags);
+ }
/* Guess which priority_group will be used at next mapping time */
pg = READ_ONCE(m->current_pg);
@@ -2001,7 +2134,7 @@ static int multipath_busy(struct dm_target *ti)
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 13, 0},
+ .version = {1, 14, 0},
.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE |
DM_TARGET_PASSES_INTEGRITY,
.module = THIS_MODULE,
@@ -2075,6 +2208,10 @@ static void __exit dm_multipath_exit(void)
module_init(dm_multipath_init);
module_exit(dm_multipath_exit);
+module_param_named(queue_if_no_path_timeout_secs,
+ queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds");
+
MODULE_DESCRIPTION(DM_NAME " multipath target");
MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index b6eb536..c47bc0e 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -74,7 +74,7 @@ struct path_selector_type {
int (*start_io) (struct path_selector *ps, struct dm_path *path,
size_t nr_bytes);
int (*end_io) (struct path_selector *ps, struct dm_path *path,
- size_t nr_bytes);
+ size_t nr_bytes, u64 start_time);
};
/* Register a path selector */
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 969c4f1..5fd018d 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -227,7 +227,7 @@ static int ql_start_io(struct path_selector *ps, struct dm_path *path,
}
static int ql_end_io(struct path_selector *ps, struct dm_path *path,
- size_t nr_bytes)
+ size_t nr_bytes, u64 start_time)
{
struct path_info *pi = path->pscontext;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 5e73cc6..f5083b4 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -129,7 +129,9 @@ struct raid_dev {
CTR_FLAG_RAID10_COPIES | \
CTR_FLAG_RAID10_FORMAT | \
CTR_FLAG_DELTA_DISKS | \
- CTR_FLAG_DATA_OFFSET)
+ CTR_FLAG_DATA_OFFSET | \
+ CTR_FLAG_JOURNAL_DEV | \
+ CTR_FLAG_JOURNAL_MODE)
/* Valid options definitions per raid level... */
@@ -209,6 +211,7 @@ struct raid_dev {
#define RT_FLAG_RS_SUSPENDED 5
#define RT_FLAG_RS_IN_SYNC 6
#define RT_FLAG_RS_RESYNCING 7
+#define RT_FLAG_RS_GROW 8
/* Array elements of 64 bit needed for rebuild/failed disk bits */
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@@ -239,7 +242,9 @@ struct raid_set {
struct mddev md;
struct raid_type *raid_type;
- struct dm_target_callbacks callbacks;
+
+ sector_t array_sectors;
+ sector_t dev_sectors;
/* Optional raid4/5/6 journal device */
struct journal_dev {
@@ -248,7 +253,7 @@ struct raid_set {
int mode;
} journal_dev;
- struct raid_dev dev[0];
+ struct raid_dev dev[];
};
static void rs_config_backup(struct raid_set *rs, struct rs_layout *l)
@@ -616,7 +621,6 @@ static int raid10_format_to_md_layout(struct raid_set *rs,
} else if (algorithm == ALGORITHM_RAID10_FAR) {
f = copies;
- r = !RAID10_OFFSET;
if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
r |= RAID10_USE_FAR_SETS;
@@ -697,7 +701,7 @@ static void rs_set_capacity(struct raid_set *rs)
struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
set_capacity(gendisk, rs->md.array_sectors);
- revalidate_disk(gendisk);
+ revalidate_disk_size(gendisk, true);
}
/*
@@ -1615,13 +1619,12 @@ static int _check_data_dev_sectors(struct raid_set *rs)
}
/* Calculate the sectors per device and per array used for @rs */
-static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
+static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, bool use_mddev)
{
int delta_disks;
unsigned int data_stripes;
+ sector_t array_sectors = sectors, dev_sectors = sectors;
struct mddev *mddev = &rs->md;
- struct md_rdev *rdev;
- sector_t array_sectors = rs->ti->len, dev_sectors = rs->ti->len;
if (use_mddev) {
delta_disks = mddev->delta_disks;
@@ -1656,12 +1659,9 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
/* Striped layouts */
array_sectors = (data_stripes + delta_disks) * dev_sectors;
- rdev_for_each(rdev, mddev)
- if (!test_bit(Journal, &rdev->flags))
- rdev->sectors = dev_sectors;
-
mddev->array_sectors = array_sectors;
mddev->dev_sectors = dev_sectors;
+ rs_set_rdev_sectors(rs);
return _check_data_dev_sectors(rs);
bad:
@@ -1670,7 +1670,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
}
/* Setup recovery on @rs */
-static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
+static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
{
/* raid0 does not recover */
if (rs_is_raid0(rs))
@@ -1691,22 +1691,6 @@ static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
? MaxSector : dev_sectors;
}
-/* Setup recovery on @rs based on raid type, device size and 'nosync' flag */
-static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
-{
- if (!dev_sectors)
- /* New raid set or 'sync' flag provided */
- __rs_setup_recovery(rs, 0);
- else if (dev_sectors == MaxSector)
- /* Prevent recovery */
- __rs_setup_recovery(rs, MaxSector);
- else if (__rdev_sectors(rs) < dev_sectors)
- /* Grown raid set */
- __rs_setup_recovery(rs, __rdev_sectors(rs));
- else
- __rs_setup_recovery(rs, MaxSector);
-}
-
static void do_table_event(struct work_struct *ws)
{
struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
@@ -1720,13 +1704,6 @@ static void do_table_event(struct work_struct *ws)
dm_table_event(rs->ti->table);
}
-static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
-{
- struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
-
- return mddev_congested(&rs->md, bits);
-}
-
/*
* Make sure a valid takover (level switch) is being requested on @rs
*
@@ -2366,8 +2343,6 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
if (new_devs == rs->raid_disks || !rebuilds) {
/* Replace a broken device */
- if (new_devs == 1 && !rs->delta_disks)
- ;
if (new_devs == rs->raid_disks) {
DMINFO("Superblocks created for new raid set");
set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
@@ -2480,7 +2455,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
return -EINVAL;
}
- /* Enable bitmap creation for RAID levels != 0 */
+ /* Enable bitmap creation on @rs unless no metadevs or raid0 or journaled raid4/5/6 set. */
mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096);
mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
@@ -2917,7 +2892,7 @@ static int rs_setup_reshape(struct raid_set *rs)
/* Remove disk(s) */
} else if (rs->delta_disks < 0) {
- r = rs_set_dev_and_array_sectors(rs, true);
+ r = rs_set_dev_and_array_sectors(rs, rs->ti->len, true);
mddev->reshape_backwards = 1; /* removing disk(s) -> backward reshape */
/* Change layout and/or chunk size */
@@ -3028,7 +3003,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
bool resize = false;
struct raid_type *rt;
unsigned int num_raid_params, num_raid_devs;
- sector_t calculated_dev_sectors, rdev_sectors, reshape_sectors;
+ sector_t sb_array_sectors, rdev_sectors, reshape_sectors;
struct raid_set *rs = NULL;
const char *arg;
struct rs_layout rs_layout;
@@ -3038,7 +3013,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{ 1, 254, "Cannot understand number of raid devices parameters" }
};
- /* Must have <raid_type> */
arg = dm_shift_arg(&as);
if (!arg) {
ti->error = "No arguments";
@@ -3087,11 +3061,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
*
* Any existing superblock will overwrite the array and device sizes
*/
- r = rs_set_dev_and_array_sectors(rs, false);
+ r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false);
if (r)
goto bad;
- calculated_dev_sectors = rs->md.dev_sectors;
+ /* Memorize just calculated, potentially larger sizes to grow the raid set in preresume */
+ rs->array_sectors = rs->md.array_sectors;
+ rs->dev_sectors = rs->md.dev_sectors;
/*
* Backup any new raid set level, layout, ...
@@ -3104,6 +3080,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto bad;
+ /* All in-core metadata now as of current superblocks after calling analyse_superblocks() */
+ sb_array_sectors = rs->md.array_sectors;
rdev_sectors = __rdev_sectors(rs);
if (!rdev_sectors) {
ti->error = "Invalid rdev size";
@@ -3113,8 +3091,11 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
reshape_sectors = _get_reshape_sectors(rs);
- if (calculated_dev_sectors != rdev_sectors)
- resize = calculated_dev_sectors != (reshape_sectors ? rdev_sectors - reshape_sectors : rdev_sectors);
+ if (rs->dev_sectors != rdev_sectors) {
+ resize = (rs->dev_sectors != rdev_sectors - reshape_sectors);
+ if (rs->dev_sectors > rdev_sectors - reshape_sectors)
+ set_bit(RT_FLAG_RS_GROW, &rs->runtime_flags);
+ }
INIT_WORK(&rs->md.event_work, do_table_event);
ti->private = rs;
@@ -3141,13 +3122,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
rs_set_new(rs);
} else if (rs_is_recovering(rs)) {
- /* Rebuild particular devices */
- if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
- set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
- rs_setup_recovery(rs, MaxSector);
- }
/* A recovering raid set may be resized */
- ; /* skip setup rs */
+ goto size_check;
} else if (rs_is_reshaping(rs)) {
/* Have to reject size change request during reshape */
if (resize) {
@@ -3191,6 +3167,9 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
rs_setup_recovery(rs, MaxSector);
rs_set_new(rs);
} else if (rs_reshape_requested(rs)) {
+ /* Only request grow on raid set size extensions, not on reshapes. */
+ clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags);
+
/*
* No need to check for 'ongoing' takeover here, because takeover
* is an instant operation as oposed to an ongoing reshape.
@@ -3221,13 +3200,31 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
rs_set_cur(rs);
} else {
+size_check:
/* May not set recovery when a device rebuild is requested */
if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
- rs_setup_recovery(rs, MaxSector);
+ clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags);
set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
- } else
- rs_setup_recovery(rs, test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ?
- 0 : (resize ? calculated_dev_sectors : MaxSector));
+ rs_setup_recovery(rs, MaxSector);
+ } else if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) {
+ /*
+ * Set raid set to current size, i.e. size as of
+ * superblocks to grow to larger size in preresume.
+ */
+ r = rs_set_dev_and_array_sectors(rs, sb_array_sectors, false);
+ if (r)
+ goto bad;
+
+ rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors);
+ } else {
+ /* This is no size change or it is shrinking, update size and record in superblocks */
+ r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false);
+ if (r)
+ goto bad;
+
+ if (sb_array_sectors > rs->array_sectors)
+ set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+ }
rs_set_cur(rs);
}
@@ -3263,9 +3260,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_md_start;
}
- rs->callbacks.congested_fn = raid_is_congested;
- dm_table_add_target_callbacks(ti->table, &rs->callbacks);
-
/* If raid4/5/6 journal mode explicitly requested (only possible with journal dev) -> set it */
if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode);
@@ -3325,7 +3319,6 @@ static void raid_dtr(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
- list_del_init(&rs->callbacks.list);
md_stop(&rs->md);
raid_set_free(rs);
}
@@ -3428,10 +3421,9 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev)
/* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */
static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
- sector_t resync_max_sectors)
+ enum sync_state state, sector_t resync_max_sectors)
{
sector_t r;
- enum sync_state state;
struct mddev *mddev = &rs->md;
clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
@@ -3442,8 +3434,6 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
} else {
- state = decipher_sync_action(mddev, recovery);
-
if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery))
r = mddev->recovery_cp;
else
@@ -3461,18 +3451,14 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
/*
* In case we are recovering, the array is not in sync
* and health chars should show the recovering legs.
+ *
+ * Already retrieved recovery offset from curr_resync_completed above.
*/
;
- else if (state == st_resync)
+
+ else if (state == st_resync || state == st_reshape)
/*
- * If "resync" is occurring, the raid set
- * is or may be out of sync hence the health
- * characters shall be 'a'.
- */
- set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
- else if (state == st_reshape)
- /*
- * If "reshape" is occurring, the raid set
+ * If "resync/reshape" is occurring, the raid set
* is or may be out of sync hence the health
* characters shall be 'a'.
*/
@@ -3486,22 +3472,22 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
*/
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
- else {
- struct md_rdev *rdev;
-
+ else if (test_bit(MD_RECOVERY_NEEDED, &recovery))
/*
* We are idle and recovery is needed, prevent 'A' chars race
* caused by components still set to in-sync by constructor.
*/
- if (test_bit(MD_RECOVERY_NEEDED, &recovery))
- set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
+ set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
+ else {
/*
- * The raid set may be doing an initial sync, or it may
- * be rebuilding individual components. If all the
- * devices are In_sync, then it is the raid set that is
- * being initialized.
+ * We are idle and the raid set may be doing an initial
+ * sync, or it may be rebuilding individual components.
+ * If all the devices are In_sync, then it is the raid set
+ * that is being initialized.
*/
+ struct md_rdev *rdev;
+
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
rdev_for_each(rdev, mddev)
if (!test_bit(Journal, &rdev->flags) &&
@@ -3531,10 +3517,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
unsigned long recovery;
unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
unsigned int sz = 0;
- unsigned int rebuild_disks;
- unsigned int write_mostly_params = 0;
+ unsigned int rebuild_writemostly_count = 0;
sector_t progress, resync_max_sectors, resync_mismatches;
- const char *sync_action;
+ enum sync_state state;
struct raid_type *rt;
switch (type) {
@@ -3548,14 +3533,14 @@ static void raid_status(struct dm_target *ti, status_type_t type,
/* Access most recent mddev properties for status output */
smp_rmb();
- recovery = rs->md.recovery;
/* Get sensible max sectors even if raid set not yet started */
resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ?
mddev->resync_max_sectors : mddev->dev_sectors;
- progress = rs_get_progress(rs, recovery, resync_max_sectors);
+ recovery = rs->md.recovery;
+ state = decipher_sync_action(mddev, recovery);
+ progress = rs_get_progress(rs, recovery, state, resync_max_sectors);
resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
atomic64_read(&mddev->resync_mismatches) : 0;
- sync_action = sync_str(decipher_sync_action(&rs->md, recovery));
/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
for (i = 0; i < rs->raid_disks; i++)
@@ -3583,7 +3568,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
* See Documentation/admin-guide/device-mapper/dm-raid.rst for
* information on each of these states.
*/
- DMEMIT(" %s", sync_action);
+ DMEMIT(" %s", sync_str(state));
/*
* v1.5.0+:
@@ -3616,18 +3601,20 @@ static void raid_status(struct dm_target *ti, status_type_t type,
case STATUSTYPE_TABLE:
/* Report the table line string you would use to construct this raid set */
- /* Calculate raid parameter count */
- for (i = 0; i < rs->raid_disks; i++)
- if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
- write_mostly_params += 2;
- rebuild_disks = memweight(rs->rebuild_disks, DISKS_ARRAY_ELEMS * sizeof(*rs->rebuild_disks));
- raid_param_cnt += rebuild_disks * 2 +
- write_mostly_params +
+ /*
+ * Count any rebuild or writemostly argument pairs and subtract the
+ * hweight count being added below of any rebuild and writemostly ctr flags.
+ */
+ for (i = 0; i < rs->raid_disks; i++) {
+ rebuild_writemostly_count += (test_bit(i, (void *) rs->rebuild_disks) ? 2 : 0) +
+ (test_bit(WriteMostly, &rs->dev[i].rdev.flags) ? 2 : 0);
+ }
+ rebuild_writemostly_count -= (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) ? 2 : 0) +
+ (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags) ? 2 : 0);
+ /* Calculate raid parameter count based on ^ rebuild/writemostly argument counts and ctr flags set. */
+ raid_param_cnt += rebuild_writemostly_count +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
- hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
- (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
- (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
-
+ hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
/* Emit table line */
/* This has to be in the documented order for userspace! */
DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
@@ -3635,11 +3622,10 @@ static void raid_status(struct dm_target *ti, status_type_t type,
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
- if (rebuild_disks)
+ if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags))
for (i = 0; i < rs->raid_disks; i++)
- if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
- DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
- rs->dev[i].rdev.raid_disk);
+ if (test_bit(i, (void *) rs->rebuild_disks))
+ DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), i);
if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
mddev->bitmap_info.daemon_sleep);
@@ -3649,7 +3635,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
mddev->sync_speed_max);
- if (write_mostly_params)
+ if (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags))
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY),
@@ -3977,11 +3963,22 @@ static int raid_preresume(struct dm_target *ti)
if (r)
return r;
- /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */
- if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap &&
- mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) {
- r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors,
- to_bytes(rs->requested_bitmap_chunk_sectors), 0);
+ /* We are extending the raid set size, adjust mddev/md_rdev sizes and set capacity. */
+ if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) {
+ mddev->array_sectors = rs->array_sectors;
+ mddev->dev_sectors = rs->dev_sectors;
+ rs_set_rdev_sectors(rs);
+ rs_set_capacity(rs);
+ }
+
+ /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) or grown device size */
+ if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap &&
+ (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags) ||
+ (rs->requested_bitmap_chunk_sectors &&
+ mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) {
+ int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize;
+
+ r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0);
if (r)
DMERR("Failed to resize bitmap");
}
@@ -3990,8 +3987,10 @@ static int raid_preresume(struct dm_target *ti)
/* Be prepared for mddev_resume() in raid_resume() */
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) {
- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
mddev->resync_min = mddev->recovery_cp;
+ if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags))
+ mddev->resync_max_sectors = mddev->dev_sectors;
}
/* Check for any reshape request unless new raid set */
@@ -4039,7 +4038,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 14, 0},
+ .version = {1, 15, 1},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 089aed5..fa09bc4 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -83,7 +83,7 @@ struct mirror_set {
struct work_struct trigger_event;
unsigned nr_mirrors;
- struct mirror mirror[0];
+ struct mirror mirror[];
};
DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(raid1_resync_throttle,
@@ -779,7 +779,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
wakeup_mirrord(ms);
} else {
map_bio(get_default_mirror(ms), bio);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
}
}
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 6bc6192..b1e867f 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -143,10 +143,6 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
*/
static void rq_completed(struct mapped_device *md)
{
- /* nudge anyone waiting on suspend queue */
- if (unlikely(wq_has_sleeper(&md->wait)))
- wake_up(&md->wait);
-
/*
* dm_put() must be at the end of this function. See the comment above
*/
@@ -179,7 +175,7 @@ static void __dm_mq_kick_requeue_list(struct request_queue *q, unsigned long mse
void dm_mq_kick_requeue_list(struct mapped_device *md)
{
- __dm_mq_kick_requeue_list(dm_get_md_queue(md), 0);
+ __dm_mq_kick_requeue_list(md->queue, 0);
}
EXPORT_SYMBOL(dm_mq_kick_requeue_list);
@@ -285,7 +281,8 @@ static void dm_complete_request(struct request *rq, blk_status_t error)
struct dm_rq_target_io *tio = tio_from_request(rq);
tio->error = error;
- blk_mq_complete_request(rq);
+ if (likely(!blk_should_fake_timeout(rq->q)))
+ blk_mq_complete_request(rq);
}
/*
@@ -544,7 +541,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
md->tag_set->ops = &dm_mq_ops;
md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
md->tag_set->numa_node = md->numa_node_id;
- md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
+ md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
md->tag_set->driver_data = md;
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index f006a90..9cfda66 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -309,7 +309,7 @@ static int st_start_io(struct path_selector *ps, struct dm_path *path,
}
static int st_end_io(struct path_selector *ps, struct dm_path *path,
- size_t nr_bytes)
+ size_t nr_bytes, u64 start_time)
{
struct path_info *pi = path->pscontext;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 963d377..8e329c3 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -252,7 +252,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int op,
/*
* Issue the synchronous I/O from a different thread
- * to avoid generic_make_request recursion.
+ * to avoid submit_bio_noacct recursion.
*/
INIT_WORK_ONSTACK(&req.work, do_metadata);
queue_work(ps->metadata_wq, &req.work);
@@ -284,16 +284,9 @@ static void skip_metadata(struct pstore *ps)
*/
static int area_io(struct pstore *ps, int op, int op_flags)
{
- int r;
- chunk_t chunk;
+ chunk_t chunk = area_location(ps, ps->current_area);
- chunk = area_location(ps, ps->current_area);
-
- r = chunk_io(ps, ps->area, chunk, op, op_flags, 0);
- if (r)
- return r;
-
- return 0;
+ return chunk_io(ps, ps->area, chunk, op, op_flags, 0);
}
static void zero_memory_area(struct pstore *ps)
@@ -613,7 +606,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
chunk_t old, chunk_t new),
void *callback_context)
{
- int r, uninitialized_var(new_snapshot);
+ int r, new_snapshot;
struct pstore *ps = get_info(store);
/*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index e902aae..41735a2 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1066,7 +1066,7 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
DMERR("Read error in exception store: "
"shutting down merge");
down_write(&s->lock);
- s->merge_failed = 1;
+ s->merge_failed = true;
up_write(&s->lock);
}
goto shut;
@@ -1170,7 +1170,7 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
shut:
down_write(&s->lock);
- s->merge_failed = 1;
+ s->merge_failed = true;
b = __release_queued_bios_after_merge(s);
up_write(&s->lock);
error_bios(b);
@@ -1335,7 +1335,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
INIT_LIST_HEAD(&s->list);
spin_lock_init(&s->pe_lock);
s->state_bits = 0;
- s->merge_failed = 0;
+ s->merge_failed = false;
s->first_merging_chunk = 0;
s->num_merging_chunks = 0;
bio_list_init(&s->bios_queued_during_merge);
@@ -1593,7 +1593,7 @@ static void flush_bios(struct bio *bio)
while (bio) {
n = bio->bi_next;
bio->bi_next = NULL;
- generic_make_request(bio);
+ submit_bio_noacct(bio);
bio = n;
}
}
@@ -1613,7 +1613,7 @@ static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
bio->bi_next = NULL;
r = do_origin(s->origin, bio, false);
if (r == DM_MAPIO_REMAPPED)
- generic_make_request(bio);
+ submit_bio_noacct(bio);
bio = n;
}
}
@@ -1854,7 +1854,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe,
bio->bi_end_io = full_bio_end_io;
bio->bi_private = callback_data;
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
static struct dm_snap_pending_exception *
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 7141704..35d368c 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -56,7 +56,7 @@ struct dm_stat {
size_t percpu_alloc_size;
size_t histogram_alloc_size;
struct dm_stat_percpu *stat_percpu[NR_CPUS];
- struct dm_stat_shared stat_shared[0];
+ struct dm_stat_shared stat_shared[];
};
#define STAT_PRECISE_TIMESTAMPS 1
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 8547d75..151d022 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -41,7 +41,7 @@ struct stripe_c {
/* Work struct used for triggering events*/
struct work_struct trigger_event;
- struct stripe stripe[0];
+ struct stripe stripe[];
};
/*
@@ -55,19 +55,6 @@ static void trigger_event(struct work_struct *work)
dm_table_event(sc->ti->table);
}
-static inline struct stripe_c *alloc_context(unsigned int stripes)
-{
- size_t len;
-
- if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
- stripes))
- return NULL;
-
- len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
-
- return kmalloc(len, GFP_KERNEL);
-}
-
/*
* Parse a single <dev> <sector> pair
*/
@@ -142,7 +129,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -EINVAL;
}
- sc = alloc_context(stripes);
+ sc = kmalloc(struct_size(sc, stripe, stripes), GFP_KERNEL);
if (!sc) {
ti->error = "Memory allocation for striped context "
"failed";
@@ -373,10 +360,32 @@ static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
}
+static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
+ size_t nr_pages)
+{
+ int ret;
+ sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+ struct stripe_c *sc = ti->private;
+ struct dax_device *dax_dev;
+ struct block_device *bdev;
+ uint32_t stripe;
+
+ stripe_map_sector(sc, sector, &stripe, &dev_sector);
+ dev_sector += sc->stripe[stripe].physical_start;
+ dax_dev = sc->stripe[stripe].dev->dax_dev;
+ bdev = sc->stripe[stripe].dev->bdev;
+
+ ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages << PAGE_SHIFT, &pgoff);
+ if (ret)
+ return ret;
+ return dax_zero_page_range(dax_dev, pgoff, nr_pages);
+}
+
#else
#define stripe_dax_direct_access NULL
#define stripe_dax_copy_from_iter NULL
#define stripe_dax_copy_to_iter NULL
+#define stripe_dax_zero_page_range NULL
#endif
/*
@@ -499,6 +508,7 @@ static struct target_type stripe_target = {
.direct_access = stripe_dax_direct_access,
.dax_copy_from_iter = stripe_dax_copy_from_iter,
.dax_copy_to_iter = stripe_dax_copy_to_iter,
+ .dax_zero_page_range = stripe_dax_zero_page_range,
};
int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index 8a0f057..bff4c7f 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -53,7 +53,7 @@ struct switch_ctx {
/*
* Array of dm devices to switch between.
*/
- struct switch_path path_list[0];
+ struct switch_path path_list[];
};
static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 06b3823..5c59089 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -24,50 +24,10 @@
#define DM_MSG_PREFIX "table"
-#define MAX_DEPTH 16
#define NODE_SIZE L1_CACHE_BYTES
#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
-struct dm_table {
- struct mapped_device *md;
- enum dm_queue_mode type;
-
- /* btree table */
- unsigned int depth;
- unsigned int counts[MAX_DEPTH]; /* in nodes */
- sector_t *index[MAX_DEPTH];
-
- unsigned int num_targets;
- unsigned int num_allocated;
- sector_t *highs;
- struct dm_target *targets;
-
- struct target_type *immutable_target_type;
-
- bool integrity_supported:1;
- bool singleton:1;
- unsigned integrity_added:1;
-
- /*
- * Indicates the rw permissions for the new logical
- * device. This should be a combination of FMODE_READ
- * and FMODE_WRITE.
- */
- fmode_t mode;
-
- /* a list of devices used by this table */
- struct list_head devices;
-
- /* events get handed up using this callback */
- void (*event_fn)(void *);
- void *event_context;
-
- struct dm_md_mempools *mempools;
-
- struct list_head target_callbacks;
-};
-
/*
* Similar to ceiling(log_size(n))
*/
@@ -190,7 +150,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
return -ENOMEM;
INIT_LIST_HEAD(&t->devices);
- INIT_LIST_HEAD(&t->target_callbacks);
if (!num_targets)
num_targets = KEYS_PER_NODE;
@@ -279,7 +238,6 @@ static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
- struct request_queue *q;
struct queue_limits *limits = data;
struct block_device *bdev = dev->bdev;
sector_t dev_size =
@@ -288,22 +246,6 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
limits->logical_block_size >> SECTOR_SHIFT;
char b[BDEVNAME_SIZE];
- /*
- * Some devices exist without request functions,
- * such as loop devices not yet bound to backing files.
- * Forbid the use of such devices.
- */
- q = bdev_get_queue(bdev);
- if (!q || !q->make_request_fn) {
- DMWARN("%s: %s is not yet initialised: "
- "start=%llu, len=%llu, dev_size=%llu",
- dm_device_name(ti->table->md), bdevname(bdev, b),
- (unsigned long long)start,
- (unsigned long long)len,
- (unsigned long long)dev_size);
- return 1;
- }
-
if (!dev_size)
return 0;
@@ -378,7 +320,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
* This upgrades the mode on an already open dm_dev, being
* careful to leave things as they were if we fail to reopen the
* device and not to touch the existing bdev field in case
- * it is accessed concurrently inside dm_table_any_congested().
+ * it is accessed concurrently.
*/
static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
struct mapped_device *md)
@@ -487,7 +429,8 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
return 0;
}
- if (bdev_stack_limits(limits, bdev, start) < 0)
+ if (blk_stack_limits(limits, &q->limits,
+ get_start_sect(bdev) + start) < 0)
DMWARN("%s: adding target device %s caused an alignment inconsistency: "
"physical_block_size=%u, logical_block_size=%u, "
"alignment_offset=%u, start=%llu",
@@ -496,9 +439,6 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
q->limits.logical_block_size,
q->limits.alignment_offset,
(unsigned long long) start << SECTOR_SHIFT);
-
- limits->zoned = blk_queue_zoned_model(q);
-
return 0;
}
@@ -668,7 +608,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
*/
unsigned short remaining = 0;
- struct dm_target *uninitialized_var(ti);
+ struct dm_target *ti;
struct queue_limits ti_limits;
unsigned i;
@@ -872,8 +812,7 @@ EXPORT_SYMBOL(dm_consume_args);
static bool __table_type_bio_based(enum dm_queue_mode table_type)
{
return (table_type == DM_TYPE_BIO_BASED ||
- table_type == DM_TYPE_DAX_BIO_BASED ||
- table_type == DM_TYPE_NVME_BIO_BASED);
+ table_type == DM_TYPE_DAX_BIO_BASED);
}
static bool __table_type_request_based(enum dm_queue_mode table_type)
@@ -929,8 +868,6 @@ bool dm_table_supports_dax(struct dm_table *t,
return true;
}
-static bool dm_table_does_not_support_partial_completion(struct dm_table *t);
-
static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -938,7 +875,7 @@ static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev,
struct request_queue *q = bdev_get_queue(bdev);
/* request-based cannot stack on partitions! */
- if (bdev != bdev->bd_contains)
+ if (bdev_is_partition(bdev))
return false;
return queue_is_mq(q);
@@ -960,7 +897,6 @@ static int dm_table_determine_type(struct dm_table *t)
goto verify_bio_based;
}
BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED);
- BUG_ON(t->type == DM_TYPE_NVME_BIO_BASED);
goto verify_rq_based;
}
@@ -999,15 +935,6 @@ static int dm_table_determine_type(struct dm_table *t)
if (dm_table_supports_dax(t, device_not_dax_capable, &page_size) ||
(list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
t->type = DM_TYPE_DAX_BIO_BASED;
- } else {
- /* Check if upgrading to NVMe bio-based is valid or required */
- tgt = dm_table_get_immutable_target(t);
- if (tgt && !tgt->max_io_len && dm_table_does_not_support_partial_completion(t)) {
- t->type = DM_TYPE_NVME_BIO_BASED;
- goto verify_rq_based; /* must be stacked directly on NVMe (blk-mq) */
- } else if (list_empty(devices) && live_md_type == DM_TYPE_NVME_BIO_BASED) {
- t->type = DM_TYPE_NVME_BIO_BASED;
- }
}
return 0;
}
@@ -1024,8 +951,7 @@ static int dm_table_determine_type(struct dm_table *t)
* (e.g. request completion process for partial completion.)
*/
if (t->num_targets > 1) {
- DMERR("%s DM doesn't support multiple targets",
- t->type == DM_TYPE_NVME_BIO_BASED ? "nvme bio-based" : "request-based");
+ DMERR("request-based DM doesn't support multiple targets");
return -EINVAL;
}
@@ -1461,6 +1387,13 @@ static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev,
return !q || blk_queue_zoned_model(q) != *zoned_model;
}
+/*
+ * Check the device zoned model based on the target feature flag. If the target
+ * has the DM_TARGET_ZONED_HM feature flag set, host-managed zoned devices are
+ * also accepted but all devices must have the same zoned model. If the target
+ * has the DM_TARGET_MIXED_ZONED_MODEL feature set, the devices can have any
+ * zoned model with all zoned devices having the same zone size.
+ */
static bool dm_table_supports_zoned_model(struct dm_table *t,
enum blk_zoned_model zoned_model)
{
@@ -1470,13 +1403,15 @@ static bool dm_table_supports_zoned_model(struct dm_table *t,
for (i = 0; i < dm_table_get_num_targets(t); i++) {
ti = dm_table_get_target(t, i);
- if (zoned_model == BLK_ZONED_HM &&
- !dm_target_supports_zoned_hm(ti->type))
- return false;
-
- if (!ti->type->iterate_devices ||
- ti->type->iterate_devices(ti, device_not_zoned_model, &zoned_model))
- return false;
+ if (dm_target_supports_zoned_hm(ti->type)) {
+ if (!ti->type->iterate_devices ||
+ ti->type->iterate_devices(ti, device_not_zoned_model,
+ &zoned_model))
+ return false;
+ } else if (!dm_target_supports_mixed_zoned_model(ti->type)) {
+ if (zoned_model == BLK_ZONED_HM)
+ return false;
+ }
}
return true;
@@ -1488,9 +1423,17 @@ static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *
struct request_queue *q = bdev_get_queue(dev->bdev);
unsigned int *zone_sectors = data;
+ if (!blk_queue_is_zoned(q))
+ return 0;
+
return !q || blk_queue_zone_sectors(q) != *zone_sectors;
}
+/*
+ * Check consistency of zoned model and zone sectors across all targets. For
+ * zone sectors, if the destination device is a zoned block device, it shall
+ * have the specified zone_sectors.
+ */
static int validate_hardware_zoned_model(struct dm_table *table,
enum blk_zoned_model zoned_model,
unsigned int zone_sectors)
@@ -1509,7 +1452,7 @@ static int validate_hardware_zoned_model(struct dm_table *table,
return -EINVAL;
if (dm_table_any_dev_attr(table, device_not_matches_zone_sectors, &zone_sectors)) {
- DMERR("%s: zone sectors is not consistent across all devices",
+ DMERR("%s: zone sectors is not consistent across all zoned devices",
dm_device_name(table->md));
return -EINVAL;
}
@@ -1578,22 +1521,6 @@ int dm_calculate_queue_limits(struct dm_table *table,
dm_device_name(table->md),
(unsigned long long) ti->begin,
(unsigned long long) ti->len);
-
- /*
- * FIXME: this should likely be moved to blk_stack_limits(), would
- * also eliminate limits->zoned stacking hack in dm_set_device_limits()
- */
- if (limits->zoned == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) {
- /*
- * By default, the stacked limits zoned model is set to
- * BLK_ZONED_NONE in blk_set_stacking_limits(). Update
- * this model using the first target model reported
- * that is not BLK_ZONED_NONE. This will be either the
- * first target device zoned model or the model reported
- * by the target .io_hints.
- */
- limits->zoned = ti_limits.zoned;
- }
}
/*
@@ -1714,20 +1641,6 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
return q && !blk_queue_add_random(q);
}
-static int device_is_partial_completion(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
-{
- char b[BDEVNAME_SIZE];
-
- /* For now, NVMe devices are the only devices of this class */
- return (strncmp(bdevname(dev->bdev, b), "nvme", 4) != 0);
-}
-
-static bool dm_table_does_not_support_partial_completion(struct dm_table *t)
-{
- return !dm_table_any_dev_attr(t, device_is_partial_completion, NULL);
-}
-
static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -1782,6 +1695,33 @@ static bool dm_table_supports_write_zeroes(struct dm_table *t)
return true;
}
+static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && !blk_queue_nowait(q);
+}
+
+static bool dm_table_supports_nowait(struct dm_table *t)
+{
+ struct dm_target *ti;
+ unsigned i = 0;
+
+ while (i < dm_table_get_num_targets(t)) {
+ ti = dm_table_get_target(t, i++);
+
+ if (!dm_target_supports_nowait(ti->type))
+ return false;
+
+ if (!ti->type->iterate_devices ||
+ ti->type->iterate_devices(ti, device_not_nowait_capable, NULL))
+ return false;
+ }
+
+ return true;
+}
+
static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -1849,7 +1789,7 @@ static int device_requires_stable_pages(struct dm_target *ti,
{
struct request_queue *q = bdev_get_queue(dev->bdev);
- return q && bdi_cap_stable_pages_required(q->backing_dev_info);
+ return q && blk_queue_stable_writes(q);
}
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
@@ -1863,6 +1803,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
*/
q->limits = *limits;
+ if (dm_table_supports_nowait(t))
+ blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
+ else
+ blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q);
+
if (!dm_table_supports_discards(t)) {
blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
/* Must also clear discard limits... */
@@ -1916,9 +1861,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
* don't want error, zero, etc to require stable pages.
*/
if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL))
- q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
else
- q->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
/*
* Determine whether or not this queue's I/O timings contribute
@@ -1933,15 +1878,16 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
/*
* For a zoned target, the number of zones should be updated for the
* correct value to be exposed in sysfs queue/nr_zones. For a BIO based
- * target, this is all that is needed. For a request based target, the
- * queue zone bitmaps must also be updated.
- * Use blk_revalidate_disk_zones() to handle this.
+ * target, this is all that is needed.
*/
- if (blk_queue_is_zoned(q))
- blk_revalidate_disk_zones(t->md->disk);
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (blk_queue_is_zoned(q)) {
+ WARN_ON_ONCE(queue_is_mq(q));
+ q->nr_zones = blkdev_nr_zones(t->md->disk);
+ }
+#endif
- /* Allow reads to exceed readahead limits */
- q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9);
+ blk_queue_update_readahead(q);
}
unsigned int dm_table_get_num_targets(struct dm_table *t)
@@ -2046,38 +1992,6 @@ int dm_table_resume_targets(struct dm_table *t)
return 0;
}
-void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
-{
- list_add(&cb->list, &t->target_callbacks);
-}
-EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
-
-int dm_table_any_congested(struct dm_table *t, int bdi_bits)
-{
- struct dm_dev_internal *dd;
- struct list_head *devices = dm_table_get_devices(t);
- struct dm_target_callbacks *cb;
- int r = 0;
-
- list_for_each_entry(dd, devices, list) {
- struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
- char b[BDEVNAME_SIZE];
-
- if (likely(q))
- r |= bdi_congested(q->backing_dev_info, bdi_bits);
- else
- DMWARN_LIMIT("%s: any_congested: nonexistent device %s",
- dm_device_name(t->md),
- bdevname(dd->dm_dev->bdev, b));
- }
-
- list_for_each_entry(cb, &t->target_callbacks, list)
- if (cb->congested_fn)
- r |= cb->congested_fn(cb, bdi_bits);
-
- return r;
-}
-
struct mapped_device *dm_table_get_md(struct dm_table *t)
{
return t->md;
@@ -2092,16 +2006,11 @@ EXPORT_SYMBOL_GPL(dm_table_device_name);
void dm_table_run_md_queue_async(struct dm_table *t)
{
- struct mapped_device *md;
- struct request_queue *queue;
-
if (!dm_table_request_based(t))
return;
- md = dm_table_get_md(t);
- queue = dm_get_md_queue(md);
- if (queue)
- blk_mq_run_hw_queues(queue, true);
+ if (t->md->queue)
+ blk_mq_run_hw_queues(t->md->queue, true);
}
EXPORT_SYMBOL(dm_table_run_md_queue_async);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index a5ed59e..6ebb212 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -28,7 +28,7 @@
*
* - A hierarchical btree, with 2 levels which effectively maps (thin
* dev id, virtual block) -> block_time. Block time is a 64-bit
- * field holding the time in the low 24 bits, and block in the top 48
+ * field holding the time in the low 24 bits, and block in the top 40
* bits.
*
* BTrees consist solely of btree_nodes, that fill a block. Some are
@@ -814,7 +814,7 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
return r;
if (td->open_count)
- td->changed = 0;
+ td->changed = false;
else {
list_del(&td->list);
kfree(td);
@@ -1051,12 +1051,11 @@ static int __create_thin(struct dm_pool_metadata *pmd,
int r;
dm_block_t dev_root;
uint64_t key = dev;
- struct disk_device_details details_le;
struct dm_thin_device *td;
__le64 value;
r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
- &key, &details_le);
+ &key, NULL);
if (!r)
return -EEXIST;
@@ -1112,7 +1111,7 @@ static int __set_snapshot_details(struct dm_pool_metadata *pmd,
if (r)
return r;
- td->changed = 1;
+ td->changed = true;
td->snapshotted_time = time;
snap->mapped_blocks = td->mapped_blocks;
@@ -1129,12 +1128,11 @@ static int __create_snap(struct dm_pool_metadata *pmd,
dm_block_t origin_root;
uint64_t key = origin, dev_key = dev;
struct dm_thin_device *td;
- struct disk_device_details details_le;
__le64 value;
/* check this device is unused */
r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
- &dev_key, &details_le);
+ &dev_key, NULL);
if (!r)
return -EEXIST;
@@ -1624,7 +1622,7 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
if (r)
return r;
- td->changed = 1;
+ td->changed = true;
if (inserted)
td->mapped_blocks++;
@@ -1655,7 +1653,7 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
return r;
td->mapped_blocks--;
- td->changed = 1;
+ td->changed = true;
return 0;
}
@@ -1709,7 +1707,7 @@ static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_
}
td->mapped_blocks -= total_count;
- td->changed = 1;
+ td->changed = true;
/*
* Reinsert the mapping tree.
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 1b2c98b..fff4c50 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -282,6 +282,8 @@ struct pool {
struct dm_bio_prison_cell **cell_sort_array;
mempool_t mapping_pool;
+
+ struct bio flush_bio;
};
static void metadata_operation_failed(struct pool *pool, const char *op, int r);
@@ -324,12 +326,10 @@ struct pool_c {
struct pool *pool;
struct dm_dev *data_dev;
struct dm_dev *metadata_dev;
- struct dm_target_callbacks callbacks;
dm_block_t low_water_blocks;
struct pool_features requested_pf; /* Features requested during table load */
struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
- struct bio flush_bio;
};
/*
@@ -611,13 +611,12 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
blk_status_t error)
{
struct bio_list bios;
- unsigned long flags;
bio_list_init(&bios);
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
__merge_bio_list(&bios, master);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
error_bio_list(&bios, error);
}
@@ -625,15 +624,14 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
static void requeue_deferred_cells(struct thin_c *tc)
{
struct pool *pool = tc->pool;
- unsigned long flags;
struct list_head cells;
struct dm_bio_prison_cell *cell, *tmp;
INIT_LIST_HEAD(&cells);
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
list_splice_init(&tc->deferred_cells, &cells);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
list_for_each_entry_safe(cell, tmp, &cells, user_list)
cell_requeue(pool, cell);
@@ -642,14 +640,13 @@ static void requeue_deferred_cells(struct thin_c *tc)
static void requeue_io(struct thin_c *tc)
{
struct bio_list bios;
- unsigned long flags;
bio_list_init(&bios);
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
__merge_bio_list(&bios, &tc->deferred_bio_list);
__merge_bio_list(&bios, &tc->retry_on_resume_list);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
error_bio_list(&bios, BLK_STS_DM_REQUEUE);
requeue_deferred_cells(tc);
@@ -758,10 +755,9 @@ static void inc_all_io_entry(struct pool *pool, struct bio *bio)
static void issue(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
- unsigned long flags;
if (!bio_triggers_commit(tc, bio)) {
- generic_make_request(bio);
+ submit_bio_noacct(bio);
return;
}
@@ -779,9 +775,9 @@ static void issue(struct thin_c *tc, struct bio *bio)
* Batch together any bios that trigger commits and then issue a
* single commit for them in process_deferred_bios().
*/
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
bio_list_add(&pool->deferred_flush_bios, bio);
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
}
static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
@@ -888,12 +884,15 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c
{
struct pool *pool = tc->pool;
unsigned long flags;
+ int has_work;
spin_lock_irqsave(&tc->lock, flags);
cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
+ has_work = !bio_list_empty(&tc->deferred_bio_list);
spin_unlock_irqrestore(&tc->lock, flags);
- wake_worker(pool);
+ if (has_work)
+ wake_worker(pool);
}
static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
@@ -962,7 +961,6 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
- unsigned long flags;
/*
* If the bio has the REQ_FUA flag set we must commit the metadata
@@ -987,9 +985,9 @@ static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
* Batch together any bios that trigger commits and then issue a
* single commit for them in process_deferred_bios().
*/
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
bio_list_add(&pool->deferred_flush_completions, bio);
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
}
static void process_prepared_mapping(struct dm_thin_new_mapping *m)
@@ -1228,14 +1226,13 @@ static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
static void process_prepared(struct pool *pool, struct list_head *head,
process_mapping_fn *fn)
{
- unsigned long flags;
struct list_head maps;
struct dm_thin_new_mapping *m, *tmp;
INIT_LIST_HEAD(&maps);
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
list_splice_init(head, &maps);
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
list_for_each_entry_safe(m, tmp, &maps, list)
(*fn)(m);
@@ -1512,14 +1509,12 @@ static int commit(struct pool *pool)
static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
{
- unsigned long flags;
-
if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
DMWARN("%s: reached low water mark for data device: sending event.",
dm_device_name(pool->pool_md));
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
pool->low_water_triggered = true;
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
dm_table_event(pool->ti->table);
}
}
@@ -1595,11 +1590,10 @@ static void retry_on_resume(struct bio *bio)
{
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct thin_c *tc = h->tc;
- unsigned long flags;
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
bio_list_add(&tc->retry_on_resume_list, bio);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
}
static blk_status_t should_error_unserviceable_bio(struct pool *pool)
@@ -2172,7 +2166,6 @@ static void __sort_thin_deferred_bios(struct thin_c *tc)
static void process_thin_deferred_bios(struct thin_c *tc)
{
struct pool *pool = tc->pool;
- unsigned long flags;
struct bio *bio;
struct bio_list bios;
struct blk_plug plug;
@@ -2186,10 +2179,10 @@ static void process_thin_deferred_bios(struct thin_c *tc)
bio_list_init(&bios);
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
if (bio_list_empty(&tc->deferred_bio_list)) {
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
return;
}
@@ -2198,7 +2191,7 @@ static void process_thin_deferred_bios(struct thin_c *tc)
bio_list_merge(&bios, &tc->deferred_bio_list);
bio_list_init(&tc->deferred_bio_list);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
blk_start_plug(&plug);
while ((bio = bio_list_pop(&bios))) {
@@ -2208,10 +2201,10 @@ static void process_thin_deferred_bios(struct thin_c *tc)
* prepared mappings to process.
*/
if (ensure_next_mapping(pool)) {
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
bio_list_add(&tc->deferred_bio_list, bio);
bio_list_merge(&tc->deferred_bio_list, &bios);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
break;
}
@@ -2266,16 +2259,15 @@ static unsigned sort_cells(struct pool *pool, struct list_head *cells)
static void process_thin_deferred_cells(struct thin_c *tc)
{
struct pool *pool = tc->pool;
- unsigned long flags;
struct list_head cells;
struct dm_bio_prison_cell *cell;
unsigned i, j, count;
INIT_LIST_HEAD(&cells);
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
list_splice_init(&tc->deferred_cells, &cells);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
if (list_empty(&cells))
return;
@@ -2296,9 +2288,9 @@ static void process_thin_deferred_cells(struct thin_c *tc)
for (j = i; j < count; j++)
list_add(&pool->cell_sort_array[j]->user_list, &cells);
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
list_splice(&cells, &tc->deferred_cells);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
return;
}
@@ -2351,7 +2343,6 @@ static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
static void process_deferred_bios(struct pool *pool)
{
- unsigned long flags;
struct bio *bio;
struct bio_list bios, bio_completions;
struct thin_c *tc;
@@ -2370,13 +2361,13 @@ static void process_deferred_bios(struct pool *pool)
bio_list_init(&bios);
bio_list_init(&bio_completions);
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
bio_list_merge(&bios, &pool->deferred_flush_bios);
bio_list_init(&pool->deferred_flush_bios);
bio_list_merge(&bio_completions, &pool->deferred_flush_completions);
bio_list_init(&pool->deferred_flush_completions);
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
!(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
@@ -2402,7 +2393,7 @@ static void process_deferred_bios(struct pool *pool)
if (bio->bi_opf & REQ_PREFLUSH)
bio_endio(bio);
else
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
}
@@ -2667,12 +2658,11 @@ static void metadata_operation_failed(struct pool *pool, const char *op, int r)
*/
static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
{
- unsigned long flags;
struct pool *pool = tc->pool;
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
bio_list_add(&tc->deferred_bio_list, bio);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
wake_worker(pool);
}
@@ -2688,13 +2678,12 @@ static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{
- unsigned long flags;
struct pool *pool = tc->pool;
throttle_lock(&pool->throttle);
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
list_add_tail(&cell->user_list, &tc->deferred_cells);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
throttle_unlock(&pool->throttle);
wake_worker(pool);
@@ -2806,29 +2795,16 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
}
}
-static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
-{
- struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
- struct request_queue *q;
-
- if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)
- return 1;
-
- q = bdev_get_queue(pt->data_dev->bdev);
- return bdi_congested(q->backing_dev_info, bdi_bits);
-}
-
static void requeue_bios(struct pool *pool)
{
- unsigned long flags;
struct thin_c *tc;
rcu_read_lock();
list_for_each_entry_rcu(tc, &pool->active_thins, list) {
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
bio_list_init(&tc->retry_on_resume_list);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
}
rcu_read_unlock();
}
@@ -2937,6 +2913,7 @@ static void __pool_destroy(struct pool *pool)
if (pool->next_mapping)
mempool_free(pool->next_mapping, &pool->mapping_pool);
mempool_exit(&pool->mapping_pool);
+ bio_uninit(&pool->flush_bio);
dm_deferred_set_destroy(pool->shared_read_ds);
dm_deferred_set_destroy(pool->all_io_ds);
kfree(pool);
@@ -3017,6 +2994,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->low_water_triggered = false;
pool->suspended = true;
pool->out_of_data_space = false;
+ bio_init(&pool->flush_bio, NULL, 0);
pool->shared_read_ds = dm_deferred_set_create();
if (!pool->shared_read_ds) {
@@ -3144,7 +3122,6 @@ static void pool_dtr(struct dm_target *ti)
__pool_dec(pt->pool);
dm_put_device(ti, pt->metadata_dev);
dm_put_device(ti, pt->data_dev);
- bio_uninit(&pt->flush_bio);
kfree(pt);
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -3223,11 +3200,11 @@ static void metadata_low_callback(void *context)
*/
static int metadata_pre_commit_callback(void *context)
{
- struct pool_c *pt = context;
- struct bio *flush_bio = &pt->flush_bio;
+ struct pool *pool = context;
+ struct bio *flush_bio = &pool->flush_bio;
bio_reset(flush_bio);
- bio_set_dev(flush_bio, pt->data_dev->bdev);
+ bio_set_dev(flush_bio, pool->data_dev);
flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
return submit_bio_wait(flush_bio);
@@ -3401,7 +3378,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
pt->data_dev = data_dev;
pt->low_water_blocks = low_water_blocks;
pt->adjusted_pf = pt->requested_pf = pf;
- bio_init(&pt->flush_bio, NULL, 0);
ti->num_flush_bios = 1;
/*
@@ -3428,8 +3404,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (r)
goto out_flags_changed;
- pt->callbacks.congested_fn = pool_is_congested;
- dm_table_add_target_callbacks(ti->table, &pt->callbacks);
+ dm_pool_register_pre_commit_callback(pool->pmd,
+ metadata_pre_commit_callback, pool);
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -3454,15 +3430,14 @@ static int pool_map(struct dm_target *ti, struct bio *bio)
int r;
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
- unsigned long flags;
/*
* As this is a singleton target, ti->begin is always zero.
*/
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
bio_set_dev(bio, pt->data_dev->bdev);
r = DM_MAPIO_REMAPPED;
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
return r;
}
@@ -3591,9 +3566,6 @@ static int pool_preresume(struct dm_target *ti)
if (r)
return r;
- dm_pool_register_pre_commit_callback(pool->pmd,
- metadata_pre_commit_callback, pt);
-
r = maybe_resize_data_dev(ti, &need_commit1);
if (r)
return r;
@@ -3636,7 +3608,6 @@ static void pool_resume(struct dm_target *ti)
{
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
- unsigned long flags;
/*
* Must requeue active_thins' bios and then resume
@@ -3645,10 +3616,10 @@ static void pool_resume(struct dm_target *ti)
requeue_bios(pool);
pool_resume_active_thins(pool);
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
pool->low_water_triggered = false;
pool->suspended = false;
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
do_waker(&pool->waker.work);
}
@@ -3657,11 +3628,10 @@ static void pool_presuspend(struct dm_target *ti)
{
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
- unsigned long flags;
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
pool->suspended = true;
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
pool_suspend_active_thins(pool);
}
@@ -3670,13 +3640,12 @@ static void pool_presuspend_undo(struct dm_target *ti)
{
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
- unsigned long flags;
pool_resume_active_thins(pool);
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
pool->suspended = false;
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
}
static void pool_postsuspend(struct dm_target *ti)
@@ -4155,11 +4124,10 @@ static void thin_put(struct thin_c *tc)
static void thin_dtr(struct dm_target *ti)
{
struct thin_c *tc = ti->private;
- unsigned long flags;
- spin_lock_irqsave(&tc->pool->lock, flags);
+ spin_lock_irq(&tc->pool->lock);
list_del_rcu(&tc->list);
- spin_unlock_irqrestore(&tc->pool->lock, flags);
+ spin_unlock_irq(&tc->pool->lock);
synchronize_rcu();
thin_put(tc);
@@ -4195,7 +4163,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
struct thin_c *tc;
struct dm_dev *pool_dev, *origin_dev;
struct mapped_device *pool_md;
- unsigned long flags;
mutex_lock(&dm_thin_pool_table.mutex);
@@ -4289,9 +4256,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
mutex_unlock(&dm_thin_pool_table.mutex);
- spin_lock_irqsave(&tc->pool->lock, flags);
+ spin_lock_irq(&tc->pool->lock);
if (tc->pool->suspended) {
- spin_unlock_irqrestore(&tc->pool->lock, flags);
+ spin_unlock_irq(&tc->pool->lock);
mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
ti->error = "Unable to activate thin device while pool is suspended";
r = -EINVAL;
@@ -4300,7 +4267,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
refcount_set(&tc->refcount, 1);
init_completion(&tc->can_destroy);
list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
- spin_unlock_irqrestore(&tc->pool->lock, flags);
+ spin_unlock_irq(&tc->pool->lock);
/*
* This synchronize_rcu() call is needed here otherwise we risk a
* wake_worker() call finding no bios to process (because the newly
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 711f101..808a98e 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -30,6 +30,7 @@
#define DM_VERITY_OPT_LOGGING "ignore_corruption"
#define DM_VERITY_OPT_RESTART "restart_on_corruption"
+#define DM_VERITY_OPT_PANIC "panic_on_corruption"
#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks"
#define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once"
@@ -254,6 +255,9 @@ static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
if (v->mode == DM_VERITY_MODE_RESTART)
kernel_restart("dm-verity device corrupted");
+ if (v->mode == DM_VERITY_MODE_PANIC)
+ panic("dm-verity device corrupted");
+
return 1;
}
@@ -621,8 +625,22 @@ static void verity_prefetch_io(struct work_struct *work)
static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
{
+ sector_t block = io->block;
+ unsigned int n_blocks = io->n_blocks;
struct dm_verity_prefetch_work *pw;
+ if (v->validated_blocks) {
+ while (n_blocks && test_bit(block, v->validated_blocks)) {
+ block++;
+ n_blocks--;
+ }
+ while (n_blocks && test_bit(block + n_blocks - 1,
+ v->validated_blocks))
+ n_blocks--;
+ if (!n_blocks)
+ return;
+ }
+
pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
@@ -631,8 +649,8 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
INIT_WORK(&pw->work, verity_prefetch_io);
pw->v = v;
- pw->block = io->block;
- pw->n_blocks = io->n_blocks;
+ pw->block = block;
+ pw->n_blocks = n_blocks;
queue_work(v->verify_wq, &pw->work);
}
@@ -677,7 +695,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
verity_submit_prefetch(v, io);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
return DM_MAPIO_SUBMITTED;
}
@@ -738,6 +756,9 @@ static void verity_status(struct dm_target *ti, status_type_t type,
case DM_VERITY_MODE_RESTART:
DMEMIT(DM_VERITY_OPT_RESTART);
break;
+ case DM_VERITY_MODE_PANIC:
+ DMEMIT(DM_VERITY_OPT_PANIC);
+ break;
default:
BUG();
}
@@ -903,6 +924,10 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
v->mode = DM_VERITY_MODE_RESTART;
continue;
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_PANIC)) {
+ v->mode = DM_VERITY_MODE_PANIC;
+ continue;
+
} else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) {
r = verity_alloc_zero_digest(v);
if (r) {
@@ -1217,7 +1242,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
static struct target_type verity_target = {
.name = "verity",
- .version = {1, 5, 0},
+ .version = {1, 7, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
diff --git a/drivers/md/dm-verity-verify-sig.h b/drivers/md/dm-verity-verify-sig.h
index 19b1547..3987c71 100644
--- a/drivers/md/dm-verity-verify-sig.h
+++ b/drivers/md/dm-verity-verify-sig.h
@@ -34,25 +34,25 @@ void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts);
#define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 0
-int verity_verify_root_hash(const void *data, size_t data_len,
- const void *sig_data, size_t sig_len)
+static inline int verity_verify_root_hash(const void *data, size_t data_len,
+ const void *sig_data, size_t sig_len)
{
return 0;
}
-bool verity_verify_is_sig_opt_arg(const char *arg_name)
+static inline bool verity_verify_is_sig_opt_arg(const char *arg_name)
{
return false;
}
-int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
- struct dm_verity_sig_opts *sig_opts,
- unsigned int *argc, const char *arg_name)
+static inline int verity_verify_sig_parse_opt_args(struct dm_arg_set *as,
+ struct dm_verity *v, struct dm_verity_sig_opts *sig_opts,
+ unsigned int *argc, const char *arg_name)
{
return -EINVAL;
}
-void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts)
+static inline void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts)
{
}
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 641b9e3..4e769d1 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -20,7 +20,8 @@
enum verity_mode {
DM_VERITY_MODE_EIO,
DM_VERITY_MODE_LOGGING,
- DM_VERITY_MODE_RESTART
+ DM_VERITY_MODE_RESTART,
+ DM_VERITY_MODE_PANIC
};
enum verity_block_type {
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index ec10fda..9d6ae3e 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -26,6 +26,8 @@
#define AUTOCOMMIT_BLOCKS_SSD 65536
#define AUTOCOMMIT_BLOCKS_PMEM 64
#define AUTOCOMMIT_MSEC 1000
+#define MAX_AGE_DIV 16
+#define MAX_AGE_UNSPECIFIED -1UL
#define BITMAP_GRANULARITY 65536
#if BITMAP_GRANULARITY < PAGE_SIZE
@@ -47,7 +49,7 @@ do { \
#define pmem_assign(dest, src) ((dest) = (src))
#endif
-#if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM)
+#if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
#define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
#endif
@@ -88,6 +90,7 @@ struct wc_entry {
:47
#endif
;
+ unsigned long age;
#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
uint64_t original_sector;
uint64_t seq_count;
@@ -119,6 +122,7 @@ struct dm_writecache {
size_t writeback_size;
size_t freelist_high_watermark;
size_t freelist_low_watermark;
+ unsigned long max_age;
unsigned uncommitted_blocks;
unsigned autocommit_blocks;
@@ -130,6 +134,8 @@ struct dm_writecache {
struct timer_list autocommit_timer;
struct wait_queue_head freelist_wait;
+ struct timer_list max_age_timer;
+
atomic_t bio_in_progress[2];
struct wait_queue_head bio_in_progress_wait[2];
@@ -160,12 +166,16 @@ struct dm_writecache {
bool max_writeback_jobs_set:1;
bool autocommit_blocks_set:1;
bool autocommit_time_set:1;
+ bool max_age_set:1;
bool writeback_fua_set:1;
bool flush_on_suspend:1;
+ bool cleaner:1;
+ bool cleaner_set:1;
unsigned high_wm_percent_value;
unsigned low_wm_percent_value;
unsigned autocommit_time_value;
+ unsigned max_age_value;
unsigned writeback_all;
struct workqueue_struct *writeback_wq;
@@ -234,10 +244,6 @@ static int persistent_memory_claim(struct dm_writecache *wc)
wc->memory_vmapped = false;
- if (!wc->ssd_dev->dax_dev) {
- r = -EOPNOTSUPP;
- goto err1;
- }
s = wc->memory_map_size;
p = s >> PAGE_SHIFT;
if (!p) {
@@ -518,10 +524,38 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
}
+static void ssd_commit_superblock(struct dm_writecache *wc)
+{
+ int r;
+ struct dm_io_region region;
+ struct dm_io_request req;
+
+ region.bdev = wc->ssd_dev->bdev;
+ region.sector = 0;
+ region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
+
+ if (unlikely(region.sector + region.count > wc->metadata_sectors))
+ region.count = wc->metadata_sectors - region.sector;
+
+ region.sector += wc->start_sector;
+
+ req.bi_op = REQ_OP_WRITE;
+ req.bi_op_flags = REQ_SYNC | REQ_FUA;
+ req.mem.type = DM_IO_VMA;
+ req.mem.ptr.vma = (char *)wc->memory_map;
+ req.client = wc->dm_io;
+ req.notify.fn = NULL;
+ req.notify.context = NULL;
+
+ r = dm_io(&req, 1, ®ion, NULL);
+ if (unlikely(r))
+ writecache_error(wc, r, "error writing superblock");
+}
+
static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{
if (WC_MODE_PMEM(wc))
- wmb();
+ pmem_wmb();
else
ssd_commit_flushed(wc, wait_for_ios);
}
@@ -612,6 +646,7 @@ static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *i
rb_link_node(&ins->rb_node, parent, node);
rb_insert_color(&ins->rb_node, &wc->tree);
list_add(&ins->lru, &wc->lru);
+ ins->age = jiffies;
}
static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
@@ -647,7 +682,17 @@ static inline void writecache_verify_watermark(struct dm_writecache *wc)
queue_work(wc->writeback_wq, &wc->writeback_work);
}
-static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
+static void writecache_max_age_timer(struct timer_list *t)
+{
+ struct dm_writecache *wc = from_timer(wc, t, max_age_timer);
+
+ if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
+ queue_work(wc->writeback_wq, &wc->writeback_work);
+ mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
+ }
+}
+
+static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
{
struct wc_entry *e;
@@ -656,6 +701,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
if (unlikely(!wc->current_free))
return NULL;
e = wc->current_free;
+ if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
+ return NULL;
next = rb_next(&e->rb_node);
rb_erase(&e->rb_node, &wc->freetree);
if (unlikely(!next))
@@ -665,6 +712,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
if (unlikely(list_empty(&wc->freelist)))
return NULL;
e = container_of(wc->freelist.next, struct wc_entry, lru);
+ if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
+ return NULL;
list_del(&e->lru);
}
wc->freelist_size--;
@@ -753,8 +802,10 @@ static void writecache_flush(struct dm_writecache *wc)
wc->seq_count++;
pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
- writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
- writecache_commit_flushed(wc, false);
+ if (WC_MODE_PMEM(wc))
+ writecache_commit_flushed(wc, false);
+ else
+ ssd_commit_superblock(wc);
wc->overwrote_committed = false;
@@ -817,8 +868,10 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_
if (likely(!e->write_in_progress)) {
if (!discarded_something) {
- writecache_wait_for_ios(wc, READ);
- writecache_wait_for_ios(wc, WRITE);
+ if (!WC_MODE_PMEM(wc)) {
+ writecache_wait_for_ios(wc, READ);
+ writecache_wait_for_ios(wc, WRITE);
+ }
discarded_something = true;
}
if (!writecache_entry_is_committed(wc, e))
@@ -851,6 +904,7 @@ static void writecache_suspend(struct dm_target *ti)
bool flush_on_suspend;
del_timer_sync(&wc->autocommit_timer);
+ del_timer_sync(&wc->max_age_timer);
wc_lock(wc);
writecache_flush(wc);
@@ -949,7 +1003,8 @@ static void writecache_resume(struct dm_target *ti)
}
wc->freelist_size = 0;
- r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
+ r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count,
+ sizeof(uint64_t));
if (r) {
writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
sb_seq_count = cpu_to_le64(0);
@@ -965,7 +1020,8 @@ static void writecache_resume(struct dm_target *ti)
e->seq_count = -1;
continue;
}
- r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry));
+ r = copy_mc_to_kernel(&wme, memory_entry(wc, e),
+ sizeof(struct wc_memory_entry));
if (r) {
writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
(unsigned long)b, r);
@@ -1019,6 +1075,9 @@ static void writecache_resume(struct dm_target *ti)
writecache_verify_watermark(wc);
+ if (wc->max_age != MAX_AGE_UNSPECIFIED)
+ mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
+
wc_unlock(wc);
}
@@ -1067,6 +1126,28 @@ static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_w
return 0;
}
+static void activate_cleaner(struct dm_writecache *wc)
+{
+ wc->flush_on_suspend = true;
+ wc->cleaner = true;
+ wc->freelist_high_watermark = wc->n_blocks;
+ wc->freelist_low_watermark = wc->n_blocks;
+}
+
+static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
+{
+ if (argc != 1)
+ return -EINVAL;
+
+ wc_lock(wc);
+ activate_cleaner(wc);
+ if (!dm_suspended(wc->ti))
+ writecache_verify_watermark(wc);
+ wc_unlock(wc);
+
+ return 0;
+}
+
static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
char *result, unsigned maxlen)
{
@@ -1077,12 +1158,50 @@ static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
r = process_flush_mesg(argc, argv, wc);
else if (!strcasecmp(argv[0], "flush_on_suspend"))
r = process_flush_on_suspend_mesg(argc, argv, wc);
+ else if (!strcasecmp(argv[0], "cleaner"))
+ r = process_cleaner_mesg(argc, argv, wc);
else
DMERR("unrecognised message received: %s", argv[0]);
return r;
}
+static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
+{
+ /*
+ * clflushopt performs better with block size 1024, 2048, 4096
+ * non-temporal stores perform better with block size 512
+ *
+ * block size 512 1024 2048 4096
+ * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s
+ * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s
+ *
+ * We see that movnti performs better for 512-byte blocks, and
+ * clflushopt performs better for 1024-byte and larger blocks. So, we
+ * prefer clflushopt for sizes >= 768.
+ *
+ * NOTE: this happens to be the case now (with dm-writecache's single
+ * threaded model) but re-evaluate this once memcpy_flushcache() is
+ * enabled to use movdir64b which might invalidate this performance
+ * advantage seen with cache-allocating-writes plus flushing.
+ */
+#ifdef CONFIG_X86
+ if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
+ likely(boot_cpu_data.x86_clflush_size == 64) &&
+ likely(size >= 768)) {
+ do {
+ memcpy((void *)dest, (void *)source, 64);
+ clflushopt((void *)dest);
+ dest += 64;
+ source += 64;
+ size -= 64;
+ } while (size >= 64);
+ return;
+ }
+#endif
+ memcpy_flushcache(dest, source, size);
+}
+
static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
{
void *buf;
@@ -1100,7 +1219,7 @@ static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data
if (rw == READ) {
int r;
- r = memcpy_mcsafe(buf, data, size);
+ r = copy_mc_to_kernel(buf, data, size);
flush_dcache_page(bio_page(bio));
if (unlikely(r)) {
writecache_error(wc, r, "hardware memory error when reading data: %d", r);
@@ -1108,7 +1227,7 @@ static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data
}
} else {
flush_dcache_page(bio_page(bio));
- memcpy_flushcache(data, buf, size);
+ memcpy_flushcache_optimized(data, buf, size);
}
bvec_kunmap_irq(buf, &flags);
@@ -1146,7 +1265,7 @@ static int writecache_flush_thread(void *data)
bio_end_sector(bio));
wc_unlock(wc);
bio_set_dev(bio, wc->dev->bdev);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
} else {
writecache_flush(wc);
wc_unlock(wc);
@@ -1182,8 +1301,12 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
writecache_flush(wc);
if (writecache_has_error(wc))
goto unlock_error;
+ if (unlikely(wc->cleaner))
+ goto unlock_remap_origin;
goto unlock_submit;
} else {
+ if (dm_bio_get_target_bio_nr(bio))
+ goto unlock_remap_origin;
writecache_offload_bio(wc, bio);
goto unlock_return;
}
@@ -1240,19 +1363,40 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
}
} else {
do {
+ bool found_entry = false;
+ bool search_used = false;
if (writecache_has_error(wc))
goto unlock_error;
e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
if (e) {
- if (!writecache_entry_is_committed(wc, e))
- goto bio_copy;
- if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
- wc->overwrote_committed = true;
+ if (!writecache_entry_is_committed(wc, e)) {
+ search_used = true;
goto bio_copy;
}
+ if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
+ wc->overwrote_committed = true;
+ search_used = true;
+ goto bio_copy;
+ }
+ found_entry = true;
+ } else {
+ if (unlikely(wc->cleaner))
+ goto direct_write;
}
- e = writecache_pop_from_freelist(wc);
+ e = writecache_pop_from_freelist(wc, (sector_t)-1);
if (unlikely(!e)) {
+ if (!WC_MODE_PMEM(wc) && !found_entry) {
+direct_write:
+ e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
+ if (e) {
+ sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector;
+ BUG_ON(!next_boundary);
+ if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
+ dm_accept_partial_bio(bio, next_boundary);
+ }
+ }
+ goto unlock_remap_origin;
+ }
writecache_wait_on_freelist(wc);
continue;
}
@@ -1263,9 +1407,44 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
if (WC_MODE_PMEM(wc)) {
bio_copy_block(wc, bio, memory_data(wc, e));
} else {
- dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
+ unsigned bio_size = wc->block_size;
+ sector_t start_cache_sec = cache_sector(wc, e);
+ sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
+
+ while (bio_size < bio->bi_iter.bi_size) {
+ if (!search_used) {
+ struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
+ if (!f)
+ break;
+ write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
+ (bio_size >> SECTOR_SHIFT), wc->seq_count);
+ writecache_insert_entry(wc, f);
+ wc->uncommitted_blocks++;
+ } else {
+ struct wc_entry *f;
+ struct rb_node *next = rb_next(&e->rb_node);
+ if (!next)
+ break;
+ f = container_of(next, struct wc_entry, rb_node);
+ if (f != e + 1)
+ break;
+ if (read_original_sector(wc, f) !=
+ read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
+ break;
+ if (unlikely(f->write_in_progress))
+ break;
+ if (writecache_entry_is_committed(wc, f))
+ wc->overwrote_committed = true;
+ e = f;
+ }
+ bio_size += wc->block_size;
+ current_cache_sec += wc->block_size >> SECTOR_SHIFT;
+ }
+
bio_set_dev(bio, wc->ssd_dev->bdev);
- bio->bi_iter.bi_sector = cache_sector(wc, e);
+ bio->bi_iter.bi_sector = start_cache_sec;
+ dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
+
if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
wc->uncommitted_blocks = 0;
queue_work(wc->writeback_wq, &wc->flush_work);
@@ -1663,7 +1842,9 @@ static void writecache_writeback(struct work_struct *work)
wbl.size = 0;
while (!list_empty(&wc->lru) &&
(wc->writeback_all ||
- wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) {
+ wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark ||
+ (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >=
+ wc->max_age - wc->max_age / MAX_AGE_DIV))) {
n_walked++;
if (unlikely(n_walked > WRITEBACK_LATENCY) &&
@@ -1928,9 +2109,11 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
wc->ti = ti;
mutex_init(&wc->lock);
+ wc->max_age = MAX_AGE_UNSPECIFIED;
writecache_poison_lists(wc);
init_waitqueue_head(&wc->freelist_wait);
timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
+ timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0);
for (i = 0; i < 2; i++) {
atomic_set(&wc->bio_in_progress[i], 0);
@@ -2114,6 +2297,19 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
wc->autocommit_time_value = autocommit_msecs;
wc->autocommit_time_set = true;
+ } else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
+ unsigned max_age_msecs;
+ string = dm_shift_arg(&as), opt_params--;
+ if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
+ goto invalid_optional;
+ if (max_age_msecs > 86400000)
+ goto invalid_optional;
+ wc->max_age = msecs_to_jiffies(max_age_msecs);
+ wc->max_age_set = true;
+ wc->max_age_value = max_age_msecs;
+ } else if (!strcasecmp(string, "cleaner")) {
+ wc->cleaner_set = true;
+ wc->cleaner = true;
} else if (!strcasecmp(string, "fua")) {
if (WC_MODE_PMEM(wc)) {
wc->writeback_fua = true;
@@ -2214,7 +2410,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
}
- r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
+ r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock));
if (r) {
ti->error = "Hardware memory error when reading superblock";
goto bad;
@@ -2225,7 +2421,8 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->error = "Unable to initialize device";
goto bad;
}
- r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
+ r = copy_mc_to_kernel(&s, sb(wc),
+ sizeof(struct wc_memory_superblock));
if (r) {
ti->error = "Hardware memory error when reading superblock";
goto bad;
@@ -2285,13 +2482,16 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
do_div(x, 100);
wc->freelist_low_watermark = x;
+ if (wc->cleaner)
+ activate_cleaner(wc);
+
r = writecache_alloc_entries(wc);
if (r) {
ti->error = "Cannot allocate memory";
goto bad;
}
- ti->num_flush_bios = 1;
+ ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
ti->flush_supported = true;
ti->num_discard_bios = 1;
@@ -2337,6 +2537,10 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
extra_args += 2;
if (wc->autocommit_time_set)
extra_args += 2;
+ if (wc->max_age_set)
+ extra_args += 2;
+ if (wc->cleaner_set)
+ extra_args++;
if (wc->writeback_fua_set)
extra_args++;
@@ -2353,6 +2557,10 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
if (wc->autocommit_time_set)
DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
+ if (wc->max_age_set)
+ DMEMIT(" max_age %u", wc->max_age_value);
+ if (wc->cleaner_set)
+ DMEMIT(" cleaner");
if (wc->writeback_fua_set)
DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
break;
@@ -2361,7 +2569,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
static struct target_type writecache_target = {
.name = "writecache",
- .version = {1, 1, 1},
+ .version = {1, 4, 0},
.module = THIS_MODULE,
.ctr = writecache_ctr,
.dtr = writecache_dtr,
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index e6b0039..5100907 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -16,7 +16,7 @@
/*
* Metadata version.
*/
-#define DMZ_META_VER 1
+#define DMZ_META_VER 2
/*
* On-disk super block magic.
@@ -69,8 +69,17 @@ struct dmz_super {
/* Checksum */
__le32 crc; /* 48 */
+ /* DM-Zoned label */
+ u8 dmz_label[32]; /* 80 */
+
+ /* DM-Zoned UUID */
+ u8 dmz_uuid[16]; /* 96 */
+
+ /* Device UUID */
+ u8 dev_uuid[16]; /* 112 */
+
/* Padding to full 512B sector */
- u8 reserved[464]; /* 512 */
+ u8 reserved[400]; /* 512 */
};
/*
@@ -122,8 +131,10 @@ enum {
*/
struct dmz_sb {
sector_t block;
+ struct dmz_dev *dev;
struct dmz_mblock *mblk;
struct dmz_super *sb;
+ struct dm_zone *zone;
};
/*
@@ -131,28 +142,41 @@ struct dmz_sb {
*/
struct dmz_metadata {
struct dmz_dev *dev;
+ unsigned int nr_devs;
+
+ char devname[BDEVNAME_SIZE];
+ char label[BDEVNAME_SIZE];
+ uuid_t uuid;
sector_t zone_bitmap_size;
unsigned int zone_nr_bitmap_blocks;
unsigned int zone_bits_per_mblk;
+ sector_t zone_nr_blocks;
+ sector_t zone_nr_blocks_shift;
+
+ sector_t zone_nr_sectors;
+ sector_t zone_nr_sectors_shift;
+
unsigned int nr_bitmap_blocks;
unsigned int nr_map_blocks;
+ unsigned int nr_zones;
unsigned int nr_useable_zones;
unsigned int nr_meta_blocks;
unsigned int nr_meta_zones;
unsigned int nr_data_zones;
+ unsigned int nr_cache_zones;
unsigned int nr_rnd_zones;
unsigned int nr_reserved_seq;
unsigned int nr_chunks;
/* Zone information array */
- struct dm_zone *zones;
+ struct xarray zones;
- struct dm_zone *sb_zone;
struct dmz_sb sb[2];
unsigned int mblk_primary;
+ unsigned int sb_version;
u64 sb_gen;
unsigned int min_nr_mblks;
unsigned int max_nr_mblks;
@@ -168,15 +192,11 @@ struct dmz_metadata {
/* Zone allocation management */
struct mutex map_lock;
struct dmz_mblock **map_mblk;
- unsigned int nr_rnd;
- atomic_t unmap_nr_rnd;
- struct list_head unmap_rnd_list;
- struct list_head map_rnd_list;
- unsigned int nr_seq;
- atomic_t unmap_nr_seq;
- struct list_head unmap_seq_list;
- struct list_head map_seq_list;
+ unsigned int nr_cache;
+ atomic_t unmap_nr_cache;
+ struct list_head unmap_cache_list;
+ struct list_head map_cache_list;
atomic_t nr_reserved_seq_zones;
struct list_head reserved_seq_zones_list;
@@ -184,22 +204,65 @@ struct dmz_metadata {
wait_queue_head_t free_wq;
};
+#define dmz_zmd_info(zmd, format, args...) \
+ DMINFO("(%s): " format, (zmd)->label, ## args)
+
+#define dmz_zmd_err(zmd, format, args...) \
+ DMERR("(%s): " format, (zmd)->label, ## args)
+
+#define dmz_zmd_warn(zmd, format, args...) \
+ DMWARN("(%s): " format, (zmd)->label, ## args)
+
+#define dmz_zmd_debug(zmd, format, args...) \
+ DMDEBUG("(%s): " format, (zmd)->label, ## args)
/*
* Various accessors
*/
-unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone)
+static unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone)
{
- return ((unsigned int)(zone - zmd->zones));
+ if (WARN_ON(!zone))
+ return 0;
+
+ return zone->id - zone->dev->zone_offset;
}
sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone)
{
- return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_sectors_shift;
+ unsigned int zone_id = dmz_dev_zone_id(zmd, zone);
+
+ return (sector_t)zone_id << zmd->zone_nr_sectors_shift;
}
sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone)
{
- return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_blocks_shift;
+ unsigned int zone_id = dmz_dev_zone_id(zmd, zone);
+
+ return (sector_t)zone_id << zmd->zone_nr_blocks_shift;
+}
+
+unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd)
+{
+ return zmd->zone_nr_blocks;
+}
+
+unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd)
+{
+ return zmd->zone_nr_blocks_shift;
+}
+
+unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd)
+{
+ return zmd->zone_nr_sectors;
+}
+
+unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd)
+{
+ return zmd->zone_nr_sectors_shift;
+}
+
+unsigned int dmz_nr_zones(struct dmz_metadata *zmd)
+{
+ return zmd->nr_zones;
}
unsigned int dmz_nr_chunks(struct dmz_metadata *zmd)
@@ -207,14 +270,88 @@ unsigned int dmz_nr_chunks(struct dmz_metadata *zmd)
return zmd->nr_chunks;
}
-unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd)
+unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx)
{
- return zmd->nr_rnd;
+ return zmd->dev[idx].nr_rnd;
}
-unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd)
+unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx)
{
- return atomic_read(&zmd->unmap_nr_rnd);
+ return atomic_read(&zmd->dev[idx].unmap_nr_rnd);
+}
+
+unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd)
+{
+ return zmd->nr_cache;
+}
+
+unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd)
+{
+ return atomic_read(&zmd->unmap_nr_cache);
+}
+
+unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx)
+{
+ return zmd->dev[idx].nr_seq;
+}
+
+unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx)
+{
+ return atomic_read(&zmd->dev[idx].unmap_nr_seq);
+}
+
+static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id)
+{
+ return xa_load(&zmd->zones, zone_id);
+}
+
+static struct dm_zone *dmz_insert(struct dmz_metadata *zmd,
+ unsigned int zone_id, struct dmz_dev *dev)
+{
+ struct dm_zone *zone = kzalloc(sizeof(struct dm_zone), GFP_KERNEL);
+
+ if (!zone)
+ return ERR_PTR(-ENOMEM);
+
+ if (xa_insert(&zmd->zones, zone_id, zone, GFP_KERNEL)) {
+ kfree(zone);
+ return ERR_PTR(-EBUSY);
+ }
+
+ INIT_LIST_HEAD(&zone->link);
+ atomic_set(&zone->refcount, 0);
+ zone->id = zone_id;
+ zone->chunk = DMZ_MAP_UNMAPPED;
+ zone->dev = dev;
+
+ return zone;
+}
+
+const char *dmz_metadata_label(struct dmz_metadata *zmd)
+{
+ return (const char *)zmd->label;
+}
+
+bool dmz_check_dev(struct dmz_metadata *zmd)
+{
+ unsigned int i;
+
+ for (i = 0; i < zmd->nr_devs; i++) {
+ if (!dmz_check_bdev(&zmd->dev[i]))
+ return false;
+ }
+ return true;
+}
+
+bool dmz_dev_is_dying(struct dmz_metadata *zmd)
+{
+ unsigned int i;
+
+ for (i = 0; i < zmd->nr_devs; i++) {
+ if (dmz_bdev_is_dying(&zmd->dev[i]))
+ return true;
+ }
+ return false;
}
/*
@@ -402,9 +539,10 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
{
struct dmz_mblock *mblk, *m;
sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
+ struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev;
struct bio *bio;
- if (dmz_bdev_is_dying(zmd->dev))
+ if (dmz_bdev_is_dying(dev))
return ERR_PTR(-EIO);
/* Get a new block and a BIO to read it */
@@ -440,7 +578,7 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
/* Submit read BIO */
bio->bi_iter.bi_sector = dmz_blk2sect(block);
- bio_set_dev(bio, zmd->dev->bdev);
+ bio_set_dev(bio, dev->bdev);
bio->bi_private = mblk;
bio->bi_end_io = dmz_mblock_bio_end_io;
bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO);
@@ -537,6 +675,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
sector_t mblk_no)
{
struct dmz_mblock *mblk;
+ struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev;
/* Check rbtree */
spin_lock(&zmd->mblk_lock);
@@ -555,7 +694,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
TASK_UNINTERRUPTIBLE);
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
dmz_release_mblock(zmd, mblk);
- dmz_check_bdev(zmd->dev);
+ dmz_check_bdev(dev);
return ERR_PTR(-EIO);
}
@@ -579,10 +718,11 @@ static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
unsigned int set)
{
+ struct dmz_dev *dev = zmd->sb[set].dev;
sector_t block = zmd->sb[set].block + mblk->no;
struct bio *bio;
- if (dmz_bdev_is_dying(zmd->dev))
+ if (dmz_bdev_is_dying(dev))
return -EIO;
bio = bio_alloc(GFP_NOIO, 1);
@@ -594,7 +734,7 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
set_bit(DMZ_META_WRITING, &mblk->state);
bio->bi_iter.bi_sector = dmz_blk2sect(block);
- bio_set_dev(bio, zmd->dev->bdev);
+ bio_set_dev(bio, dev->bdev);
bio->bi_private = mblk;
bio->bi_end_io = dmz_mblock_bio_end_io;
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
@@ -607,13 +747,16 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
/*
* Read/write a metadata block.
*/
-static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
- struct page *page)
+static int dmz_rdwr_block(struct dmz_dev *dev, int op,
+ sector_t block, struct page *page)
{
struct bio *bio;
int ret;
- if (dmz_bdev_is_dying(zmd->dev))
+ if (WARN_ON(!dev))
+ return -EIO;
+
+ if (dmz_bdev_is_dying(dev))
return -EIO;
bio = bio_alloc(GFP_NOIO, 1);
@@ -621,14 +764,14 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
return -ENOMEM;
bio->bi_iter.bi_sector = dmz_blk2sect(block);
- bio_set_dev(bio, zmd->dev->bdev);
+ bio_set_dev(bio, dev->bdev);
bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO);
bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
ret = submit_bio_wait(bio);
bio_put(bio);
if (ret)
- dmz_check_bdev(zmd->dev);
+ dmz_check_bdev(dev);
return ret;
}
@@ -637,18 +780,32 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
*/
static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
{
- sector_t block = zmd->sb[set].block;
struct dmz_mblock *mblk = zmd->sb[set].mblk;
struct dmz_super *sb = zmd->sb[set].sb;
+ struct dmz_dev *dev = zmd->sb[set].dev;
+ sector_t sb_block;
u64 sb_gen = zmd->sb_gen + 1;
int ret;
sb->magic = cpu_to_le32(DMZ_MAGIC);
- sb->version = cpu_to_le32(DMZ_META_VER);
+
+ sb->version = cpu_to_le32(zmd->sb_version);
+ if (zmd->sb_version > 1) {
+ BUILD_BUG_ON(UUID_SIZE != 16);
+ export_uuid(sb->dmz_uuid, &zmd->uuid);
+ memcpy(sb->dmz_label, zmd->label, BDEVNAME_SIZE);
+ export_uuid(sb->dev_uuid, &dev->uuid);
+ }
sb->gen = cpu_to_le64(sb_gen);
- sb->sb_block = cpu_to_le64(block);
+ /*
+ * The metadata always references the absolute block address,
+ * ie relative to the entire block range, not the per-device
+ * block address.
+ */
+ sb_block = zmd->sb[set].zone->id << zmd->zone_nr_blocks_shift;
+ sb->sb_block = cpu_to_le64(sb_block);
sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks);
sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq);
sb->nr_chunks = cpu_to_le32(zmd->nr_chunks);
@@ -659,9 +816,10 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
sb->crc = 0;
sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE));
- ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page);
+ ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block,
+ mblk->page);
if (ret == 0)
- ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
+ ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
return ret;
}
@@ -674,6 +832,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
unsigned int set)
{
struct dmz_mblock *mblk;
+ struct dmz_dev *dev = zmd->sb[set].dev;
struct blk_plug plug;
int ret = 0, nr_mblks_submitted = 0;
@@ -695,7 +854,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
TASK_UNINTERRUPTIBLE);
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
clear_bit(DMZ_META_ERROR, &mblk->state);
- dmz_check_bdev(zmd->dev);
+ dmz_check_bdev(dev);
ret = -EIO;
}
nr_mblks_submitted--;
@@ -703,7 +862,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
/* Flush drive cache (this will also sync data) */
if (ret == 0)
- ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
+ ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
return ret;
}
@@ -740,6 +899,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
{
struct dmz_mblock *mblk;
struct list_head write_list;
+ struct dmz_dev *dev;
int ret;
if (WARN_ON(!zmd))
@@ -753,6 +913,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
* from modifying metadata.
*/
down_write(&zmd->mblk_sem);
+ dev = zmd->sb[zmd->mblk_primary].dev;
/*
* This is called from the target flush work and reclaim work.
@@ -760,7 +921,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
*/
dmz_lock_flush(zmd);
- if (dmz_bdev_is_dying(zmd->dev)) {
+ if (dmz_bdev_is_dying(dev)) {
ret = -EIO;
goto out;
}
@@ -772,7 +933,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
/* If there are no dirty metadata blocks, just flush the device cache */
if (list_empty(&write_list)) {
- ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
+ ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
goto err;
}
@@ -821,7 +982,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
list_splice(&write_list, &zmd->mblk_dirty_list);
spin_unlock(&zmd->mblk_lock);
}
- if (!dmz_check_bdev(zmd->dev))
+ if (!dmz_check_bdev(dev))
ret = -EIO;
goto out;
}
@@ -829,12 +990,31 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
/*
* Check super block.
*/
-static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb)
+static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb,
+ bool tertiary)
{
+ struct dmz_super *sb = dsb->sb;
+ struct dmz_dev *dev = dsb->dev;
unsigned int nr_meta_zones, nr_data_zones;
- struct dmz_dev *dev = zmd->dev;
u32 crc, stored_crc;
- u64 gen;
+ u64 gen, sb_block;
+
+ if (le32_to_cpu(sb->magic) != DMZ_MAGIC) {
+ dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)",
+ DMZ_MAGIC, le32_to_cpu(sb->magic));
+ return -ENXIO;
+ }
+
+ zmd->sb_version = le32_to_cpu(sb->version);
+ if (zmd->sb_version > DMZ_META_VER) {
+ dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)",
+ DMZ_META_VER, zmd->sb_version);
+ return -EINVAL;
+ }
+ if (zmd->sb_version < 2 && tertiary) {
+ dmz_dev_err(dev, "Tertiary superblocks are not supported");
+ return -EINVAL;
+ }
gen = le64_to_cpu(sb->gen);
stored_crc = le32_to_cpu(sb->crc);
@@ -846,22 +1026,60 @@ static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb)
return -ENXIO;
}
- if (le32_to_cpu(sb->magic) != DMZ_MAGIC) {
- dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)",
- DMZ_MAGIC, le32_to_cpu(sb->magic));
- return -ENXIO;
+ sb_block = le64_to_cpu(sb->sb_block);
+ if (sb_block != (u64)dsb->zone->id << zmd->zone_nr_blocks_shift ) {
+ dmz_dev_err(dev, "Invalid superblock position "
+ "(is %llu expected %llu)",
+ sb_block,
+ (u64)dsb->zone->id << zmd->zone_nr_blocks_shift);
+ return -EINVAL;
+ }
+ if (zmd->sb_version > 1) {
+ uuid_t sb_uuid;
+
+ import_uuid(&sb_uuid, sb->dmz_uuid);
+ if (uuid_is_null(&sb_uuid)) {
+ dmz_dev_err(dev, "NULL DM-Zoned uuid");
+ return -ENXIO;
+ } else if (uuid_is_null(&zmd->uuid)) {
+ uuid_copy(&zmd->uuid, &sb_uuid);
+ } else if (!uuid_equal(&zmd->uuid, &sb_uuid)) {
+ dmz_dev_err(dev, "mismatching DM-Zoned uuid, "
+ "is %pUl expected %pUl",
+ &sb_uuid, &zmd->uuid);
+ return -ENXIO;
+ }
+ if (!strlen(zmd->label))
+ memcpy(zmd->label, sb->dmz_label, BDEVNAME_SIZE);
+ else if (memcmp(zmd->label, sb->dmz_label, BDEVNAME_SIZE)) {
+ dmz_dev_err(dev, "mismatching DM-Zoned label, "
+ "is %s expected %s",
+ sb->dmz_label, zmd->label);
+ return -ENXIO;
+ }
+ import_uuid(&dev->uuid, sb->dev_uuid);
+ if (uuid_is_null(&dev->uuid)) {
+ dmz_dev_err(dev, "NULL device uuid");
+ return -ENXIO;
+ }
+
+ if (tertiary) {
+ /*
+ * Generation number should be 0, but it doesn't
+ * really matter if it isn't.
+ */
+ if (gen != 0)
+ dmz_dev_warn(dev, "Invalid generation %llu",
+ gen);
+ return 0;
+ }
}
- if (le32_to_cpu(sb->version) != DMZ_META_VER) {
- dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)",
- DMZ_META_VER, le32_to_cpu(sb->version));
- return -ENXIO;
- }
-
- nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + dev->zone_nr_blocks - 1)
- >> dev->zone_nr_blocks_shift;
+ nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1)
+ >> zmd->zone_nr_blocks_shift;
if (!nr_meta_zones ||
- nr_meta_zones >= zmd->nr_rnd_zones) {
+ (zmd->nr_devs <= 1 && nr_meta_zones >= zmd->nr_rnd_zones) ||
+ (zmd->nr_devs > 1 && nr_meta_zones >= zmd->nr_cache_zones)) {
dmz_dev_err(dev, "Invalid number of metadata blocks");
return -ENXIO;
}
@@ -895,10 +1113,13 @@ static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb)
/*
* Read the first or second super block from disk.
*/
-static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set)
+static int dmz_read_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set)
{
- return dmz_rdwr_block(zmd, REQ_OP_READ, zmd->sb[set].block,
- zmd->sb[set].mblk->page);
+ dmz_zmd_debug(zmd, "read superblock set %d dev %s block %llu",
+ set, sb->dev->name, sb->block);
+
+ return dmz_rdwr_block(sb->dev, REQ_OP_READ,
+ sb->block, sb->mblk->page);
}
/*
@@ -908,8 +1129,9 @@ static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set)
*/
static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd)
{
- unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks;
+ unsigned int zone_nr_blocks = zmd->zone_nr_blocks;
struct dmz_mblock *mblk;
+ unsigned int zone_id = zmd->sb[0].zone->id;
int i;
/* Allocate a block */
@@ -922,24 +1144,29 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd)
/* Bad first super block: search for the second one */
zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks;
- for (i = 0; i < zmd->nr_rnd_zones - 1; i++) {
- if (dmz_read_sb(zmd, 1) != 0)
+ zmd->sb[1].zone = dmz_get(zmd, zone_id + 1);
+ zmd->sb[1].dev = zmd->sb[0].dev;
+ for (i = 1; i < zmd->nr_rnd_zones; i++) {
+ if (dmz_read_sb(zmd, &zmd->sb[1], 1) != 0)
break;
if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC)
return 0;
zmd->sb[1].block += zone_nr_blocks;
+ zmd->sb[1].zone = dmz_get(zmd, zone_id + i);
}
dmz_free_mblock(zmd, mblk);
zmd->sb[1].mblk = NULL;
+ zmd->sb[1].zone = NULL;
+ zmd->sb[1].dev = NULL;
return -EIO;
}
/*
- * Read the first or second super block from disk.
+ * Read a super block from disk.
*/
-static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set)
+static int dmz_get_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set)
{
struct dmz_mblock *mblk;
int ret;
@@ -949,14 +1176,14 @@ static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set)
if (!mblk)
return -ENOMEM;
- zmd->sb[set].mblk = mblk;
- zmd->sb[set].sb = mblk->data;
+ sb->mblk = mblk;
+ sb->sb = mblk->data;
/* Read super block */
- ret = dmz_read_sb(zmd, set);
+ ret = dmz_read_sb(zmd, sb, set);
if (ret) {
dmz_free_mblock(zmd, mblk);
- zmd->sb[set].mblk = NULL;
+ sb->mblk = NULL;
return ret;
}
@@ -972,14 +1199,13 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set)
struct page *page;
int i, ret;
- dmz_dev_warn(zmd->dev, "Metadata set %u invalid: recovering", dst_set);
+ dmz_dev_warn(zmd->sb[dst_set].dev,
+ "Metadata set %u invalid: recovering", dst_set);
if (dst_set == 0)
- zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone);
- else {
- zmd->sb[1].block = zmd->sb[0].block +
- (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
- }
+ zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone);
+ else
+ zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone);
page = alloc_page(GFP_NOIO);
if (!page)
@@ -987,11 +1213,11 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set)
/* Copy metadata blocks */
for (i = 1; i < zmd->nr_meta_blocks; i++) {
- ret = dmz_rdwr_block(zmd, REQ_OP_READ,
+ ret = dmz_rdwr_block(zmd->sb[src_set].dev, REQ_OP_READ,
zmd->sb[src_set].block + i, page);
if (ret)
goto out;
- ret = dmz_rdwr_block(zmd, REQ_OP_WRITE,
+ ret = dmz_rdwr_block(zmd->sb[dst_set].dev, REQ_OP_WRITE,
zmd->sb[dst_set].block + i, page);
if (ret)
goto out;
@@ -1023,53 +1249,73 @@ static int dmz_load_sb(struct dmz_metadata *zmd)
u64 sb_gen[2] = {0, 0};
int ret;
+ if (!zmd->sb[0].zone) {
+ dmz_zmd_err(zmd, "Primary super block zone not set");
+ return -ENXIO;
+ }
+
/* Read and check the primary super block */
- zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone);
- ret = dmz_get_sb(zmd, 0);
+ zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone);
+ zmd->sb[0].dev = zmd->sb[0].zone->dev;
+ ret = dmz_get_sb(zmd, &zmd->sb[0], 0);
if (ret) {
- dmz_dev_err(zmd->dev, "Read primary super block failed");
+ dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed");
return ret;
}
- ret = dmz_check_sb(zmd, zmd->sb[0].sb);
+ ret = dmz_check_sb(zmd, &zmd->sb[0], false);
/* Read and check secondary super block */
if (ret == 0) {
sb_good[0] = true;
- zmd->sb[1].block = zmd->sb[0].block +
- (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
- ret = dmz_get_sb(zmd, 1);
+ if (!zmd->sb[1].zone) {
+ unsigned int zone_id =
+ zmd->sb[0].zone->id + zmd->nr_meta_zones;
+
+ zmd->sb[1].zone = dmz_get(zmd, zone_id);
+ }
+ zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone);
+ zmd->sb[1].dev = zmd->sb[0].dev;
+ ret = dmz_get_sb(zmd, &zmd->sb[1], 1);
} else
ret = dmz_lookup_secondary_sb(zmd);
if (ret) {
- dmz_dev_err(zmd->dev, "Read secondary super block failed");
+ dmz_dev_err(zmd->sb[1].dev, "Read secondary super block failed");
return ret;
}
- ret = dmz_check_sb(zmd, zmd->sb[1].sb);
+ ret = dmz_check_sb(zmd, &zmd->sb[1], false);
if (ret == 0)
sb_good[1] = true;
/* Use highest generation sb first */
if (!sb_good[0] && !sb_good[1]) {
- dmz_dev_err(zmd->dev, "No valid super block found");
+ dmz_zmd_err(zmd, "No valid super block found");
return -EIO;
}
if (sb_good[0])
sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen);
- else
+ else {
ret = dmz_recover_mblocks(zmd, 0);
+ if (ret) {
+ dmz_dev_err(zmd->sb[0].dev,
+ "Recovery of superblock 0 failed");
+ return -EIO;
+ }
+ }
if (sb_good[1])
sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen);
- else
+ else {
ret = dmz_recover_mblocks(zmd, 1);
- if (ret) {
- dmz_dev_err(zmd->dev, "Recovery failed");
- return -EIO;
+ if (ret) {
+ dmz_dev_err(zmd->sb[1].dev,
+ "Recovery of superblock 1 failed");
+ return -EIO;
+ }
}
if (sb_gen[0] >= sb_gen[1]) {
@@ -1080,60 +1326,141 @@ static int dmz_load_sb(struct dmz_metadata *zmd)
zmd->mblk_primary = 1;
}
- dmz_dev_debug(zmd->dev, "Using super block %u (gen %llu)",
+ dmz_dev_debug(zmd->sb[zmd->mblk_primary].dev,
+ "Using super block %u (gen %llu)",
zmd->mblk_primary, zmd->sb_gen);
- return 0;
+ if (zmd->sb_version > 1) {
+ int i;
+ struct dmz_sb *sb;
+
+ sb = kzalloc(sizeof(struct dmz_sb), GFP_KERNEL);
+ if (!sb)
+ return -ENOMEM;
+ for (i = 1; i < zmd->nr_devs; i++) {
+ sb->block = 0;
+ sb->zone = dmz_get(zmd, zmd->dev[i].zone_offset);
+ sb->dev = &zmd->dev[i];
+ if (!dmz_is_meta(sb->zone)) {
+ dmz_dev_err(sb->dev,
+ "Tertiary super block zone %u not marked as metadata zone",
+ sb->zone->id);
+ ret = -EINVAL;
+ goto out_kfree;
+ }
+ ret = dmz_get_sb(zmd, sb, i + 1);
+ if (ret) {
+ dmz_dev_err(sb->dev,
+ "Read tertiary super block failed");
+ dmz_free_mblock(zmd, sb->mblk);
+ goto out_kfree;
+ }
+ ret = dmz_check_sb(zmd, sb, true);
+ dmz_free_mblock(zmd, sb->mblk);
+ if (ret == -EINVAL)
+ goto out_kfree;
+ }
+ out_kfree:
+ kfree(sb);
+ }
+ return ret;
}
/*
* Initialize a zone descriptor.
*/
-static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
- struct blk_zone *blkz)
+static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data)
{
- struct dmz_dev *dev = zmd->dev;
+ struct dmz_dev *dev = data;
+ struct dmz_metadata *zmd = dev->metadata;
+ int idx = num + dev->zone_offset;
+ struct dm_zone *zone;
- /* Ignore the eventual last runt (smaller) zone */
- if (blkz->len != dev->zone_nr_sectors) {
- if (blkz->start + blkz->len == dev->capacity)
+ zone = dmz_insert(zmd, idx, dev);
+ if (IS_ERR(zone))
+ return PTR_ERR(zone);
+
+ if (blkz->len != zmd->zone_nr_sectors) {
+ if (zmd->sb_version > 1) {
+ /* Ignore the eventual runt (smaller) zone */
+ set_bit(DMZ_OFFLINE, &zone->flags);
+ return 0;
+ } else if (blkz->start + blkz->len == dev->capacity)
return 0;
return -ENXIO;
}
- INIT_LIST_HEAD(&zone->link);
- atomic_set(&zone->refcount, 0);
- zone->chunk = DMZ_MAP_UNMAPPED;
-
- if (blkz->type == BLK_ZONE_TYPE_CONVENTIONAL) {
- set_bit(DMZ_RND, &zone->flags);
- } else if (blkz->type == BLK_ZONE_TYPE_SEQWRITE_REQ ||
- blkz->type == BLK_ZONE_TYPE_SEQWRITE_PREF) {
- set_bit(DMZ_SEQ, &zone->flags);
- } else
+ /*
+ * Devices that have zones with a capacity smaller than the zone size
+ * (e.g. NVMe zoned namespaces) are not supported.
+ */
+ if (blkz->capacity != blkz->len)
return -ENXIO;
- if (blkz->cond == BLK_ZONE_COND_OFFLINE)
- set_bit(DMZ_OFFLINE, &zone->flags);
- else if (blkz->cond == BLK_ZONE_COND_READONLY)
- set_bit(DMZ_READ_ONLY, &zone->flags);
+ switch (blkz->type) {
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ set_bit(DMZ_RND, &zone->flags);
+ break;
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
+ set_bit(DMZ_SEQ, &zone->flags);
+ break;
+ default:
+ return -ENXIO;
+ }
if (dmz_is_rnd(zone))
zone->wp_block = 0;
else
zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start);
- if (!dmz_is_offline(zone) && !dmz_is_readonly(zone)) {
+ if (blkz->cond == BLK_ZONE_COND_OFFLINE)
+ set_bit(DMZ_OFFLINE, &zone->flags);
+ else if (blkz->cond == BLK_ZONE_COND_READONLY)
+ set_bit(DMZ_READ_ONLY, &zone->flags);
+ else {
zmd->nr_useable_zones++;
if (dmz_is_rnd(zone)) {
zmd->nr_rnd_zones++;
- if (!zmd->sb_zone) {
- /* Super block zone */
- zmd->sb_zone = zone;
+ if (zmd->nr_devs == 1 && !zmd->sb[0].zone) {
+ /* Primary super block zone */
+ zmd->sb[0].zone = zone;
}
}
+ if (zmd->nr_devs > 1 && num == 0) {
+ /*
+ * Tertiary superblock zones are always at the
+ * start of the zoned devices, so mark them
+ * as metadata zone.
+ */
+ set_bit(DMZ_META, &zone->flags);
+ }
}
+ return 0;
+}
+static int dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev)
+{
+ int idx;
+ sector_t zone_offset = 0;
+
+ for(idx = 0; idx < dev->nr_zones; idx++) {
+ struct dm_zone *zone;
+
+ zone = dmz_insert(zmd, idx, dev);
+ if (IS_ERR(zone))
+ return PTR_ERR(zone);
+ set_bit(DMZ_CACHE, &zone->flags);
+ zone->wp_block = 0;
+ zmd->nr_cache_zones++;
+ zmd->nr_useable_zones++;
+ if (dev->capacity - zone_offset < zmd->zone_nr_sectors) {
+ /* Disable runt zone */
+ set_bit(DMZ_OFFLINE, &zone->flags);
+ break;
+ }
+ zone_offset += zmd->zone_nr_sectors;
+ }
return 0;
}
@@ -1142,15 +1469,16 @@ static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
*/
static void dmz_drop_zones(struct dmz_metadata *zmd)
{
- kfree(zmd->zones);
- zmd->zones = NULL;
-}
+ int idx;
-/*
- * The size of a zone report in number of zones.
- * This results in 4096*64B=256KB report zones commands.
- */
-#define DMZ_REPORT_NR_ZONES 4096
+ for(idx = 0; idx < zmd->nr_zones; idx++) {
+ struct dm_zone *zone = xa_load(&zmd->zones, idx);
+
+ kfree(zone);
+ xa_erase(&zmd->zones, idx);
+ }
+ xa_destroy(&zmd->zones);
+}
/*
* Allocate and initialize zone descriptors using the zone
@@ -1158,76 +1486,111 @@ static void dmz_drop_zones(struct dmz_metadata *zmd)
*/
static int dmz_init_zones(struct dmz_metadata *zmd)
{
- struct dmz_dev *dev = zmd->dev;
- struct dm_zone *zone;
- struct blk_zone *blkz;
- unsigned int nr_blkz;
- sector_t sector = 0;
- int i, ret = 0;
+ int i, ret;
+ struct dmz_dev *zoned_dev = &zmd->dev[0];
/* Init */
- zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
+ zmd->zone_nr_sectors = zmd->dev[0].zone_nr_sectors;
+ zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors);
+ zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors);
+ zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks);
+ zmd->zone_bitmap_size = zmd->zone_nr_blocks >> 3;
zmd->zone_nr_bitmap_blocks =
max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT);
- zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks,
+ zmd->zone_bits_per_mblk = min_t(sector_t, zmd->zone_nr_blocks,
DMZ_BLOCK_SIZE_BITS);
/* Allocate zone array */
- zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
- if (!zmd->zones)
- return -ENOMEM;
+ zmd->nr_zones = 0;
+ for (i = 0; i < zmd->nr_devs; i++) {
+ struct dmz_dev *dev = &zmd->dev[i];
- dmz_dev_info(dev, "Using %zu B for zone information",
- sizeof(struct dm_zone) * dev->nr_zones);
+ dev->metadata = zmd;
+ zmd->nr_zones += dev->nr_zones;
- /* Get zone information */
- nr_blkz = DMZ_REPORT_NR_ZONES;
- blkz = kcalloc(nr_blkz, sizeof(struct blk_zone), GFP_KERNEL);
- if (!blkz) {
- ret = -ENOMEM;
- goto out;
+ atomic_set(&dev->unmap_nr_rnd, 0);
+ INIT_LIST_HEAD(&dev->unmap_rnd_list);
+ INIT_LIST_HEAD(&dev->map_rnd_list);
+
+ atomic_set(&dev->unmap_nr_seq, 0);
+ INIT_LIST_HEAD(&dev->unmap_seq_list);
+ INIT_LIST_HEAD(&dev->map_seq_list);
+ }
+
+ if (!zmd->nr_zones) {
+ DMERR("(%s): No zones found", zmd->devname);
+ return -ENXIO;
+ }
+ xa_init(&zmd->zones);
+
+ DMDEBUG("(%s): Using %zu B for zone information",
+ zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones);
+
+ if (zmd->nr_devs > 1) {
+ ret = dmz_emulate_zones(zmd, &zmd->dev[0]);
+ if (ret < 0) {
+ DMDEBUG("(%s): Failed to emulate zones, error %d",
+ zmd->devname, ret);
+ dmz_drop_zones(zmd);
+ return ret;
+ }
+
+ /*
+ * Primary superblock zone is always at zone 0 when multiple
+ * drives are present.
+ */
+ zmd->sb[0].zone = dmz_get(zmd, 0);
+
+ for (i = 1; i < zmd->nr_devs; i++) {
+ zoned_dev = &zmd->dev[i];
+
+ ret = blkdev_report_zones(zoned_dev->bdev, 0,
+ BLK_ALL_ZONES,
+ dmz_init_zone, zoned_dev);
+ if (ret < 0) {
+ DMDEBUG("(%s): Failed to report zones, error %d",
+ zmd->devname, ret);
+ dmz_drop_zones(zmd);
+ return ret;
+ }
+ }
+ return 0;
}
/*
- * Get zone information and initialize zone descriptors.
- * At the same time, determine where the super block
- * should be: first block of the first randomly writable
- * zone.
+ * Get zone information and initialize zone descriptors. At the same
+ * time, determine where the super block should be: first block of the
+ * first randomly writable zone.
*/
- zone = zmd->zones;
- while (sector < dev->capacity) {
- /* Get zone information */
- nr_blkz = DMZ_REPORT_NR_ZONES;
- ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz);
- if (ret) {
- dmz_dev_err(dev, "Report zones failed %d", ret);
- goto out;
- }
-
- if (!nr_blkz)
- break;
-
- /* Process report */
- for (i = 0; i < nr_blkz; i++) {
- ret = dmz_init_zone(zmd, zone, &blkz[i]);
- if (ret)
- goto out;
- sector += dev->zone_nr_sectors;
- zone++;
- }
- }
-
- /* The entire zone configuration of the disk should now be known */
- if (sector < dev->capacity) {
- dmz_dev_err(dev, "Failed to get correct zone information");
- ret = -ENXIO;
- }
-out:
- kfree(blkz);
- if (ret)
+ ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES,
+ dmz_init_zone, zoned_dev);
+ if (ret < 0) {
+ DMDEBUG("(%s): Failed to report zones, error %d",
+ zmd->devname, ret);
dmz_drop_zones(zmd);
+ return ret;
+ }
- return ret;
+ return 0;
+}
+
+static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx,
+ void *data)
+{
+ struct dm_zone *zone = data;
+
+ clear_bit(DMZ_OFFLINE, &zone->flags);
+ clear_bit(DMZ_READ_ONLY, &zone->flags);
+ if (blkz->cond == BLK_ZONE_COND_OFFLINE)
+ set_bit(DMZ_OFFLINE, &zone->flags);
+ else if (blkz->cond == BLK_ZONE_COND_READONLY)
+ set_bit(DMZ_READ_ONLY, &zone->flags);
+
+ if (dmz_is_seq(zone))
+ zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start);
+ else
+ zone->wp_block = 0;
+ return 0;
}
/*
@@ -1235,11 +1598,13 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
*/
static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
{
- unsigned int nr_blkz = 1;
+ struct dmz_dev *dev = zone->dev;
unsigned int noio_flag;
- struct blk_zone blkz;
int ret;
+ if (dev->flags & DMZ_BDEV_REGULAR)
+ return 0;
+
/*
* Get zone information from disk. Since blkdev_report_zones() uses
* GFP_KERNEL by default for memory allocations, set the per-task
@@ -1247,30 +1612,19 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
* GFP_NOIO was specified.
*/
noio_flag = memalloc_noio_save();
- ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
- &blkz, &nr_blkz);
+ ret = blkdev_report_zones(dev->bdev, dmz_start_sect(zmd, zone), 1,
+ dmz_update_zone_cb, zone);
memalloc_noio_restore(noio_flag);
- if (!nr_blkz)
+
+ if (ret == 0)
ret = -EIO;
- if (ret) {
- dmz_dev_err(zmd->dev, "Get zone %u report failed",
- dmz_id(zmd, zone));
- dmz_check_bdev(zmd->dev);
+ if (ret < 0) {
+ dmz_dev_err(dev, "Get zone %u report failed",
+ zone->id);
+ dmz_check_bdev(dev);
return ret;
}
- clear_bit(DMZ_OFFLINE, &zone->flags);
- clear_bit(DMZ_READ_ONLY, &zone->flags);
- if (blkz.cond == BLK_ZONE_COND_OFFLINE)
- set_bit(DMZ_OFFLINE, &zone->flags);
- else if (blkz.cond == BLK_ZONE_COND_READONLY)
- set_bit(DMZ_READ_ONLY, &zone->flags);
-
- if (dmz_is_seq(zone))
- zone->wp_block = dmz_sect2blk(blkz.wp - blkz.start);
- else
- zone->wp_block = 0;
-
return 0;
}
@@ -1281,6 +1635,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
static int dmz_handle_seq_write_err(struct dmz_metadata *zmd,
struct dm_zone *zone)
{
+ struct dmz_dev *dev = zone->dev;
unsigned int wp = 0;
int ret;
@@ -1289,8 +1644,8 @@ static int dmz_handle_seq_write_err(struct dmz_metadata *zmd,
if (ret)
return ret;
- dmz_dev_warn(zmd->dev, "Processing zone %u write error (zone wp %u/%u)",
- dmz_id(zmd, zone), zone->wp_block, wp);
+ dmz_dev_warn(dev, "Processing zone %u write error (zone wp %u/%u)",
+ zone->id, zone->wp_block, wp);
if (zone->wp_block < wp) {
dmz_invalidate_blocks(zmd, zone, zone->wp_block,
@@ -1300,11 +1655,6 @@ static int dmz_handle_seq_write_err(struct dmz_metadata *zmd,
return 0;
}
-static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id)
-{
- return &zmd->zones[zone_id];
-}
-
/*
* Reset a zone write pointer.
*/
@@ -1322,14 +1672,14 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
return 0;
if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
- struct dmz_dev *dev = zmd->dev;
+ struct dmz_dev *dev = zone->dev;
- ret = blkdev_reset_zones(dev->bdev,
- dmz_start_sect(zmd, zone),
- dev->zone_nr_sectors, GFP_NOIO);
+ ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET,
+ dmz_start_sect(zmd, zone),
+ zmd->zone_nr_sectors, GFP_NOIO);
if (ret) {
dmz_dev_err(dev, "Reset zone %u failed %d",
- dmz_id(zmd, zone), ret);
+ zone->id, ret);
return ret;
}
}
@@ -1348,7 +1698,6 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone);
*/
static int dmz_load_mapping(struct dmz_metadata *zmd)
{
- struct dmz_dev *dev = zmd->dev;
struct dm_zone *dzone, *bzone;
struct dmz_mblock *dmap_mblk = NULL;
struct dmz_map *dmap;
@@ -1380,36 +1729,48 @@ static int dmz_load_mapping(struct dmz_metadata *zmd)
if (dzone_id == DMZ_MAP_UNMAPPED)
goto next;
- if (dzone_id >= dev->nr_zones) {
- dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u",
+ if (dzone_id >= zmd->nr_zones) {
+ dmz_zmd_err(zmd, "Chunk %u mapping: invalid data zone ID %u",
chunk, dzone_id);
return -EIO;
}
dzone = dmz_get(zmd, dzone_id);
+ if (!dzone) {
+ dmz_zmd_err(zmd, "Chunk %u mapping: data zone %u not present",
+ chunk, dzone_id);
+ return -EIO;
+ }
set_bit(DMZ_DATA, &dzone->flags);
dzone->chunk = chunk;
dmz_get_zone_weight(zmd, dzone);
- if (dmz_is_rnd(dzone))
- list_add_tail(&dzone->link, &zmd->map_rnd_list);
+ if (dmz_is_cache(dzone))
+ list_add_tail(&dzone->link, &zmd->map_cache_list);
+ else if (dmz_is_rnd(dzone))
+ list_add_tail(&dzone->link, &dzone->dev->map_rnd_list);
else
- list_add_tail(&dzone->link, &zmd->map_seq_list);
+ list_add_tail(&dzone->link, &dzone->dev->map_seq_list);
/* Check buffer zone */
bzone_id = le32_to_cpu(dmap[e].bzone_id);
if (bzone_id == DMZ_MAP_UNMAPPED)
goto next;
- if (bzone_id >= dev->nr_zones) {
- dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u",
+ if (bzone_id >= zmd->nr_zones) {
+ dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone ID %u",
chunk, bzone_id);
return -EIO;
}
bzone = dmz_get(zmd, bzone_id);
- if (!dmz_is_rnd(bzone)) {
- dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone %u",
+ if (!bzone) {
+ dmz_zmd_err(zmd, "Chunk %u mapping: buffer zone %u not present",
+ chunk, bzone_id);
+ return -EIO;
+ }
+ if (!dmz_is_rnd(bzone) && !dmz_is_cache(bzone)) {
+ dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u",
chunk, bzone_id);
return -EIO;
}
@@ -1420,7 +1781,10 @@ static int dmz_load_mapping(struct dmz_metadata *zmd)
bzone->bzone = dzone;
dzone->bzone = bzone;
dmz_get_zone_weight(zmd, bzone);
- list_add_tail(&bzone->link, &zmd->map_rnd_list);
+ if (dmz_is_cache(bzone))
+ list_add_tail(&bzone->link, &zmd->map_cache_list);
+ else
+ list_add_tail(&bzone->link, &bzone->dev->map_rnd_list);
next:
chunk++;
e++;
@@ -1433,15 +1797,21 @@ static int dmz_load_mapping(struct dmz_metadata *zmd)
* fully initialized. All remaining zones are unmapped data
* zones. Finish initializing those here.
*/
- for (i = 0; i < dev->nr_zones; i++) {
+ for (i = 0; i < zmd->nr_zones; i++) {
dzone = dmz_get(zmd, i);
+ if (!dzone)
+ continue;
if (dmz_is_meta(dzone))
continue;
+ if (dmz_is_offline(dzone))
+ continue;
- if (dmz_is_rnd(dzone))
- zmd->nr_rnd++;
+ if (dmz_is_cache(dzone))
+ zmd->nr_cache++;
+ else if (dmz_is_rnd(dzone))
+ dzone->dev->nr_rnd++;
else
- zmd->nr_seq++;
+ dzone->dev->nr_seq++;
if (dmz_is_data(dzone)) {
/* Already initialized */
@@ -1451,16 +1821,22 @@ static int dmz_load_mapping(struct dmz_metadata *zmd)
/* Unmapped data zone */
set_bit(DMZ_DATA, &dzone->flags);
dzone->chunk = DMZ_MAP_UNMAPPED;
- if (dmz_is_rnd(dzone)) {
- list_add_tail(&dzone->link, &zmd->unmap_rnd_list);
- atomic_inc(&zmd->unmap_nr_rnd);
+ if (dmz_is_cache(dzone)) {
+ list_add_tail(&dzone->link, &zmd->unmap_cache_list);
+ atomic_inc(&zmd->unmap_nr_cache);
+ } else if (dmz_is_rnd(dzone)) {
+ list_add_tail(&dzone->link,
+ &dzone->dev->unmap_rnd_list);
+ atomic_inc(&dzone->dev->unmap_nr_rnd);
} else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) {
list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list);
+ set_bit(DMZ_RESERVED, &dzone->flags);
atomic_inc(&zmd->nr_reserved_seq_zones);
- zmd->nr_seq--;
+ dzone->dev->nr_seq--;
} else {
- list_add_tail(&dzone->link, &zmd->unmap_seq_list);
- atomic_inc(&zmd->unmap_nr_seq);
+ list_add_tail(&dzone->link,
+ &dzone->dev->unmap_seq_list);
+ atomic_inc(&dzone->dev->unmap_nr_seq);
}
}
@@ -1494,10 +1870,13 @@ static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
list_del_init(&zone->link);
if (dmz_is_seq(zone)) {
/* LRU rotate sequential zone */
- list_add_tail(&zone->link, &zmd->map_seq_list);
+ list_add_tail(&zone->link, &zone->dev->map_seq_list);
+ } else if (dmz_is_cache(zone)) {
+ /* LRU rotate cache zone */
+ list_add_tail(&zone->link, &zmd->map_cache_list);
} else {
/* LRU rotate random zone */
- list_add_tail(&zone->link, &zmd->map_rnd_list);
+ list_add_tail(&zone->link, &zone->dev->map_rnd_list);
}
}
@@ -1564,26 +1943,64 @@ static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone)
{
dmz_unlock_map(zmd);
dmz_unlock_metadata(zmd);
+ set_bit(DMZ_RECLAIM_TERMINATE, &zone->flags);
wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ);
+ clear_bit(DMZ_RECLAIM_TERMINATE, &zone->flags);
dmz_lock_metadata(zmd);
dmz_lock_map(zmd);
}
/*
- * Select a random write zone for reclaim.
+ * Select a cache or random write zone for reclaim.
*/
-static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
+static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd,
+ unsigned int idx, bool idle)
{
struct dm_zone *dzone = NULL;
- struct dm_zone *zone;
+ struct dm_zone *zone, *maxw_z = NULL;
+ struct list_head *zone_list;
- if (list_empty(&zmd->map_rnd_list))
- return ERR_PTR(-EBUSY);
+ /* If we have cache zones select from the cache zone list */
+ if (zmd->nr_cache) {
+ zone_list = &zmd->map_cache_list;
+ /* Try to relaim random zones, too, when idle */
+ if (idle && list_empty(zone_list))
+ zone_list = &zmd->dev[idx].map_rnd_list;
+ } else
+ zone_list = &zmd->dev[idx].map_rnd_list;
- list_for_each_entry(zone, &zmd->map_rnd_list, link) {
- if (dmz_is_buf(zone))
+ /*
+ * Find the buffer zone with the heaviest weight or the first (oldest)
+ * data zone that can be reclaimed.
+ */
+ list_for_each_entry(zone, zone_list, link) {
+ if (dmz_is_buf(zone)) {
dzone = zone->bzone;
- else
+ if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx)
+ continue;
+ if (!maxw_z || maxw_z->weight < dzone->weight)
+ maxw_z = dzone;
+ } else {
+ dzone = zone;
+ if (dmz_lock_zone_reclaim(dzone))
+ return dzone;
+ }
+ }
+
+ if (maxw_z && dmz_lock_zone_reclaim(maxw_z))
+ return maxw_z;
+
+ /*
+ * If we come here, none of the zones inspected could be locked for
+ * reclaim. Try again, being more aggressive, that is, find the
+ * first zone that can be reclaimed regardless of its weitght.
+ */
+ list_for_each_entry(zone, zone_list, link) {
+ if (dmz_is_buf(zone)) {
+ dzone = zone->bzone;
+ if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx)
+ continue;
+ } else
dzone = zone;
if (dmz_lock_zone_reclaim(dzone))
return dzone;
@@ -1595,14 +2012,12 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
/*
* Select a buffered sequential zone for reclaim.
*/
-static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
+static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd,
+ unsigned int idx)
{
struct dm_zone *zone;
- if (list_empty(&zmd->map_seq_list))
- return ERR_PTR(-EBUSY);
-
- list_for_each_entry(zone, &zmd->map_seq_list, link) {
+ list_for_each_entry(zone, &zmd->dev[idx].map_seq_list, link) {
if (!zone->bzone)
continue;
if (dmz_lock_zone_reclaim(zone))
@@ -1615,9 +2030,10 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
/*
* Select a zone for reclaim.
*/
-struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd)
+struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd,
+ unsigned int dev_idx, bool idle)
{
- struct dm_zone *zone;
+ struct dm_zone *zone = NULL;
/*
* Search for a zone candidate to reclaim: 2 cases are possible.
@@ -1629,9 +2045,9 @@ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd)
*/
dmz_lock_map(zmd);
if (list_empty(&zmd->reserved_seq_zones_list))
- zone = dmz_get_seq_zone_for_reclaim(zmd);
- else
- zone = dmz_get_rnd_zone_for_reclaim(zmd);
+ zone = dmz_get_seq_zone_for_reclaim(zmd, dev_idx);
+ if (!zone)
+ zone = dmz_get_rnd_zone_for_reclaim(zmd, dev_idx, idle);
dmz_unlock_map(zmd);
return zone;
@@ -1651,6 +2067,7 @@ struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chu
unsigned int dzone_id;
struct dm_zone *dzone = NULL;
int ret = 0;
+ int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND;
dmz_lock_map(zmd);
again:
@@ -1665,9 +2082,9 @@ struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chu
goto out;
/* Allocate a random zone */
- dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
+ dzone = dmz_alloc_zone(zmd, 0, alloc_flags);
if (!dzone) {
- if (dmz_bdev_is_dying(zmd->dev)) {
+ if (dmz_dev_is_dying(zmd)) {
dzone = ERR_PTR(-EIO);
goto out;
}
@@ -1680,6 +2097,10 @@ struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chu
} else {
/* The chunk is already mapped: get the mapping zone */
dzone = dmz_get(zmd, dzone_id);
+ if (!dzone) {
+ dzone = ERR_PTR(-EIO);
+ goto out;
+ }
if (dzone->chunk != chunk) {
dzone = ERR_PTR(-EIO);
goto out;
@@ -1758,6 +2179,7 @@ struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
struct dm_zone *dzone)
{
struct dm_zone *bzone;
+ int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND;
dmz_lock_map(zmd);
again:
@@ -1766,9 +2188,9 @@ struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
goto out;
/* Allocate a random zone */
- bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
+ bzone = dmz_alloc_zone(zmd, 0, alloc_flags);
if (!bzone) {
- if (dmz_bdev_is_dying(zmd->dev)) {
+ if (dmz_dev_is_dying(zmd)) {
bzone = ERR_PTR(-EIO);
goto out;
}
@@ -1777,14 +2199,16 @@ struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
}
/* Update the chunk mapping */
- dmz_set_chunk_mapping(zmd, dzone->chunk, dmz_id(zmd, dzone),
- dmz_id(zmd, bzone));
+ dmz_set_chunk_mapping(zmd, dzone->chunk, dzone->id, bzone->id);
set_bit(DMZ_BUF, &bzone->flags);
bzone->chunk = dzone->chunk;
bzone->bzone = dzone;
dzone->bzone = bzone;
- list_add_tail(&bzone->link, &zmd->map_rnd_list);
+ if (dmz_is_cache(bzone))
+ list_add_tail(&bzone->link, &zmd->map_cache_list);
+ else
+ list_add_tail(&bzone->link, &bzone->dev->map_rnd_list);
out:
dmz_unlock_map(zmd);
@@ -1795,46 +2219,75 @@ struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
* Get an unmapped (free) zone.
* This must be called with the mapping lock held.
*/
-struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags)
+struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned int dev_idx,
+ unsigned long flags)
{
struct list_head *list;
struct dm_zone *zone;
+ int i;
- if (flags & DMZ_ALLOC_RND)
- list = &zmd->unmap_rnd_list;
- else
- list = &zmd->unmap_seq_list;
+ /* Schedule reclaim to ensure free zones are available */
+ if (!(flags & DMZ_ALLOC_RECLAIM)) {
+ for (i = 0; i < zmd->nr_devs; i++)
+ dmz_schedule_reclaim(zmd->dev[i].reclaim);
+ }
+
+ i = 0;
again:
+ if (flags & DMZ_ALLOC_CACHE)
+ list = &zmd->unmap_cache_list;
+ else if (flags & DMZ_ALLOC_RND)
+ list = &zmd->dev[dev_idx].unmap_rnd_list;
+ else
+ list = &zmd->dev[dev_idx].unmap_seq_list;
+
if (list_empty(list)) {
/*
- * No free zone: if this is for reclaim, allow using the
- * reserved sequential zones.
+ * No free zone: return NULL if this is for not reclaim.
*/
- if (!(flags & DMZ_ALLOC_RECLAIM) ||
- list_empty(&zmd->reserved_seq_zones_list))
+ if (!(flags & DMZ_ALLOC_RECLAIM))
return NULL;
+ /*
+ * Try to allocate from other devices
+ */
+ if (i < zmd->nr_devs) {
+ dev_idx = (dev_idx + 1) % zmd->nr_devs;
+ i++;
+ goto again;
+ }
- zone = list_first_entry(&zmd->reserved_seq_zones_list,
- struct dm_zone, link);
- list_del_init(&zone->link);
- atomic_dec(&zmd->nr_reserved_seq_zones);
+ /*
+ * Fallback to the reserved sequential zones
+ */
+ zone = list_first_entry_or_null(&zmd->reserved_seq_zones_list,
+ struct dm_zone, link);
+ if (zone) {
+ list_del_init(&zone->link);
+ atomic_dec(&zmd->nr_reserved_seq_zones);
+ }
return zone;
}
zone = list_first_entry(list, struct dm_zone, link);
list_del_init(&zone->link);
- if (dmz_is_rnd(zone))
- atomic_dec(&zmd->unmap_nr_rnd);
+ if (dmz_is_cache(zone))
+ atomic_dec(&zmd->unmap_nr_cache);
+ else if (dmz_is_rnd(zone))
+ atomic_dec(&zone->dev->unmap_nr_rnd);
else
- atomic_dec(&zmd->unmap_nr_seq);
+ atomic_dec(&zone->dev->unmap_nr_seq);
if (dmz_is_offline(zone)) {
- dmz_dev_warn(zmd->dev, "Zone %u is offline", dmz_id(zmd, zone));
+ dmz_zmd_warn(zmd, "Zone %u is offline", zone->id);
zone = NULL;
goto again;
}
-
+ if (dmz_is_meta(zone)) {
+ dmz_zmd_warn(zmd, "Zone %u has metadata", zone->id);
+ zone = NULL;
+ goto again;
+ }
return zone;
}
@@ -1849,16 +2302,18 @@ void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
dmz_reset_zone(zmd, zone);
/* Return the zone to its type unmap list */
- if (dmz_is_rnd(zone)) {
- list_add_tail(&zone->link, &zmd->unmap_rnd_list);
- atomic_inc(&zmd->unmap_nr_rnd);
- } else if (atomic_read(&zmd->nr_reserved_seq_zones) <
- zmd->nr_reserved_seq) {
+ if (dmz_is_cache(zone)) {
+ list_add_tail(&zone->link, &zmd->unmap_cache_list);
+ atomic_inc(&zmd->unmap_nr_cache);
+ } else if (dmz_is_rnd(zone)) {
+ list_add_tail(&zone->link, &zone->dev->unmap_rnd_list);
+ atomic_inc(&zone->dev->unmap_nr_rnd);
+ } else if (dmz_is_reserved(zone)) {
list_add_tail(&zone->link, &zmd->reserved_seq_zones_list);
atomic_inc(&zmd->nr_reserved_seq_zones);
} else {
- list_add_tail(&zone->link, &zmd->unmap_seq_list);
- atomic_inc(&zmd->unmap_nr_seq);
+ list_add_tail(&zone->link, &zone->dev->unmap_seq_list);
+ atomic_inc(&zone->dev->unmap_nr_seq);
}
wake_up_all(&zmd->free_wq);
@@ -1872,13 +2327,15 @@ void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone,
unsigned int chunk)
{
/* Set the chunk mapping */
- dmz_set_chunk_mapping(zmd, chunk, dmz_id(zmd, dzone),
+ dmz_set_chunk_mapping(zmd, chunk, dzone->id,
DMZ_MAP_UNMAPPED);
dzone->chunk = chunk;
- if (dmz_is_rnd(dzone))
- list_add_tail(&dzone->link, &zmd->map_rnd_list);
+ if (dmz_is_cache(dzone))
+ list_add_tail(&dzone->link, &zmd->map_cache_list);
+ else if (dmz_is_rnd(dzone))
+ list_add_tail(&dzone->link, &dzone->dev->map_rnd_list);
else
- list_add_tail(&dzone->link, &zmd->map_seq_list);
+ list_add_tail(&dzone->link, &dzone->dev->map_seq_list);
}
/*
@@ -1900,7 +2357,7 @@ void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
* Unmapping the chunk buffer zone: clear only
* the chunk buffer mapping
*/
- dzone_id = dmz_id(zmd, zone->bzone);
+ dzone_id = zone->bzone->id;
zone->bzone->bzone = NULL;
zone->bzone = NULL;
@@ -1962,7 +2419,7 @@ static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd,
sector_t chunk_block)
{
sector_t bitmap_block = 1 + zmd->nr_map_blocks +
- (sector_t)(dmz_id(zmd, zone) * zmd->zone_nr_bitmap_blocks) +
+ (sector_t)(zone->id * zmd->zone_nr_bitmap_blocks) +
(chunk_block >> DMZ_BLOCK_SHIFT_BITS);
return dmz_get_mblock(zmd, bitmap_block);
@@ -1978,7 +2435,7 @@ int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
sector_t chunk_block = 0;
/* Get the zones bitmap blocks */
- while (chunk_block < zmd->dev->zone_nr_blocks) {
+ while (chunk_block < zmd->zone_nr_blocks) {
from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block);
if (IS_ERR(from_mblk))
return PTR_ERR(from_mblk);
@@ -2013,7 +2470,7 @@ int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
int ret;
/* Get the zones bitmap blocks */
- while (chunk_block < zmd->dev->zone_nr_blocks) {
+ while (chunk_block < zmd->zone_nr_blocks) {
/* Get a valid region from the source zone */
ret = dmz_first_valid_block(zmd, from_zone, &chunk_block);
if (ret <= 0)
@@ -2037,12 +2494,12 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
sector_t chunk_block, unsigned int nr_blocks)
{
unsigned int count, bit, nr_bits;
- unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks;
+ unsigned int zone_nr_blocks = zmd->zone_nr_blocks;
struct dmz_mblock *mblk;
unsigned int n = 0;
- dmz_dev_debug(zmd->dev, "=> VALIDATE zone %u, block %llu, %u blocks",
- dmz_id(zmd, zone), (unsigned long long)chunk_block,
+ dmz_zmd_debug(zmd, "=> VALIDATE zone %u, block %llu, %u blocks",
+ zone->id, (unsigned long long)chunk_block,
nr_blocks);
WARN_ON(chunk_block + nr_blocks > zone_nr_blocks);
@@ -2071,8 +2528,8 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
if (likely(zone->weight + n <= zone_nr_blocks))
zone->weight += n;
else {
- dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be <= %u",
- dmz_id(zmd, zone), zone->weight,
+ dmz_zmd_warn(zmd, "Zone %u: weight %u should be <= %u",
+ zone->id, zone->weight,
zone_nr_blocks - n);
zone->weight = zone_nr_blocks;
}
@@ -2121,10 +2578,10 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
struct dmz_mblock *mblk;
unsigned int n = 0;
- dmz_dev_debug(zmd->dev, "=> INVALIDATE zone %u, block %llu, %u blocks",
- dmz_id(zmd, zone), (u64)chunk_block, nr_blocks);
+ dmz_zmd_debug(zmd, "=> INVALIDATE zone %u, block %llu, %u blocks",
+ zone->id, (u64)chunk_block, nr_blocks);
- WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks);
+ WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks);
while (nr_blocks) {
/* Get bitmap block */
@@ -2151,8 +2608,8 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
if (zone->weight >= n)
zone->weight -= n;
else {
- dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be >= %u",
- dmz_id(zmd, zone), zone->weight, n);
+ dmz_zmd_warn(zmd, "Zone %u: weight %u should be >= %u",
+ zone->id, zone->weight, n);
zone->weight = 0;
}
@@ -2168,7 +2625,7 @@ static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone,
struct dmz_mblock *mblk;
int ret;
- WARN_ON(chunk_block >= zmd->dev->zone_nr_blocks);
+ WARN_ON(chunk_block >= zmd->zone_nr_blocks);
/* Get bitmap block */
mblk = dmz_get_bitmap(zmd, zone, chunk_block);
@@ -2198,7 +2655,7 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
unsigned long *bitmap;
int n = 0;
- WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks);
+ WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks);
while (nr_blocks) {
/* Get bitmap block */
@@ -2242,7 +2699,7 @@ int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone,
/* The block is valid: get the number of valid blocks from block */
return dmz_to_next_set_block(zmd, zone, chunk_block,
- zmd->dev->zone_nr_blocks - chunk_block, 0);
+ zmd->zone_nr_blocks - chunk_block, 0);
}
/*
@@ -2258,7 +2715,7 @@ int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone,
int ret;
ret = dmz_to_next_set_block(zmd, zone, start_block,
- zmd->dev->zone_nr_blocks - start_block, 1);
+ zmd->zone_nr_blocks - start_block, 1);
if (ret < 0)
return ret;
@@ -2266,7 +2723,7 @@ int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone,
*chunk_block = start_block;
return dmz_to_next_set_block(zmd, zone, start_block,
- zmd->dev->zone_nr_blocks - start_block, 0);
+ zmd->zone_nr_blocks - start_block, 0);
}
/*
@@ -2305,7 +2762,7 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone)
struct dmz_mblock *mblk;
sector_t chunk_block = 0;
unsigned int bit, nr_bits;
- unsigned int nr_blocks = zmd->dev->zone_nr_blocks;
+ unsigned int nr_blocks = zmd->zone_nr_blocks;
void *bitmap;
int n = 0;
@@ -2361,7 +2818,7 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
while (!list_empty(&zmd->mblk_dirty_list)) {
mblk = list_first_entry(&zmd->mblk_dirty_list,
struct dmz_mblock, link);
- dmz_dev_warn(zmd->dev, "mblock %llu still in dirty list (ref %u)",
+ dmz_zmd_warn(zmd, "mblock %llu still in dirty list (ref %u)",
(u64)mblk->no, mblk->ref);
list_del_init(&mblk->link);
rb_erase(&mblk->node, &zmd->mblk_rbtree);
@@ -2379,7 +2836,7 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
/* Sanity checks: the mblock rbtree should now be empty */
root = &zmd->mblk_rbtree;
rbtree_postorder_for_each_entry_safe(mblk, next, root, node) {
- dmz_dev_warn(zmd->dev, "mblock %llu ref %u still in rbtree",
+ dmz_zmd_warn(zmd, "mblock %llu ref %u still in rbtree",
(u64)mblk->no, mblk->ref);
mblk->ref = 0;
dmz_free_mblock(zmd, mblk);
@@ -2392,13 +2849,42 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
mutex_destroy(&zmd->map_lock);
}
+static void dmz_print_dev(struct dmz_metadata *zmd, int num)
+{
+ struct dmz_dev *dev = &zmd->dev[num];
+
+ if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE)
+ dmz_dev_info(dev, "Regular block device");
+ else
+ dmz_dev_info(dev, "Host-%s zoned block device",
+ bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
+ "aware" : "managed");
+ if (zmd->sb_version > 1) {
+ sector_t sector_offset =
+ dev->zone_offset << zmd->zone_nr_sectors_shift;
+
+ dmz_dev_info(dev, " %llu 512-byte logical sectors (offset %llu)",
+ (u64)dev->capacity, (u64)sector_offset);
+ dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors (offset %llu)",
+ dev->nr_zones, (u64)zmd->zone_nr_sectors,
+ (u64)dev->zone_offset);
+ } else {
+ dmz_dev_info(dev, " %llu 512-byte logical sectors",
+ (u64)dev->capacity);
+ dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors",
+ dev->nr_zones, (u64)zmd->zone_nr_sectors);
+ }
+}
+
/*
* Initialize the zoned metadata.
*/
-int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
+int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev,
+ struct dmz_metadata **metadata,
+ const char *devname)
{
struct dmz_metadata *zmd;
- unsigned int i, zid;
+ unsigned int i;
struct dm_zone *zone;
int ret;
@@ -2406,7 +2892,9 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
if (!zmd)
return -ENOMEM;
+ strcpy(zmd->devname, devname);
zmd->dev = dev;
+ zmd->nr_devs = num_dev;
zmd->mblk_rbtree = RB_ROOT;
init_rwsem(&zmd->mblk_sem);
mutex_init(&zmd->mblk_flush_lock);
@@ -2415,13 +2903,10 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
INIT_LIST_HEAD(&zmd->mblk_dirty_list);
mutex_init(&zmd->map_lock);
- atomic_set(&zmd->unmap_nr_rnd, 0);
- INIT_LIST_HEAD(&zmd->unmap_rnd_list);
- INIT_LIST_HEAD(&zmd->map_rnd_list);
- atomic_set(&zmd->unmap_nr_seq, 0);
- INIT_LIST_HEAD(&zmd->unmap_seq_list);
- INIT_LIST_HEAD(&zmd->map_seq_list);
+ atomic_set(&zmd->unmap_nr_cache, 0);
+ INIT_LIST_HEAD(&zmd->unmap_cache_list);
+ INIT_LIST_HEAD(&zmd->map_cache_list);
atomic_set(&zmd->nr_reserved_seq_zones, 0);
INIT_LIST_HEAD(&zmd->reserved_seq_zones_list);
@@ -2439,14 +2924,22 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
goto err;
/* Set metadata zones starting from sb_zone */
- zid = dmz_id(zmd, zmd->sb_zone);
for (i = 0; i < zmd->nr_meta_zones << 1; i++) {
- zone = dmz_get(zmd, zid + i);
- if (!dmz_is_rnd(zone))
+ zone = dmz_get(zmd, zmd->sb[0].zone->id + i);
+ if (!zone) {
+ dmz_zmd_err(zmd,
+ "metadata zone %u not present", i);
+ ret = -ENXIO;
goto err;
+ }
+ if (!dmz_is_rnd(zone) && !dmz_is_cache(zone)) {
+ dmz_zmd_err(zmd,
+ "metadata zone %d is not random", i);
+ ret = -ENXIO;
+ goto err;
+ }
set_bit(DMZ_META, &zone->flags);
}
-
/* Load mapping table */
ret = dmz_load_mapping(zmd);
if (ret)
@@ -2467,34 +2960,38 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
/* Metadata cache shrinker */
ret = register_shrinker(&zmd->mblk_shrinker);
if (ret) {
- dmz_dev_err(dev, "Register metadata cache shrinker failed");
+ dmz_zmd_err(zmd, "Register metadata cache shrinker failed");
goto err;
}
- dmz_dev_info(dev, "Host-%s zoned block device",
- bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
- "aware" : "managed");
- dmz_dev_info(dev, " %llu 512-byte logical sectors",
- (u64)dev->capacity);
- dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors",
- dev->nr_zones, (u64)dev->zone_nr_sectors);
- dmz_dev_info(dev, " %u metadata zones",
- zmd->nr_meta_zones * 2);
- dmz_dev_info(dev, " %u data zones for %u chunks",
- zmd->nr_data_zones, zmd->nr_chunks);
- dmz_dev_info(dev, " %u random zones (%u unmapped)",
- zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd));
- dmz_dev_info(dev, " %u sequential zones (%u unmapped)",
- zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq));
- dmz_dev_info(dev, " %u reserved sequential data zones",
- zmd->nr_reserved_seq);
+ dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version);
+ for (i = 0; i < zmd->nr_devs; i++)
+ dmz_print_dev(zmd, i);
- dmz_dev_debug(dev, "Format:");
- dmz_dev_debug(dev, "%u metadata blocks per set (%u max cache)",
+ dmz_zmd_info(zmd, " %u zones of %llu 512-byte logical sectors",
+ zmd->nr_zones, (u64)zmd->zone_nr_sectors);
+ dmz_zmd_debug(zmd, " %u metadata zones",
+ zmd->nr_meta_zones * 2);
+ dmz_zmd_debug(zmd, " %u data zones for %u chunks",
+ zmd->nr_data_zones, zmd->nr_chunks);
+ dmz_zmd_debug(zmd, " %u cache zones (%u unmapped)",
+ zmd->nr_cache, atomic_read(&zmd->unmap_nr_cache));
+ for (i = 0; i < zmd->nr_devs; i++) {
+ dmz_zmd_debug(zmd, " %u random zones (%u unmapped)",
+ dmz_nr_rnd_zones(zmd, i),
+ dmz_nr_unmap_rnd_zones(zmd, i));
+ dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)",
+ dmz_nr_seq_zones(zmd, i),
+ dmz_nr_unmap_seq_zones(zmd, i));
+ }
+ dmz_zmd_debug(zmd, " %u reserved sequential data zones",
+ zmd->nr_reserved_seq);
+ dmz_zmd_debug(zmd, "Format:");
+ dmz_zmd_debug(zmd, "%u metadata blocks per set (%u max cache)",
zmd->nr_meta_blocks, zmd->max_nr_mblks);
- dmz_dev_debug(dev, " %u data zone mapping blocks",
+ dmz_zmd_debug(zmd, " %u data zone mapping blocks",
zmd->nr_map_blocks);
- dmz_dev_debug(dev, " %u bitmap blocks",
+ dmz_zmd_debug(zmd, " %u bitmap blocks",
zmd->nr_bitmap_blocks);
*metadata = zmd;
@@ -2523,30 +3020,28 @@ void dmz_dtr_metadata(struct dmz_metadata *zmd)
*/
int dmz_resume_metadata(struct dmz_metadata *zmd)
{
- struct dmz_dev *dev = zmd->dev;
struct dm_zone *zone;
sector_t wp_block;
unsigned int i;
int ret;
/* Check zones */
- for (i = 0; i < dev->nr_zones; i++) {
+ for (i = 0; i < zmd->nr_zones; i++) {
zone = dmz_get(zmd, i);
if (!zone) {
- dmz_dev_err(dev, "Unable to get zone %u", i);
+ dmz_zmd_err(zmd, "Unable to get zone %u", i);
return -EIO;
}
-
wp_block = zone->wp_block;
ret = dmz_update_zone(zmd, zone);
if (ret) {
- dmz_dev_err(dev, "Broken zone %u", i);
+ dmz_zmd_err(zmd, "Broken zone %u", i);
return ret;
}
if (dmz_is_offline(zone)) {
- dmz_dev_warn(dev, "Zone %u is offline", i);
+ dmz_zmd_warn(zmd, "Zone %u is offline", i);
continue;
}
@@ -2554,11 +3049,11 @@ int dmz_resume_metadata(struct dmz_metadata *zmd)
if (!dmz_is_seq(zone))
zone->wp_block = 0;
else if (zone->wp_block != wp_block) {
- dmz_dev_err(dev, "Zone %u: Invalid wp (%llu / %llu)",
+ dmz_zmd_err(zmd, "Zone %u: Invalid wp (%llu / %llu)",
i, (u64)zone->wp_block, (u64)wp_block);
zone->wp_block = wp_block;
dmz_invalidate_blocks(zmd, zone, zone->wp_block,
- dev->zone_nr_blocks - zone->wp_block);
+ zmd->zone_nr_blocks - zone->wp_block);
}
}
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
index d508173..9c0ecc9 100644
--- a/drivers/md/dm-zoned-reclaim.c
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -13,7 +13,6 @@
struct dmz_reclaim {
struct dmz_metadata *metadata;
- struct dmz_dev *dev;
struct delayed_work work;
struct workqueue_struct *wq;
@@ -22,6 +21,8 @@ struct dmz_reclaim {
struct dm_kcopyd_throttle kc_throttle;
int kc_err;
+ int dev_idx;
+
unsigned long flags;
/* Last target access time */
@@ -44,13 +45,13 @@ enum {
* Percentage of unmapped (free) random zones below which reclaim starts
* even if the target is busy.
*/
-#define DMZ_RECLAIM_LOW_UNMAP_RND 30
+#define DMZ_RECLAIM_LOW_UNMAP_ZONES 30
/*
* Percentage of unmapped (free) random zones above which reclaim will
* stop if the target is busy.
*/
-#define DMZ_RECLAIM_HIGH_UNMAP_RND 50
+#define DMZ_RECLAIM_HIGH_UNMAP_ZONES 50
/*
* Align a sequential zone write pointer to chunk_block.
@@ -59,6 +60,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
sector_t block)
{
struct dmz_metadata *zmd = zrc->metadata;
+ struct dmz_dev *dev = zone->dev;
sector_t wp_block = zone->wp_block;
unsigned int nr_blocks;
int ret;
@@ -74,15 +76,15 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
* pointer and the requested position.
*/
nr_blocks = block - wp_block;
- ret = blkdev_issue_zeroout(zrc->dev->bdev,
+ ret = blkdev_issue_zeroout(dev->bdev,
dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
dmz_blk2sect(nr_blocks), GFP_NOIO, 0);
if (ret) {
- dmz_dev_err(zrc->dev,
+ dmz_dev_err(dev,
"Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
- dmz_id(zmd, zone), (unsigned long long)wp_block,
+ zone->id, (unsigned long long)wp_block,
(unsigned long long)block, nr_blocks, ret);
- dmz_check_bdev(zrc->dev);
+ dmz_check_bdev(dev);
return ret;
}
@@ -116,7 +118,6 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
struct dm_zone *src_zone, struct dm_zone *dst_zone)
{
struct dmz_metadata *zmd = zrc->metadata;
- struct dmz_dev *dev = zrc->dev;
struct dm_io_region src, dst;
sector_t block = 0, end_block;
sector_t nr_blocks;
@@ -128,7 +129,7 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
if (dmz_is_seq(src_zone))
end_block = src_zone->wp_block;
else
- end_block = dev->zone_nr_blocks;
+ end_block = dmz_zone_nr_blocks(zmd);
src_zone_block = dmz_start_block(zmd, src_zone);
dst_zone_block = dmz_start_block(zmd, dst_zone);
@@ -136,8 +137,13 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
while (block < end_block) {
- if (dev->flags & DMZ_BDEV_DYING)
+ if (src_zone->dev->flags & DMZ_BDEV_DYING)
return -EIO;
+ if (dst_zone->dev->flags & DMZ_BDEV_DYING)
+ return -EIO;
+
+ if (dmz_reclaim_should_terminate(src_zone))
+ return -EINTR;
/* Get a valid region from the source zone */
ret = dmz_first_valid_block(zmd, src_zone, &block);
@@ -156,11 +162,11 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
return ret;
}
- src.bdev = dev->bdev;
+ src.bdev = src_zone->dev->bdev;
src.sector = dmz_blk2sect(src_zone_block + block);
src.count = dmz_blk2sect(nr_blocks);
- dst.bdev = dev->bdev;
+ dst.bdev = dst_zone->dev->bdev;
dst.sector = dmz_blk2sect(dst_zone_block + block);
dst.count = src.count;
@@ -194,10 +200,10 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone)
struct dmz_metadata *zmd = zrc->metadata;
int ret;
- dmz_dev_debug(zrc->dev,
- "Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)",
- dzone->chunk, dmz_id(zmd, bzone), dmz_weight(bzone),
- dmz_id(zmd, dzone), dmz_weight(dzone));
+ DMDEBUG("(%s/%u): Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)",
+ dmz_metadata_label(zmd), zrc->dev_idx,
+ dzone->chunk, bzone->id, dmz_weight(bzone),
+ dzone->id, dmz_weight(dzone));
/* Flush data zone into the buffer zone */
ret = dmz_reclaim_copy(zrc, bzone, dzone);
@@ -210,7 +216,7 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone)
ret = dmz_merge_valid_blocks(zmd, bzone, dzone, chunk_block);
if (ret == 0) {
/* Free the buffer zone */
- dmz_invalidate_blocks(zmd, bzone, 0, zrc->dev->zone_nr_blocks);
+ dmz_invalidate_blocks(zmd, bzone, 0, dmz_zone_nr_blocks(zmd));
dmz_lock_map(zmd);
dmz_unmap_zone(zmd, bzone);
dmz_unlock_zone_reclaim(dzone);
@@ -233,10 +239,10 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
struct dmz_metadata *zmd = zrc->metadata;
int ret = 0;
- dmz_dev_debug(zrc->dev,
- "Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)",
- chunk, dmz_id(zmd, dzone), dmz_weight(dzone),
- dmz_id(zmd, bzone), dmz_weight(bzone));
+ DMDEBUG("(%s/%u): Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)",
+ dmz_metadata_label(zmd), zrc->dev_idx,
+ chunk, dzone->id, dmz_weight(dzone),
+ bzone->id, dmz_weight(bzone));
/* Flush data zone into the buffer zone */
ret = dmz_reclaim_copy(zrc, dzone, bzone);
@@ -252,7 +258,7 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
* Free the data zone and remap the chunk to
* the buffer zone.
*/
- dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks);
+ dmz_invalidate_blocks(zmd, dzone, 0, dmz_zone_nr_blocks(zmd));
dmz_lock_map(zmd);
dmz_unmap_zone(zmd, bzone);
dmz_unmap_zone(zmd, dzone);
@@ -277,18 +283,26 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
struct dm_zone *szone = NULL;
struct dmz_metadata *zmd = zrc->metadata;
int ret;
+ int alloc_flags = DMZ_ALLOC_SEQ;
- /* Get a free sequential zone */
+ /* Get a free random or sequential zone */
dmz_lock_map(zmd);
- szone = dmz_alloc_zone(zmd, DMZ_ALLOC_RECLAIM);
+again:
+ szone = dmz_alloc_zone(zmd, zrc->dev_idx,
+ alloc_flags | DMZ_ALLOC_RECLAIM);
+ if (!szone && alloc_flags == DMZ_ALLOC_SEQ && dmz_nr_cache_zones(zmd)) {
+ alloc_flags = DMZ_ALLOC_RND;
+ goto again;
+ }
dmz_unlock_map(zmd);
if (!szone)
return -ENOSPC;
- dmz_dev_debug(zrc->dev,
- "Chunk %u, move rnd zone %u (weight %u) to seq zone %u",
- chunk, dmz_id(zmd, dzone), dmz_weight(dzone),
- dmz_id(zmd, szone));
+ DMDEBUG("(%s/%u): Chunk %u, move %s zone %u (weight %u) to %s zone %u",
+ dmz_metadata_label(zmd), zrc->dev_idx, chunk,
+ dmz_is_cache(dzone) ? "cache" : "rnd",
+ dzone->id, dmz_weight(dzone),
+ dmz_is_rnd(szone) ? "rnd" : "seq", szone->id);
/* Flush the random data zone into the sequential zone */
ret = dmz_reclaim_copy(zrc, dzone, szone);
@@ -306,7 +320,7 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
dmz_unlock_map(zmd);
} else {
/* Free the data zone and remap the chunk */
- dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks);
+ dmz_invalidate_blocks(zmd, dzone, 0, dmz_zone_nr_blocks(zmd));
dmz_lock_map(zmd);
dmz_unmap_zone(zmd, dzone);
dmz_unlock_zone_reclaim(dzone);
@@ -337,6 +351,14 @@ static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone)
}
/*
+ * Test if the target device is idle.
+ */
+static inline int dmz_target_idle(struct dmz_reclaim *zrc)
+{
+ return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD);
+}
+
+/*
* Find a candidate zone for reclaim and process it.
*/
static int dmz_do_reclaim(struct dmz_reclaim *zrc)
@@ -348,13 +370,17 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc)
int ret;
/* Get a data zone */
- dzone = dmz_get_zone_for_reclaim(zmd);
- if (!dzone)
+ dzone = dmz_get_zone_for_reclaim(zmd, zrc->dev_idx,
+ dmz_target_idle(zrc));
+ if (!dzone) {
+ DMDEBUG("(%s/%u): No zone found to reclaim",
+ dmz_metadata_label(zmd), zrc->dev_idx);
return -EBUSY;
+ }
+ rzone = dzone;
start = jiffies;
-
- if (dmz_is_rnd(dzone)) {
+ if (dmz_is_cache(dzone) || dmz_is_rnd(dzone)) {
if (!dmz_weight(dzone)) {
/* Empty zone */
dmz_reclaim_empty(zrc, dzone);
@@ -366,8 +392,6 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc)
*/
ret = dmz_reclaim_rnd_data(zrc, dzone);
}
- rzone = dzone;
-
} else {
struct dm_zone *bzone = dzone->bzone;
sector_t chunk_block = 0;
@@ -390,59 +414,86 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc)
* be later reclaimed.
*/
ret = dmz_reclaim_seq_data(zrc, dzone);
- rzone = dzone;
}
}
out:
if (ret) {
+ if (ret == -EINTR)
+ DMDEBUG("(%s/%u): reclaim zone %u interrupted",
+ dmz_metadata_label(zmd), zrc->dev_idx,
+ rzone->id);
+ else
+ DMDEBUG("(%s/%u): Failed to reclaim zone %u, err %d",
+ dmz_metadata_label(zmd), zrc->dev_idx,
+ rzone->id, ret);
dmz_unlock_zone_reclaim(dzone);
return ret;
}
ret = dmz_flush_metadata(zrc->metadata);
if (ret) {
- dmz_dev_debug(zrc->dev,
- "Metadata flush for zone %u failed, err %d\n",
- dmz_id(zmd, rzone), ret);
+ DMDEBUG("(%s/%u): Metadata flush for zone %u failed, err %d",
+ dmz_metadata_label(zmd), zrc->dev_idx, rzone->id, ret);
return ret;
}
- dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms",
- dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start));
+ DMDEBUG("(%s/%u): Reclaimed zone %u in %u ms",
+ dmz_metadata_label(zmd), zrc->dev_idx,
+ rzone->id, jiffies_to_msecs(jiffies - start));
return 0;
}
-/*
- * Test if the target device is idle.
- */
-static inline int dmz_target_idle(struct dmz_reclaim *zrc)
+static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc)
{
- return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD);
+ struct dmz_metadata *zmd = zrc->metadata;
+ unsigned int nr_cache = dmz_nr_cache_zones(zmd);
+ unsigned int nr_unmap, nr_zones;
+
+ if (nr_cache) {
+ nr_zones = nr_cache;
+ nr_unmap = dmz_nr_unmap_cache_zones(zmd);
+ } else {
+ nr_zones = dmz_nr_rnd_zones(zmd, zrc->dev_idx);
+ nr_unmap = dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx);
+ }
+ if (nr_unmap <= 1)
+ return 0;
+ return nr_unmap * 100 / nr_zones;
}
/*
* Test if reclaim is necessary.
*/
-static bool dmz_should_reclaim(struct dmz_reclaim *zrc)
+static bool dmz_should_reclaim(struct dmz_reclaim *zrc, unsigned int p_unmap)
{
- struct dmz_metadata *zmd = zrc->metadata;
- unsigned int nr_rnd = dmz_nr_rnd_zones(zmd);
- unsigned int nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd);
- unsigned int p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd;
+ unsigned int nr_reclaim;
+
+ nr_reclaim = dmz_nr_rnd_zones(zrc->metadata, zrc->dev_idx);
+
+ if (dmz_nr_cache_zones(zrc->metadata)) {
+ /*
+ * The first device in a multi-device
+ * setup only contains cache zones, so
+ * never start reclaim there.
+ */
+ if (zrc->dev_idx == 0)
+ return false;
+ nr_reclaim += dmz_nr_cache_zones(zrc->metadata);
+ }
/* Reclaim when idle */
- if (dmz_target_idle(zrc) && nr_unmap_rnd < nr_rnd)
+ if (dmz_target_idle(zrc) && nr_reclaim)
return true;
- /* If there are still plenty of random zones, do not reclaim */
- if (p_unmap_rnd >= DMZ_RECLAIM_HIGH_UNMAP_RND)
+ /* If there are still plenty of cache zones, do not reclaim */
+ if (p_unmap >= DMZ_RECLAIM_HIGH_UNMAP_ZONES)
return false;
/*
- * If the percentage of unmapped random zones is low,
+ * If the percentage of unmapped cache zones is low,
* reclaim even if the target is busy.
*/
- return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND;
+ return p_unmap <= DMZ_RECLAIM_LOW_UNMAP_ZONES;
}
/*
@@ -452,14 +503,14 @@ static void dmz_reclaim_work(struct work_struct *work)
{
struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work);
struct dmz_metadata *zmd = zrc->metadata;
- unsigned int nr_rnd, nr_unmap_rnd;
- unsigned int p_unmap_rnd;
+ unsigned int p_unmap;
int ret;
- if (dmz_bdev_is_dying(zrc->dev))
+ if (dmz_dev_is_dying(zmd))
return;
- if (!dmz_should_reclaim(zrc)) {
+ p_unmap = dmz_reclaim_percentage(zrc);
+ if (!dmz_should_reclaim(zrc, p_unmap)) {
mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
return;
}
@@ -470,27 +521,26 @@ static void dmz_reclaim_work(struct work_struct *work)
* and slower if there are still some free random zones to avoid
* as much as possible to negatively impact the user workload.
*/
- nr_rnd = dmz_nr_rnd_zones(zmd);
- nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd);
- p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd;
- if (dmz_target_idle(zrc) || p_unmap_rnd < DMZ_RECLAIM_LOW_UNMAP_RND / 2) {
+ if (dmz_target_idle(zrc) || p_unmap < DMZ_RECLAIM_LOW_UNMAP_ZONES / 2) {
/* Idle or very low percentage: go fast */
zrc->kc_throttle.throttle = 100;
} else {
/* Busy but we still have some random zone: throttle */
- zrc->kc_throttle.throttle = min(75U, 100U - p_unmap_rnd / 2);
+ zrc->kc_throttle.throttle = min(75U, 100U - p_unmap / 2);
}
- dmz_dev_debug(zrc->dev,
- "Reclaim (%u): %s, %u%% free rnd zones (%u/%u)",
- zrc->kc_throttle.throttle,
- (dmz_target_idle(zrc) ? "Idle" : "Busy"),
- p_unmap_rnd, nr_unmap_rnd, nr_rnd);
+ DMDEBUG("(%s/%u): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)",
+ dmz_metadata_label(zmd), zrc->dev_idx,
+ zrc->kc_throttle.throttle,
+ (dmz_target_idle(zrc) ? "Idle" : "Busy"),
+ p_unmap, dmz_nr_unmap_cache_zones(zmd),
+ dmz_nr_cache_zones(zmd),
+ dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx),
+ dmz_nr_rnd_zones(zmd, zrc->dev_idx));
ret = dmz_do_reclaim(zrc);
- if (ret) {
- dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret);
- if (!dmz_check_bdev(zrc->dev))
+ if (ret && ret != -EINTR) {
+ if (!dmz_check_dev(zmd))
return;
}
@@ -500,8 +550,8 @@ static void dmz_reclaim_work(struct work_struct *work)
/*
* Initialize reclaim.
*/
-int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
- struct dmz_reclaim **reclaim)
+int dmz_ctr_reclaim(struct dmz_metadata *zmd,
+ struct dmz_reclaim **reclaim, int idx)
{
struct dmz_reclaim *zrc;
int ret;
@@ -510,9 +560,9 @@ int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
if (!zrc)
return -ENOMEM;
- zrc->dev = dev;
zrc->metadata = zmd;
zrc->atime = jiffies;
+ zrc->dev_idx = idx;
/* Reclaim kcopyd client */
zrc->kc = dm_kcopyd_client_create(&zrc->kc_throttle);
@@ -524,8 +574,8 @@ int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
/* Reclaim work */
INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work);
- zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s", WQ_MEM_RECLAIM,
- dev->name);
+ zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s_%d", WQ_MEM_RECLAIM,
+ dmz_metadata_label(zmd), idx);
if (!zrc->wq) {
ret = -ENOMEM;
goto err;
@@ -583,7 +633,8 @@ void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc)
*/
void dmz_schedule_reclaim(struct dmz_reclaim *zrc)
{
- if (dmz_should_reclaim(zrc))
+ unsigned int p_unmap = dmz_reclaim_percentage(zrc);
+
+ if (dmz_should_reclaim(zrc, p_unmap))
mod_delayed_work(zrc->wq, &zrc->work, 0);
}
-
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 6e4f3ef..7e88df6 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -17,7 +17,7 @@
* Zone BIO context.
*/
struct dmz_bioctx {
- struct dmz_target *target;
+ struct dmz_dev *dev;
struct dm_zone *zone;
struct bio *bio;
refcount_t ref;
@@ -38,9 +38,10 @@ struct dm_chunk_work {
* Target descriptor.
*/
struct dmz_target {
- struct dm_dev *ddev;
+ struct dm_dev **ddev;
+ unsigned int nr_ddevs;
- unsigned long flags;
+ unsigned int flags;
/* Zoned block device information */
struct dmz_dev *dev;
@@ -48,9 +49,6 @@ struct dmz_target {
/* For metadata handling */
struct dmz_metadata *metadata;
- /* For reclaim */
- struct dmz_reclaim *reclaim;
-
/* For chunk work */
struct radix_tree_root chunk_rxtree;
struct workqueue_struct *chunk_wq;
@@ -76,12 +74,13 @@ struct dmz_target {
*/
static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
{
- struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+ struct dmz_bioctx *bioctx =
+ dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
bio->bi_status = status;
- if (bio->bi_status != BLK_STS_OK)
- bioctx->target->dev->flags |= DMZ_CHECK_BDEV;
+ if (bioctx->dev && bio->bi_status != BLK_STS_OK)
+ bioctx->dev->flags |= DMZ_CHECK_BDEV;
if (refcount_dec_and_test(&bioctx->ref)) {
struct dm_zone *zone = bioctx->zone;
@@ -118,14 +117,20 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
struct bio *bio, sector_t chunk_block,
unsigned int nr_blocks)
{
- struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+ struct dmz_bioctx *bioctx =
+ dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+ struct dmz_dev *dev = zone->dev;
struct bio *clone;
+ if (dev->flags & DMZ_BDEV_DYING)
+ return -EIO;
+
clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set);
if (!clone)
return -ENOMEM;
- bio_set_dev(clone, dmz->dev->bdev);
+ bio_set_dev(clone, dev->bdev);
+ bioctx->dev = dev;
clone->bi_iter.bi_sector =
dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT;
@@ -135,7 +140,7 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
bio_advance(bio, clone->bi_iter.bi_size);
refcount_inc(&bioctx->ref);
- generic_make_request(clone);
+ submit_bio_noacct(clone);
if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
zone->wp_block += nr_blocks;
@@ -165,7 +170,8 @@ static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio,
static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
struct bio *bio)
{
- sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio));
+ struct dmz_metadata *zmd = dmz->metadata;
+ sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio));
unsigned int nr_blocks = dmz_bio_blocks(bio);
sector_t end_block = chunk_block + nr_blocks;
struct dm_zone *rzone, *bzone;
@@ -177,19 +183,22 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
return 0;
}
- dmz_dev_debug(dmz->dev, "READ chunk %llu -> %s zone %u, block %llu, %u blocks",
- (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
- (dmz_is_rnd(zone) ? "RND" : "SEQ"),
- dmz_id(dmz->metadata, zone),
- (unsigned long long)chunk_block, nr_blocks);
+ DMDEBUG("(%s): READ chunk %llu -> %s zone %u, block %llu, %u blocks",
+ dmz_metadata_label(zmd),
+ (unsigned long long)dmz_bio_chunk(zmd, bio),
+ (dmz_is_rnd(zone) ? "RND" :
+ (dmz_is_cache(zone) ? "CACHE" : "SEQ")),
+ zone->id,
+ (unsigned long long)chunk_block, nr_blocks);
/* Check block validity to determine the read location */
bzone = zone->bzone;
while (chunk_block < end_block) {
nr_blocks = 0;
- if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) {
+ if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
+ chunk_block < zone->wp_block) {
/* Test block validity in the data zone */
- ret = dmz_block_valid(dmz->metadata, zone, chunk_block);
+ ret = dmz_block_valid(zmd, zone, chunk_block);
if (ret < 0)
return ret;
if (ret > 0) {
@@ -204,7 +213,7 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
* Check the buffer zone, if there is one.
*/
if (!nr_blocks && bzone) {
- ret = dmz_block_valid(dmz->metadata, bzone, chunk_block);
+ ret = dmz_block_valid(zmd, bzone, chunk_block);
if (ret < 0)
return ret;
if (ret > 0) {
@@ -216,8 +225,10 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
if (nr_blocks) {
/* Valid blocks found: read them */
- nr_blocks = min_t(unsigned int, nr_blocks, end_block - chunk_block);
- ret = dmz_submit_bio(dmz, rzone, bio, chunk_block, nr_blocks);
+ nr_blocks = min_t(unsigned int, nr_blocks,
+ end_block - chunk_block);
+ ret = dmz_submit_bio(dmz, rzone, bio,
+ chunk_block, nr_blocks);
if (ret)
return ret;
chunk_block += nr_blocks;
@@ -308,25 +319,30 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz,
static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone,
struct bio *bio)
{
- sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio));
+ struct dmz_metadata *zmd = dmz->metadata;
+ sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio));
unsigned int nr_blocks = dmz_bio_blocks(bio);
if (!zone)
return -ENOSPC;
- dmz_dev_debug(dmz->dev, "WRITE chunk %llu -> %s zone %u, block %llu, %u blocks",
- (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
- (dmz_is_rnd(zone) ? "RND" : "SEQ"),
- dmz_id(dmz->metadata, zone),
- (unsigned long long)chunk_block, nr_blocks);
+ DMDEBUG("(%s): WRITE chunk %llu -> %s zone %u, block %llu, %u blocks",
+ dmz_metadata_label(zmd),
+ (unsigned long long)dmz_bio_chunk(zmd, bio),
+ (dmz_is_rnd(zone) ? "RND" :
+ (dmz_is_cache(zone) ? "CACHE" : "SEQ")),
+ zone->id,
+ (unsigned long long)chunk_block, nr_blocks);
- if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) {
+ if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
+ chunk_block == zone->wp_block) {
/*
* zone is a random zone or it is a sequential zone
* and the BIO is aligned to the zone write pointer:
* direct write the zone.
*/
- return dmz_handle_direct_write(dmz, zone, bio, chunk_block, nr_blocks);
+ return dmz_handle_direct_write(dmz, zone, bio,
+ chunk_block, nr_blocks);
}
/*
@@ -345,7 +361,7 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
struct dmz_metadata *zmd = dmz->metadata;
sector_t block = dmz_bio_block(bio);
unsigned int nr_blocks = dmz_bio_blocks(bio);
- sector_t chunk_block = dmz_chunk_block(dmz->dev, block);
+ sector_t chunk_block = dmz_chunk_block(zmd, block);
int ret = 0;
/* For unmapped chunks, there is nothing to do */
@@ -355,16 +371,18 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
if (dmz_is_readonly(zone))
return -EROFS;
- dmz_dev_debug(dmz->dev, "DISCARD chunk %llu -> zone %u, block %llu, %u blocks",
- (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
- dmz_id(zmd, zone),
- (unsigned long long)chunk_block, nr_blocks);
+ DMDEBUG("(%s): DISCARD chunk %llu -> zone %u, block %llu, %u blocks",
+ dmz_metadata_label(dmz->metadata),
+ (unsigned long long)dmz_bio_chunk(zmd, bio),
+ zone->id,
+ (unsigned long long)chunk_block, nr_blocks);
/*
* Invalidate blocks in the data zone and its
* buffer zone if one is mapped.
*/
- if (dmz_is_rnd(zone) || chunk_block < zone->wp_block)
+ if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
+ chunk_block < zone->wp_block)
ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
if (ret == 0 && zone->bzone)
ret = dmz_invalidate_blocks(zmd, zone->bzone,
@@ -378,31 +396,20 @@ static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
struct bio *bio)
{
- struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
+ struct dmz_bioctx *bioctx =
+ dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
struct dmz_metadata *zmd = dmz->metadata;
struct dm_zone *zone;
int ret;
- /*
- * Write may trigger a zone allocation. So make sure the
- * allocation can succeed.
- */
- if (bio_op(bio) == REQ_OP_WRITE)
- dmz_schedule_reclaim(dmz->reclaim);
-
dmz_lock_metadata(zmd);
- if (dmz->dev->flags & DMZ_BDEV_DYING) {
- ret = -EIO;
- goto out;
- }
-
/*
* Get the data zone mapping the chunk. There may be no
* mapping for read and discard. If a mapping is obtained,
+ the zone returned will be set to active state.
*/
- zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(dmz->dev, bio),
+ zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(zmd, bio),
bio_op(bio));
if (IS_ERR(zone)) {
ret = PTR_ERR(zone);
@@ -413,6 +420,7 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
if (zone) {
dmz_activate_zone(zone);
bioctx->zone = zone;
+ dmz_reclaim_bio_acc(zone->dev->reclaim);
}
switch (bio_op(bio)) {
@@ -427,8 +435,8 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
ret = dmz_handle_discard(dmz, zone, bio);
break;
default:
- dmz_dev_err(dmz->dev, "Unsupported BIO operation 0x%x",
- bio_op(bio));
+ DMERR("(%s): Unsupported BIO operation 0x%x",
+ dmz_metadata_label(dmz->metadata), bio_op(bio));
ret = -EIO;
}
@@ -502,7 +510,8 @@ static void dmz_flush_work(struct work_struct *work)
/* Flush dirty metadata blocks */
ret = dmz_flush_metadata(dmz->metadata);
if (ret)
- dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret);
+ DMDEBUG("(%s): Metadata flush failed, rc=%d",
+ dmz_metadata_label(dmz->metadata), ret);
/* Process queued flush requests */
while (1) {
@@ -525,7 +534,7 @@ static void dmz_flush_work(struct work_struct *work)
*/
static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
{
- unsigned int chunk = dmz_bio_chunk(dmz->dev, bio);
+ unsigned int chunk = dmz_bio_chunk(dmz->metadata, bio);
struct dm_chunk_work *cw;
int ret = 0;
@@ -558,7 +567,6 @@ static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
bio_list_add(&cw->bio_list, bio);
- dmz_reclaim_bio_acc(dmz->reclaim);
if (queue_work(dmz->chunk_wq, &cw->work))
dmz_get_chunk_work(cw);
out:
@@ -618,23 +626,22 @@ bool dmz_check_bdev(struct dmz_dev *dmz_dev)
static int dmz_map(struct dm_target *ti, struct bio *bio)
{
struct dmz_target *dmz = ti->private;
- struct dmz_dev *dev = dmz->dev;
+ struct dmz_metadata *zmd = dmz->metadata;
struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
sector_t sector = bio->bi_iter.bi_sector;
unsigned int nr_sectors = bio_sectors(bio);
sector_t chunk_sector;
int ret;
- if (dmz_bdev_is_dying(dmz->dev))
+ if (dmz_dev_is_dying(zmd))
return DM_MAPIO_KILL;
- dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
- bio_op(bio), (unsigned long long)sector, nr_sectors,
- (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
- (unsigned long long)dmz_chunk_block(dmz->dev, dmz_bio_block(bio)),
- (unsigned int)dmz_bio_blocks(bio));
-
- bio_set_dev(bio, dev->bdev);
+ DMDEBUG("(%s): BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
+ dmz_metadata_label(zmd),
+ bio_op(bio), (unsigned long long)sector, nr_sectors,
+ (unsigned long long)dmz_bio_chunk(zmd, bio),
+ (unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)),
+ (unsigned int)dmz_bio_blocks(bio));
if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE)
return DM_MAPIO_REMAPPED;
@@ -644,7 +651,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_KILL;
/* Initialize the BIO context */
- bioctx->target = dmz;
+ bioctx->dev = NULL;
bioctx->zone = NULL;
bioctx->bio = bio;
refcount_set(&bioctx->ref, 1);
@@ -659,17 +666,17 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
}
/* Split zone BIOs to fit entirely into a zone */
- chunk_sector = sector & (dev->zone_nr_sectors - 1);
- if (chunk_sector + nr_sectors > dev->zone_nr_sectors)
- dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector);
+ chunk_sector = sector & (dmz_zone_nr_sectors(zmd) - 1);
+ if (chunk_sector + nr_sectors > dmz_zone_nr_sectors(zmd))
+ dm_accept_partial_bio(bio, dmz_zone_nr_sectors(zmd) - chunk_sector);
/* Now ready to handle this BIO */
ret = dmz_queue_chunk_work(dmz, bio);
if (ret) {
- dmz_dev_debug(dmz->dev,
- "BIO op %d, can't process chunk %llu, err %i\n",
- bio_op(bio), (u64)dmz_bio_chunk(dmz->dev, bio),
- ret);
+ DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i",
+ dmz_metadata_label(zmd),
+ bio_op(bio), (u64)dmz_bio_chunk(zmd, bio),
+ ret);
return DM_MAPIO_REQUEUE;
}
@@ -679,64 +686,65 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
/*
* Get zoned device information.
*/
-static int dmz_get_zoned_device(struct dm_target *ti, char *path)
+static int dmz_get_zoned_device(struct dm_target *ti, char *path,
+ int idx, int nr_devs)
{
struct dmz_target *dmz = ti->private;
- struct request_queue *q;
+ struct dm_dev *ddev;
struct dmz_dev *dev;
- sector_t aligned_capacity;
int ret;
+ struct block_device *bdev;
/* Get the target device */
- ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &dmz->ddev);
+ ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &ddev);
if (ret) {
ti->error = "Get target device failed";
- dmz->ddev = NULL;
return ret;
}
- dev = kzalloc(sizeof(struct dmz_dev), GFP_KERNEL);
- if (!dev) {
- ret = -ENOMEM;
- goto err;
+ bdev = ddev->bdev;
+ if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) {
+ if (nr_devs == 1) {
+ ti->error = "Invalid regular device";
+ goto err;
+ }
+ if (idx != 0) {
+ ti->error = "First device must be a regular device";
+ goto err;
+ }
+ if (dmz->ddev[0]) {
+ ti->error = "Too many regular devices";
+ goto err;
+ }
+ dev = &dmz->dev[idx];
+ dev->flags = DMZ_BDEV_REGULAR;
+ } else {
+ if (dmz->ddev[idx]) {
+ ti->error = "Too many zoned devices";
+ goto err;
+ }
+ if (nr_devs > 1 && idx == 0) {
+ ti->error = "First device must be a regular device";
+ goto err;
+ }
+ dev = &dmz->dev[idx];
}
-
- dev->bdev = dmz->ddev->bdev;
+ dev->bdev = bdev;
+ dev->dev_idx = idx;
(void)bdevname(dev->bdev, dev->name);
- if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) {
- ti->error = "Not a zoned block device";
- ret = -EINVAL;
+ dev->capacity = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+ if (ti->begin) {
+ ti->error = "Partial mapping is not supported";
goto err;
}
- q = bdev_get_queue(dev->bdev);
- dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
- aligned_capacity = dev->capacity &
- ~((sector_t)blk_queue_zone_sectors(q) - 1);
- if (ti->begin ||
- ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) {
- ti->error = "Partial mapping not supported";
- ret = -EINVAL;
- goto err;
- }
-
- dev->zone_nr_sectors = blk_queue_zone_sectors(q);
- dev->zone_nr_sectors_shift = ilog2(dev->zone_nr_sectors);
-
- dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
- dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks);
-
- dev->nr_zones = blkdev_nr_zones(dev->bdev);
-
- dmz->dev = dev;
+ dmz->ddev[idx] = ddev;
return 0;
err:
- dm_put_device(ti, dmz->ddev);
- kfree(dev);
-
- return ret;
+ dm_put_device(ti, ddev);
+ return -EINVAL;
}
/*
@@ -745,10 +753,78 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
static void dmz_put_zoned_device(struct dm_target *ti)
{
struct dmz_target *dmz = ti->private;
+ int i;
- dm_put_device(ti, dmz->ddev);
- kfree(dmz->dev);
- dmz->dev = NULL;
+ for (i = 0; i < dmz->nr_ddevs; i++) {
+ if (dmz->ddev[i]) {
+ dm_put_device(ti, dmz->ddev[i]);
+ dmz->ddev[i] = NULL;
+ }
+ }
+}
+
+static int dmz_fixup_devices(struct dm_target *ti)
+{
+ struct dmz_target *dmz = ti->private;
+ struct dmz_dev *reg_dev, *zoned_dev;
+ struct request_queue *q;
+ sector_t zone_nr_sectors = 0;
+ int i;
+
+ /*
+ * When we have more than on devices, the first one must be a
+ * regular block device and the others zoned block devices.
+ */
+ if (dmz->nr_ddevs > 1) {
+ reg_dev = &dmz->dev[0];
+ if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) {
+ ti->error = "Primary disk is not a regular device";
+ return -EINVAL;
+ }
+ for (i = 1; i < dmz->nr_ddevs; i++) {
+ zoned_dev = &dmz->dev[i];
+ if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
+ ti->error = "Secondary disk is not a zoned device";
+ return -EINVAL;
+ }
+ q = bdev_get_queue(zoned_dev->bdev);
+ if (zone_nr_sectors &&
+ zone_nr_sectors != blk_queue_zone_sectors(q)) {
+ ti->error = "Zone nr sectors mismatch";
+ return -EINVAL;
+ }
+ zone_nr_sectors = blk_queue_zone_sectors(q);
+ zoned_dev->zone_nr_sectors = zone_nr_sectors;
+ zoned_dev->nr_zones =
+ blkdev_nr_zones(zoned_dev->bdev->bd_disk);
+ }
+ } else {
+ reg_dev = NULL;
+ zoned_dev = &dmz->dev[0];
+ if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
+ ti->error = "Disk is not a zoned device";
+ return -EINVAL;
+ }
+ q = bdev_get_queue(zoned_dev->bdev);
+ zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q);
+ zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk);
+ }
+
+ if (reg_dev) {
+ sector_t zone_offset;
+
+ reg_dev->zone_nr_sectors = zone_nr_sectors;
+ reg_dev->nr_zones =
+ DIV_ROUND_UP_SECTOR_T(reg_dev->capacity,
+ reg_dev->zone_nr_sectors);
+ reg_dev->zone_offset = 0;
+ zone_offset = reg_dev->nr_zones;
+ for (i = 1; i < dmz->nr_ddevs; i++) {
+ dmz->dev[i].zone_offset = zone_offset;
+ zone_offset += dmz->dev[i].nr_zones;
+ }
+ }
+ return 0;
}
/*
@@ -757,11 +833,10 @@ static void dmz_put_zoned_device(struct dm_target *ti)
static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct dmz_target *dmz;
- struct dmz_dev *dev;
- int ret;
+ int ret, i;
/* Check arguments */
- if (argc != 1) {
+ if (argc < 1) {
ti->error = "Invalid argument count";
return -EINVAL;
}
@@ -772,25 +847,42 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->error = "Unable to allocate the zoned target descriptor";
return -ENOMEM;
}
+ dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL);
+ if (!dmz->dev) {
+ ti->error = "Unable to allocate the zoned device descriptors";
+ kfree(dmz);
+ return -ENOMEM;
+ }
+ dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL);
+ if (!dmz->ddev) {
+ ti->error = "Unable to allocate the dm device descriptors";
+ ret = -ENOMEM;
+ goto err;
+ }
+ dmz->nr_ddevs = argc;
+
ti->private = dmz;
/* Get the target zoned block device */
- ret = dmz_get_zoned_device(ti, argv[0]);
- if (ret) {
- dmz->ddev = NULL;
- goto err;
+ for (i = 0; i < argc; i++) {
+ ret = dmz_get_zoned_device(ti, argv[i], i, argc);
+ if (ret)
+ goto err_dev;
}
+ ret = dmz_fixup_devices(ti);
+ if (ret)
+ goto err_dev;
/* Initialize metadata */
- dev = dmz->dev;
- ret = dmz_ctr_metadata(dev, &dmz->metadata);
+ ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata,
+ dm_table_device_name(ti->table));
if (ret) {
ti->error = "Metadata initialization failed";
goto err_dev;
}
/* Set target (no write same support) */
- ti->max_io_len = dev->zone_nr_sectors;
+ ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata);
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_zeroes_bios = 1;
@@ -799,7 +891,8 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->discards_supported = true;
/* The exposed capacity is the number of chunks that can be mapped */
- ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift;
+ ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) <<
+ dmz_zone_nr_sectors_shift(dmz->metadata);
/* Zone BIO */
ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0);
@@ -811,8 +904,9 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
/* Chunk BIO work */
mutex_init(&dmz->chunk_lock);
INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO);
- dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND,
- 0, dev->name);
+ dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, 0,
+ dmz_metadata_label(dmz->metadata));
if (!dmz->chunk_wq) {
ti->error = "Create chunk workqueue failed";
ret = -ENOMEM;
@@ -824,7 +918,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
bio_list_init(&dmz->flush_list);
INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work);
dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM,
- dev->name);
+ dmz_metadata_label(dmz->metadata));
if (!dmz->flush_wq) {
ti->error = "Create flush workqueue failed";
ret = -ENOMEM;
@@ -833,15 +927,18 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
/* Initialize reclaim */
- ret = dmz_ctr_reclaim(dev, dmz->metadata, &dmz->reclaim);
- if (ret) {
- ti->error = "Zone reclaim initialization failed";
- goto err_fwq;
+ for (i = 0; i < dmz->nr_ddevs; i++) {
+ ret = dmz_ctr_reclaim(dmz->metadata, &dmz->dev[i].reclaim, i);
+ if (ret) {
+ ti->error = "Zone reclaim initialization failed";
+ goto err_fwq;
+ }
}
- dmz_dev_info(dev, "Target device: %llu 512-byte logical sectors (%llu blocks)",
- (unsigned long long)ti->len,
- (unsigned long long)dmz_sect2blk(ti->len));
+ DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)",
+ dmz_metadata_label(dmz->metadata),
+ (unsigned long long)ti->len,
+ (unsigned long long)dmz_sect2blk(ti->len));
return 0;
err_fwq:
@@ -856,6 +953,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
err_dev:
dmz_put_zoned_device(ti);
err:
+ kfree(dmz->dev);
kfree(dmz);
return ret;
@@ -867,11 +965,13 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
static void dmz_dtr(struct dm_target *ti)
{
struct dmz_target *dmz = ti->private;
+ int i;
flush_workqueue(dmz->chunk_wq);
destroy_workqueue(dmz->chunk_wq);
- dmz_dtr_reclaim(dmz->reclaim);
+ for (i = 0; i < dmz->nr_ddevs; i++)
+ dmz_dtr_reclaim(dmz->dev[i].reclaim);
cancel_delayed_work_sync(&dmz->flush_work);
destroy_workqueue(dmz->flush_wq);
@@ -886,6 +986,7 @@ static void dmz_dtr(struct dm_target *ti)
mutex_destroy(&dmz->chunk_lock);
+ kfree(dmz->dev);
kfree(dmz);
}
@@ -895,7 +996,7 @@ static void dmz_dtr(struct dm_target *ti)
static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct dmz_target *dmz = ti->private;
- unsigned int chunk_sectors = dmz->dev->zone_nr_sectors;
+ unsigned int chunk_sectors = dmz_zone_nr_sectors(dmz->metadata);
limits->logical_block_size = DMZ_BLOCK_SIZE;
limits->physical_block_size = DMZ_BLOCK_SIZE;
@@ -923,11 +1024,12 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
{
struct dmz_target *dmz = ti->private;
+ struct dmz_dev *dev = &dmz->dev[0];
- if (!dmz_check_bdev(dmz->dev))
+ if (!dmz_check_bdev(dev))
return -EIO;
- *bdev = dmz->dev->bdev;
+ *bdev = dev->bdev;
return 0;
}
@@ -938,9 +1040,11 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
static void dmz_suspend(struct dm_target *ti)
{
struct dmz_target *dmz = ti->private;
+ int i;
flush_workqueue(dmz->chunk_wq);
- dmz_suspend_reclaim(dmz->reclaim);
+ for (i = 0; i < dmz->nr_ddevs; i++)
+ dmz_suspend_reclaim(dmz->dev[i].reclaim);
cancel_delayed_work_sync(&dmz->flush_work);
}
@@ -950,25 +1054,96 @@ static void dmz_suspend(struct dm_target *ti)
static void dmz_resume(struct dm_target *ti)
{
struct dmz_target *dmz = ti->private;
+ int i;
queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
- dmz_resume_reclaim(dmz->reclaim);
+ for (i = 0; i < dmz->nr_ddevs; i++)
+ dmz_resume_reclaim(dmz->dev[i].reclaim);
}
static int dmz_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct dmz_target *dmz = ti->private;
- struct dmz_dev *dev = dmz->dev;
- sector_t capacity = dev->capacity & ~(dev->zone_nr_sectors - 1);
+ unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata);
+ sector_t capacity;
+ int i, r;
- return fn(ti, dmz->ddev, 0, capacity, data);
+ for (i = 0; i < dmz->nr_ddevs; i++) {
+ capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1);
+ r = fn(ti, dmz->ddev[i], 0, capacity, data);
+ if (r)
+ break;
+ }
+ return r;
+}
+
+static void dmz_status(struct dm_target *ti, status_type_t type,
+ unsigned int status_flags, char *result,
+ unsigned int maxlen)
+{
+ struct dmz_target *dmz = ti->private;
+ ssize_t sz = 0;
+ char buf[BDEVNAME_SIZE];
+ struct dmz_dev *dev;
+ int i;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%u zones %u/%u cache",
+ dmz_nr_zones(dmz->metadata),
+ dmz_nr_unmap_cache_zones(dmz->metadata),
+ dmz_nr_cache_zones(dmz->metadata));
+ for (i = 0; i < dmz->nr_ddevs; i++) {
+ /*
+ * For a multi-device setup the first device
+ * contains only cache zones.
+ */
+ if ((i == 0) &&
+ (dmz_nr_cache_zones(dmz->metadata) > 0))
+ continue;
+ DMEMIT(" %u/%u random %u/%u sequential",
+ dmz_nr_unmap_rnd_zones(dmz->metadata, i),
+ dmz_nr_rnd_zones(dmz->metadata, i),
+ dmz_nr_unmap_seq_zones(dmz->metadata, i),
+ dmz_nr_seq_zones(dmz->metadata, i));
+ }
+ break;
+ case STATUSTYPE_TABLE:
+ dev = &dmz->dev[0];
+ format_dev_t(buf, dev->bdev->bd_dev);
+ DMEMIT("%s", buf);
+ for (i = 1; i < dmz->nr_ddevs; i++) {
+ dev = &dmz->dev[i];
+ format_dev_t(buf, dev->bdev->bd_dev);
+ DMEMIT(" %s", buf);
+ }
+ break;
+ }
+ return;
+}
+
+static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv,
+ char *result, unsigned int maxlen)
+{
+ struct dmz_target *dmz = ti->private;
+ int r = -EINVAL;
+
+ if (!strcasecmp(argv[0], "reclaim")) {
+ int i;
+
+ for (i = 0; i < dmz->nr_ddevs; i++)
+ dmz_schedule_reclaim(dmz->dev[i].reclaim);
+ r = 0;
+ } else
+ DMERR("unrecognized message %s", argv[0]);
+ return r;
}
static struct target_type dmz_type = {
.name = "zoned",
- .version = {1, 0, 0},
- .features = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
+ .version = {2, 0, 0},
+ .features = DM_TARGET_SINGLETON | DM_TARGET_MIXED_ZONED_MODEL,
.module = THIS_MODULE,
.ctr = dmz_ctr,
.dtr = dmz_dtr,
@@ -978,6 +1153,8 @@ static struct target_type dmz_type = {
.postsuspend = dmz_suspend,
.resume = dmz_resume,
.iterate_devices = dmz_iterate_devices,
+ .status = dmz_status,
+ .message = dmz_message,
};
static int __init dmz_init(void)
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
index 5b5e493..22f1144 100644
--- a/drivers/md/dm-zoned.h
+++ b/drivers/md/dm-zoned.h
@@ -45,34 +45,50 @@
#define dmz_bio_block(bio) dmz_sect2blk((bio)->bi_iter.bi_sector)
#define dmz_bio_blocks(bio) dmz_sect2blk(bio_sectors(bio))
+struct dmz_metadata;
+struct dmz_reclaim;
+
/*
* Zoned block device information.
*/
struct dmz_dev {
struct block_device *bdev;
+ struct dmz_metadata *metadata;
+ struct dmz_reclaim *reclaim;
char name[BDEVNAME_SIZE];
+ uuid_t uuid;
sector_t capacity;
+ unsigned int dev_idx;
+
unsigned int nr_zones;
+ unsigned int zone_offset;
unsigned int flags;
sector_t zone_nr_sectors;
- unsigned int zone_nr_sectors_shift;
- sector_t zone_nr_blocks;
- sector_t zone_nr_blocks_shift;
+ unsigned int nr_rnd;
+ atomic_t unmap_nr_rnd;
+ struct list_head unmap_rnd_list;
+ struct list_head map_rnd_list;
+
+ unsigned int nr_seq;
+ atomic_t unmap_nr_seq;
+ struct list_head unmap_seq_list;
+ struct list_head map_seq_list;
};
-#define dmz_bio_chunk(dev, bio) ((bio)->bi_iter.bi_sector >> \
- (dev)->zone_nr_sectors_shift)
-#define dmz_chunk_block(dev, b) ((b) & ((dev)->zone_nr_blocks - 1))
+#define dmz_bio_chunk(zmd, bio) ((bio)->bi_iter.bi_sector >> \
+ dmz_zone_nr_sectors_shift(zmd))
+#define dmz_chunk_block(zmd, b) ((b) & (dmz_zone_nr_blocks(zmd) - 1))
/* Device flags. */
#define DMZ_BDEV_DYING (1 << 0)
#define DMZ_CHECK_BDEV (2 << 0)
+#define DMZ_BDEV_REGULAR (4 << 0)
/*
* Zone descriptor.
@@ -81,12 +97,18 @@ struct dm_zone {
/* For listing the zone depending on its state */
struct list_head link;
+ /* Device containing this zone */
+ struct dmz_dev *dev;
+
/* Zone type and state */
unsigned long flags;
/* Zone activation reference count */
atomic_t refcount;
+ /* Zone id */
+ unsigned int id;
+
/* Zone write pointer block (relative to the zone start block) */
unsigned int wp_block;
@@ -109,6 +131,7 @@ struct dm_zone {
*/
enum {
/* Zone write type */
+ DMZ_CACHE,
DMZ_RND,
DMZ_SEQ,
@@ -120,22 +143,28 @@ enum {
DMZ_META,
DMZ_DATA,
DMZ_BUF,
+ DMZ_RESERVED,
/* Zone internal state */
DMZ_RECLAIM,
DMZ_SEQ_WRITE_ERR,
+ DMZ_RECLAIM_TERMINATE,
};
/*
* Zone data accessors.
*/
+#define dmz_is_cache(z) test_bit(DMZ_CACHE, &(z)->flags)
#define dmz_is_rnd(z) test_bit(DMZ_RND, &(z)->flags)
#define dmz_is_seq(z) test_bit(DMZ_SEQ, &(z)->flags)
#define dmz_is_empty(z) ((z)->wp_block == 0)
#define dmz_is_offline(z) test_bit(DMZ_OFFLINE, &(z)->flags)
#define dmz_is_readonly(z) test_bit(DMZ_READ_ONLY, &(z)->flags)
#define dmz_in_reclaim(z) test_bit(DMZ_RECLAIM, &(z)->flags)
+#define dmz_is_reserved(z) test_bit(DMZ_RESERVED, &(z)->flags)
#define dmz_seq_write_err(z) test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags)
+#define dmz_reclaim_should_terminate(z) \
+ test_bit(DMZ_RECLAIM_TERMINATE, &(z)->flags)
#define dmz_is_meta(z) test_bit(DMZ_META, &(z)->flags)
#define dmz_is_buf(z) test_bit(DMZ_BUF, &(z)->flags)
@@ -158,13 +187,11 @@ enum {
#define dmz_dev_debug(dev, format, args...) \
DMDEBUG("(%s): " format, (dev)->name, ## args)
-struct dmz_metadata;
-struct dmz_reclaim;
-
/*
* Functions defined in dm-zoned-metadata.c
*/
-int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **zmd);
+int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev,
+ struct dmz_metadata **zmd, const char *devname);
void dmz_dtr_metadata(struct dmz_metadata *zmd);
int dmz_resume_metadata(struct dmz_metadata *zmd);
@@ -175,23 +202,38 @@ void dmz_unlock_metadata(struct dmz_metadata *zmd);
void dmz_lock_flush(struct dmz_metadata *zmd);
void dmz_unlock_flush(struct dmz_metadata *zmd);
int dmz_flush_metadata(struct dmz_metadata *zmd);
+const char *dmz_metadata_label(struct dmz_metadata *zmd);
-unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone);
sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone);
sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone);
unsigned int dmz_nr_chunks(struct dmz_metadata *zmd);
-#define DMZ_ALLOC_RND 0x01
-#define DMZ_ALLOC_RECLAIM 0x02
+bool dmz_check_dev(struct dmz_metadata *zmd);
+bool dmz_dev_is_dying(struct dmz_metadata *zmd);
-struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags);
+#define DMZ_ALLOC_RND 0x01
+#define DMZ_ALLOC_CACHE 0x02
+#define DMZ_ALLOC_SEQ 0x04
+#define DMZ_ALLOC_RECLAIM 0x10
+
+struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd,
+ unsigned int dev_idx, unsigned long flags);
void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
unsigned int chunk);
void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
-unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd);
-unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd);
+unsigned int dmz_nr_zones(struct dmz_metadata *zmd);
+unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd);
+unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd);
+unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx);
+unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx);
+unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx);
+unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx);
+unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd);
+unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd);
+unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd);
+unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd);
/*
* Activate a zone (increment its reference count).
@@ -201,26 +243,10 @@ static inline void dmz_activate_zone(struct dm_zone *zone)
atomic_inc(&zone->refcount);
}
-/*
- * Deactivate a zone. This decrement the zone reference counter
- * indicating that all BIOs to the zone have completed when the count is 0.
- */
-static inline void dmz_deactivate_zone(struct dm_zone *zone)
-{
- atomic_dec(&zone->refcount);
-}
-
-/*
- * Test if a zone is active, that is, has a refcount > 0.
- */
-static inline bool dmz_is_active(struct dm_zone *zone)
-{
- return atomic_read(&zone->refcount);
-}
-
int dmz_lock_zone_reclaim(struct dm_zone *zone);
void dmz_unlock_zone_reclaim(struct dm_zone *zone);
-struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd);
+struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd,
+ unsigned int dev_idx, bool idle);
struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd,
unsigned int chunk, int op);
@@ -244,8 +270,7 @@ int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
/*
* Functions defined in dm-zoned-reclaim.c
*/
-int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd,
- struct dmz_reclaim **zrc);
+int dmz_ctr_reclaim(struct dmz_metadata *zmd, struct dmz_reclaim **zrc, int idx);
void dmz_dtr_reclaim(struct dmz_reclaim *zrc);
void dmz_suspend_reclaim(struct dmz_reclaim *zrc);
void dmz_resume_reclaim(struct dmz_reclaim *zrc);
@@ -258,4 +283,22 @@ void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
bool dmz_check_bdev(struct dmz_dev *dmz_dev);
+/*
+ * Deactivate a zone. This decrement the zone reference counter
+ * indicating that all BIOs to the zone have completed when the count is 0.
+ */
+static inline void dmz_deactivate_zone(struct dm_zone *zone)
+{
+ dmz_reclaim_bio_acc(zone->dev->reclaim);
+ atomic_dec(&zone->refcount);
+}
+
+/*
+ * Test if a zone is active, that is, has a refcount > 0.
+ */
+static inline bool dmz_is_active(struct dm_zone *zone)
+{
+ return atomic_read(&zone->refcount);
+}
+
#endif /* DM_ZONED_H */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 530c0fe..6030cba 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -26,6 +26,8 @@
#include <linux/wait.h>
#include <linux/pr.h>
#include <linux/refcount.h>
+#include <linux/part_stat.h>
+#include <linux/blk-crypto.h>
#define DM_MSG_PREFIX "core"
@@ -430,21 +432,6 @@ static void do_deferred_remove(struct work_struct *w)
dm_deferred_remove();
}
-sector_t dm_get_size(struct mapped_device *md)
-{
- return get_capacity(md->disk);
-}
-
-struct request_queue *dm_get_md_queue(struct mapped_device *md)
-{
- return md->queue;
-}
-
-struct dm_stats *dm_get_stats(struct mapped_device *md)
-{
- return &md->stats;
-}
-
static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
struct mapped_device *md = bdev->bd_disk->private_data;
@@ -452,14 +439,48 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return dm_get_geometry(md, geo);
}
-static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones)
-{
#ifdef CONFIG_BLK_DEV_ZONED
+int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
+{
+ struct dm_report_zones_args *args = data;
+ sector_t sector_diff = args->tgt->begin - args->start;
+
+ /*
+ * Ignore zones beyond the target range.
+ */
+ if (zone->start >= args->start + args->tgt->len)
+ return 0;
+
+ /*
+ * Remap the start sector and write pointer position of the zone
+ * to match its position in the target range.
+ */
+ zone->start += sector_diff;
+ if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
+ if (zone->cond == BLK_ZONE_COND_FULL)
+ zone->wp = zone->start + zone->len;
+ else if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->wp = zone->start;
+ else
+ zone->wp += sector_diff;
+ }
+
+ args->next_sector = zone->start + zone->len;
+ return args->orig_cb(zone, args->zone_idx++, args->orig_data);
+}
+EXPORT_SYMBOL_GPL(dm_report_zones_cb);
+
+static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data)
+{
struct mapped_device *md = disk->private_data;
- struct dm_target *tgt;
struct dm_table *map;
int srcu_idx, ret;
+ struct dm_report_zones_args args = {
+ .next_sector = sector,
+ .orig_data = data,
+ .orig_cb = cb,
+ };
if (dm_suspended_md(md))
return -EAGAIN;
@@ -470,38 +491,31 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
goto out;
}
- tgt = dm_table_find_target(map, sector);
- if (!tgt) {
- ret = -EIO;
- goto out;
- }
+ do {
+ struct dm_target *tgt;
- /*
- * If we are executing this, we already know that the block device
- * is a zoned device and so each target should have support for that
- * type of drive. A missing report_zones method means that the target
- * driver has a problem.
- */
- if (WARN_ON(!tgt->type->report_zones)) {
- ret = -EIO;
- goto out;
- }
+ tgt = dm_table_find_target(map, args.next_sector);
+ if (WARN_ON_ONCE(!tgt->type->report_zones)) {
+ ret = -EIO;
+ goto out;
+ }
- /*
- * blkdev_report_zones() will loop and call this again to cover all the
- * zones of the target, eventually moving on to the next target.
- * So there is no need to loop here trying to fill the entire array
- * of zones.
- */
- ret = tgt->type->report_zones(tgt, sector, zones, nr_zones);
+ args.tgt = tgt;
+ ret = tgt->type->report_zones(tgt, &args,
+ nr_zones - args.zone_idx);
+ if (ret < 0)
+ goto out;
+ } while (args.zone_idx < nr_zones &&
+ args.next_sector < get_capacity(disk));
+ ret = args.zone_idx;
out:
dm_put_live_table(md, srcu_idx);
return ret;
-#else
- return -ENOTSUPP;
-#endif
}
+#else
+#define dm_blk_report_zones NULL
+#endif /* CONFIG_BLK_DEV_ZONED */
static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
struct block_device **bdev)
@@ -572,7 +586,44 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
return r;
}
-static void start_io_acct(struct dm_io *io);
+u64 dm_start_time_ns_from_clone(struct bio *bio)
+{
+ struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+ struct dm_io *io = tio->io;
+
+ return jiffies_to_nsecs(io->start_time);
+}
+EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
+
+static void start_io_acct(struct dm_io *io)
+{
+ struct mapped_device *md = io->md;
+ struct bio *bio = io->orig_bio;
+
+ io->start_time = bio_start_io_acct(bio);
+ if (unlikely(dm_stats_used(&md->stats)))
+ dm_stats_account_io(&md->stats, bio_data_dir(bio),
+ bio->bi_iter.bi_sector, bio_sectors(bio),
+ false, 0, &io->stats_aux);
+}
+
+static void end_io_acct(struct dm_io *io)
+{
+ struct mapped_device *md = io->md;
+ struct bio *bio = io->orig_bio;
+ unsigned long duration = jiffies - io->start_time;
+
+ bio_end_io_acct(bio, io->start_time);
+
+ if (unlikely(dm_stats_used(&md->stats)))
+ dm_stats_account_io(&md->stats, bio_data_dir(bio),
+ bio->bi_iter.bi_sector, bio_sectors(bio),
+ true, duration, &io->stats_aux);
+
+ /* nudge anyone waiting on suspend queue */
+ if (unlikely(wq_has_sleeper(&md->wait)))
+ wake_up(&md->wait);
+}
static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
{
@@ -638,63 +689,6 @@ static void free_tio(struct dm_target_io *tio)
bio_put(&tio->clone);
}
-static bool md_in_flight_bios(struct mapped_device *md)
-{
- int cpu;
- struct hd_struct *part = &dm_disk(md)->part0;
- long sum = 0;
-
- for_each_possible_cpu(cpu) {
- sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
- sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
- }
-
- return sum != 0;
-}
-
-static bool md_in_flight(struct mapped_device *md)
-{
- if (queue_is_mq(md->queue))
- return blk_mq_queue_inflight(md->queue);
- else
- return md_in_flight_bios(md);
-}
-
-static void start_io_acct(struct dm_io *io)
-{
- struct mapped_device *md = io->md;
- struct bio *bio = io->orig_bio;
-
- io->start_time = jiffies;
-
- generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
- &dm_disk(md)->part0);
-
- if (unlikely(dm_stats_used(&md->stats)))
- dm_stats_account_io(&md->stats, bio_data_dir(bio),
- bio->bi_iter.bi_sector, bio_sectors(bio),
- false, 0, &io->stats_aux);
-}
-
-static void end_io_acct(struct dm_io *io)
-{
- struct mapped_device *md = io->md;
- struct bio *bio = io->orig_bio;
- unsigned long duration = jiffies - io->start_time;
-
- generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
- io->start_time);
-
- if (unlikely(dm_stats_used(&md->stats)))
- dm_stats_account_io(&md->stats, bio_data_dir(bio),
- bio->bi_iter.bi_sector, bio_sectors(bio),
- true, duration, &io->stats_aux);
-
- /* nudge anyone waiting on suspend queue */
- if (unlikely(wq_has_sleeper(&md->wait)))
- wake_up(&md->wait);
-}
-
/*
* Add the bio to the list of deferred io.
*/
@@ -994,8 +988,9 @@ static void clone_endio(struct bio *bio)
struct dm_io *io = tio->io;
struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
+ struct bio *orig_bio = io->orig_bio;
- if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
+ if (unlikely(error == BLK_STS_TARGET)) {
if (bio_op(bio) == REQ_OP_DISCARD &&
!bio->bi_disk->queue->limits.max_discard_sectors)
disable_discard(md);
@@ -1007,12 +1002,24 @@ static void clone_endio(struct bio *bio)
disable_write_zeroes(md);
}
+ /*
+ * For zone-append bios get offset in zone of the written
+ * sector and add that to the original bio sector pos.
+ */
+ if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
+ sector_t written_sector = bio->bi_iter.bi_sector;
+ struct request_queue *q = orig_bio->bi_disk->queue;
+ u64 mask = (u64)blk_queue_zone_sectors(q) - 1;
+
+ orig_bio->bi_iter.bi_sector += written_sector & mask;
+ }
+
if (endio) {
int r = endio(tio->ti, bio, &error);
switch (r) {
case DM_ENDIO_REQUEUE:
error = BLK_STS_DM_REQUEUE;
- /*FALLTHRU*/
+ fallthrough;
case DM_ENDIO_DONE:
break;
case DM_ENDIO_INCOMPLETE:
@@ -1037,29 +1044,28 @@ static void clone_endio(struct bio *bio)
* Return maximum size of I/O possible at the supplied sector up to the current
* target boundary.
*/
-static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
+static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
+ sector_t target_offset)
{
- sector_t target_offset = dm_target_offset(ti, sector);
-
return ti->len - target_offset;
}
-static sector_t max_io_len(sector_t sector, struct dm_target *ti)
+static sector_t max_io_len(struct dm_target *ti, sector_t sector)
{
- sector_t len = max_io_len_target_boundary(sector, ti);
- sector_t offset, max_len;
+ sector_t target_offset = dm_target_offset(ti, sector);
+ sector_t len = max_io_len_target_boundary(ti, target_offset);
+ sector_t max_len;
/*
- * Does the target need to split even further?
+ * Does the target need to split IO even further?
+ * - varied (per target) IO splitting is a tenet of DM; this
+ * explains why stacked chunk_sectors based splitting via
+ * blk_max_size_offset() isn't possible here. So pass in
+ * ti->max_io_len to override stacked chunk_sectors.
*/
if (ti->max_io_len) {
- offset = dm_target_offset(ti, sector);
- if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
- max_len = sector_div(offset, ti->max_io_len);
- else
- max_len = offset & (ti->max_io_len - 1);
- max_len = ti->max_io_len - max_len;
-
+ max_len = blk_max_size_offset(ti->table->md->queue,
+ target_offset, ti->max_io_len);
if (len > max_len)
len = max_len;
}
@@ -1115,7 +1121,7 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
goto out;
if (!ti->type->direct_access)
goto out;
- len = max_io_len(sector, ti) / PAGE_SECTORS;
+ len = max_io_len(ti, sector) / PAGE_SECTORS;
if (len < 1)
goto out;
nr_pages = min(len, nr_pages);
@@ -1195,9 +1201,37 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
return ret;
}
+static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
+ size_t nr_pages)
+{
+ struct mapped_device *md = dax_get_private(dax_dev);
+ sector_t sector = pgoff * PAGE_SECTORS;
+ struct dm_target *ti;
+ int ret = -EIO;
+ int srcu_idx;
+
+ ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+ if (!ti)
+ goto out;
+ if (WARN_ON(!ti->type->dax_zero_page_range)) {
+ /*
+ * ->zero_page_range() is mandatory dax operation. If we are
+ * here, something is wrong.
+ */
+ goto out;
+ }
+ ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
+ out:
+ dm_put_live_table(md, srcu_idx);
+
+ return ret;
+}
+
/*
* A target may call dm_accept_partial_bio only from the map routine. It is
- * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
+ * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
+ * operations and REQ_OP_ZONE_APPEND (zone append writes).
*
* dm_accept_partial_bio informs the dm that the target only wants to process
* additional n_sectors sectors of the bio and the rest of the data should be
@@ -1227,62 +1261,18 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
{
struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+
BUG_ON(bio->bi_opf & REQ_PREFLUSH);
+ BUG_ON(op_is_zone_mgmt(bio_op(bio)));
+ BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
BUG_ON(bi_size > *tio->len_ptr);
BUG_ON(n_sectors > bi_size);
+
*tio->len_ptr -= bi_size - n_sectors;
bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
}
EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
-/*
- * The zone descriptors obtained with a zone report indicate
- * zone positions within the underlying device of the target. The zone
- * descriptors must be remapped to match their position within the dm device.
- * The caller target should obtain the zones information using
- * blkdev_report_zones() to ensure that remapping for partition offset is
- * already handled.
- */
-void dm_remap_zone_report(struct dm_target *ti, sector_t start,
- struct blk_zone *zones, unsigned int *nr_zones)
-{
-#ifdef CONFIG_BLK_DEV_ZONED
- struct blk_zone *zone;
- unsigned int nrz = *nr_zones;
- int i;
-
- /*
- * Remap the start sector and write pointer position of the zones in
- * the array. Since we may have obtained from the target underlying
- * device more zones that the target size, also adjust the number
- * of zones.
- */
- for (i = 0; i < nrz; i++) {
- zone = zones + i;
- if (zone->start >= start + ti->len) {
- memset(zone, 0, sizeof(struct blk_zone) * (nrz - i));
- break;
- }
-
- zone->start = zone->start + ti->begin - start;
- if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
- continue;
-
- if (zone->cond == BLK_ZONE_COND_FULL)
- zone->wp = zone->start + zone->len;
- else if (zone->cond == BLK_ZONE_COND_EMPTY)
- zone->wp = zone->start;
- else
- zone->wp = zone->wp + ti->begin - start;
- }
-
- *nr_zones = i;
-#else /* !CONFIG_BLK_DEV_ZONED */
- *nr_zones = 0;
-#endif
-}
-EXPORT_SYMBOL_GPL(dm_remap_zone_report);
-
static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
{
mutex_lock(&md->swap_bios_lock);
@@ -1305,7 +1295,6 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
sector_t sector;
struct bio *clone = &tio->clone;
struct dm_io *io = tio->io;
- struct mapped_device *md = io->md;
struct dm_target *ti = tio->ti;
blk_qc_t ret = BLK_QC_T_NONE;
@@ -1335,10 +1324,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
/* the bio has been remapped so dispatch it */
trace_block_bio_remap(clone->bi_disk->queue, clone,
bio_dev(io->orig_bio), sector);
- if (md->type == DM_TYPE_NVME_BIO_BASED)
- ret = direct_make_request(clone);
- else
- ret = generic_make_request(clone);
+ ret = submit_bio_noacct(clone);
break;
case DM_MAPIO_KILL:
if (unlikely(swap_bios_limit(ti, clone))) {
@@ -1377,12 +1363,15 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
sector_t sector, unsigned len)
{
struct bio *clone = &tio->clone;
+ int r;
__bio_clone_fast(clone, bio);
- if (bio_integrity(bio)) {
- int r;
+ r = bio_crypt_clone(clone, bio, GFP_NOIO);
+ if (r < 0)
+ return r;
+ if (bio_integrity(bio)) {
if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
!dm_target_passes_integrity(tio->ti->type))) {
DMWARN("%s: the target %s doesn't support integrity data.",
@@ -1478,6 +1467,17 @@ static int __send_empty_flush(struct clone_info *ci)
{
unsigned target_nr = 0;
struct dm_target *ti;
+ struct bio flush_bio;
+
+ /*
+ * Use an on-stack bio for this, it's safe since we don't
+ * need to reference it after submit. It's just used as
+ * the basis for the clone(s).
+ */
+ bio_init(&flush_bio, NULL, 0);
+ flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+ ci->bio = &flush_bio;
+ ci->sector_count = 0;
/*
* Empty flush uses a statically initialized bio, as the base for
@@ -1491,6 +1491,8 @@ static int __send_empty_flush(struct clone_info *ci)
BUG_ON(bio_has_data(ci->bio));
while ((ti = dm_table_get_target(ci->map, target_nr++)))
__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
+
+ bio_uninit(ci->bio);
return 0;
}
@@ -1513,28 +1515,6 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
return 0;
}
-typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
-
-static unsigned get_num_discard_bios(struct dm_target *ti)
-{
- return ti->num_discard_bios;
-}
-
-static unsigned get_num_secure_erase_bios(struct dm_target *ti)
-{
- return ti->num_secure_erase_bios;
-}
-
-static unsigned get_num_write_same_bios(struct dm_target *ti)
-{
- return ti->num_write_same_bios;
-}
-
-static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
-{
- return ti->num_write_zeroes_bios;
-}
-
static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
unsigned num_bios)
{
@@ -1549,7 +1529,8 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *
if (!num_bios)
return -EOPNOTSUPP;
- len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
+ len = min_t(sector_t, ci->sector_count,
+ max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
__send_duplicate_bios(ci, ti, num_bios, &len);
@@ -1559,26 +1540,6 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *
return 0;
}
-static int __send_discard(struct clone_info *ci, struct dm_target *ti)
-{
- return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti));
-}
-
-static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
-{
- return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti));
-}
-
-static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
-{
- return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti));
-}
-
-static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
-{
- return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti));
-}
-
static bool is_abnormal_io(struct bio *bio)
{
bool r = false;
@@ -1599,18 +1560,26 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
int *result)
{
struct bio *bio = ci->bio;
+ unsigned num_bios = 0;
- if (bio_op(bio) == REQ_OP_DISCARD)
- *result = __send_discard(ci, ti);
- else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
- *result = __send_secure_erase(ci, ti);
- else if (bio_op(bio) == REQ_OP_WRITE_SAME)
- *result = __send_write_same(ci, ti);
- else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
- *result = __send_write_zeroes(ci, ti);
- else
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ num_bios = ti->num_discard_bios;
+ break;
+ case REQ_OP_SECURE_ERASE:
+ num_bios = ti->num_secure_erase_bios;
+ break;
+ case REQ_OP_WRITE_SAME:
+ num_bios = ti->num_write_same_bios;
+ break;
+ case REQ_OP_WRITE_ZEROES:
+ num_bios = ti->num_write_zeroes_bios;
+ break;
+ default:
return false;
+ }
+ *result = __send_changing_extent_only(ci, ti, num_bios);
return true;
}
@@ -1630,7 +1599,7 @@ static int __split_and_process_non_flush(struct clone_info *ci)
if (__process_abnormal_io(ci, ti, &r))
return r;
- len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
+ len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
if (r < 0)
@@ -1666,21 +1635,9 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
init_clone_info(&ci, md, map, bio);
if (bio->bi_opf & REQ_PREFLUSH) {
- struct bio flush_bio;
-
- /*
- * Use an on-stack bio for this, it's safe since we don't
- * need to reference it after submit. It's just used as
- * the basis for the clone(s).
- */
- bio_init(&flush_bio, NULL, 0);
- flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
- ci.bio = &flush_bio;
- ci.sector_count = 0;
error = __send_empty_flush(&ci);
- bio_uninit(ci.bio);
/* dec_pending submits any data associated with flush */
- } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
+ } else if (op_is_zone_mgmt(bio_op(bio))) {
ci.bio = bio;
ci.sector_count = 0;
error = __split_and_process_non_flush(&ci);
@@ -1691,7 +1648,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
error = __split_and_process_non_flush(&ci);
if (current->bio_list && ci.sector_count && !error) {
/*
- * Remainder must be passed to generic_make_request()
+ * Remainder must be passed to submit_bio_noacct()
* so that it gets handled *after* bios already submitted
* have been completely processed.
* We take a clone of the original to store in
@@ -1716,7 +1673,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
bio_chain(b, bio);
trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
- ret = generic_make_request(bio);
+ ret = submit_bio_noacct(bio);
break;
}
}
@@ -1727,137 +1684,45 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
return ret;
}
-/*
- * Optimized variant of __split_and_process_bio that leverages the
- * fact that targets that use it do _not_ have a need to split bios.
- */
-static blk_qc_t __process_bio(struct mapped_device *md, struct dm_table *map,
- struct bio *bio, struct dm_target *ti)
+static blk_qc_t dm_submit_bio(struct bio *bio)
{
- struct clone_info ci;
- blk_qc_t ret = BLK_QC_T_NONE;
- int error = 0;
-
- init_clone_info(&ci, md, map, bio);
-
- if (bio->bi_opf & REQ_PREFLUSH) {
- struct bio flush_bio;
-
- /*
- * Use an on-stack bio for this, it's safe since we don't
- * need to reference it after submit. It's just used as
- * the basis for the clone(s).
- */
- bio_init(&flush_bio, NULL, 0);
- flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
- ci.bio = &flush_bio;
- ci.sector_count = 0;
- error = __send_empty_flush(&ci);
- bio_uninit(ci.bio);
- /* dec_pending submits any data associated with flush */
- } else {
- struct dm_target_io *tio;
-
- ci.bio = bio;
- ci.sector_count = bio_sectors(bio);
- if (__process_abnormal_io(&ci, ti, &error))
- goto out;
-
- tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
- ret = __clone_and_map_simple_bio(&ci, tio, NULL);
- }
-out:
- /* drop the extra reference count */
- dec_pending(ci.io, errno_to_blk_status(error));
- return ret;
-}
-
-static blk_qc_t dm_process_bio(struct mapped_device *md,
- struct dm_table *map, struct bio *bio)
-{
- blk_qc_t ret = BLK_QC_T_NONE;
- struct dm_target *ti = md->immutable_target;
-
- if (unlikely(!map)) {
- bio_io_error(bio);
- return ret;
- }
-
- if (!ti) {
- ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
- if (unlikely(!ti)) {
- bio_io_error(bio);
- return ret;
- }
- }
-
- /*
- * If in ->make_request_fn we need to use blk_queue_split(), otherwise
- * queue_limits for abnormal requests (e.g. discard, writesame, etc)
- * won't be imposed.
- */
- if (current->bio_list) {
- if (is_abnormal_io(bio))
- blk_queue_split(md->queue, &bio);
- /* regular IO is split by __split_and_process_bio */
- }
-
- if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED)
- return __process_bio(md, map, bio, ti);
- return __split_and_process_bio(md, map, bio);
-}
-
-static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
-{
- struct mapped_device *md = q->queuedata;
+ struct mapped_device *md = bio->bi_disk->private_data;
blk_qc_t ret = BLK_QC_T_NONE;
int srcu_idx;
struct dm_table *map;
map = dm_get_live_table(md, &srcu_idx);
-
- /* if we're suspended, we have to queue this io for later */
- if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
- dm_put_live_table(md, srcu_idx);
-
- if (!(bio->bi_opf & REQ_RAHEAD))
- queue_io(md, bio);
- else
- bio_io_error(bio);
- return ret;
+ if (unlikely(!map)) {
+ DMERR_LIMIT("%s: mapping table unavailable, erroring io",
+ dm_device_name(md));
+ bio_io_error(bio);
+ goto out;
}
- ret = dm_process_bio(md, map, bio);
+ /* If suspended, queue this IO for later */
+ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
+ if (bio->bi_opf & REQ_NOWAIT)
+ bio_wouldblock_error(bio);
+ else if (bio->bi_opf & REQ_RAHEAD)
+ bio_io_error(bio);
+ else
+ queue_io(md, bio);
+ goto out;
+ }
+ /*
+ * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc)
+ * otherwise associated queue_limits won't be imposed.
+ */
+ if (is_abnormal_io(bio))
+ blk_queue_split(&bio);
+
+ ret = __split_and_process_bio(md, map, bio);
+out:
dm_put_live_table(md, srcu_idx);
return ret;
}
-static int dm_any_congested(void *congested_data, int bdi_bits)
-{
- int r = bdi_bits;
- struct mapped_device *md = congested_data;
- struct dm_table *map;
-
- if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
- if (dm_request_based(md)) {
- /*
- * With request-based DM we only need to check the
- * top-level queue for congestion.
- */
- struct backing_dev_info *bdi = md->queue->backing_dev_info;
- r = bdi->wb.congested->state & bdi_bits;
- } else {
- map = dm_get_live_table_fast(md);
- if (map)
- r = dm_table_any_congested(map, bdi_bits);
- dm_put_live_table_fast(md);
- }
- }
-
- return r;
-}
-
/*-----------------------------------------------------------------
* An IDR is used to keep track of allocated minor numbers.
*---------------------------------------------------------------*/
@@ -1908,6 +1773,7 @@ static int next_free_minor(int *minor)
}
static const struct block_device_operations dm_blk_dops;
+static const struct block_device_operations dm_rq_blk_dops;
static const struct dax_operations dm_dax_ops;
static void dm_wq_work(struct work_struct *work);
@@ -1996,16 +1862,14 @@ static struct mapped_device *alloc_dev(int minor)
INIT_LIST_HEAD(&md->table_devices);
spin_lock_init(&md->uevent_lock);
- md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
+ /*
+ * default to bio-based until DM table is loaded and md->type
+ * established. If request-based table is loaded: blk-mq will
+ * override accordingly.
+ */
+ md->queue = blk_alloc_queue(numa_node_id);
if (!md->queue)
goto bad;
- md->queue->queuedata = md;
- /*
- * default to bio-based required ->make_request_fn until DM
- * table is loaded and md->type established. If request-based
- * table is loaded: blk-mq will override accordingly.
- */
- blk_queue_make_request(md->queue, dm_make_request);
md->disk = alloc_disk_node(1, md->numa_node_id);
if (!md->disk)
@@ -2030,8 +1894,10 @@ static struct mapped_device *alloc_dev(int minor)
if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
md->dax_dev = alloc_dax(md, md->disk->disk_name,
&dm_dax_ops, 0);
- if (!md->dax_dev)
+ if (IS_ERR(md->dax_dev)) {
+ md->dax_dev = NULL;
goto bad;
+ }
}
add_disk_no_queue_reg(md->disk);
@@ -2148,18 +2014,6 @@ static void event_callback(void *context)
}
/*
- * Protected by md->suspend_lock obtained by dm_swap_table().
- */
-static void __set_size(struct mapped_device *md, sector_t size)
-{
- lockdep_assert_held(&md->suspend_lock);
-
- set_capacity(md->disk, size);
-
- i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-}
-
-/*
* Returns old map, which caller must destroy.
*/
static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
@@ -2181,7 +2035,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
if (size != dm_get_size(md))
memset(&md->geometry, 0, sizeof(md->geometry));
- __set_size(md, size);
+ set_capacity(md->disk, size);
+ bd_set_nr_sectors(md->bdev, size);
dm_table_event_callback(t, event_callback, md);
@@ -2195,12 +2050,10 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
if (request_based)
dm_stop_queue(q);
- if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
+ if (request_based) {
/*
- * Leverage the fact that request-based DM targets and
- * NVMe bio based targets are immutable singletons
- * - used to optimize both dm_request_fn and dm_mq_queue_rq;
- * and __process_bio.
+ * Leverage the fact that request-based DM targets are
+ * immutable singletons - used to optimize dm_mq_queue_rq.
*/
md->immutable_target = dm_table_get_immutable_target(t);
}
@@ -2303,12 +2156,6 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
}
EXPORT_SYMBOL_GPL(dm_get_queue_limits);
-static void dm_init_congested_fn(struct mapped_device *md)
-{
- md->queue->backing_dev_info->congested_data = md;
- md->queue->backing_dev_info->congested_fn = dm_any_congested;
-}
-
/*
* Setup the DM device's queue based on md's type
*/
@@ -2320,17 +2167,15 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
switch (type) {
case DM_TYPE_REQUEST_BASED:
+ md->disk->fops = &dm_rq_blk_dops;
r = dm_mq_init_request_queue(md, t);
if (r) {
- DMERR("Cannot initialize queue for request-based dm-mq mapped device");
+ DMERR("Cannot initialize queue for request-based dm mapped device");
return r;
}
- dm_init_congested_fn(md);
break;
case DM_TYPE_BIO_BASED:
case DM_TYPE_DAX_BIO_BASED:
- case DM_TYPE_NVME_BIO_BASED:
- dm_init_congested_fn(md);
break;
case DM_TYPE_NONE:
WARN_ON_ONCE(true);
@@ -2471,15 +2316,29 @@ void dm_put(struct mapped_device *md)
}
EXPORT_SYMBOL_GPL(dm_put);
-static int dm_wait_for_completion(struct mapped_device *md, long task_state)
+static bool md_in_flight_bios(struct mapped_device *md)
+{
+ int cpu;
+ struct hd_struct *part = &dm_disk(md)->part0;
+ long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
+ sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
+ }
+
+ return sum != 0;
+}
+
+static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state)
{
int r = 0;
DEFINE_WAIT(wait);
- while (1) {
+ while (true) {
prepare_to_wait(&md->wait, &wait, task_state);
- if (!md_in_flight(md))
+ if (!md_in_flight_bios(md))
break;
if (signal_pending_state(task_state, current)) {
@@ -2494,34 +2353,46 @@ static int dm_wait_for_completion(struct mapped_device *md, long task_state)
return r;
}
+static int dm_wait_for_completion(struct mapped_device *md, long task_state)
+{
+ int r = 0;
+
+ if (!queue_is_mq(md->queue))
+ return dm_wait_for_bios_completion(md, task_state);
+
+ while (true) {
+ if (!blk_mq_queue_inflight(md->queue))
+ break;
+
+ if (signal_pending_state(task_state, current)) {
+ r = -EINTR;
+ break;
+ }
+
+ msleep(5);
+ }
+
+ return r;
+}
+
/*
* Process the deferred bios
*/
static void dm_wq_work(struct work_struct *work)
{
- struct mapped_device *md = container_of(work, struct mapped_device,
- work);
- struct bio *c;
- int srcu_idx;
- struct dm_table *map;
-
- map = dm_get_live_table(md, &srcu_idx);
+ struct mapped_device *md = container_of(work, struct mapped_device, work);
+ struct bio *bio;
while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
spin_lock_irq(&md->deferred_lock);
- c = bio_list_pop(&md->deferred);
+ bio = bio_list_pop(&md->deferred);
spin_unlock_irq(&md->deferred_lock);
- if (!c)
+ if (!bio)
break;
- if (dm_request_based(md))
- (void) generic_make_request(c);
- else
- (void) dm_process_bio(md, map, c);
+ submit_bio_noacct(bio);
}
-
- dm_put_live_table(md, srcu_idx);
}
static void dm_queue_flush(struct mapped_device *md)
@@ -2633,7 +2504,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
if (noflush)
set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
else
- pr_debug("%s: suspending with flush\n", dm_device_name(md));
+ DMDEBUG("%s: suspending with flush", dm_device_name(md));
/*
* This gets reverted if there's an error later and the targets
@@ -2658,13 +2529,12 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
/*
* Here we must make sure that no processes are submitting requests
* to target drivers i.e. no one may be executing
- * __split_and_process_bio. This is called from dm_request and
- * dm_wq_work.
+ * __split_and_process_bio from dm_submit_bio.
*
- * To get all processes out of __split_and_process_bio in dm_request,
+ * To get all processes out of __split_and_process_bio in dm_submit_bio,
* we take the write lock. To prevent any process from reentering
- * __split_and_process_bio from dm_request and quiesce the thread
- * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
+ * __split_and_process_bio from dm_submit_bio and quiesce the thread
+ * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call
* flush_workqueue(md->wq).
*/
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
@@ -3032,19 +2902,19 @@ int dm_test_deferred_remove_flag(struct mapped_device *md)
int dm_suspended(struct dm_target *ti)
{
- return dm_suspended_md(dm_table_get_md(ti->table));
+ return dm_suspended_md(ti->table->md);
}
EXPORT_SYMBOL_GPL(dm_suspended);
int dm_post_suspending(struct dm_target *ti)
{
- return dm_post_suspending_md(dm_table_get_md(ti->table));
+ return dm_post_suspending_md(ti->table->md);
}
EXPORT_SYMBOL_GPL(dm_post_suspending);
int dm_noflush_suspending(struct dm_target *ti)
{
- return __noflush_suspending(dm_table_get_md(ti->table));
+ return __noflush_suspending(ti->table->md);
}
EXPORT_SYMBOL_GPL(dm_noflush_suspending);
@@ -3063,7 +2933,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
switch (type) {
case DM_TYPE_BIO_BASED:
case DM_TYPE_DAX_BIO_BASED:
- case DM_TYPE_NVME_BIO_BASED:
pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
@@ -3271,6 +3140,7 @@ static const struct pr_ops dm_pr_ops = {
};
static const struct block_device_operations dm_blk_dops = {
+ .submit_bio = dm_submit_bio,
.open = dm_blk_open,
.release = dm_blk_close,
.ioctl = dm_blk_ioctl,
@@ -3280,11 +3150,21 @@ static const struct block_device_operations dm_blk_dops = {
.owner = THIS_MODULE
};
+static const struct block_device_operations dm_rq_blk_dops = {
+ .open = dm_blk_open,
+ .release = dm_blk_close,
+ .ioctl = dm_blk_ioctl,
+ .getgeo = dm_blk_getgeo,
+ .pr_ops = &dm_pr_ops,
+ .owner = THIS_MODULE
+};
+
static const struct dax_operations dm_dax_ops = {
.direct_access = dm_dax_direct_access,
.dax_supported = dm_dax_supported,
.copy_from_iter = dm_dax_copy_from_iter,
.copy_to_iter = dm_dax_copy_to_iter,
+ .zero_page_range = dm_dax_zero_page_range,
};
/*
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 9fbf87e..b441ad7 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -63,7 +63,6 @@ void dm_table_presuspend_targets(struct dm_table *t);
void dm_table_presuspend_undo_targets(struct dm_table *t);
void dm_table_postsuspend_targets(struct dm_table *t);
int dm_table_resume_targets(struct dm_table *t);
-int dm_table_any_congested(struct dm_table *t, int bdi_bits);
enum dm_queue_mode dm_table_get_type(struct dm_table *t);
struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
@@ -180,12 +179,9 @@ int dm_open_count(struct mapped_device *md);
int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred);
int dm_cancel_deferred_remove(struct mapped_device *md);
int dm_request_based(struct mapped_device *md);
-sector_t dm_get_size(struct mapped_device *md);
-struct request_queue *dm_get_md_queue(struct mapped_device *md);
int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
struct dm_dev **result);
void dm_put_table_device(struct mapped_device *md, struct dm_dev *d);
-struct dm_stats *dm_get_stats(struct mapped_device *md);
int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
unsigned cookie);
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
new file mode 100644
index 0000000..2cf9737
--- /dev/null
+++ b/drivers/md/md-autodetect.c
@@ -0,0 +1,291 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/major.h>
+#include <linux/delay.h>
+#include <linux/init_syscalls.h>
+#include <linux/raid/detect.h>
+#include <linux/raid/md_u.h>
+#include <linux/raid/md_p.h>
+#include "md.h"
+
+/*
+ * When md (and any require personalities) are compiled into the kernel
+ * (not a module), arrays can be assembles are boot time using with AUTODETECT
+ * where specially marked partitions are registered with md_autodetect_dev(),
+ * and with MD_BOOT where devices to be collected are given on the boot line
+ * with md=.....
+ * The code for that is here.
+ */
+
+#ifdef CONFIG_MD_AUTODETECT
+static int __initdata raid_noautodetect;
+#else
+static int __initdata raid_noautodetect=1;
+#endif
+static int __initdata raid_autopart;
+
+static struct md_setup_args {
+ int minor;
+ int partitioned;
+ int level;
+ int chunk;
+ char *device_names;
+} md_setup_args[256] __initdata;
+
+static int md_setup_ents __initdata;
+
+/*
+ * Parse the command-line parameters given our kernel, but do not
+ * actually try to invoke the MD device now; that is handled by
+ * md_setup_drive after the low-level disk drivers have initialised.
+ *
+ * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
+ * assigns the task of parsing integer arguments to the
+ * invoked program now). Added ability to initialise all
+ * the MD devices (by specifying multiple "md=" lines)
+ * instead of just one. -- KTK
+ * 18May2000: Added support for persistent-superblock arrays:
+ * md=n,0,factor,fault,device-list uses RAID0 for device n
+ * md=n,-1,factor,fault,device-list uses LINEAR for device n
+ * md=n,device-list reads a RAID superblock from the devices
+ * elements in device-list are read by name_to_kdev_t so can be
+ * a hex number or something like /dev/hda1 /dev/sdb
+ * 2001-06-03: Dave Cinege <dcinege@psychosis.com>
+ * Shifted name_to_kdev_t() and related operations to md_set_drive()
+ * for later execution. Rewrote section to make devfs compatible.
+ */
+static int __init md_setup(char *str)
+{
+ int minor, level, factor, fault, partitioned = 0;
+ char *pername = "";
+ char *str1;
+ int ent;
+
+ if (*str == 'd') {
+ partitioned = 1;
+ str++;
+ }
+ if (get_option(&str, &minor) != 2) { /* MD Number */
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ str1 = str;
+ for (ent=0 ; ent< md_setup_ents ; ent++)
+ if (md_setup_args[ent].minor == minor &&
+ md_setup_args[ent].partitioned == partitioned) {
+ printk(KERN_WARNING "md: md=%s%d, Specified more than once. "
+ "Replacing previous definition.\n", partitioned?"d":"", minor);
+ break;
+ }
+ if (ent >= ARRAY_SIZE(md_setup_args)) {
+ printk(KERN_WARNING "md: md=%s%d - too many md initialisations\n", partitioned?"d":"", minor);
+ return 0;
+ }
+ if (ent >= md_setup_ents)
+ md_setup_ents++;
+ switch (get_option(&str, &level)) { /* RAID level */
+ case 2: /* could be 0 or -1.. */
+ if (level == 0 || level == LEVEL_LINEAR) {
+ if (get_option(&str, &factor) != 2 || /* Chunk Size */
+ get_option(&str, &fault) != 2) {
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ md_setup_args[ent].level = level;
+ md_setup_args[ent].chunk = 1 << (factor+12);
+ if (level == LEVEL_LINEAR)
+ pername = "linear";
+ else
+ pername = "raid0";
+ break;
+ }
+ fallthrough;
+ case 1: /* the first device is numeric */
+ str = str1;
+ fallthrough;
+ case 0:
+ md_setup_args[ent].level = LEVEL_NONE;
+ pername="super-block";
+ }
+
+ printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n",
+ minor, pername, str);
+ md_setup_args[ent].device_names = str;
+ md_setup_args[ent].partitioned = partitioned;
+ md_setup_args[ent].minor = minor;
+
+ return 1;
+}
+
+static void __init md_setup_drive(struct md_setup_args *args)
+{
+ char *devname = args->device_names;
+ dev_t devices[MD_SB_DISKS + 1], mdev;
+ struct mdu_array_info_s ainfo = { };
+ struct block_device *bdev;
+ struct mddev *mddev;
+ int err = 0, i;
+ char name[16];
+
+ if (args->partitioned) {
+ mdev = MKDEV(mdp_major, args->minor << MdpMinorShift);
+ sprintf(name, "md_d%d", args->minor);
+ } else {
+ mdev = MKDEV(MD_MAJOR, args->minor);
+ sprintf(name, "md%d", args->minor);
+ }
+
+ for (i = 0; i < MD_SB_DISKS && devname != NULL; i++) {
+ struct kstat stat;
+ char *p;
+ char comp_name[64];
+ dev_t dev;
+
+ p = strchr(devname, ',');
+ if (p)
+ *p++ = 0;
+
+ dev = name_to_dev_t(devname);
+ if (strncmp(devname, "/dev/", 5) == 0)
+ devname += 5;
+ snprintf(comp_name, 63, "/dev/%s", devname);
+ if (init_stat(comp_name, &stat, 0) == 0 && S_ISBLK(stat.mode))
+ dev = new_decode_dev(stat.rdev);
+ if (!dev) {
+ pr_warn("md: Unknown device name: %s\n", devname);
+ break;
+ }
+
+ devices[i] = dev;
+ devname = p;
+ }
+ devices[i] = 0;
+
+ if (!i)
+ return;
+
+ pr_info("md: Loading %s: %s\n", name, args->device_names);
+
+ bdev = blkdev_get_by_dev(mdev, FMODE_READ, NULL);
+ if (IS_ERR(bdev)) {
+ pr_err("md: open failed - cannot start array %s\n", name);
+ return;
+ }
+
+ err = -EIO;
+ if (WARN(bdev->bd_disk->fops != &md_fops,
+ "Opening block device %x resulted in non-md device\n",
+ mdev))
+ goto out_blkdev_put;
+
+ mddev = bdev->bd_disk->private_data;
+
+ err = mddev_lock(mddev);
+ if (err) {
+ pr_err("md: failed to lock array %s\n", name);
+ goto out_blkdev_put;
+ }
+
+ if (!list_empty(&mddev->disks) || mddev->raid_disks) {
+ pr_warn("md: Ignoring %s, already autodetected. (Use raid=noautodetect)\n",
+ name);
+ goto out_unlock;
+ }
+
+ if (args->level != LEVEL_NONE) {
+ /* non-persistent */
+ ainfo.level = args->level;
+ ainfo.md_minor = args->minor;
+ ainfo.not_persistent = 1;
+ ainfo.state = (1 << MD_SB_CLEAN);
+ ainfo.chunk_size = args->chunk;
+ while (devices[ainfo.raid_disks])
+ ainfo.raid_disks++;
+ }
+
+ err = md_set_array_info(mddev, &ainfo);
+
+ for (i = 0; i <= MD_SB_DISKS && devices[i]; i++) {
+ struct mdu_disk_info_s dinfo = {
+ .major = MAJOR(devices[i]),
+ .minor = MINOR(devices[i]),
+ };
+
+ if (args->level != LEVEL_NONE) {
+ dinfo.number = i;
+ dinfo.raid_disk = i;
+ dinfo.state =
+ (1 << MD_DISK_ACTIVE) | (1 << MD_DISK_SYNC);
+ }
+
+ md_add_new_disk(mddev, &dinfo);
+ }
+
+ if (!err)
+ err = do_md_run(mddev);
+ if (err)
+ pr_warn("md: starting %s failed\n", name);
+out_unlock:
+ mddev_unlock(mddev);
+out_blkdev_put:
+ blkdev_put(bdev, FMODE_READ);
+}
+
+static int __init raid_setup(char *str)
+{
+ int len, pos;
+
+ len = strlen(str) + 1;
+ pos = 0;
+
+ while (pos < len) {
+ char *comma = strchr(str+pos, ',');
+ int wlen;
+ if (comma)
+ wlen = (comma-str)-pos;
+ else wlen = (len-1)-pos;
+
+ if (!strncmp(str, "noautodetect", wlen))
+ raid_noautodetect = 1;
+ if (!strncmp(str, "autodetect", wlen))
+ raid_noautodetect = 0;
+ if (strncmp(str, "partitionable", wlen)==0)
+ raid_autopart = 1;
+ if (strncmp(str, "part", wlen)==0)
+ raid_autopart = 1;
+ pos += wlen+1;
+ }
+ return 1;
+}
+
+__setup("raid=", raid_setup);
+__setup("md=", md_setup);
+
+static void __init autodetect_raid(void)
+{
+ /*
+ * Since we don't want to detect and use half a raid array, we need to
+ * wait for the known devices to complete their probing
+ */
+ printk(KERN_INFO "md: Waiting for all devices to be available before autodetect\n");
+ printk(KERN_INFO "md: If you don't use raid, use raid=noautodetect\n");
+
+ wait_for_device_probe();
+ md_autostart_arrays(raid_autopart);
+}
+
+void __init md_run_setup(void)
+{
+ int ent;
+
+ if (raid_noautodetect)
+ printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=autodetect will force)\n");
+ else
+ autodetect_raid();
+
+ for (ent = 0; ent < md_setup_ents; ent++)
+ md_setup_drive(&md_setup_args[ent]);
+}
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index d7eef52..ea3130e 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -324,14 +324,6 @@ static void end_bitmap_write(struct buffer_head *bh, int uptodate)
wake_up(&bitmap->write_wait);
}
-/* copied from buffer.c */
-static void
-__clear_page_buffers(struct page *page)
-{
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
-}
static void free_buffers(struct page *page)
{
struct buffer_head *bh;
@@ -345,7 +337,7 @@ static void free_buffers(struct page *page)
free_buffer_head(bh);
bh = next;
}
- __clear_page_buffers(page);
+ detach_page_private(page);
put_page(page);
}
@@ -364,33 +356,38 @@ static int read_page(struct file *file, unsigned long index,
int ret = 0;
struct inode *inode = file_inode(file);
struct buffer_head *bh;
- sector_t block;
+ sector_t block, blk_cur;
+ unsigned long blocksize = i_blocksize(inode);
pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
(unsigned long long)index << PAGE_SHIFT);
- bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false);
+ bh = alloc_page_buffers(page, blocksize, false);
if (!bh) {
ret = -ENOMEM;
goto out;
}
- attach_page_buffers(page, bh);
- block = index << (PAGE_SHIFT - inode->i_blkbits);
+ attach_page_private(page, bh);
+ blk_cur = index << (PAGE_SHIFT - inode->i_blkbits);
while (bh) {
+ block = blk_cur;
+
if (count == 0)
bh->b_blocknr = 0;
else {
- bh->b_blocknr = bmap(inode, block);
- if (bh->b_blocknr == 0) {
- /* Cannot use this file! */
+ ret = bmap(inode, &block);
+ if (ret || !block) {
ret = -EINVAL;
+ bh->b_blocknr = 0;
goto out;
}
+
+ bh->b_blocknr = block;
bh->b_bdev = inode->i_sb->s_bdev;
- if (count < (1<<inode->i_blkbits))
+ if (count < blocksize)
count = 0;
else
- count -= (1<<inode->i_blkbits);
+ count -= blocksize;
bh->b_end_io = end_bitmap_write;
bh->b_private = bitmap;
@@ -399,7 +396,7 @@ static int read_page(struct file *file, unsigned long index,
set_buffer_mapped(bh);
submit_bh(REQ_OP_READ, 0, bh);
}
- block++;
+ blk_cur++;
bh = bh->b_this_page;
}
page->index = index;
@@ -609,8 +606,8 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
if (bitmap->cluster_slot >= 0) {
sector_t bm_blocks = bitmap->mddev->resync_max_sectors;
- sector_div(bm_blocks,
- bitmap->mddev->bitmap_info.chunksize >> 9);
+ bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks,
+ (bitmap->mddev->bitmap_info.chunksize >> 9));
/* bits to bytes */
bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
/* to 4k blocks */
@@ -1019,8 +1016,6 @@ void md_bitmap_unplug(struct bitmap *bitmap)
/* look at each page to see if there are any set bits that need to be
* flushed out to disk */
for (i = 0; i < bitmap->storage.file_pages; i++) {
- if (!bitmap->storage.filemap)
- return;
dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
need_write = test_and_clear_page_attr(bitmap, i,
BITMAP_PAGE_NEEDWRITE);
@@ -1338,7 +1333,8 @@ void md_bitmap_daemon_work(struct mddev *mddev)
BITMAP_PAGE_DIRTY))
/* bitmap_unplug will handle the rest */
break;
- if (test_and_clear_page_attr(bitmap, j,
+ if (bitmap->storage.filemap &&
+ test_and_clear_page_attr(bitmap, j,
BITMAP_PAGE_NEEDWRITE)) {
write_page(bitmap, bitmap->storage.filemap[j], 0);
}
@@ -1438,7 +1434,7 @@ int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long s
case 0:
md_bitmap_file_set_bit(bitmap, offset);
md_bitmap_count_page(&bitmap->counts, offset, 1);
- /* fall through */
+ fallthrough;
case 1:
*bmc = 2;
}
@@ -1636,7 +1632,7 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
s += blocks;
}
bitmap->last_end_sync = jiffies;
- sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed);
}
EXPORT_SYMBOL(md_bitmap_cond_end_sync);
@@ -1792,8 +1788,8 @@ void md_bitmap_destroy(struct mddev *mddev)
return;
md_bitmap_wait_behind_writes(mddev);
- mempool_destroy(mddev->wb_info_pool);
- mddev->wb_info_pool = NULL;
+ if (!mddev->serialize_policy)
+ mddev_destroy_serial_pool(mddev, NULL, true);
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
@@ -1910,7 +1906,7 @@ int md_bitmap_load(struct mddev *mddev)
goto out;
rdev_for_each(rdev, mddev)
- mddev_create_wb_pool(mddev, rdev, true);
+ mddev_create_serial_pool(mddev, rdev, true);
if (mddev_is_clustered(mddev))
md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
@@ -2479,16 +2475,16 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (backlog > COUNTER_MAX)
return -EINVAL;
mddev->bitmap_info.max_write_behind = backlog;
- if (!backlog && mddev->wb_info_pool) {
- /* wb_info_pool is not needed if backlog is zero */
- mempool_destroy(mddev->wb_info_pool);
- mddev->wb_info_pool = NULL;
- } else if (backlog && !mddev->wb_info_pool) {
- /* wb_info_pool is needed since backlog is not zero */
+ if (!backlog && mddev->serial_info_pool) {
+ /* serial_info_pool is not needed if backlog is zero */
+ if (!mddev->serialize_policy)
+ mddev_destroy_serial_pool(mddev, NULL, false);
+ } else if (backlog && !mddev->serial_info_pool) {
+ /* serial_info_pool is needed since backlog is not zero */
struct md_rdev *rdev;
rdev_for_each(rdev, mddev)
- mddev_create_wb_pool(mddev, rdev, false);
+ mddev_create_serial_pool(mddev, rdev, false);
}
if (old_mwb != backlog)
md_bitmap_update_sb(mddev->bitmap);
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 794e1d5..f0e64e7 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -582,7 +582,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
break;
case CHANGE_CAPACITY:
set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
break;
case RESYNCING:
set_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
@@ -1305,12 +1305,12 @@ static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
__func__, __LINE__);
set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
} else {
/* revert to previous sectors */
ret = mddev->pers->resize(mddev, old_dev_sectors);
if (!ret)
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
ret = __sendmsg(cinfo, &cmsg);
if (ret)
pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
index 50ad4ba..fda4cb3 100644
--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -169,7 +169,7 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
if (bio_data_dir(bio) == WRITE) {
/* write request */
if (atomic_read(&conf->counters[WriteAll])) {
- /* special case - don't decrement, don't generic_make_request,
+ /* special case - don't decrement, don't submit_bio_noacct,
* just fail immediately
*/
bio_io_error(bio);
@@ -214,7 +214,7 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
} else
bio_set_dev(bio, conf->rdev->bdev);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
return true;
}
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 26c75c0..5ab2206 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -46,29 +46,6 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
return conf->disks + lo;
}
-/*
- * In linear_congested() conf->raid_disks is used as a copy of
- * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks
- * and conf->disks[] are created in linear_conf(), they are always
- * consitent with each other, but mddev->raid_disks does not.
- */
-static int linear_congested(struct mddev *mddev, int bits)
-{
- struct linear_conf *conf;
- int i, ret = 0;
-
- rcu_read_lock();
- conf = rcu_dereference(mddev->private);
-
- for (i = 0; i < conf->raid_disks && !ret ; i++) {
- struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
- ret |= bdi_congested(q->backing_dev_info, bits);
- }
-
- rcu_read_unlock();
- return ret;
-}
-
static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks)
{
struct linear_conf *conf;
@@ -225,7 +202,7 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev_resume(mddev);
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
kfree_rcu(oldconf, rcu);
return 0;
}
@@ -267,7 +244,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
struct bio *split = bio_split(bio, end_sector - bio_sector,
GFP_NOIO, &mddev->bio_set);
bio_chain(split, bio);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
bio = split;
}
@@ -286,7 +263,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
bio_sector);
mddev_check_writesame(mddev, bio);
mddev_check_write_zeroes(mddev, bio);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
return true;
@@ -322,7 +299,6 @@ static struct md_personality linear_personality =
.hot_add_disk = linear_add,
.size = linear_size,
.quiesce = linear_quiesce,
- .congested = linear_congested,
};
static int __init linear_init (void)
diff --git a/drivers/md/md-linear.h b/drivers/md/md-linear.h
index 8381d65..24e97db 100644
--- a/drivers/md/md-linear.h
+++ b/drivers/md/md-linear.h
@@ -12,6 +12,6 @@ struct linear_conf
struct rcu_head rcu;
sector_t array_sectors;
int raid_disks; /* a copy of mddev->raid_disks */
- struct dev_info disks[0];
+ struct dev_info disks[];
};
#endif
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 152f9e6..776bbe5 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -131,7 +131,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
mp_bh->bio.bi_private = mp_bh;
mddev_check_writesame(mddev, &mp_bh->bio);
mddev_check_write_zeroes(mddev, &mp_bh->bio);
- generic_make_request(&mp_bh->bio);
+ submit_bio_noacct(&mp_bh->bio);
return true;
}
@@ -151,28 +151,6 @@ static void multipath_status(struct seq_file *seq, struct mddev *mddev)
seq_putc(seq, ']');
}
-static int multipath_congested(struct mddev *mddev, int bits)
-{
- struct mpconf *conf = mddev->private;
- int i, ret = 0;
-
- rcu_read_lock();
- for (i = 0; i < mddev->raid_disks ; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
- struct request_queue *q = bdev_get_queue(rdev->bdev);
-
- ret |= bdi_congested(q->backing_dev_info, bits);
- /* Just like multipath_map, we just check the
- * first available device
- */
- break;
- }
- }
- rcu_read_unlock();
- return ret;
-}
-
/*
* Careful, this can execute in IRQ contexts as well!
*/
@@ -348,7 +326,7 @@ static void multipathd(struct md_thread *thread)
bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
bio->bi_end_io = multipath_end_request;
bio->bi_private = mp_bh;
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
}
spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -478,7 +456,6 @@ static struct md_personality multipath_personality =
.hot_add_disk = multipath_add_disk,
.hot_remove_disk= multipath_remove_disk,
.size = multipath_size,
- .congested = multipath_congested,
};
static int __init multipath_init (void)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 761d438..cc38765 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -58,18 +58,16 @@
#include <linux/delay.h>
#include <linux/raid/md_p.h>
#include <linux/raid/md_u.h>
+#include <linux/raid/detect.h>
#include <linux/slab.h>
#include <linux/percpu-refcount.h>
+#include <linux/part_stat.h>
#include <trace/events/block.h>
#include "md.h"
#include "md-bitmap.h"
#include "md-cluster.h"
-#ifndef MODULE
-static void autostart_arrays(int part);
-#endif
-
/* pers_list is a list of registered personalities protected
* by pers_lock.
* pers_lock does extra service to protect accesses to
@@ -87,6 +85,7 @@ static struct module *md_cluster_mod;
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
static struct workqueue_struct *md_misc_wq;
+static struct workqueue_struct *md_rdev_misc_wq;
static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
@@ -98,6 +97,8 @@ static void mddev_detach(struct mddev *mddev);
* count by 2 for every hour elapsed between read errors.
*/
#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
+/* Default safemode delay: 200 msec */
+#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
/*
* Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
* is 1000 KB/sec, so the extra system load does not show up that much.
@@ -125,76 +126,167 @@ static inline int speed_max(struct mddev *mddev)
mddev->sync_speed_max : sysctl_speed_limit_max;
}
-static int rdev_init_wb(struct md_rdev *rdev)
+static void rdev_uninit_serial(struct md_rdev *rdev)
{
- if (rdev->bdev->bd_queue->nr_hw_queues == 1)
+ if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
+ return;
+
+ kvfree(rdev->serial);
+ rdev->serial = NULL;
+}
+
+static void rdevs_uninit_serial(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev)
+ rdev_uninit_serial(rdev);
+}
+
+static int rdev_init_serial(struct md_rdev *rdev)
+{
+ /* serial_nums equals with BARRIER_BUCKETS_NR */
+ int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
+ struct serial_in_rdev *serial = NULL;
+
+ if (test_bit(CollisionCheck, &rdev->flags))
return 0;
- spin_lock_init(&rdev->wb_list_lock);
- INIT_LIST_HEAD(&rdev->wb_list);
- init_waitqueue_head(&rdev->wb_io_wait);
- set_bit(WBCollisionCheck, &rdev->flags);
+ serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
+ GFP_KERNEL);
+ if (!serial)
+ return -ENOMEM;
- return 1;
+ for (i = 0; i < serial_nums; i++) {
+ struct serial_in_rdev *serial_tmp = &serial[i];
+
+ spin_lock_init(&serial_tmp->serial_lock);
+ serial_tmp->serial_rb = RB_ROOT_CACHED;
+ init_waitqueue_head(&serial_tmp->serial_io_wait);
+ }
+
+ rdev->serial = serial;
+ set_bit(CollisionCheck, &rdev->flags);
+
+ return 0;
+}
+
+static int rdevs_init_serial(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ int ret = 0;
+
+ rdev_for_each(rdev, mddev) {
+ ret = rdev_init_serial(rdev);
+ if (ret)
+ break;
+ }
+
+ /* Free all resources if pool is not existed */
+ if (ret && !mddev->serial_info_pool)
+ rdevs_uninit_serial(mddev);
+
+ return ret;
}
/*
- * Create wb_info_pool if rdev is the first multi-queue device flaged
- * with writemostly, also write-behind mode is enabled.
+ * rdev needs to enable serial stuffs if it meets the conditions:
+ * 1. it is multi-queue device flaged with writemostly.
+ * 2. the write-behind mode is enabled.
*/
-void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend)
+static int rdev_need_serial(struct md_rdev *rdev)
{
- if (mddev->bitmap_info.max_write_behind == 0)
+ return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
+ rdev->bdev->bd_disk->queue->nr_hw_queues != 1 &&
+ test_bit(WriteMostly, &rdev->flags));
+}
+
+/*
+ * Init resource for rdev(s), then create serial_info_pool if:
+ * 1. rdev is the first device which return true from rdev_enable_serial.
+ * 2. rdev is NULL, means we want to enable serialization for all rdevs.
+ */
+void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend)
+{
+ int ret = 0;
+
+ if (rdev && !rdev_need_serial(rdev) &&
+ !test_bit(CollisionCheck, &rdev->flags))
return;
- if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev))
+ if (!is_suspend)
+ mddev_suspend(mddev);
+
+ if (!rdev)
+ ret = rdevs_init_serial(mddev);
+ else
+ ret = rdev_init_serial(rdev);
+ if (ret)
+ goto abort;
+
+ if (mddev->serial_info_pool == NULL) {
+ /*
+ * already in memalloc noio context by
+ * mddev_suspend()
+ */
+ mddev->serial_info_pool =
+ mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
+ sizeof(struct serial_info));
+ if (!mddev->serial_info_pool) {
+ rdevs_uninit_serial(mddev);
+ pr_err("can't alloc memory pool for serialization\n");
+ }
+ }
+
+abort:
+ if (!is_suspend)
+ mddev_resume(mddev);
+}
+
+/*
+ * Free resource from rdev(s), and destroy serial_info_pool under conditions:
+ * 1. rdev is the last device flaged with CollisionCheck.
+ * 2. when bitmap is destroyed while policy is not enabled.
+ * 3. for disable policy, the pool is destroyed only when no rdev needs it.
+ */
+void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend)
+{
+ if (rdev && !test_bit(CollisionCheck, &rdev->flags))
return;
- if (mddev->wb_info_pool == NULL) {
- unsigned int noio_flag;
+ if (mddev->serial_info_pool) {
+ struct md_rdev *temp;
+ int num = 0; /* used to track if other rdevs need the pool */
if (!is_suspend)
mddev_suspend(mddev);
- noio_flag = memalloc_noio_save();
- mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS,
- sizeof(struct wb_info));
- memalloc_noio_restore(noio_flag);
- if (!mddev->wb_info_pool)
- pr_err("can't alloc memory pool for writemostly\n");
+ rdev_for_each(temp, mddev) {
+ if (!rdev) {
+ if (!mddev->serialize_policy ||
+ !rdev_need_serial(temp))
+ rdev_uninit_serial(temp);
+ else
+ num++;
+ } else if (temp != rdev &&
+ test_bit(CollisionCheck, &temp->flags))
+ num++;
+ }
+
+ if (rdev)
+ rdev_uninit_serial(rdev);
+
+ if (num)
+ pr_info("The mempool could be used by other devices\n");
+ else {
+ mempool_destroy(mddev->serial_info_pool);
+ mddev->serial_info_pool = NULL;
+ }
if (!is_suspend)
mddev_resume(mddev);
}
}
-EXPORT_SYMBOL_GPL(mddev_create_wb_pool);
-
-/*
- * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck.
- */
-static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev)
-{
- if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags))
- return;
-
- if (mddev->wb_info_pool) {
- struct md_rdev *temp;
- int num = 0;
-
- /*
- * Check if other rdevs need wb_info_pool.
- */
- rdev_for_each(temp, mddev)
- if (temp != rdev &&
- test_bit(WBCollisionCheck, &temp->flags))
- num++;
- if (!num) {
- mddev_suspend(rdev->mddev);
- mempool_destroy(mddev->wb_info_pool);
- mddev->wb_info_pool = NULL;
- mddev_resume(rdev->mddev);
- }
- }
-}
static struct ctl_table_header *raid_table_header;
@@ -236,8 +328,6 @@ static struct ctl_table raid_root_table[] = {
{ }
};
-static const struct block_device_operations md_fops;
-
static int start_readonly;
/*
@@ -369,11 +459,11 @@ void md_handle_request(struct mddev *mddev, struct bio *bio)
}
EXPORT_SYMBOL(md_handle_request);
-static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t md_submit_bio(struct bio *bio)
{
const int rw = bio_data_dir(bio);
const int sgrp = op_stat_group(bio_op(bio));
- struct mddev *mddev = q->queuedata;
+ struct mddev *mddev = bio->bi_disk->private_data;
unsigned int sectors;
if (mddev == NULL || mddev->pers == NULL) {
@@ -386,7 +476,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
return BLK_QC_T_NONE;
}
- blk_queue_split(q, &bio);
+ blk_queue_split(&bio);
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
if (bio_sectors(bio) != 0)
@@ -435,11 +525,15 @@ void mddev_suspend(struct mddev *mddev)
wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
del_timer_sync(&mddev->safemode_timer);
+ /* restrict memory reclaim I/O during raid array is suspend */
+ mddev->noio_flag = memalloc_noio_save();
}
EXPORT_SYMBOL_GPL(mddev_suspend);
void mddev_resume(struct mddev *mddev)
{
+ /* entred the memalloc scope from mddev_suspend() */
+ memalloc_noio_restore(mddev->noio_flag);
lockdep_assert_held(&mddev->reconfig_mutex);
if (--mddev->suspended)
return;
@@ -452,26 +546,6 @@ void mddev_resume(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(mddev_resume);
-int mddev_congested(struct mddev *mddev, int bits)
-{
- struct md_personality *pers = mddev->pers;
- int ret = 0;
-
- rcu_read_lock();
- if (mddev->suspended)
- ret = 1;
- else if (pers && pers->congested)
- ret = pers->congested(mddev, bits);
- rcu_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL_GPL(mddev_congested);
-static int md_congested(void *data, int bits)
-{
- struct mddev *mddev = data;
- return mddev_congested(mddev, bits);
-}
-
/*
* Generic flush handling for md
*/
@@ -773,7 +847,13 @@ void mddev_unlock(struct mddev *mddev)
sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
if (mddev->sysfs_action)
sysfs_put(mddev->sysfs_action);
+ if (mddev->sysfs_completed)
+ sysfs_put(mddev->sysfs_completed);
+ if (mddev->sysfs_degraded)
+ sysfs_put(mddev->sysfs_degraded);
mddev->sysfs_action = NULL;
+ mddev->sysfs_completed = NULL;
+ mddev->sysfs_degraded = NULL;
}
}
mddev->sysfs_active = 0;
@@ -875,7 +955,8 @@ static void super_written(struct bio *bio)
struct mddev *mddev = rdev->mddev;
if (bio->bi_status) {
- pr_err("md: super_written gets error=%d\n", bio->bi_status);
+ pr_err("md: %s gets error=%d\n", __func__,
+ blk_status_to_errno(bio->bi_status));
md_error(mddev, rdev);
if (!test_bit(Faulty, &rdev->flags)
&& (bio->bi_opf & MD_FAILFAST)) {
@@ -2090,6 +2171,24 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->sb_csum = calc_sb_1_csum(sb);
}
+static sector_t super_1_choose_bm_space(sector_t dev_size)
+{
+ sector_t bm_space;
+
+ /* if the device is bigger than 8Gig, save 64k for bitmap
+ * usage, if bigger than 200Gig, save 128k
+ */
+ if (dev_size < 64*2)
+ bm_space = 0;
+ else if (dev_size - 64*2 >= 200*1024*1024*2)
+ bm_space = 128*2;
+ else if (dev_size - 4*2 > 8*1024*1024*2)
+ bm_space = 64*2;
+ else
+ bm_space = 4*2;
+ return bm_space;
+}
+
static unsigned long long
super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
{
@@ -2110,10 +2209,20 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
return 0;
} else {
/* minor version 0; superblock after data */
- sector_t sb_start;
- sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
+ sector_t sb_start, bm_space;
+ sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9;
+
+ /* 8K is for superblock */
+ sb_start = dev_size - 8*2;
sb_start &= ~(sector_t)(4*2 - 1);
- max_sectors = rdev->sectors + sb_start - rdev->sb_start;
+
+ bm_space = super_1_choose_bm_space(dev_size);
+
+ /* Space that can be used to store date needs to decrease
+ * superblock bitmap space and bad block space(4K)
+ */
+ max_sectors = sb_start - bm_space - 4*2;
+
if (!num_sectors || num_sectors > max_sectors)
num_sectors = max_sectors;
rdev->sb_start = sb_start;
@@ -2211,8 +2320,7 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
test_bit(Journal, &rdev2->flags) ||
rdev2->raid_disk == -1)
continue;
- if (rdev->bdev->bd_contains ==
- rdev2->bdev->bd_contains) {
+ if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) {
rcu_read_unlock();
return 1;
}
@@ -2362,15 +2470,19 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
pr_debug("md: bind<%s>\n", b);
if (mddev->raid_disks)
- mddev_create_wb_pool(mddev, rdev, false);
+ mddev_create_serial_pool(mddev, rdev, false);
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
goto fail;
ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
- if (sysfs_create_link(&rdev->kobj, ko, "block"))
- /* failure here is OK */;
+ /* failure here is OK */
+ err = sysfs_create_link(&rdev->kobj, ko, "block");
rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
+ rdev->sysfs_unack_badblocks =
+ sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
+ rdev->sysfs_badblocks =
+ sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
list_add_rcu(&rdev->same_set, &mddev->disks);
bd_link_disk_holder(rdev->bdev, mddev->gendisk);
@@ -2386,7 +2498,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
return err;
}
-static void md_delayed_delete(struct work_struct *ws)
+static void rdev_delayed_delete(struct work_struct *ws)
{
struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
kobject_del(&rdev->kobj);
@@ -2400,20 +2512,24 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
- mddev_destroy_wb_pool(rdev->mddev, rdev);
+ mddev_destroy_serial_pool(rdev->mddev, rdev, false);
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
+ sysfs_put(rdev->sysfs_unack_badblocks);
+ sysfs_put(rdev->sysfs_badblocks);
rdev->sysfs_state = NULL;
+ rdev->sysfs_unack_badblocks = NULL;
+ rdev->sysfs_badblocks = NULL;
rdev->badblocks.count = 0;
/* We need to delay this, otherwise we can deadlock when
* writing to 'remove' to "dev/state". We also need
* to delay it due to rcu usage.
*/
synchronize_rcu();
- INIT_WORK(&rdev->del_work, md_delayed_delete);
+ INIT_WORK(&rdev->del_work, rdev_delayed_delete);
kobject_get(&rdev->kobj);
- queue_work(md_misc_wq, &rdev->del_work);
+ queue_work(md_rdev_misc_wq, &rdev->del_work);
}
/*
@@ -2425,12 +2541,12 @@ static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
{
int err = 0;
struct block_device *bdev;
- char b[BDEVNAME_SIZE];
bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
shared ? (struct md_rdev *)lock_rdev : rdev);
if (IS_ERR(bdev)) {
- pr_warn("md: could not open %s.\n", __bdevname(dev, b));
+ pr_warn("md: could not open device unknown-block(%u,%u).\n",
+ MAJOR(dev), MINOR(dev));
return PTR_ERR(bdev);
}
rdev->bdev = bdev;
@@ -2749,7 +2865,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
goto repeat;
wake_up(&mddev->sb_wait);
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
rdev_for_each(rdev, mddev) {
if (test_and_clear_bit(FaultRecorded, &rdev->flags))
@@ -2882,7 +2998,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
* -write_error - clears WriteErrorSeen
* {,-}failfast - set/clear FailFast
*/
+
+ struct mddev *mddev = rdev->mddev;
int err = -EINVAL;
+ bool need_update_sb = false;
+
if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
md_error(rdev->mddev, rdev);
if (test_bit(Faulty, &rdev->flags))
@@ -2897,7 +3017,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
if (rdev->raid_disk >= 0)
err = -EBUSY;
else {
- struct mddev *mddev = rdev->mddev;
err = 0;
if (mddev_is_clustered(mddev))
err = md_cluster_ops->remove_disk(mddev, rdev);
@@ -2913,11 +3032,13 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
- mddev_create_wb_pool(rdev->mddev, rdev, false);
+ mddev_create_serial_pool(rdev->mddev, rdev, false);
+ need_update_sb = true;
err = 0;
} else if (cmd_match(buf, "-writemostly")) {
- mddev_destroy_wb_pool(rdev->mddev, rdev);
+ mddev_destroy_serial_pool(rdev->mddev, rdev, false);
clear_bit(WriteMostly, &rdev->flags);
+ need_update_sb = true;
err = 0;
} else if (cmd_match(buf, "blocked")) {
set_bit(Blocked, &rdev->flags);
@@ -2943,9 +3064,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
err = 0;
} else if (cmd_match(buf, "failfast")) {
set_bit(FailFast, &rdev->flags);
+ need_update_sb = true;
err = 0;
} else if (cmd_match(buf, "-failfast")) {
clear_bit(FailFast, &rdev->flags);
+ need_update_sb = true;
err = 0;
} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags)) {
@@ -3024,6 +3147,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
clear_bit(ExternalBbl, &rdev->flags);
err = 0;
}
+ if (need_update_sb)
+ md_update_sb(mddev, 1);
if (!err)
sysfs_notify_dirent_safe(rdev->sysfs_state);
return err ? err : len;
@@ -3123,15 +3248,14 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
rdev->saved_raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
clear_bit(Bitmap_sync, &rdev->flags);
- err = rdev->mddev->pers->
- hot_add_disk(rdev->mddev, rdev);
+ err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev);
if (err) {
rdev->raid_disk = -1;
return err;
} else
sysfs_notify_dirent_safe(rdev->sysfs_state);
- if (sysfs_link_rdev(rdev->mddev, rdev))
- /* failure here is OK */;
+ /* failure here is OK */;
+ sysfs_link_rdev(rdev->mddev, rdev);
/* don't wakeup anyone, leave that to userspace. */
} else {
if (slot >= rdev->mddev->raid_disks &&
@@ -3956,6 +4080,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
pr_warn("md: cannot register extra attributes for %s\n",
mdname(mddev));
mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
+ mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
+ mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
}
if (oldpers->sync_request != NULL &&
pers->sync_request == NULL) {
@@ -4003,7 +4129,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev_resume(mddev);
if (!mddev->thread)
md_update_sb(mddev, 1);
- sysfs_notify(&mddev->kobj, NULL, "level");
+ sysfs_notify_dirent_safe(mddev->sysfs_level);
md_new_event(mddev);
rv = len;
out_unlock:
@@ -4116,6 +4242,14 @@ static struct md_sysfs_entry md_raid_disks =
__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
static ssize_t
+uuid_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%pU\n", mddev->uuid);
+}
+static struct md_sysfs_entry md_uuid =
+__ATTR(uuid, S_IRUGO, uuid_show, NULL);
+
+static ssize_t
chunk_size_show(struct mddev *mddev, char *page)
{
if (mddev->reshape_position != MaxSector &&
@@ -4300,7 +4434,6 @@ array_state_show(struct mddev *mddev, char *page)
static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
-static int do_md_run(struct mddev *mddev);
static int restart_array(struct mddev *mddev);
static ssize_t
@@ -4446,6 +4579,20 @@ null_show(struct mddev *mddev, char *page)
return -EINVAL;
}
+/* need to ensure rdev_delayed_delete() has completed */
+static void flush_rdev_wq(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
+ if (work_pending(&rdev->del_work)) {
+ flush_workqueue(md_rdev_misc_wq);
+ break;
+ }
+ rcu_read_unlock();
+}
+
static ssize_t
new_dev_store(struct mddev *mddev, const char *buf, size_t len)
{
@@ -4473,8 +4620,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
minor != MINOR(dev))
return -EOVERFLOW;
- flush_workqueue(md_misc_wq);
-
+ flush_rdev_wq(mddev);
err = mddev_lock(mddev);
if (err)
return err;
@@ -4712,7 +4858,8 @@ action_store(struct mddev *mddev, const char *page, size_t len)
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
mddev_lock(mddev) == 0) {
- flush_workqueue(md_misc_wq);
+ if (work_pending(&mddev->del_work))
+ flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
@@ -4742,7 +4889,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
}
if (err)
return err;
- sysfs_notify(&mddev->kobj, NULL, "degraded");
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
} else {
if (cmd_match(page, "check"))
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -5217,7 +5364,7 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len)
mddev->array_sectors = sectors;
if (mddev->pers) {
set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
}
}
mddev_unlock(mddev);
@@ -5302,10 +5449,62 @@ static struct md_sysfs_entry md_fail_last_dev =
__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
fail_last_dev_store);
+static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
+{
+ if (mddev->pers == NULL || (mddev->pers->level != 1))
+ return sprintf(page, "n/a\n");
+ else
+ return sprintf(page, "%d\n", mddev->serialize_policy);
+}
+
+/*
+ * Setting serialize_policy to true to enforce write IO is not reordered
+ * for raid1.
+ */
+static ssize_t
+serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int err;
+ bool value;
+
+ err = kstrtobool(buf, &value);
+ if (err)
+ return err;
+
+ if (value == mddev->serialize_policy)
+ return len;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ if (mddev->pers == NULL || (mddev->pers->level != 1)) {
+ pr_err("md: serialize_policy is only effective for raid1\n");
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ mddev_suspend(mddev);
+ if (value)
+ mddev_create_serial_pool(mddev, NULL, true);
+ else
+ mddev_destroy_serial_pool(mddev, NULL, true);
+ mddev->serialize_policy = value;
+ mddev_resume(mddev);
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry md_serialize_policy =
+__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
+ serialize_policy_store);
+
+
static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_layout.attr,
&md_raid_disks.attr,
+ &md_uuid.attr,
&md_chunk_size.attr,
&md_size.attr,
&md_resync_start.attr,
@@ -5319,6 +5518,7 @@ static struct attribute *md_default_attrs[] = {
&max_corr_read_errors.attr,
&md_consistency_policy.attr,
&md_fail_last_dev.attr,
+ &md_serialize_policy.attr,
NULL,
};
@@ -5396,6 +5596,8 @@ static void md_free(struct kobject *ko)
if (mddev->sysfs_state)
sysfs_put(mddev->sysfs_state);
+ if (mddev->sysfs_level)
+ sysfs_put(mddev->sysfs_level);
if (mddev->gendisk)
del_gendisk(mddev->gendisk);
@@ -5503,12 +5705,10 @@ static int md_alloc(dev_t dev, char *name)
mddev->hold_active = UNTIL_STOP;
error = -ENOMEM;
- mddev->queue = blk_alloc_queue(GFP_KERNEL);
+ mddev->queue = blk_alloc_queue(NUMA_NO_NODE);
if (!mddev->queue)
goto abort;
- mddev->queue->queuedata = mddev;
- blk_queue_make_request(mddev->queue, md_make_request);
blk_set_stacking_limits(&mddev->queue->limits);
disk = alloc_disk(1 << shift);
@@ -5534,11 +5734,8 @@ static int md_alloc(dev_t dev, char *name)
* remove it now.
*/
disk->flags |= GENHD_FL_EXT_DEVT;
+ disk->events |= DISK_EVENT_MEDIA_CHANGE;
mddev->gendisk = disk;
- /* As soon as we call add_disk(), another thread could get
- * through to md_open, so make sure it doesn't get too far
- */
- mutex_lock(&mddev->open_mutex);
add_disk(disk);
error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
@@ -5553,12 +5750,12 @@ static int md_alloc(dev_t dev, char *name)
if (mddev->kobj.sd &&
sysfs_create_group(&mddev->kobj, &md_bitmap_group))
pr_debug("pointless warning\n");
- mutex_unlock(&mddev->open_mutex);
abort:
mutex_unlock(&disks_mutex);
if (!error && mddev->kobj.sd) {
kobject_uevent(&mddev->kobj, KOBJ_ADD);
mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
+ mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
}
mddev_put(mddev);
return error;
@@ -5742,8 +5939,8 @@ int md_run(struct mddev *mddev)
rdev_for_each(rdev, mddev)
rdev_for_each(rdev2, mddev) {
if (rdev < rdev2 &&
- rdev->bdev->bd_contains ==
- rdev2->bdev->bd_contains) {
+ rdev->bdev->bd_disk ==
+ rdev2->bdev->bd_disk) {
pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n",
mdname(mddev),
bdevname(rdev->bdev,b),
@@ -5794,18 +5991,18 @@ int md_run(struct mddev *mddev)
goto bitmap_abort;
if (mddev->bitmap_info.max_write_behind > 0) {
- bool creat_pool = false;
+ bool create_pool = false;
rdev_for_each(rdev, mddev) {
if (test_bit(WriteMostly, &rdev->flags) &&
- rdev_init_wb(rdev))
- creat_pool = true;
+ rdev_init_serial(rdev))
+ create_pool = true;
}
- if (creat_pool && mddev->wb_info_pool == NULL) {
- mddev->wb_info_pool =
- mempool_create_kmalloc_pool(NR_WB_INFOS,
- sizeof(struct wb_info));
- if (!mddev->wb_info_pool) {
+ if (create_pool && mddev->serial_info_pool == NULL) {
+ mddev->serial_info_pool =
+ mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
+ sizeof(struct serial_info));
+ if (!mddev->serial_info_pool) {
err = -ENOMEM;
goto bitmap_abort;
}
@@ -5828,8 +6025,6 @@ int md_run(struct mddev *mddev)
blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
else
blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
- mddev->queue->backing_dev_info->congested_data = mddev;
- mddev->queue->backing_dev_info->congested_fn = md_congested;
}
if (pers->sync_request) {
if (mddev->kobj.sd &&
@@ -5837,6 +6032,8 @@ int md_run(struct mddev *mddev)
pr_warn("md: cannot register extra attributes for %s\n",
mdname(mddev));
mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
+ mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
+ mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
} else if (mddev->ro == 2) /* auto-readonly not meaningful */
mddev->ro = 0;
@@ -5846,7 +6043,7 @@ int md_run(struct mddev *mddev)
if (mddev_is_clustered(mddev))
mddev->safemode_delay = 0;
else
- mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
+ mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
mddev->in_sync = 1;
smp_wmb();
spin_lock(&mddev->lock);
@@ -5883,7 +6080,7 @@ int md_run(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(md_run);
-static int do_md_run(struct mddev *mddev)
+int do_md_run(struct mddev *mddev)
{
int err;
@@ -5907,13 +6104,13 @@ static int do_md_run(struct mddev *mddev)
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
clear_bit(MD_NOT_READY, &mddev->flags);
mddev->changed = 1;
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
sysfs_notify_dirent_safe(mddev->sysfs_state);
sysfs_notify_dirent_safe(mddev->sysfs_action);
- sysfs_notify(&mddev->kobj, NULL, "degraded");
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
out:
clear_bit(MD_NOT_READY, &mddev->flags);
return err;
@@ -6028,7 +6225,8 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- flush_workqueue(md_misc_wq);
+ if (work_pending(&mddev->del_work))
+ flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
@@ -6050,8 +6248,9 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
- mempool_destroy(mddev->wb_info_pool);
- mddev->wb_info_pool = NULL;
+ /* disable policy to guarantee rdevs free resources for serialization */
+ mddev->serialize_policy = 0;
+ mddev_destroy_serial_pool(mddev, NULL, true);
}
void md_stop_writes(struct mddev *mddev)
@@ -6080,7 +6279,8 @@ static void __md_stop(struct mddev *mddev)
md_bitmap_destroy(mddev);
mddev_detach(mddev);
/* Ensure ->event_work is done */
- flush_workqueue(md_misc_wq);
+ if (mddev->event_work.func)
+ flush_workqueue(md_misc_wq);
spin_lock(&mddev->lock);
mddev->pers = NULL;
spin_unlock(&mddev->lock);
@@ -6211,7 +6411,6 @@ static int do_md_stop(struct mddev *mddev, int mode,
__md_stop_writes(mddev);
__md_stop(mddev);
- mddev->queue->backing_dev_info->congested_fn = NULL;
/* tell userspace to handle 'inactive' */
sysfs_notify_dirent_safe(mddev->sysfs_state);
@@ -6223,7 +6422,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
set_capacity(disk, 0);
mutex_unlock(&mddev->open_mutex);
mddev->changed = 1;
- revalidate_disk(disk);
+ revalidate_disk_size(disk, true);
if (mddev->ro)
mddev->ro = 0;
@@ -6514,7 +6713,7 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
return 0;
}
-static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
+int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
{
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
struct md_rdev *rdev;
@@ -6560,7 +6759,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
}
/*
- * add_new_disk can be used once the array is assembled
+ * md_add_new_disk can be used once the array is assembled
* to add "hot spares". They must already have a superblock
* written
*/
@@ -6673,7 +6872,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
return err;
}
- /* otherwise, add_new_disk is only allowed
+ /* otherwise, md_add_new_disk is only allowed
* for major_version==0 superblocks
*/
if (mddev->major_version != 0) {
@@ -6920,7 +7119,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
}
/*
- * set_array_info is used two different ways
+ * md_set_array_info is used two different ways
* The original usage is when creating a new array.
* In this usage, raid_disks is > 0 and it together with
* level, size, not_persistent,layout,chunksize determine the
@@ -6932,9 +7131,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
* The minor and patch _version numbers are also kept incase the
* super_block handler wishes to interpret them.
*/
-static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
+int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
{
-
if (info->raid_disks == 0) {
/* just setting version number for superblock loading */
if (info->major_version < 0 ||
@@ -7056,7 +7254,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
md_cluster_ops->update_size(mddev, old_dev_sectors);
else if (mddev->queue) {
set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
}
}
return rv;
@@ -7223,6 +7421,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.nodes = 0;
md_cluster_ops->leave(mddev);
+ module_put(md_cluster_mod);
+ mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
mddev_suspend(mddev);
md_bitmap_destroy(mddev);
@@ -7283,7 +7483,6 @@ static inline bool md_ioctl_valid(unsigned int cmd)
case GET_DISK_INFO:
case HOT_ADD_DISK:
case HOT_REMOVE_DISK:
- case RAID_AUTORUN:
case RAID_VERSION:
case RESTART_ARRAY_RW:
case RUN_ARRAY:
@@ -7329,13 +7528,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
case RAID_VERSION:
err = get_version(argp);
goto out;
-
-#ifndef MODULE
- case RAID_AUTORUN:
- err = 0;
- autostart_arrays(arg);
- goto out;
-#endif
default:;
}
@@ -7376,9 +7568,8 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
}
- if (cmd == ADD_NEW_DISK)
- /* need to ensure md_delayed_delete() has completed */
- flush_workqueue(md_misc_wq);
+ if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
+ flush_rdev_wq(mddev);
if (cmd == HOT_REMOVE_DISK)
/* need to ensure recovery thread has run */
@@ -7438,7 +7629,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
err = -EBUSY;
goto unlock;
}
- err = set_array_info(mddev, &info);
+ err = md_set_array_info(mddev, &info);
if (err) {
pr_warn("md: couldn't set array info. %d\n", err);
goto unlock;
@@ -7492,7 +7683,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
/* Need to clear read-only for this */
break;
else
- err = add_new_disk(mddev, &info);
+ err = md_add_new_disk(mddev, &info);
goto unlock;
}
break;
@@ -7560,7 +7751,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
if (copy_from_user(&info, argp, sizeof(info)))
err = -EFAULT;
else
- err = add_new_disk(mddev, &info);
+ err = md_add_new_disk(mddev, &info);
goto unlock;
}
@@ -7655,7 +7846,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
atomic_inc(&mddev->openers);
mutex_unlock(&mddev->open_mutex);
- check_disk_change(bdev);
+ bdev_check_media_change(bdev);
out:
if (err)
mddev_put(mddev);
@@ -7671,23 +7862,21 @@ static void md_release(struct gendisk *disk, fmode_t mode)
mddev_put(mddev);
}
-static int md_media_changed(struct gendisk *disk)
+static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
{
struct mddev *mddev = disk->private_data;
+ unsigned int ret = 0;
- return mddev->changed;
-}
-
-static int md_revalidate(struct gendisk *disk)
-{
- struct mddev *mddev = disk->private_data;
-
+ if (mddev->changed)
+ ret = DISK_EVENT_MEDIA_CHANGE;
mddev->changed = 0;
- return 0;
+ return ret;
}
-static const struct block_device_operations md_fops =
+
+const struct block_device_operations md_fops =
{
.owner = THIS_MODULE,
+ .submit_bio = md_submit_bio,
.open = md_open,
.release = md_release,
.ioctl = md_ioctl,
@@ -7695,8 +7884,7 @@ static const struct block_device_operations md_fops =
.compat_ioctl = md_compat_ioctl,
#endif
.getgeo = md_getgeo,
- .media_changed = md_media_changed,
- .revalidate_disk= md_revalidate,
+ .check_events = md_check_events,
};
static int md_thread(void *arg)
@@ -8168,13 +8356,12 @@ static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
return mask;
}
-static const struct file_operations md_seq_fops = {
- .owner = THIS_MODULE,
- .open = md_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
- .poll = mdstat_poll,
+static const struct proc_ops mdstat_proc_ops = {
+ .proc_open = md_seq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+ .proc_poll = mdstat_poll,
};
int register_md_personality(struct md_personality *p)
@@ -8225,6 +8412,7 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
int md_setup_cluster(struct mddev *mddev, int nodes)
{
+ int ret;
if (!md_cluster_ops)
request_module("md-cluster");
spin_lock(&pers_lock);
@@ -8236,7 +8424,10 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
}
spin_unlock(&pers_lock);
- return md_cluster_ops->join(mddev, nodes);
+ ret = md_cluster_ops->join(mddev, nodes);
+ if (!ret)
+ mddev->safemode_delay = 0;
+ return ret;
}
void md_cluster_stop(struct mddev *mddev)
@@ -8256,7 +8447,7 @@ static int is_mddev_idle(struct mddev *mddev, int init)
idle = 1;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
- struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
+ struct gendisk *disk = rdev->bdev->bd_disk;
curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
atomic_read(&disk->sync_io);
/* sync IO will cause sync_io to increase before the disk_stats
@@ -8637,7 +8828,7 @@ void md_do_sync(struct md_thread *thread)
} else
mddev->curr_resync = 3; /* no longer delayed */
mddev->curr_resync_completed = j;
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
md_new_event(mddev);
update_time = jiffies;
@@ -8665,7 +8856,7 @@ void md_do_sync(struct md_thread *thread)
mddev->recovery_cp = j;
update_time = jiffies;
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
while (j >= mddev->resync_max &&
@@ -8772,7 +8963,7 @@ void md_do_sync(struct md_thread *thread)
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev->curr_resync > 3) {
mddev->curr_resync_completed = mddev->curr_resync;
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
mddev->pers->sync_request(mddev, max_sectors, &skipped);
@@ -8829,7 +9020,7 @@ void md_do_sync(struct md_thread *thread)
mddev_unlock(mddev);
if (!mddev_is_clustered(mddev)) {
set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
}
}
@@ -8902,7 +9093,7 @@ static int remove_and_add_spares(struct mddev *mddev,
}
if (removed && mddev->kobj.sd)
- sysfs_notify(&mddev->kobj, NULL, "degraded");
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
if (this && removed)
goto no_add;
@@ -8929,10 +9120,9 @@ static int remove_and_add_spares(struct mddev *mddev,
rdev->recovery_offset = 0;
}
- if (mddev->pers->
- hot_add_disk(mddev, rdev) == 0) {
- if (sysfs_link_rdev(mddev, rdev))
- /* failure here is OK */;
+ if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
+ /* failure here is OK */
+ sysfs_link_rdev(mddev, rdev);
if (!test_bit(Journal, &rdev->flags))
spares++;
md_new_event(mddev);
@@ -9186,8 +9376,7 @@ void md_reap_sync_thread(struct mddev *mddev)
/* success...*/
/* activate any spares */
if (mddev->pers->spare_active(mddev)) {
- sysfs_notify(&mddev->kobj, NULL,
- "degraded");
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
}
}
@@ -9277,8 +9466,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
if (rv == 0) {
/* Make sure they get written out promptly */
if (test_bit(ExternalBbl, &rdev->flags))
- sysfs_notify(&rdev->kobj, NULL,
- "unacknowledged_bad_blocks");
+ sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
sysfs_notify_dirent_safe(rdev->sysfs_state);
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
@@ -9299,7 +9487,7 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
s += rdev->data_offset;
rv = badblocks_clear(&rdev->badblocks, s, sectors);
if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
- sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
+ sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
return rv;
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
@@ -9343,7 +9531,7 @@ static void md_geninit(void)
{
pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
- proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
+ proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops);
}
static int __init md_init(void)
@@ -9358,6 +9546,10 @@ static int __init md_init(void)
if (!md_misc_wq)
goto err_misc_wq;
+ md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0);
+ if (!md_rdev_misc_wq)
+ goto err_rdev_misc_wq;
+
if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
goto err_md;
@@ -9379,6 +9571,8 @@ static int __init md_init(void)
err_mdp:
unregister_blkdev(MD_MAJOR, "md");
err_md:
+ destroy_workqueue(md_rdev_misc_wq);
+err_rdev_misc_wq:
destroy_workqueue(md_misc_wq);
err_misc_wq:
destroy_workqueue(md_wq);
@@ -9526,7 +9720,7 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->recovery_offset == MaxSector &&
!test_bit(In_sync, &rdev->flags) &&
mddev->pers->spare_active(mddev))
- sysfs_notify(&mddev->kobj, NULL, "degraded");
+ sysfs_notify_dirent_safe(mddev->sysfs_degraded);
put_page(swapout);
return 0;
@@ -9589,7 +9783,7 @@ void md_autodetect_dev(dev_t dev)
}
}
-static void autostart_arrays(int part)
+void md_autostart_arrays(int part)
{
struct md_rdev *rdev;
struct detected_devices_node *node_detected_dev;
@@ -9668,6 +9862,7 @@ static __exit void md_exit(void)
* destroy_workqueue() below will wait for that to complete.
*/
}
+ destroy_workqueue(md_rdev_misc_wq);
destroy_workqueue(md_misc_wq);
destroy_workqueue(md_wq);
}
@@ -9677,7 +9872,7 @@ module_exit(md_exit)
static int get_ro(char *buffer, const struct kernel_param *kp)
{
- return sprintf(buffer, "%d", start_readonly);
+ return sprintf(buffer, "%d\n", start_readonly);
}
static int set_ro(const char *val, const struct kernel_param *kp)
{
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 5f86f8a..c94811c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -32,6 +32,16 @@
* be retried.
*/
#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
+
+/*
+ * The struct embedded in rdev is used to serialize IO.
+ */
+struct serial_in_rdev {
+ struct rb_root_cached serial_rb;
+ spinlock_t serial_lock;
+ wait_queue_head_t serial_io_wait;
+};
+
/*
* MD's 'extended' device
*/
@@ -110,18 +120,16 @@ struct md_rdev {
* in superblock.
*/
- /*
- * The members for check collision of write behind IOs.
- */
- struct list_head wb_list;
- spinlock_t wb_list_lock;
- wait_queue_head_t wb_io_wait;
+ struct serial_in_rdev *serial; /* used for raid1 io serialization */
struct work_struct del_work; /* used for delayed sysfs removal */
struct kernfs_node *sysfs_state; /* handle for 'state'
* sysfs entry */
-
+ /* handle for 'unacknowledged_bad_blocks' sysfs dentry */
+ struct kernfs_node *sysfs_unack_badblocks;
+ /* handle for 'bad_blocks' sysfs dentry */
+ struct kernfs_node *sysfs_badblocks;
struct badblocks badblocks;
struct {
@@ -201,9 +209,9 @@ enum flag_bits {
* it didn't fail, so don't use FailFast
* any more for metadata
*/
- WBCollisionCheck, /*
- * multiqueue device should check if there
- * is collision between write behind bios.
+ CollisionCheck, /*
+ * check if there is collision between raid1
+ * serial bios.
*/
};
@@ -263,12 +271,13 @@ enum mddev_sb_flags {
MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
};
-#define NR_WB_INFOS 8
-/* record current range of write behind IOs */
-struct wb_info {
- sector_t lo;
- sector_t hi;
- struct list_head list;
+#define NR_SERIAL_INFOS 8
+/* record current range of serialize IOs */
+struct serial_info {
+ struct rb_node node;
+ sector_t start; /* start sector of rb node */
+ sector_t last; /* end sector of rb node */
+ sector_t _subtree_last; /* highest sector in subtree of rb node */
};
struct mddev {
@@ -388,7 +397,7 @@ struct mddev {
* These locks are separate due to conflicting interactions
* with bdev->bd_mutex.
* Lock ordering is:
- * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk
+ * reconfig_mutex -> bd_mutex
* bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open
*/
struct mutex open_mutex;
@@ -414,6 +423,9 @@ struct mddev {
* file in sysfs.
*/
struct kernfs_node *sysfs_action; /* handle for 'sync_action' */
+ struct kernfs_node *sysfs_completed; /*handle for 'sync_completed' */
+ struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */
+ struct kernfs_node *sysfs_level; /*handle for 'level' */
struct work_struct del_work; /* used for delayed sysfs removal */
@@ -487,13 +499,15 @@ struct mddev {
*/
struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
- mempool_t *wb_info_pool;
+ mempool_t *serial_info_pool;
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info;
unsigned int good_device_nr; /* good device num within cluster raid */
+ unsigned int noio_flag; /* for memalloc scope API */
bool has_superblocks:1;
bool fail_last_dev:1;
+ bool serialize_policy:1;
};
enum recovery_flags {
@@ -536,7 +550,7 @@ extern void mddev_unlock(struct mddev *mddev);
static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
{
- atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
+ atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
}
static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
@@ -589,9 +603,6 @@ struct md_personality
* array.
*/
void *(*takeover) (struct mddev *mddev);
- /* congested implements bdi.congested_fn().
- * Will not be called while array is 'suspended' */
- int (*congested)(struct mddev *mddev, int bits);
/* Changes the consistency policy of an active array. */
int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
};
@@ -702,7 +713,6 @@ extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
extern void md_finish_reshape(struct mddev *mddev);
-extern int mddev_congested(struct mddev *mddev, int bits);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
sector_t sector, int size, struct page *page);
@@ -737,8 +747,10 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
-extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend);
+extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend);
+extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
@@ -790,4 +802,16 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio
!bio->bi_disk->queue->limits.max_write_zeroes_sectors)
mddev->queue->limits.max_write_zeroes_sectors = 0;
}
+
+struct mdu_array_info_s;
+struct mdu_disk_info_s;
+
+extern int mdp_major;
+void md_autostart_arrays(int part);
+int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info);
+int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info);
+int do_md_run(struct mddev *mddev);
+
+extern const struct block_device_operations md_fops;
+
#endif /* _MD_MD_H */
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index baaec1a..f4f948b 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -4,7 +4,7 @@
depends on BLK_DEV_DM
select LIBCRC32C
select DM_BUFIO
- ---help---
+ help
Library providing immutable on-disk data structure support for
device-mapper targets such as the thin provisioning target.
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index 5673f8e..21d1a17 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -38,7 +38,7 @@ struct node_header {
struct btree_node {
struct node_header header;
- __le64 keys[0];
+ __le64 keys[];
} __attribute__((packed, aligned(8)));
@@ -68,7 +68,7 @@ struct ro_spine {
};
void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
-int exit_ro_spine(struct ro_spine *s);
+void exit_ro_spine(struct ro_spine *s);
int ro_step(struct ro_spine *s, dm_block_t new_child);
void ro_pop(struct ro_spine *s);
struct btree_node *ro_node(struct ro_spine *s);
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 9e4d121..63f2bae 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -423,9 +423,9 @@ static int rebalance_children(struct shadow_spine *s,
memcpy(n, dm_block_data(child),
dm_bm_block_size(dm_tm_get_bm(info->tm)));
- dm_tm_unlock(info->tm, child);
dm_tm_dec(info->tm, dm_block_location(child));
+ dm_tm_unlock(info->tm, child);
return 0;
}
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index b27b809..e03cb9e 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -132,15 +132,13 @@ void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info)
s->nodes[1] = NULL;
}
-int exit_ro_spine(struct ro_spine *s)
+void exit_ro_spine(struct ro_spine *s)
{
- int r = 0, i;
+ int i;
for (i = 0; i < s->count; i++) {
unlock_block(s->info, s->nodes[i]);
}
-
- return r;
}
int ro_step(struct ro_spine *s, dm_block_t new_child)
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 8aae062..ee3e63a 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -83,14 +83,16 @@ void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
}
static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
- uint64_t key, void *value)
- __dm_written_to_disk(value)
+ uint64_t key, void *value)
+ __dm_written_to_disk(value)
{
uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
+ uint32_t max_entries = le32_to_cpu(node->header.max_entries);
__le64 key_le = cpu_to_le64(key);
if (index > nr_entries ||
- index >= le32_to_cpu(node->header.max_entries)) {
+ index >= max_entries ||
+ nr_entries >= max_entries) {
DMERR("too many entries in btree node for insert");
__dm_unbless_for_disk(value);
return -ENOMEM;
@@ -366,7 +368,8 @@ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
} while (!(flags & LEAF_NODE));
*result_key = le64_to_cpu(ro_node(s)->keys[i]);
- memcpy(v, value_ptr(ro_node(s), i), value_size);
+ if (v)
+ memcpy(v, value_ptr(ro_node(s), i), value_size);
return 0;
}
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index a213bf1..85853ab 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -281,6 +281,11 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result)
struct disk_index_entry ie_disk;
struct dm_block *blk;
+ if (b >= ll->nr_blocks) {
+ DMERR_LIMIT("metadata block out of bounds");
+ return -EINVAL;
+ }
+
b = do_div(index, ll->entries_per_block);
r = ll->load_ie(ll, index, &ie_disk);
if (r < 0)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 322386f..35843df 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -29,21 +29,6 @@ module_param(default_layout, int, 0644);
(1L << MD_HAS_PPL) | \
(1L << MD_HAS_MULTIPLE_PPLS))
-static int raid0_congested(struct mddev *mddev, int bits)
-{
- struct r0conf *conf = mddev->private;
- struct md_rdev **devlist = conf->devlist;
- int raid_disks = conf->strip_zone[0].nb_dev;
- int i, ret = 0;
-
- for (i = 0; i < raid_disks && !ret ; i++) {
- struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
-
- ret |= bdi_congested(q->backing_dev_info, bits);
- }
- return ret;
-}
-
/*
* inform the user of the raid configuration
*/
@@ -425,22 +410,6 @@ static int raid0_run(struct mddev *mddev)
mdname(mddev),
(unsigned long long)mddev->array_sectors);
- if (mddev->queue) {
- /* calculate the max read-ahead size.
- * For read-ahead of large files to be effective, we need to
- * readahead at least twice a whole stripe. i.e. number of devices
- * multiplied by chunk size times 2.
- * If an individual device has an ra_pages greater than the
- * chunk size, then we will not drive that device as hard as it
- * wants. We consider this a configuration error: a larger
- * chunksize should be used in that case.
- */
- int stripe = mddev->raid_disks *
- (mddev->chunk_sectors << 9) / PAGE_SIZE;
- if (mddev->queue->backing_dev_info->ra_pages < 2* stripe)
- mddev->queue->backing_dev_info->ra_pages = 2* stripe;
- }
-
dump_zones(mddev);
ret = md_integrity_register(mddev);
@@ -457,23 +426,6 @@ static void raid0_free(struct mddev *mddev, void *priv)
kfree(conf);
}
-/*
- * Is io distribute over 1 or more chunks ?
-*/
-static inline int is_io_in_chunk_boundary(struct mddev *mddev,
- unsigned int chunk_sects, struct bio *bio)
-{
- if (likely(is_power_of_2(chunk_sects))) {
- return chunk_sects >=
- ((bio->bi_iter.bi_sector & (chunk_sects-1))
- + bio_sectors(bio));
- } else{
- sector_t sector = bio->bi_iter.bi_sector;
- return chunk_sects >= (sector_div(sector, chunk_sects)
- + bio_sectors(bio));
- }
-}
-
static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
{
struct r0conf *conf = mddev->private;
@@ -495,7 +447,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
&mddev->bio_set);
bio_chain(split, bio);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
bio = split;
end = zone->zone_end;
} else
@@ -559,7 +511,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
trace_block_bio_remap(bdev_get_queue(rdev->bdev),
discard_bio, disk_devt(mddev->gendisk),
bio->bi_iter.bi_sector);
- generic_make_request(discard_bio);
+ submit_bio_noacct(discard_bio);
}
bio_endio(bio);
}
@@ -600,7 +552,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
struct bio *split = bio_split(bio, sectors, GFP_NOIO,
&mddev->bio_set);
bio_chain(split, bio);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
bio = split;
}
@@ -633,7 +585,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
disk_devt(mddev->gendisk), bio_sector);
mddev_check_writesame(mddev, bio);
mddev_check_write_zeroes(mddev, bio);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
return true;
}
@@ -818,7 +770,6 @@ static struct md_personality raid0_personality=
.size = raid0_size,
.takeover = raid0_takeover,
.quiesce = raid0_quiesce,
- .congested = raid0_congested,
};
static int __init raid0_init (void)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e871846..fb31e5d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
+#include <linux/interval_tree_generic.h>
#include <trace/events/block.h>
@@ -50,55 +51,71 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#include "raid1-10.c"
-static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
+#define START(node) ((node)->start)
+#define LAST(node) ((node)->last)
+INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
+ START, LAST, static inline, raid1_rb);
+
+static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
+ struct serial_info *si, int idx)
{
- struct wb_info *wi, *temp_wi;
unsigned long flags;
int ret = 0;
- struct mddev *mddev = rdev->mddev;
+ sector_t lo = r1_bio->sector;
+ sector_t hi = lo + r1_bio->sectors;
+ struct serial_in_rdev *serial = &rdev->serial[idx];
- wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO);
-
- spin_lock_irqsave(&rdev->wb_list_lock, flags);
- list_for_each_entry(temp_wi, &rdev->wb_list, list) {
- /* collision happened */
- if (hi > temp_wi->lo && lo < temp_wi->hi) {
- ret = -EBUSY;
- break;
- }
+ spin_lock_irqsave(&serial->serial_lock, flags);
+ /* collision happened */
+ if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
+ ret = -EBUSY;
+ else {
+ si->start = lo;
+ si->last = hi;
+ raid1_rb_insert(si, &serial->serial_rb);
}
-
- if (!ret) {
- wi->lo = lo;
- wi->hi = hi;
- list_add(&wi->list, &rdev->wb_list);
- } else
- mempool_free(wi, mddev->wb_info_pool);
- spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
+ spin_unlock_irqrestore(&serial->serial_lock, flags);
return ret;
}
-static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
+static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
{
- struct wb_info *wi;
+ struct mddev *mddev = rdev->mddev;
+ struct serial_info *si;
+ int idx = sector_to_idx(r1_bio->sector);
+ struct serial_in_rdev *serial = &rdev->serial[idx];
+
+ if (WARN_ON(!mddev->serial_info_pool))
+ return;
+ si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
+ wait_event(serial->serial_io_wait,
+ check_and_add_serial(rdev, r1_bio, si, idx) == 0);
+}
+
+static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
+{
+ struct serial_info *si;
unsigned long flags;
int found = 0;
struct mddev *mddev = rdev->mddev;
+ int idx = sector_to_idx(lo);
+ struct serial_in_rdev *serial = &rdev->serial[idx];
- spin_lock_irqsave(&rdev->wb_list_lock, flags);
- list_for_each_entry(wi, &rdev->wb_list, list)
- if (hi == wi->hi && lo == wi->lo) {
- list_del(&wi->list);
- mempool_free(wi, mddev->wb_info_pool);
+ spin_lock_irqsave(&serial->serial_lock, flags);
+ for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
+ si; si = raid1_rb_iter_next(si, lo, hi)) {
+ if (si->start == lo && si->last == hi) {
+ raid1_rb_remove(si, &serial->serial_rb);
+ mempool_free(si, mddev->serial_info_pool);
found = 1;
break;
}
-
+ }
if (!found)
- WARN(1, "The write behind IO is not recorded\n");
- spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
- wake_up(&rdev->wb_io_wait);
+ WARN(1, "The write IO is not recorded for serialization\n");
+ spin_unlock_irqrestore(&serial->serial_lock, flags);
+ wake_up(&serial->serial_io_wait);
}
/*
@@ -279,22 +296,17 @@ static void reschedule_retry(struct r1bio *r1_bio)
static void call_bio_endio(struct r1bio *r1_bio)
{
struct bio *bio = r1_bio->master_bio;
- struct r1conf *conf = r1_bio->mddev->private;
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
- /*
- * Wake up any possible resync thread that waits for the device
- * to go idle.
- */
- allow_barrier(conf, r1_bio->sector);
}
static void raid_end_bio_io(struct r1bio *r1_bio)
{
struct bio *bio = r1_bio->master_bio;
+ struct r1conf *conf = r1_bio->mddev->private;
/* if nobody has done the final endio yet, do it now */
if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
@@ -305,6 +317,12 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
call_bio_endio(r1_bio);
}
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle. All I/Os, even write-behind writes, are done.
+ */
+ allow_barrier(conf, r1_bio->sector);
+
free_r1bio(r1_bio);
}
@@ -430,6 +448,8 @@ static void raid1_end_write_request(struct bio *bio)
int mirror = find_bio_disk(r1_bio, bio);
struct md_rdev *rdev = conf->mirrors[mirror].rdev;
bool discard_error;
+ sector_t lo = r1_bio->sector;
+ sector_t hi = r1_bio->sector + r1_bio->sectors;
discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
@@ -499,12 +519,8 @@ static void raid1_end_write_request(struct bio *bio)
}
if (behind) {
- if (test_bit(WBCollisionCheck, &rdev->flags)) {
- sector_t lo = r1_bio->sector;
- sector_t hi = r1_bio->sector + r1_bio->sectors;
-
- remove_wb(rdev, lo, hi);
- }
+ if (test_bit(CollisionCheck, &rdev->flags))
+ remove_serial(rdev, lo, hi);
if (test_bit(WriteMostly, &rdev->flags))
atomic_dec(&r1_bio->behind_remaining);
@@ -527,7 +543,8 @@ static void raid1_end_write_request(struct bio *bio)
call_bio_endio(r1_bio);
}
}
- }
+ } else if (rdev->mddev->serialize_policy)
+ remove_serial(rdev, lo, hi);
if (r1_bio->bios[mirror] == NULL)
rdev_dec_pending(rdev, conf->mddev);
@@ -769,36 +786,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
return best_disk;
}
-static int raid1_congested(struct mddev *mddev, int bits)
-{
- struct r1conf *conf = mddev->private;
- int i, ret = 0;
-
- if ((bits & (1 << WB_async_congested)) &&
- conf->pending_count >= max_queued_requests)
- return 1;
-
- rcu_read_lock();
- for (i = 0; i < conf->raid_disks * 2; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
- struct request_queue *q = bdev_get_queue(rdev->bdev);
-
- BUG_ON(!q);
-
- /* Note the '|| 1' - when read_balance prefers
- * non-congested targets, it can be removed
- */
- if ((bits & (1 << WB_async_congested)) || 1)
- ret |= bdi_congested(q->backing_dev_info, bits);
- else
- ret &= bdi_congested(q->backing_dev_info, bits);
- }
- }
- rcu_read_unlock();
- return ret;
-}
-
static void flush_bio_list(struct r1conf *conf, struct bio *bio)
{
/* flush any pending bitmap writes to disk before proceeding w/ I/O */
@@ -817,8 +804,9 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio)
/* Just ignore it */
bio_endio(bio);
else
- generic_make_request(bio);
+ submit_bio_noacct(bio);
bio = next;
+ cond_resched();
}
}
@@ -1294,7 +1282,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
struct bio *split = bio_split(bio, max_sectors,
gfp, &conf->bio_split);
bio_chain(split, bio);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
bio = split;
r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors;
@@ -1320,7 +1308,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
trace_block_bio_remap(read_bio->bi_disk->queue, read_bio,
disk_devt(mddev->gendisk), r1_bio->sector);
- generic_make_request(read_bio);
+ submit_bio_noacct(read_bio);
}
static void raid1_write_request(struct mddev *mddev, struct bio *bio,
@@ -1465,7 +1453,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
struct bio *split = bio_split(bio, max_sectors,
GFP_NOIO, &conf->bio_split);
bio_chain(split, bio);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
bio = split;
r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors;
@@ -1478,6 +1466,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
for (i = 0; i < disks; i++) {
struct bio *mbio = NULL;
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
if (!r1_bio->bios[i])
continue;
@@ -1505,18 +1494,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (r1_bio->behind_master_bio) {
- struct md_rdev *rdev = conf->mirrors[i].rdev;
-
- if (test_bit(WBCollisionCheck, &rdev->flags)) {
- sector_t lo = r1_bio->sector;
- sector_t hi = r1_bio->sector + r1_bio->sectors;
-
- wait_event(rdev->wb_io_wait,
- check_and_add_wb(rdev, lo, hi) == 0);
- }
+ if (test_bit(CollisionCheck, &rdev->flags))
+ wait_for_serialization(rdev, r1_bio);
if (test_bit(WriteMostly, &rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
- }
+ } else if (mddev->serialize_policy)
+ wait_for_serialization(rdev, r1_bio);
r1_bio->bios[i] = mbio;
@@ -2227,7 +2210,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
atomic_inc(&r1_bio->remaining);
md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
- generic_make_request(wbio);
+ submit_bio_noacct(wbio);
}
put_sync_write_buf(r1_bio, 1);
@@ -2913,7 +2896,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
}
} else {
@@ -2922,7 +2905,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
return nr_sectors;
}
@@ -3383,7 +3366,6 @@ static struct md_personality raid1_personality =
.check_reshape = raid1_reshape,
.quiesce = raid1_quiesce,
.takeover = raid1_takeover,
- .congested = raid1_congested,
};
static int __init raid_init(void)
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e7ccad8..b7eb09e 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -180,7 +180,7 @@ struct r1bio {
* if the IO is in WRITE direction, then multiple bios are used.
* We choose the number when they are allocated.
*/
- struct bio *bios[0];
+ struct bio *bios[];
/* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
};
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index deddabf..70dccc3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -848,31 +848,6 @@ static struct md_rdev *read_balance(struct r10conf *conf,
return rdev;
}
-static int raid10_congested(struct mddev *mddev, int bits)
-{
- struct r10conf *conf = mddev->private;
- int i, ret = 0;
-
- if ((bits & (1 << WB_async_congested)) &&
- conf->pending_count >= max_queued_requests)
- return 1;
-
- rcu_read_lock();
- for (i = 0;
- (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
- && ret == 0;
- i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
- struct request_queue *q = bdev_get_queue(rdev->bdev);
-
- ret |= bdi_congested(q->backing_dev_info, bits);
- }
- }
- rcu_read_unlock();
- return ret;
-}
-
static void flush_pending_writes(struct r10conf *conf)
{
/* Any writes that have been queued but are awaiting
@@ -917,7 +892,7 @@ static void flush_pending_writes(struct r10conf *conf)
/* Just ignore it */
bio_endio(bio);
else
- generic_make_request(bio);
+ submit_bio_noacct(bio);
bio = next;
}
blk_finish_plug(&plug);
@@ -980,6 +955,7 @@ static void wait_barrier(struct r10conf *conf)
{
spin_lock_irq(&conf->resync_lock);
if (conf->barrier) {
+ struct bio_list *bio_list = current->bio_list;
conf->nr_waiting++;
/* Wait for the barrier to drop.
* However if there are already pending
@@ -994,9 +970,16 @@ static void wait_barrier(struct r10conf *conf)
wait_event_lock_irq(conf->wait_barrier,
!conf->barrier ||
(atomic_read(&conf->nr_pending) &&
- current->bio_list &&
- (!bio_list_empty(¤t->bio_list[0]) ||
- !bio_list_empty(¤t->bio_list[1]))),
+ bio_list &&
+ (!bio_list_empty(&bio_list[0]) ||
+ !bio_list_empty(&bio_list[1]))) ||
+ /* move on if recovery thread is
+ * blocked by us
+ */
+ (conf->mddev->thread->tsk == current &&
+ test_bit(MD_RECOVERY_RUNNING,
+ &conf->mddev->recovery) &&
+ conf->nr_queued > 0),
conf->resync_lock);
conf->nr_waiting--;
if (!conf->nr_waiting)
@@ -1102,7 +1085,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
/* Just ignore it */
bio_endio(bio);
else
- generic_make_request(bio);
+ submit_bio_noacct(bio);
bio = next;
}
kfree(plug);
@@ -1194,7 +1177,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
gfp, &conf->bio_split);
bio_chain(split, bio);
allow_barrier(conf);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
wait_barrier(conf);
bio = split;
r10_bio->master_bio = bio;
@@ -1221,7 +1204,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
trace_block_bio_remap(read_bio->bi_disk->queue,
read_bio, disk_devt(mddev->gendisk),
r10_bio->sector);
- generic_make_request(read_bio);
+ submit_bio_noacct(read_bio);
return;
}
@@ -1479,7 +1462,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
GFP_NOIO, &conf->bio_split);
bio_chain(split, bio);
allow_barrier(conf);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
wait_barrier(conf);
bio = split;
r10_bio->master_bio = bio;
@@ -2100,7 +2083,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
tbio->bi_opf |= MD_FAILFAST;
tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
- generic_make_request(tbio);
+ submit_bio_noacct(tbio);
}
/* Now write out to any replacement devices
@@ -2119,7 +2102,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
atomic_inc(&r10_bio->remaining);
md_sync_acct(conf->mirrors[d].replacement->bdev,
bio_sectors(tbio));
- generic_make_request(tbio);
+ submit_bio_noacct(tbio);
}
done:
@@ -2242,7 +2225,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
wbio = r10_bio->devs[1].bio;
wbio2 = r10_bio->devs[1].repl_bio;
/* Need to test wbio2->bi_end_io before we call
- * generic_make_request as if the former is NULL,
+ * submit_bio_noacct as if the former is NULL,
* the latter is free to free wbio2.
*/
if (wbio2 && !wbio2->bi_end_io)
@@ -2250,13 +2233,13 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
- generic_make_request(wbio);
+ submit_bio_noacct(wbio);
}
if (wbio2) {
atomic_inc(&conf->mirrors[d].replacement->nr_pending);
md_sync_acct(conf->mirrors[d].replacement->bdev,
bio_sectors(wbio2));
- generic_make_request(wbio2);
+ submit_bio_noacct(wbio2);
}
}
@@ -2890,7 +2873,7 @@ static void raid10_set_cluster_sync_high(struct r10conf *conf)
* a number of r10_bio structures, one for each out-of-sync device.
* As we setup these structures, we collect all bio's together into a list
* which we then process collectively to add pages, and then process again
- * to pass to generic_make_request.
+ * to pass to submit_bio_noacct.
*
* The r10_bio structures are linked using a borrowed master_bio pointer.
* This link is counted in ->remaining. When the r10_bio that points to NULL
@@ -3497,7 +3480,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (bio->bi_end_io == end_sync_read) {
md_sync_acct_bio(bio, nr_sectors);
bio->bi_status = 0;
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
}
@@ -3721,10 +3704,20 @@ static struct r10conf *setup_conf(struct mddev *mddev)
return ERR_PTR(err);
}
+static void raid10_set_io_opt(struct r10conf *conf)
+{
+ int raid_disks = conf->geo.raid_disks;
+
+ if (!(conf->geo.raid_disks % conf->geo.near_copies))
+ raid_disks /= conf->geo.near_copies;
+ blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
+ raid_disks);
+}
+
static int raid10_run(struct mddev *mddev)
{
struct r10conf *conf;
- int i, disk_idx, chunk_size;
+ int i, disk_idx;
struct raid10_info *disk;
struct md_rdev *rdev;
sector_t size;
@@ -3760,18 +3753,13 @@ static int raid10_run(struct mddev *mddev)
mddev->thread = conf->thread;
conf->thread = NULL;
- chunk_size = mddev->chunk_sectors << 9;
if (mddev->queue) {
blk_queue_max_discard_sectors(mddev->queue,
mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, 0);
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
- blk_queue_io_min(mddev->queue, chunk_size);
- if (conf->geo.raid_disks % conf->geo.near_copies)
- blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
- else
- blk_queue_io_opt(mddev->queue, chunk_size *
- (conf->geo.raid_disks / conf->geo.near_copies));
+ blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
+ raid10_set_io_opt(conf);
}
rdev_for_each(rdev, mddev) {
@@ -3886,19 +3874,6 @@ static int raid10_run(struct mddev *mddev)
mddev->resync_max_sectors = size;
set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
- if (mddev->queue) {
- int stripe = conf->geo.raid_disks *
- ((mddev->chunk_sectors << 9) / PAGE_SIZE);
-
- /* Calculate max read-ahead size.
- * We need to readahead at least twice a whole stripe....
- * maybe...
- */
- stripe /= conf->geo.near_copies;
- if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
- mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
- }
-
if (md_integrity_register(mddev))
goto out_free_conf;
@@ -4308,8 +4283,8 @@ static int raid10_start_reshape(struct mddev *mddev)
else
rdev->recovery_offset = 0;
- if (sysfs_link_rdev(mddev, rdev))
- /* Failure here is OK */;
+ /* Failure here is OK */
+ sysfs_link_rdev(mddev, rdev);
}
} else if (rdev->raid_disk >= conf->prev.raid_disks
&& !test_bit(Faulty, &rdev->flags)) {
@@ -4455,7 +4430,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
sector_nr = conf->reshape_progress;
if (sector_nr) {
mddev->curr_resync_completed = sector_nr;
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
*skipped = 1;
return sector_nr;
}
@@ -4484,8 +4459,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
last = conf->reshape_progress - 1;
sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
& conf->prev.chunk_mask);
- if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
- sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
+ if (sector_nr + RESYNC_SECTORS < last)
+ sector_nr = last + 1 - RESYNC_SECTORS;
} else {
/* 'next' is after the last device address that we
* might write to for this chunk in the new layout
@@ -4507,8 +4482,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
last = sector_nr | (conf->geo.chunk_mask
& conf->prev.chunk_mask);
- if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
- last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
+ if (sector_nr + RESYNC_SECTORS <= last)
+ last = sector_nr + RESYNC_SECTORS - 1;
}
if (need_flush ||
@@ -4655,7 +4630,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
md_sync_acct_bio(read_bio, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
read_bio->bi_next = NULL;
- generic_make_request(read_bio);
+ submit_bio_noacct(read_bio);
sectors_done += nr_sectors;
if (sector_nr <= last)
goto read_more;
@@ -4718,7 +4693,7 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
md_sync_acct_bio(b, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
b->bi_next = NULL;
- generic_make_request(b);
+ submit_bio_noacct(b);
}
end_reshape_request(r10_bio);
}
@@ -4736,16 +4711,8 @@ static void end_reshape(struct r10conf *conf)
conf->reshape_safe = MaxSector;
spin_unlock_irq(&conf->device_lock);
- /* read-ahead size must cover two whole stripes, which is
- * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
- */
- if (conf->mddev->queue) {
- int stripe = conf->geo.raid_disks *
- ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
- stripe /= conf->geo.near_copies;
- if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
- conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
- }
+ if (conf->mddev->queue)
+ raid10_set_io_opt(conf);
conf->fullsync = 0;
}
@@ -4930,7 +4897,6 @@ static struct md_personality raid10_personality =
.start_reshape = raid10_start_reshape,
.finish_reshape = raid10_finish_reshape,
.update_reshape_pos = raid10_update_reshape_pos,
- .congested = raid10_congested,
};
static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index d3eaaf3..79cd2b7 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -153,7 +153,7 @@ struct r10bio {
};
sector_t addr;
int devnum;
- } devs[0];
+ } devs[];
};
/* bits for r10bio.state */
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 9b6da75..4337ae0 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -195,9 +195,7 @@ struct r5l_log {
static inline sector_t r5c_tree_index(struct r5conf *conf,
sector_t sect)
{
- sector_t offset;
-
- offset = sector_div(sect, conf->chunk_sectors);
+ sector_div(sect, conf->chunk_sectors);
return sect;
}
@@ -298,8 +296,8 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_iter.bi_sector <
- dev->sector + STRIPE_SECTORS) {
- wbi2 = r5_next_bio(wbi, dev->sector);
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
+ wbi2 = r5_next_bio(conf, wbi, dev->sector);
md_write_end(conf->mddev);
bio_endio(wbi);
wbi = wbi2;
@@ -316,7 +314,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
set_bit(R5_UPTODATE, &sh->dev[i].flags);
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS,
+ RAID5_STRIPE_SECTORS(conf),
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
}
@@ -364,7 +362,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
*/
if (atomic_read(&conf->r5c_cached_full_stripes) >=
min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
- conf->chunk_sectors >> STRIPE_SHIFT))
+ conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf)))
r5l_wake_reclaim(conf->log, 0);
}
@@ -2430,10 +2428,15 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
struct stripe_head *sh, *next;
+ bool cleared_pending = false;
if (ctx->data_only_stripes == 0)
return;
+ if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
+ cleared_pending = true;
+ clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
+ }
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
@@ -2448,6 +2451,8 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
atomic_read(&conf->active_stripes) == 0);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+ if (cleared_pending)
+ set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
}
static int r5l_recovery_log(struct r5l_log *log)
@@ -2532,13 +2537,10 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
struct r5conf *conf;
int ret;
- ret = mddev_lock(mddev);
- if (ret)
- return ret;
-
+ spin_lock(&mddev->lock);
conf = mddev->private;
if (!conf || !conf->log) {
- mddev_unlock(mddev);
+ spin_unlock(&mddev->lock);
return 0;
}
@@ -2558,7 +2560,7 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
default:
ret = 0;
}
- mddev_unlock(mddev);
+ spin_unlock(&mddev->lock);
return ret;
}
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 18a4064..d0f5402 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -324,7 +324,7 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
* be just after the last logged stripe and write to the same
* disks. Use bit shift and logarithm to avoid 64-bit division.
*/
- if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
+ if ((sh->sector == sh_last->sector + RAID5_STRIPE_SECTORS(conf)) &&
(data_sector >> ilog2(conf->chunk_sectors) ==
data_sector_last >> ilog2(conf->chunk_sectors)) &&
((data_sector - data_sector_last) * data_disks ==
@@ -844,9 +844,9 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
/* if start and end is 4k aligned, use a 4k block */
if (block_size == 512 &&
- (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
- (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
- block_size = STRIPE_SIZE;
+ (r_sector_first & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0 &&
+ (r_sector_last & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0)
+ block_size = RAID5_STRIPE_SIZE(conf);
/* iterate through blocks in strip */
for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
@@ -1037,7 +1037,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr,
}
/* flush the disk cache after recovery if necessary */
- ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL);
+ ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL);
out:
__free_page(page);
return ret;
@@ -1274,7 +1274,8 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
if (ppl_data_sectors > 0)
- ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
+ ppl_data_sectors = rounddown(ppl_data_sectors,
+ RAID5_STRIPE_SECTORS((struct r5conf *)rdev->mddev->private));
if (ppl_data_sectors <= 0) {
pr_warn("md/raid:%s: PPL space too small on %s\n",
@@ -1360,7 +1361,7 @@ int ppl_init_log(struct r5conf *conf)
return -EINVAL;
}
- max_disks = FIELD_SIZEOF(struct ppl_log, disk_flush_bitmap) *
+ max_disks = sizeof_field(struct ppl_log, disk_flush_bitmap) *
BITS_PER_BYTE;
if (conf->raid_disks > max_disks) {
pr_warn("md/raid:%s PPL doesn't support over %d disks in the array\n",
@@ -1404,7 +1405,7 @@ int ppl_init_log(struct r5conf *conf)
atomic64_set(&ppl_conf->seq, 0);
INIT_LIST_HEAD(&ppl_conf->no_mem_stripes);
spin_lock_init(&ppl_conf->no_mem_stripes_lock);
- ppl_conf->write_hint = RWF_WRITE_LIFE_NOT_SET;
+ ppl_conf->write_hint = RWH_WRITE_LIFE_NOT_SET;
if (!mddev->external) {
ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 08a7f97..c82953a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -69,13 +69,13 @@ static struct workqueue_struct *raid5_wq;
static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
{
- int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
+ int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
return &conf->stripe_hashtbl[hash];
}
-static inline int stripe_hash_locks_hash(sector_t sect)
+static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
{
- return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
+ return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
}
static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
@@ -448,13 +448,74 @@ static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
return sh;
}
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+static void free_stripe_pages(struct stripe_head *sh)
+{
+ int i;
+ struct page *p;
+
+ /* Have not allocate page pool */
+ if (!sh->pages)
+ return;
+
+ for (i = 0; i < sh->nr_pages; i++) {
+ p = sh->pages[i];
+ if (p)
+ put_page(p);
+ sh->pages[i] = NULL;
+ }
+}
+
+static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
+{
+ int i;
+ struct page *p;
+
+ for (i = 0; i < sh->nr_pages; i++) {
+ /* The page have allocated. */
+ if (sh->pages[i])
+ continue;
+
+ p = alloc_page(gfp);
+ if (!p) {
+ free_stripe_pages(sh);
+ return -ENOMEM;
+ }
+ sh->pages[i] = p;
+ }
+ return 0;
+}
+
+static int
+init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks)
+{
+ int nr_pages, cnt;
+
+ if (sh->pages)
+ return 0;
+
+ /* Each of the sh->dev[i] need one conf->stripe_size */
+ cnt = PAGE_SIZE / conf->stripe_size;
+ nr_pages = (disks + cnt - 1) / cnt;
+
+ sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!sh->pages)
+ return -ENOMEM;
+ sh->nr_pages = nr_pages;
+ sh->stripes_per_page = cnt;
+ return 0;
+}
+#endif
+
static void shrink_buffers(struct stripe_head *sh)
{
- struct page *p;
int i;
int num = sh->raid_conf->pool_size;
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
for (i = 0; i < num ; i++) {
+ struct page *p;
+
WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
p = sh->dev[i].page;
if (!p)
@@ -462,6 +523,11 @@ static void shrink_buffers(struct stripe_head *sh)
sh->dev[i].page = NULL;
put_page(p);
}
+#else
+ for (i = 0; i < num; i++)
+ sh->dev[i].page = NULL;
+ free_stripe_pages(sh); /* Free pages */
+#endif
}
static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -469,6 +535,7 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
int i;
int num = sh->raid_conf->pool_size;
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
for (i = 0; i < num; i++) {
struct page *page;
@@ -477,8 +544,18 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
}
sh->dev[i].page = page;
sh->dev[i].orig_page = page;
+ sh->dev[i].offset = 0;
}
+#else
+ if (alloc_stripe_pages(sh, gfp))
+ return -ENOMEM;
+ for (i = 0; i < num; i++) {
+ sh->dev[i].page = raid5_get_dev_page(sh, i);
+ sh->dev[i].orig_page = sh->dev[i].page;
+ sh->dev[i].offset = raid5_get_page_offset(sh, i);
+ }
+#endif
return 0;
}
@@ -627,7 +704,7 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce)
{
struct stripe_head *sh;
- int hash = stripe_hash_locks_hash(sector);
+ int hash = stripe_hash_locks_hash(conf, sector);
int inc_empty_inactive_list_flag;
pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
@@ -748,9 +825,9 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
tmp_sec = sh->sector;
if (!sector_div(tmp_sec, conf->chunk_sectors))
return;
- head_sector = sh->sector - STRIPE_SECTORS;
+ head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
- hash = stripe_hash_locks_hash(head_sector);
+ hash = stripe_hash_locks_hash(conf, head_sector);
spin_lock_irq(conf->hash_locks + hash);
head = __find_stripe(conf, head_sector, conf->generation);
if (head && !atomic_inc_not_zero(&head->count)) {
@@ -873,10 +950,11 @@ static void dispatch_bio_list(struct bio_list *tmp)
struct bio *bio;
while ((bio = bio_list_pop(tmp)))
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
-static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
+static int cmp_stripe(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
const struct r5pending_data *da = list_entry(a,
struct r5pending_data, sibling);
@@ -1057,7 +1135,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
int bad_sectors;
- int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors);
if (!bad)
break;
@@ -1089,7 +1167,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev) {
if (s->syncing || s->expanding || s->expanded
|| s->replacing)
- md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+ md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -1129,12 +1207,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
else
sh->dev[i].vec.bv_page = sh->dev[i].page;
bi->bi_vcnt = 1;
- bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
- bi->bi_io_vec[0].bv_offset = 0;
- bi->bi_iter.bi_size = STRIPE_SIZE;
+ bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
+ bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
+ bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
bi->bi_write_hint = sh->dev[i].write_hint;
if (!rrdev)
- sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+ sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
* If this is discard request, set bi_vcnt 0. We don't
* want to confuse SCSI because SCSI will replace payload
@@ -1151,12 +1229,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, bi);
else
- generic_make_request(bi);
+ submit_bio_noacct(bi);
}
if (rrdev) {
if (s->syncing || s->expanding || s->expanded
|| s->replacing)
- md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
+ md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -1183,11 +1261,11 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
sh->dev[i].rvec.bv_page = sh->dev[i].page;
rbi->bi_vcnt = 1;
- rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
- rbi->bi_io_vec[0].bv_offset = 0;
- rbi->bi_iter.bi_size = STRIPE_SIZE;
+ rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
+ rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
+ rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
rbi->bi_write_hint = sh->dev[i].write_hint;
- sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+ sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
* If this is discard request, set bi_vcnt 0. We don't
* want to confuse SCSI because SCSI will replace payload
@@ -1201,7 +1279,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, rbi);
else
- generic_make_request(rbi);
+ submit_bio_noacct(rbi);
}
if (!rdev && !rrdev) {
if (op_is_write(op))
@@ -1226,7 +1304,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
static struct dma_async_tx_descriptor *
async_copy_data(int frombio, struct bio *bio, struct page **page,
- sector_t sector, struct dma_async_tx_descriptor *tx,
+ unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
struct stripe_head *sh, int no_skipcopy)
{
struct bio_vec bvl;
@@ -1235,6 +1313,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
int page_offset;
struct async_submit_ctl submit;
enum async_tx_flags flags = 0;
+ struct r5conf *conf = sh->raid_conf;
if (bio->bi_iter.bi_sector >= sector)
page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
@@ -1256,8 +1335,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
len -= b_offset;
}
- if (len > 0 && page_offset + len > STRIPE_SIZE)
- clen = STRIPE_SIZE - page_offset;
+ if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
+ clen = RAID5_STRIPE_SIZE(conf) - page_offset;
else
clen = len;
@@ -1265,17 +1344,17 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
b_offset += bvl.bv_offset;
bio_page = bvl.bv_page;
if (frombio) {
- if (sh->raid_conf->skip_copy &&
+ if (conf->skip_copy &&
b_offset == 0 && page_offset == 0 &&
- clen == STRIPE_SIZE &&
+ clen == RAID5_STRIPE_SIZE(conf) &&
!no_skipcopy)
*page = bio_page;
else
- tx = async_memcpy(*page, bio_page, page_offset,
+ tx = async_memcpy(*page, bio_page, page_offset + poff,
b_offset, clen, &submit);
} else
tx = async_memcpy(bio_page, *page, b_offset,
- page_offset, clen, &submit);
+ page_offset + poff, clen, &submit);
}
/* chain the operations */
submit.depend_tx = tx;
@@ -1292,6 +1371,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
int i;
+ struct r5conf *conf = sh->raid_conf;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
@@ -1312,8 +1392,8 @@ static void ops_complete_biofill(void *stripe_head_ref)
rbi = dev->read;
dev->read = NULL;
while (rbi && rbi->bi_iter.bi_sector <
- dev->sector + STRIPE_SECTORS) {
- rbi2 = r5_next_bio(rbi, dev->sector);
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
+ rbi2 = r5_next_bio(conf, rbi, dev->sector);
bio_endio(rbi);
rbi = rbi2;
}
@@ -1330,6 +1410,7 @@ static void ops_run_biofill(struct stripe_head *sh)
struct dma_async_tx_descriptor *tx = NULL;
struct async_submit_ctl submit;
int i;
+ struct r5conf *conf = sh->raid_conf;
BUG_ON(sh->batch_head);
pr_debug("%s: stripe %llu\n", __func__,
@@ -1344,10 +1425,11 @@ static void ops_run_biofill(struct stripe_head *sh)
dev->toread = NULL;
spin_unlock_irq(&sh->stripe_lock);
while (rbi && rbi->bi_iter.bi_sector <
- dev->sector + STRIPE_SECTORS) {
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
tx = async_copy_data(0, rbi, &dev->page,
+ dev->offset,
dev->sector, tx, sh, 0);
- rbi = r5_next_bio(rbi, dev->sector);
+ rbi = r5_next_bio(conf, rbi, dev->sector);
}
}
}
@@ -1401,14 +1483,25 @@ static addr_conv_t *to_addr_conv(struct stripe_head *sh,
return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
}
+/*
+ * Return a pointer to record offset address.
+ */
+static unsigned int *
+to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
+{
+ return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2);
+}
+
static struct dma_async_tx_descriptor *
ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
{
int disks = sh->disks;
struct page **xor_srcs = to_addr_page(percpu, 0);
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
int target = sh->ops.target;
struct r5dev *tgt = &sh->dev[target];
struct page *xor_dest = tgt->page;
+ unsigned int off_dest = tgt->offset;
int count = 0;
struct dma_async_tx_descriptor *tx;
struct async_submit_ctl submit;
@@ -1420,24 +1513,30 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
__func__, (unsigned long long)sh->sector, target);
BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
- for (i = disks; i--; )
- if (i != target)
+ for (i = disks; i--; ) {
+ if (i != target) {
+ off_srcs[count] = sh->dev[i].offset;
xor_srcs[count++] = sh->dev[i].page;
+ }
+ }
atomic_inc(&sh->count);
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
if (unlikely(count == 1))
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
+ tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
else
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
return tx;
}
/* set_syndrome_sources - populate source buffers for gen_syndrome
* @srcs - (struct page *) array of size sh->disks
+ * @offs - (unsigned int) array of offset for each page
* @sh - stripe_head to parse
*
* Populates srcs in proper layout order for the stripe and returns the
@@ -1446,6 +1545,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
* is recorded in srcs[count+1]].
*/
static int set_syndrome_sources(struct page **srcs,
+ unsigned int *offs,
struct stripe_head *sh,
int srctype)
{
@@ -1476,6 +1576,12 @@ static int set_syndrome_sources(struct page **srcs,
srcs[slot] = sh->dev[i].orig_page;
else
srcs[slot] = sh->dev[i].page;
+ /*
+ * For R5_InJournal, PAGE_SIZE must be 4KB and will
+ * not shared page. In that case, dev[i].offset
+ * is 0.
+ */
+ offs[slot] = sh->dev[i].offset;
}
i = raid6_next_disk(i, disks);
} while (i != d0_idx);
@@ -1488,12 +1594,14 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
{
int disks = sh->disks;
struct page **blocks = to_addr_page(percpu, 0);
+ unsigned int *offs = to_addr_offs(sh, percpu);
int target;
int qd_idx = sh->qd_idx;
struct dma_async_tx_descriptor *tx;
struct async_submit_ctl submit;
struct r5dev *tgt;
struct page *dest;
+ unsigned int dest_off;
int i;
int count;
@@ -1512,30 +1620,34 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
tgt = &sh->dev[target];
BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
dest = tgt->page;
+ dest_off = tgt->offset;
atomic_inc(&sh->count);
if (target == qd_idx) {
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
blocks[count] = NULL; /* regenerating p is not necessary */
BUG_ON(blocks[count+1] != dest); /* q should already be set */
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+ tx = async_gen_syndrome(blocks, offs, count+2,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
} else {
/* Compute any data- or p-drive using XOR */
count = 0;
for (i = disks; i-- ; ) {
if (i == target || i == qd_idx)
continue;
+ offs[count] = sh->dev[i].offset;
blocks[count++] = sh->dev[i].page;
}
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
NULL, ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
+ tx = async_xor_offs(dest, dest_off, blocks, offs, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
}
return tx;
@@ -1554,6 +1666,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
struct r5dev *tgt2 = &sh->dev[target2];
struct dma_async_tx_descriptor *tx;
struct page **blocks = to_addr_page(percpu, 0);
+ unsigned int *offs = to_addr_offs(sh, percpu);
struct async_submit_ctl submit;
BUG_ON(sh->batch_head);
@@ -1566,13 +1679,16 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
/* we need to open-code set_syndrome_sources to handle the
* slot number conversion for 'faila' and 'failb'
*/
- for (i = 0; i < disks ; i++)
+ for (i = 0; i < disks ; i++) {
+ offs[i] = 0;
blocks[i] = NULL;
+ }
count = 0;
i = d0_idx;
do {
int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+ offs[slot] = sh->dev[i].offset;
blocks[slot] = sh->dev[i].page;
if (i == target)
@@ -1597,10 +1713,12 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
- return async_gen_syndrome(blocks, 0, syndrome_disks+2,
- STRIPE_SIZE, &submit);
+ return async_gen_syndrome(blocks, offs, syndrome_disks+2,
+ RAID5_STRIPE_SIZE(sh->raid_conf),
+ &submit);
} else {
struct page *dest;
+ unsigned int dest_off;
int data_target;
int qd_idx = sh->qd_idx;
@@ -1614,22 +1732,26 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
for (i = disks; i-- ; ) {
if (i == data_target || i == qd_idx)
continue;
+ offs[count] = sh->dev[i].offset;
blocks[count++] = sh->dev[i].page;
}
dest = sh->dev[data_target].page;
+ dest_off = sh->dev[data_target].offset;
init_async_submit(&submit,
ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
NULL, NULL, NULL,
to_addr_conv(sh, percpu, 0));
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
+ tx = async_xor_offs(dest, dest_off, blocks, offs, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf),
&submit);
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
init_async_submit(&submit, ASYNC_TX_FENCE, tx,
ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
- return async_gen_syndrome(blocks, 0, count+2,
- STRIPE_SIZE, &submit);
+ return async_gen_syndrome(blocks, offs, count+2,
+ RAID5_STRIPE_SIZE(sh->raid_conf),
+ &submit);
}
} else {
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
@@ -1638,13 +1760,15 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
if (failb == syndrome_disks) {
/* We're missing D+P. */
return async_raid6_datap_recov(syndrome_disks+2,
- STRIPE_SIZE, faila,
- blocks, &submit);
+ RAID5_STRIPE_SIZE(sh->raid_conf),
+ faila,
+ blocks, offs, &submit);
} else {
/* We're missing D+D. */
return async_raid6_2data_recov(syndrome_disks+2,
- STRIPE_SIZE, faila, failb,
- blocks, &submit);
+ RAID5_STRIPE_SIZE(sh->raid_conf),
+ faila, failb,
+ blocks, offs, &submit);
}
}
}
@@ -1670,10 +1794,12 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
{
int disks = sh->disks;
struct page **xor_srcs = to_addr_page(percpu, 0);
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
int count = 0, pd_idx = sh->pd_idx, i;
struct async_submit_ctl submit;
/* existing parity data subtracted */
+ unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
BUG_ON(sh->batch_head);
@@ -1683,15 +1809,23 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
/* Only process blocks that are known to be uptodate */
- if (test_bit(R5_InJournal, &dev->flags))
+ if (test_bit(R5_InJournal, &dev->flags)) {
+ /*
+ * For this case, PAGE_SIZE must be equal to 4KB and
+ * page offset is zero.
+ */
+ off_srcs[count] = dev->offset;
xor_srcs[count++] = dev->orig_page;
- else if (test_bit(R5_Wantdrain, &dev->flags))
+ } else if (test_bit(R5_Wantdrain, &dev->flags)) {
+ off_srcs[count] = dev->offset;
xor_srcs[count++] = dev->page;
+ }
}
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
return tx;
}
@@ -1701,17 +1835,19 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
struct dma_async_tx_descriptor *tx)
{
struct page **blocks = to_addr_page(percpu, 0);
+ unsigned int *offs = to_addr_offs(sh, percpu);
int count;
struct async_submit_ctl submit;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN);
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+ tx = async_gen_syndrome(blocks, offs, count+2,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
return tx;
}
@@ -1752,7 +1888,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
WARN_ON(dev->page != dev->orig_page);
while (wbi && wbi->bi_iter.bi_sector <
- dev->sector + STRIPE_SECTORS) {
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
if (wbi->bi_opf & REQ_FUA)
set_bit(R5_WantFUA, &dev->flags);
if (wbi->bi_opf & REQ_SYNC)
@@ -1761,6 +1897,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
set_bit(R5_Discard, &dev->flags);
else {
tx = async_copy_data(1, wbi, &dev->page,
+ dev->offset,
dev->sector, tx, sh,
r5c_is_writeback(conf->log));
if (dev->page != dev->orig_page &&
@@ -1770,7 +1907,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
clear_bit(R5_OVERWRITE, &dev->flags);
}
}
- wbi = r5_next_bio(wbi, dev->sector);
+ wbi = r5_next_bio(conf, wbi, dev->sector);
}
if (head_sh->batch_head) {
@@ -1840,9 +1977,11 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
{
int disks = sh->disks;
struct page **xor_srcs;
+ unsigned int *off_srcs;
struct async_submit_ctl submit;
int count, pd_idx = sh->pd_idx, i;
struct page *xor_dest;
+ unsigned int off_dest;
int prexor = 0;
unsigned long flags;
int j = 0;
@@ -1867,24 +2006,31 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
again:
count = 0;
xor_srcs = to_addr_page(percpu, j);
+ off_srcs = to_addr_offs(sh, percpu);
/* check if prexor is active which means only process blocks
* that are part of a read-modify-write (written)
*/
if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
prexor = 1;
+ off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (head_sh->dev[i].written ||
- test_bit(R5_InJournal, &head_sh->dev[i].flags))
+ test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
+ off_srcs[count] = dev->offset;
xor_srcs[count++] = dev->page;
+ }
}
} else {
xor_dest = sh->dev[pd_idx].page;
+ off_dest = sh->dev[pd_idx].offset;
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if (i != pd_idx)
+ if (i != pd_idx) {
+ off_srcs[count] = dev->offset;
xor_srcs[count++] = dev->page;
+ }
}
}
@@ -1910,9 +2056,11 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
}
if (unlikely(count == 1))
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
+ tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
else
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
if (!last_stripe) {
j++;
sh = list_first_entry(&sh->batch_list, struct stripe_head,
@@ -1927,6 +2075,7 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
{
struct async_submit_ctl submit;
struct page **blocks;
+ unsigned int *offs;
int count, i, j = 0;
struct stripe_head *head_sh = sh;
int last_stripe;
@@ -1951,6 +2100,7 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
again:
blocks = to_addr_page(percpu, j);
+ offs = to_addr_offs(sh, percpu);
if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
synflags = SYNDROME_SRC_WRITTEN;
@@ -1960,7 +2110,7 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
txflags = ASYNC_TX_ACK;
}
- count = set_syndrome_sources(blocks, sh, synflags);
+ count = set_syndrome_sources(blocks, offs, sh, synflags);
last_stripe = !head_sh->batch_head ||
list_first_entry(&sh->batch_list,
struct stripe_head, batch_list) == head_sh;
@@ -1972,7 +2122,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
} else
init_async_submit(&submit, 0, tx, NULL, NULL,
to_addr_conv(sh, percpu, j));
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+ tx = async_gen_syndrome(blocks, offs, count+2,
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
if (!last_stripe) {
j++;
sh = list_first_entry(&sh->batch_list, struct stripe_head,
@@ -1999,7 +2150,9 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
int pd_idx = sh->pd_idx;
int qd_idx = sh->qd_idx;
struct page *xor_dest;
+ unsigned int off_dest;
struct page **xor_srcs = to_addr_page(percpu, 0);
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
struct dma_async_tx_descriptor *tx;
struct async_submit_ctl submit;
int count;
@@ -2011,16 +2164,20 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
BUG_ON(sh->batch_head);
count = 0;
xor_dest = sh->dev[pd_idx].page;
+ off_dest = sh->dev[pd_idx].offset;
+ off_srcs[count] = off_dest;
xor_srcs[count++] = xor_dest;
for (i = disks; i--; ) {
if (i == pd_idx || i == qd_idx)
continue;
+ off_srcs[count] = sh->dev[i].offset;
xor_srcs[count++] = sh->dev[i].page;
}
init_async_submit(&submit, 0, NULL, NULL, NULL,
to_addr_conv(sh, percpu, 0));
- tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
+ tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
+ RAID5_STRIPE_SIZE(sh->raid_conf),
&sh->ops.zero_sum_result, &submit);
atomic_inc(&sh->count);
@@ -2031,6 +2188,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
{
struct page **srcs = to_addr_page(percpu, 0);
+ unsigned int *offs = to_addr_offs(sh, percpu);
struct async_submit_ctl submit;
int count;
@@ -2038,15 +2196,16 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
(unsigned long long)sh->sector, checkp);
BUG_ON(sh->batch_head);
- count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
+ count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL);
if (!checkp)
srcs[count] = NULL;
atomic_inc(&sh->count);
init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
sh, to_addr_conv(sh, percpu, 0));
- async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
- &sh->ops.zero_sum_result, percpu->spare_page, &submit);
+ async_syndrome_val(srcs, offs, count+2,
+ RAID5_STRIPE_SIZE(sh->raid_conf),
+ &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit);
}
static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
@@ -2123,6 +2282,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
{
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+ kfree(sh->pages);
+#endif
if (sh->ppl_page)
__free_page(sh->ppl_page);
kmem_cache_free(sc, sh);
@@ -2156,9 +2318,15 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
sh->ppl_page = alloc_page(gfp);
if (!sh->ppl_page) {
free_stripe(sc, sh);
- sh = NULL;
+ return NULL;
}
}
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+ if (init_stripe_shared_pages(sh, conf, disks)) {
+ free_stripe(sc, sh);
+ return NULL;
+ }
+#endif
}
return sh;
}
@@ -2215,10 +2383,13 @@ static int grow_stripes(struct r5conf *conf, int num)
}
/**
- * scribble_len - return the required size of the scribble region
- * @num - total number of disks in the array
+ * scribble_alloc - allocate percpu scribble buffer for required size
+ * of the scribble region
+ * @percpu: from for_each_present_cpu() of the caller
+ * @num: total number of disks in the array
+ * @cnt: scribble objs count for required size of the scribble region
*
- * The size must be enough to contain:
+ * The scribble buffer size must be enough to contain:
* 1/ a struct page pointer for each device in the array +2
* 2/ room to convert each entry in (1) to its corresponding dma
* (dma_map_page()) or page (page_address()) address.
@@ -2231,8 +2402,9 @@ static int scribble_alloc(struct raid5_percpu *percpu,
int num, int cnt)
{
size_t obj_size =
- sizeof(struct page *) * (num+2) +
- sizeof(addr_conv_t) * (num+2);
+ sizeof(struct page *) * (num + 2) +
+ sizeof(addr_conv_t) * (num + 2) +
+ sizeof(unsigned int) * (num + 2);
void *scribble;
/*
@@ -2272,7 +2444,7 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
percpu = per_cpu_ptr(conf->percpu, cpu);
err = scribble_alloc(percpu, new_disks,
- new_sectors / STRIPE_SECTORS);
+ new_sectors / RAID5_STRIPE_SECTORS(conf));
if (err)
break;
}
@@ -2364,9 +2536,16 @@ static int resize_stripes(struct r5conf *conf, int newsize)
osh = get_free_stripe(conf, hash);
unlock_device_hash_lock(conf, hash);
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+ for (i = 0; i < osh->nr_pages; i++) {
+ nsh->pages[i] = osh->pages[i];
+ osh->pages[i] = NULL;
+ }
+#endif
for(i=0; i<conf->pool_size; i++) {
nsh->dev[i].page = osh->dev[i].page;
nsh->dev[i].orig_page = osh->dev[i].page;
+ nsh->dev[i].offset = osh->dev[i].offset;
}
nsh->hash_lock_index = hash;
free_stripe(conf->slab_cache, osh);
@@ -2415,14 +2594,33 @@ static int resize_stripes(struct r5conf *conf, int newsize)
nsh = list_entry(newstripes.next, struct stripe_head, lru);
list_del_init(&nsh->lru);
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+ for (i = 0; i < nsh->nr_pages; i++) {
+ if (nsh->pages[i])
+ continue;
+ nsh->pages[i] = alloc_page(GFP_NOIO);
+ if (!nsh->pages[i])
+ err = -ENOMEM;
+ }
+
+ for (i = conf->raid_disks; i < newsize; i++) {
+ if (nsh->dev[i].page)
+ continue;
+ nsh->dev[i].page = raid5_get_dev_page(nsh, i);
+ nsh->dev[i].orig_page = nsh->dev[i].page;
+ nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
+ }
+#else
for (i=conf->raid_disks; i < newsize; i++)
if (nsh->dev[i].page == NULL) {
struct page *p = alloc_page(GFP_NOIO);
nsh->dev[i].page = p;
nsh->dev[i].orig_page = p;
+ nsh->dev[i].offset = 0;
if (!p)
err = -ENOMEM;
}
+#endif
raid5_release_stripe(nsh);
}
/* critical section pass, GFP_NOIO no longer needed */
@@ -2506,10 +2704,10 @@ static void raid5_end_read_request(struct bio * bi)
*/
pr_info_ratelimited(
"md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
- mdname(conf->mddev), STRIPE_SECTORS,
+ mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
(unsigned long long)s,
bdevname(rdev->bdev, b));
- atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+ atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
@@ -2582,7 +2780,7 @@ static void raid5_end_read_request(struct bio * bi)
if (!(set_bad
&& test_bit(In_sync, &rdev->flags)
&& rdev_set_badblocks(
- rdev, sh->sector, STRIPE_SECTORS, 0)))
+ rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
md_error(conf->mddev, rdev);
}
}
@@ -2598,7 +2796,7 @@ static void raid5_end_write_request(struct bio *bi)
struct stripe_head *sh = bi->bi_private;
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
- struct md_rdev *uninitialized_var(rdev);
+ struct md_rdev *rdev;
sector_t first_bad;
int bad_sectors;
int replacement = 0;
@@ -2634,7 +2832,7 @@ static void raid5_end_write_request(struct bio *bi)
if (bi->bi_status)
md_error(conf->mddev, rdev);
else if (is_badblock(rdev, sh->sector,
- STRIPE_SECTORS,
+ RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
@@ -2646,7 +2844,7 @@ static void raid5_end_write_request(struct bio *bi)
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
} else if (is_badblock(rdev, sh->sector,
- STRIPE_SECTORS,
+ RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors)) {
set_bit(R5_MadeGood, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags))
@@ -3280,13 +3478,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
/* check if page is covered */
sector_t sector = sh->dev[dd_idx].sector;
for (bi=sh->dev[dd_idx].towrite;
- sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
+ sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
bi && bi->bi_iter.bi_sector <= sector;
- bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
+ bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
if (bio_end_sector(bi) >= sector)
sector = bio_end_sector(bi);
}
- if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
+ if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
sh->overwrite_disks++;
}
@@ -3311,7 +3509,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
set_bit(STRIPE_BITMAP_PENDING, &sh->state);
spin_unlock_irq(&sh->stripe_lock);
md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS, 0);
+ RAID5_STRIPE_SECTORS(conf), 0);
spin_lock_irq(&sh->stripe_lock);
clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
if (!sh->batch_head) {
@@ -3373,7 +3571,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (!rdev_set_badblocks(
rdev,
sh->sector,
- STRIPE_SECTORS, 0))
+ RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
@@ -3393,8 +3591,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
wake_up(&conf->wait_for_overlap);
while (bi && bi->bi_iter.bi_sector <
- sh->dev[i].sector + STRIPE_SECTORS) {
- struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
+ struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
md_write_end(conf->mddev);
bio_io_error(bi);
@@ -3402,7 +3600,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
}
if (bitmap_end)
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS, 0, 0);
+ RAID5_STRIPE_SECTORS(conf), 0, 0);
bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
@@ -3414,8 +3612,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi) bitmap_end = 1;
while (bi && bi->bi_iter.bi_sector <
- sh->dev[i].sector + STRIPE_SECTORS) {
- struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
+ struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
md_write_end(conf->mddev);
bio_io_error(bi);
@@ -3438,9 +3636,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi)
s->to_read--;
while (bi && bi->bi_iter.bi_sector <
- sh->dev[i].sector + STRIPE_SECTORS) {
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
struct bio *nextbi =
- r5_next_bio(bi, sh->dev[i].sector);
+ r5_next_bio(conf, bi, sh->dev[i].sector);
bio_io_error(bi);
bi = nextbi;
@@ -3448,7 +3646,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
}
if (bitmap_end)
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS, 0, 0);
+ RAID5_STRIPE_SECTORS(conf), 0, 0);
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
@@ -3493,14 +3691,14 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& !rdev_set_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0))
+ RAID5_STRIPE_SECTORS(conf), 0))
abort = 1;
rdev = rcu_dereference(conf->disks[i].replacement);
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& !rdev_set_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0))
+ RAID5_STRIPE_SECTORS(conf), 0))
abort = 1;
}
rcu_read_unlock();
@@ -3508,7 +3706,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
conf->recovery_disabled =
conf->mddev->recovery_disabled;
}
- md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
}
static int want_replace(struct stripe_head *sh, int disk_idx)
@@ -3535,6 +3733,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
&sh->dev[s->failed_num[1]] };
int i;
+ bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
if (test_bit(R5_LOCKED, &dev->flags) ||
@@ -3593,18 +3792,27 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
* devices must be read.
*/
return 1;
+
+ if (s->failed >= 2 &&
+ (fdev[i]->towrite ||
+ s->failed_num[i] == sh->pd_idx ||
+ s->failed_num[i] == sh->qd_idx) &&
+ !test_bit(R5_UPTODATE, &fdev[i]->flags))
+ /* In max degraded raid6, If the failed disk is P, Q,
+ * or we want to read the failed disk, we need to do
+ * reconstruct-write.
+ */
+ force_rcw = true;
}
- /* If we are forced to do a reconstruct-write, either because
- * the current RAID6 implementation only supports that, or
- * because parity cannot be trusted and we are currently
- * recovering it, there is extra need to be careful.
+ /* If we are forced to do a reconstruct-write, because parity
+ * cannot be trusted and we are currently recovering it, there
+ * is extra need to be careful.
* If one of the devices that we would need to read, because
* it is not being overwritten (and maybe not written at all)
* is missing/faulty, then we need to read everything we can.
*/
- if (sh->raid_conf->level != 6 &&
- sh->raid_conf->rmw_level != PARITY_DISABLE_RMW &&
+ if (!force_rcw &&
sh->sector < sh->raid_conf->mddev->recovery_cp)
/* reconstruct-write isn't being forced */
return 0;
@@ -3708,7 +3916,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
return 0;
}
-/**
+/*
* handle_stripe_fill - read or compute data to satisfy pending requests.
*/
static void handle_stripe_fill(struct stripe_head *sh,
@@ -3783,14 +3991,14 @@ static void handle_stripe_clean_event(struct r5conf *conf,
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_iter.bi_sector <
- dev->sector + STRIPE_SECTORS) {
- wbi2 = r5_next_bio(wbi, dev->sector);
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
+ wbi2 = r5_next_bio(conf, wbi, dev->sector);
md_write_end(conf->mddev);
bio_endio(wbi);
wbi = wbi2;
}
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- STRIPE_SECTORS,
+ RAID5_STRIPE_SECTORS(conf),
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
if (head_sh->batch_head) {
@@ -3974,10 +4182,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
s->locked++;
- } else {
+ } else
set_bit(STRIPE_DELAYED, &sh->state);
- set_bit(STRIPE_HANDLE, &sh->state);
- }
}
}
}
@@ -4002,10 +4208,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(R5_Wantread, &dev->flags);
s->locked++;
qread++;
- } else {
+ } else
set_bit(STRIPE_DELAYED, &sh->state);
- set_bit(STRIPE_HANDLE, &sh->state);
- }
}
}
if (rcw && conf->mddev->queue)
@@ -4055,7 +4259,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
break;
}
dev = &sh->dev[s->failed_num[0]];
- /* fall through */
+ fallthrough;
case check_state_compute_result:
sh->check_state = check_state_idle;
if (!dev)
@@ -4097,7 +4301,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
*/
set_bit(STRIPE_INSYNC, &sh->state);
else {
- atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
+ atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
@@ -4105,7 +4309,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
"%llu-%llu\n", mdname(conf->mddev),
(unsigned long long) sh->sector,
(unsigned long long) sh->sector +
- STRIPE_SECTORS);
+ RAID5_STRIPE_SECTORS(conf));
} else {
sh->check_state = check_state_compute_run;
set_bit(STRIPE_COMPUTE_RUN, &sh->state);
@@ -4186,7 +4390,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
/* we have 2-disk failure */
BUG_ON(s->failed != 2);
- /* fall through */
+ fallthrough;
case check_state_compute_result:
sh->check_state = check_state_idle;
@@ -4262,7 +4466,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
*/
}
} else {
- atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
+ atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
@@ -4270,7 +4474,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
"%llu-%llu\n", mdname(conf->mddev),
(unsigned long long) sh->sector,
(unsigned long long) sh->sector +
- STRIPE_SECTORS);
+ RAID5_STRIPE_SECTORS(conf));
} else {
int *target = &sh->ops.target;
@@ -4341,7 +4545,8 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
/* place all the copies on one channel */
init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
tx = async_memcpy(sh2->dev[dd_idx].page,
- sh->dev[i].page, 0, 0, STRIPE_SIZE,
+ sh->dev[i].page, sh2->dev[dd_idx].offset,
+ sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
&submit);
set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
@@ -4440,8 +4645,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
*/
rdev = rcu_dereference(conf->disks[i].replacement);
if (rdev && !test_bit(Faulty, &rdev->flags) &&
- rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
- !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
+ !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors))
set_bit(R5_ReadRepl, &dev->flags);
else {
@@ -4455,7 +4660,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL;
if (rdev) {
- is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors);
if (s->blocked_rdev == NULL
&& (test_bit(Blocked, &rdev->flags)
@@ -4482,7 +4687,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
}
} else if (test_bit(In_sync, &rdev->flags))
set_bit(R5_Insync, &dev->flags);
- else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
+ else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
/* in sync if before recovery_offset */
set_bit(R5_Insync, &dev->flags);
else if (test_bit(R5_UPTODATE, &dev->flags) &&
@@ -4571,12 +4776,12 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
rcu_read_unlock();
}
+/*
+ * Return '1' if this is a member of batch, or '0' if it is a lone stripe or
+ * a head which can now be handled.
+ */
static int clear_batch_ready(struct stripe_head *sh)
{
- /* Return '1' if this is a member of batch, or
- * '0' if it is a lone stripe or a head which can now be
- * handled.
- */
struct stripe_head *tmp;
if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
return (sh->batch_head && sh->batch_head != sh);
@@ -4680,6 +4885,16 @@ static void handle_stripe(struct stripe_head *sh)
struct r5dev *pdev, *qdev;
clear_bit(STRIPE_HANDLE, &sh->state);
+
+ /*
+ * handle_stripe should not continue handle the batched stripe, only
+ * the head of batch list or lone stripe can continue. Otherwise we
+ * could see break_stripe_batch_list warns about the STRIPE_ACTIVE
+ * is set for the batched stripe.
+ */
+ if (clear_batch_ready(sh))
+ return;
+
if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
/* already being handled, ensure it gets handled
* again when current action finishes */
@@ -4687,11 +4902,6 @@ static void handle_stripe(struct stripe_head *sh)
return;
}
- if (clear_batch_ready(sh) ) {
- clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
- return;
- }
-
if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
break_stripe_batch_list(sh, 0);
@@ -4925,7 +5135,7 @@ static void handle_stripe(struct stripe_head *sh)
if ((s.syncing || s.replacing) && s.locked == 0 &&
!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
test_bit(STRIPE_INSYNC, &sh->state)) {
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
wake_up(&conf->wait_for_overlap);
@@ -4944,14 +5154,11 @@ static void handle_stripe(struct stripe_head *sh)
if (!test_bit(R5_ReWrite, &dev->flags)) {
set_bit(R5_Wantwrite, &dev->flags);
set_bit(R5_ReWrite, &dev->flags);
- set_bit(R5_LOCKED, &dev->flags);
- s.locked++;
- } else {
+ } else
/* let's read it back */
set_bit(R5_Wantread, &dev->flags);
- set_bit(R5_LOCKED, &dev->flags);
- s.locked++;
- }
+ set_bit(R5_LOCKED, &dev->flags);
+ s.locked++;
}
}
@@ -4993,7 +5200,7 @@ static void handle_stripe(struct stripe_head *sh)
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
}
if (s.expanding && s.locked == 0 &&
@@ -5023,14 +5230,14 @@ static void handle_stripe(struct stripe_head *sh)
/* We own a safe reference to the rdev */
rdev = conf->disks[i].rdev;
if (!rdev_set_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0))
+ RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0);
+ RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
@@ -5039,7 +5246,7 @@ static void handle_stripe(struct stripe_head *sh)
/* rdev have been moved down */
rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
- STRIPE_SECTORS, 0);
+ RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
}
@@ -5097,28 +5304,6 @@ static void activate_bit_delay(struct r5conf *conf,
}
}
-static int raid5_congested(struct mddev *mddev, int bits)
-{
- struct r5conf *conf = mddev->private;
-
- /* No difference between reads and writes. Just check
- * how busy the stripe_cache is
- */
-
- if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
- return 1;
-
- /* Also checks whether there is pressure on r5cache log space */
- if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
- return 1;
- if (conf->quiesce)
- return 1;
- if (atomic_read(&conf->empty_inactive_list_nr))
- return 1;
-
- return 0;
-}
-
static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
{
struct r5conf *conf = mddev->private;
@@ -5287,7 +5472,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
trace_block_bio_remap(align_bi->bi_disk->queue,
align_bi, disk_devt(mddev->gendisk),
raid_bio->bi_iter.bi_sector);
- generic_make_request(align_bi);
+ submit_bio_noacct(align_bi);
return 1;
} else {
rcu_read_unlock();
@@ -5307,7 +5492,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
struct r5conf *conf = mddev->private;
split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
bio_chain(split, raid_bio);
- generic_make_request(raid_bio);
+ submit_bio_noacct(raid_bio);
raid_bio = split;
}
@@ -5503,7 +5688,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
/* Skip discard while reshape is happening */
return;
- logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
@@ -5518,7 +5703,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
last_sector *= conf->chunk_sectors;
for (; logical_sector < last_sector;
- logical_sector += STRIPE_SECTORS) {
+ logical_sector += RAID5_STRIPE_SECTORS(conf)) {
DEFINE_WAIT(w);
int d;
again:
@@ -5563,7 +5748,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
d++)
md_bitmap_startwrite(mddev->bitmap,
sh->sector,
- STRIPE_SECTORS,
+ RAID5_STRIPE_SECTORS(conf),
0);
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
@@ -5628,12 +5813,12 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
return true;
}
- logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
- for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+ for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
int previous;
int seq;
@@ -5731,8 +5916,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
do_flush = false;
}
- if (!sh->batch_head || sh == sh->batch_head)
- set_bit(STRIPE_HANDLE, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
(bi->bi_opf & REQ_SYNC) &&
@@ -5797,7 +5981,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sector_div(sector_nr, new_data_disks);
if (sector_nr) {
mddev->curr_resync_completed = sector_nr;
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
*skipped = 1;
retn = sector_nr;
goto finish;
@@ -5911,11 +6095,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
INIT_LIST_HEAD(&stripes);
- for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
+ for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
int j;
int skipped_disk = 0;
sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
@@ -5936,7 +6120,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
skipped_disk = 1;
continue;
}
- memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
+ memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
set_bit(R5_Expanded, &sh->dev[j].flags);
set_bit(R5_UPTODATE, &sh->dev[j].flags);
}
@@ -5971,7 +6155,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
- first_sector += STRIPE_SECTORS;
+ first_sector += RAID5_STRIPE_SECTORS(conf);
}
/* Now that the sources are clearly marked, we can release
* the destination stripes
@@ -6018,7 +6202,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
ret:
return retn;
@@ -6077,11 +6261,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!conf->fullsync &&
!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
- sync_blocks >= STRIPE_SECTORS) {
+ sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
/* we can skip this block, and probably more */
- sync_blocks /= STRIPE_SECTORS;
+ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
*skipped = 1;
- return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
+ /* keep things rounded to whole stripes */
+ return sync_blocks * RAID5_STRIPE_SECTORS(conf);
}
md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
@@ -6114,7 +6299,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
raid5_release_stripe(sh);
- return STRIPE_SECTORS;
+ return RAID5_STRIPE_SECTORS(conf);
}
static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
@@ -6137,14 +6322,14 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
int handled = 0;
logical_sector = raid_bio->bi_iter.bi_sector &
- ~((sector_t)STRIPE_SECTORS-1);
+ ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
sector = raid5_compute_sector(conf, logical_sector,
0, &dd_idx, NULL);
last_sector = bio_end_sector(raid_bio);
for (; logical_sector < last_sector;
- logical_sector += STRIPE_SECTORS,
- sector += STRIPE_SECTORS,
+ logical_sector += RAID5_STRIPE_SECTORS(conf),
+ sector += RAID5_STRIPE_SECTORS(conf),
scnt++) {
if (scnt < offset)
@@ -6477,6 +6662,100 @@ raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
raid5_show_rmw_level,
raid5_store_rmw_level);
+static ssize_t
+raid5_show_stripe_size(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf;
+ int ret = 0;
+
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
+ if (conf)
+ ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
+ spin_unlock(&mddev->lock);
+ return ret;
+}
+
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+static ssize_t
+raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf;
+ unsigned long new;
+ int err;
+ int size;
+
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+ if (kstrtoul(page, 10, &new))
+ return -EINVAL;
+
+ /*
+ * The value should not be bigger than PAGE_SIZE. It requires to
+ * be multiple of DEFAULT_STRIPE_SIZE and the value should be power
+ * of two.
+ */
+ if (new % DEFAULT_STRIPE_SIZE != 0 ||
+ new > PAGE_SIZE || new == 0 ||
+ new != roundup_pow_of_two(new))
+ return -EINVAL;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+
+ conf = mddev->private;
+ if (!conf) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+
+ if (new == conf->stripe_size)
+ goto out_unlock;
+
+ pr_debug("md/raid: change stripe_size from %lu to %lu\n",
+ conf->stripe_size, new);
+
+ if (mddev->sync_thread ||
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ mddev->reshape_position != MaxSector ||
+ mddev->sysfs_active) {
+ err = -EBUSY;
+ goto out_unlock;
+ }
+
+ mddev_suspend(mddev);
+ mutex_lock(&conf->cache_size_mutex);
+ size = conf->max_nr_stripes;
+
+ shrink_stripes(conf);
+
+ conf->stripe_size = new;
+ conf->stripe_shift = ilog2(new) - 9;
+ conf->stripe_sectors = new >> 9;
+ if (grow_stripes(conf, size)) {
+ pr_warn("md/raid:%s: couldn't allocate buffers\n",
+ mdname(mddev));
+ err = -ENOMEM;
+ }
+ mutex_unlock(&conf->cache_size_mutex);
+ mddev_resume(mddev);
+
+out_unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry
+raid5_stripe_size = __ATTR(stripe_size, 0644,
+ raid5_show_stripe_size,
+ raid5_store_stripe_size);
+#else
+static struct md_sysfs_entry
+raid5_stripe_size = __ATTR(stripe_size, 0444,
+ raid5_show_stripe_size,
+ NULL);
+#endif
static ssize_t
raid5_show_preread_threshold(struct mddev *mddev, char *page)
@@ -6556,14 +6835,14 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
if (!conf)
err = -ENODEV;
else if (new != conf->skip_copy) {
+ struct request_queue *q = mddev->queue;
+
mddev_suspend(mddev);
conf->skip_copy = new;
if (new)
- mddev->queue->backing_dev_info->capabilities |=
- BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
else
- mddev->queue->backing_dev_info->capabilities &=
- ~BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
mddev_resume(mddev);
}
mddev_unlock(mddev);
@@ -6603,7 +6882,6 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
static int alloc_thread_groups(struct r5conf *conf, int cnt,
int *group_cnt,
- int *worker_cnt_per_group,
struct r5worker_group **worker_groups);
static ssize_t
raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
@@ -6612,7 +6890,7 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
unsigned int new;
int err;
struct r5worker_group *new_groups, *old_groups;
- int group_cnt, worker_cnt_per_group;
+ int group_cnt;
if (len >= PAGE_SIZE)
return -EINVAL;
@@ -6635,13 +6913,11 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
if (old_groups)
flush_workqueue(raid5_wq);
- err = alloc_thread_groups(conf, new,
- &group_cnt, &worker_cnt_per_group,
- &new_groups);
+ err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
if (!err) {
spin_lock_irq(&conf->device_lock);
conf->group_cnt = group_cnt;
- conf->worker_cnt_per_group = worker_cnt_per_group;
+ conf->worker_cnt_per_group = new;
conf->worker_groups = new_groups;
spin_unlock_irq(&conf->device_lock);
@@ -6668,6 +6944,7 @@ static struct attribute *raid5_attrs[] = {
&raid5_group_thread_cnt.attr,
&raid5_skip_copy.attr,
&raid5_rmw_level.attr,
+ &raid5_stripe_size.attr,
&r5c_journal_mode.attr,
&ppl_write_hint.attr,
NULL,
@@ -6677,16 +6954,13 @@ static struct attribute_group raid5_attrs_group = {
.attrs = raid5_attrs,
};
-static int alloc_thread_groups(struct r5conf *conf, int cnt,
- int *group_cnt,
- int *worker_cnt_per_group,
+static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
struct r5worker_group **worker_groups)
{
int i, j, k;
ssize_t size;
struct r5worker *workers;
- *worker_cnt_per_group = cnt;
if (cnt == 0) {
*group_cnt = 0;
*worker_groups = NULL;
@@ -6770,7 +7044,7 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
conf->previous_raid_disks),
max(conf->chunk_sectors,
conf->prev_chunk_sectors)
- / STRIPE_SECTORS)) {
+ / RAID5_STRIPE_SECTORS(conf))) {
free_scratch_buffer(conf, percpu);
return -ENOMEM;
}
@@ -6886,7 +7160,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
struct disk_info *disk;
char pers_name[6];
int i;
- int group_cnt, worker_cnt_per_group;
+ int group_cnt;
struct r5worker_group *new_group;
int ret;
@@ -6922,6 +7196,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
if (conf == NULL)
goto abort;
+
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+ conf->stripe_size = DEFAULT_STRIPE_SIZE;
+ conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
+ conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
+#endif
INIT_LIST_HEAD(&conf->free_list);
INIT_LIST_HEAD(&conf->pending_list);
conf->pending_data = kcalloc(PENDING_IO_MAX,
@@ -6932,15 +7212,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
for (i = 0; i < PENDING_IO_MAX; i++)
list_add(&conf->pending_data[i].sibling, &conf->free_list);
/* Don't enable multi-threading by default*/
- if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
- &new_group)) {
+ if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
conf->group_cnt = group_cnt;
- conf->worker_cnt_per_group = worker_cnt_per_group;
+ conf->worker_cnt_per_group = 0;
conf->worker_groups = new_group;
} else
goto abort;
spin_lock_init(&conf->device_lock);
- seqcount_init(&conf->gen_lock);
+ seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
mutex_init(&conf->cache_size_mutex);
init_waitqueue_head(&conf->wait_for_quiescent);
init_waitqueue_head(&conf->wait_for_stripe);
@@ -7074,8 +7353,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->min_nr_stripes = NR_STRIPES;
if (mddev->reshape_position != MaxSector) {
int stripes = max_t(int,
- ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
- ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
+ ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
+ ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
conf->min_nr_stripes = max(NR_STRIPES, stripes);
if (conf->min_nr_stripes != NR_STRIPES)
pr_info("md/raid:%s: force stripe size %d for reshape\n",
@@ -7150,6 +7429,12 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
return 0;
}
+static void raid5_set_io_opt(struct r5conf *conf)
+{
+ blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
+ (conf->raid_disks - conf->max_degraded));
+}
+
static int raid5_run(struct mddev *mddev)
{
struct r5conf *conf;
@@ -7434,13 +7719,10 @@ static int raid5_run(struct mddev *mddev)
int data_disks = conf->previous_raid_disks - conf->max_degraded;
int stripe = data_disks *
((mddev->chunk_sectors << 9) / PAGE_SIZE);
- if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
- mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
- blk_queue_io_opt(mddev->queue, chunk_size *
- (conf->raid_disks - conf->max_degraded));
+ raid5_set_io_opt(conf);
mddev->queue->limits.raid_partial_stripes_expensive = 1;
/*
* We can only discard a whole stripe. It doesn't make sense to
@@ -7806,14 +8088,14 @@ static int check_stripe_cache(struct mddev *mddev)
* stripe_heads first.
*/
struct r5conf *conf = mddev->private;
- if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
+ if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
> conf->min_nr_stripes ||
- ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
+ ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
> conf->min_nr_stripes) {
pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
mdname(mddev),
((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
- / STRIPE_SIZE)*4);
+ / RAID5_STRIPE_SIZE(conf))*4);
return 0;
}
return 1;
@@ -7949,8 +8231,8 @@ static int raid5_start_reshape(struct mddev *mddev)
else
rdev->recovery_offset = 0;
- if (sysfs_link_rdev(mddev, rdev))
- /* Failure here is OK */;
+ /* Failure here is OK */
+ sysfs_link_rdev(mddev, rdev);
}
} else if (rdev->raid_disk >= conf->previous_raid_disks
&& !test_bit(Faulty, &rdev->flags)) {
@@ -8024,16 +8306,8 @@ static void end_reshape(struct r5conf *conf)
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
- /* read-ahead size must cover two whole stripes, which is
- * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
- */
- if (conf->mddev->queue) {
- int data_disks = conf->raid_disks - conf->max_degraded;
- int stripe = data_disks * ((conf->chunk_sectors << 9)
- / PAGE_SIZE);
- if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
- conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
- }
+ if (conf->mddev->queue)
+ raid5_set_io_opt(conf);
}
}
@@ -8145,7 +8419,7 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
while (chunksect && (mddev->array_sectors & (chunksect-1)))
chunksect >>= 1;
- if ((chunksect<<9) < STRIPE_SIZE)
+ if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
/* array size does not allow a suitable chunk size */
return ERR_PTR(-EINVAL);
@@ -8432,7 +8706,6 @@ static struct md_personality raid6_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
- .congested = raid5_congested,
.change_consistency_policy = raid5_change_consistency_policy,
};
static struct md_personality raid5_personality =
@@ -8457,7 +8730,6 @@ static struct md_personality raid5_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
- .congested = raid5_congested,
.change_consistency_policy = raid5_change_consistency_policy,
};
@@ -8483,7 +8755,6 @@ static struct md_personality raid4_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
- .congested = raid5_congested,
.change_consistency_policy = raid5_change_consistency_policy,
};
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index f90e070..5c05acf 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -195,6 +195,7 @@ enum reconstruct_states {
reconstruct_state_result,
};
+#define DEFAULT_STRIPE_SIZE 4096
struct stripe_head {
struct hlist_node hash;
struct list_head lru; /* inactive_list or handle_list */
@@ -246,6 +247,13 @@ struct stripe_head {
int target, target2;
enum sum_check_flags zero_sum_result;
} ops;
+
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+ /* These pages will be used by bios in dev[i] */
+ struct page **pages;
+ int nr_pages; /* page array size */
+ int stripes_per_page;
+#endif
struct r5dev {
/* rreq and rvec are used for the replacement device when
* writing data to both devices.
@@ -253,6 +261,7 @@ struct stripe_head {
struct bio req, rreq;
struct bio_vec vec, rvec;
struct page *page, *orig_page;
+ unsigned int offset; /* offset of the page */
struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */
unsigned long flags;
@@ -472,32 +481,19 @@ struct disk_info {
*/
#define NR_STRIPES 256
+
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
#define STRIPE_SIZE PAGE_SIZE
#define STRIPE_SHIFT (PAGE_SHIFT - 9)
#define STRIPE_SECTORS (STRIPE_SIZE>>9)
+#endif
+
#define IO_THRESHOLD 1
#define BYPASS_THRESHOLD 1
#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
#define HASH_MASK (NR_HASH - 1)
#define MAX_STRIPE_BATCH 8
-/* bio's attached to a stripe+device for I/O are linked together in bi_sector
- * order without overlap. There may be several bio's per stripe+device, and
- * a bio could span several devices.
- * When walking this list for a particular stripe+device, we must never proceed
- * beyond a bio that extends past this device, as the next bio might no longer
- * be valid.
- * This function is used to determine the 'next' bio in the list, given the
- * sector of the current stripe+device
- */
-static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
-{
- if (bio_end_sector(bio) < sector + STRIPE_SECTORS)
- return bio->bi_next;
- else
- return NULL;
-}
-
/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
* This is because we sometimes take all the spinlocks
* and creating that much locking depth can cause
@@ -574,6 +570,11 @@ struct r5conf {
int raid_disks;
int max_nr_stripes;
int min_nr_stripes;
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+ unsigned long stripe_size;
+ unsigned int stripe_shift;
+ unsigned long stripe_sectors;
+#endif
/* reshape_progress is the leading edge of a 'reshape'
* It has value MaxSector when no reshape is happening
@@ -589,7 +590,7 @@ struct r5conf {
int prev_chunk_sectors;
int prev_algo;
short generation; /* increments with every reshape */
- seqcount_t gen_lock; /* lock against generation changes */
+ seqcount_spinlock_t gen_lock; /* lock against generation changes */
unsigned long reshape_checkpoint; /* Time we last updated
* metadata */
long long min_offset_diff; /* minimum difference between
@@ -690,6 +691,32 @@ struct r5conf {
struct r5pending_data *next_pending_data;
};
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
+#define RAID5_STRIPE_SIZE(conf) STRIPE_SIZE
+#define RAID5_STRIPE_SHIFT(conf) STRIPE_SHIFT
+#define RAID5_STRIPE_SECTORS(conf) STRIPE_SECTORS
+#else
+#define RAID5_STRIPE_SIZE(conf) ((conf)->stripe_size)
+#define RAID5_STRIPE_SHIFT(conf) ((conf)->stripe_shift)
+#define RAID5_STRIPE_SECTORS(conf) ((conf)->stripe_sectors)
+#endif
+
+/* bio's attached to a stripe+device for I/O are linked together in bi_sector
+ * order without overlap. There may be several bio's per stripe+device, and
+ * a bio could span several devices.
+ * When walking this list for a particular stripe+device, we must never proceed
+ * beyond a bio that extends past this device, as the next bio might no longer
+ * be valid.
+ * This function is used to determine the 'next' bio in the list, given the
+ * sector of the current stripe+device
+ */
+static inline struct bio *r5_next_bio(struct r5conf *conf, struct bio *bio, sector_t sector)
+{
+ if (bio_end_sector(bio) < sector + RAID5_STRIPE_SECTORS(conf))
+ return bio->bi_next;
+ else
+ return NULL;
+}
/*
* Our supported algorithms
@@ -752,6 +779,25 @@ static inline int algorithm_is_DDF(int layout)
return layout >= 8 && layout <= 10;
}
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
+/*
+ * Return offset of the corresponding page for r5dev.
+ */
+static inline int raid5_get_page_offset(struct stripe_head *sh, int disk_idx)
+{
+ return (disk_idx % sh->stripes_per_page) * RAID5_STRIPE_SIZE(sh->raid_conf);
+}
+
+/*
+ * Return corresponding page address for r5dev.
+ */
+static inline struct page *
+raid5_get_dev_page(struct stripe_head *sh, int disk_idx)
+{
+ return sh->pages[disk_idx / sh->stripes_per_page];
+}
+#endif
+
extern void md_raid5_kick_device(struct r5conf *conf);
extern int raid5_set_cache_size(struct mddev *mddev, int size);
extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);