Update Linux to v5.4.148
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.4.148.tar.gz
Change-Id: Ib3d26c5ba9b022e2e03533005c4fed4d7c30b61b
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 6f77682..a1df0d9 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -377,7 +377,10 @@
if (!fifo_full(&ca->free_inc))
goto retry_invalidate;
- bch_prio_write(ca);
+ if (bch_prio_write(ca, false) < 0) {
+ ca->invalidate_needs_gc = 1;
+ wake_up_gc(ca->set);
+ }
}
}
out:
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 013e35a..36de6f7 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -264,7 +264,7 @@
#define BCACHE_DEV_UNLINK_DONE 2
#define BCACHE_DEV_WB_RUNNING 3
#define BCACHE_DEV_RATE_DW_RUNNING 4
- unsigned int nr_stripes;
+ int nr_stripes;
unsigned int stripe_size;
atomic_t *stripe_sectors_dirty;
unsigned long *full_dirty_stripes;
@@ -329,6 +329,9 @@
*/
atomic_t has_dirty;
+#define BCH_CACHE_READA_ALL 0
+#define BCH_CACHE_READA_META_ONLY 1
+ unsigned int cache_readahead_policy;
struct bch_ratelimit writeback_rate;
struct delayed_work writeback_rate_update;
@@ -582,6 +585,7 @@
*/
wait_queue_head_t btree_cache_wait;
struct task_struct *btree_cache_alloc_lock;
+ spinlock_t btree_cannibalize_lock;
/*
* When we free a btree node, we increment the gen of the bucket the
@@ -977,11 +981,12 @@
__printf(2, 3)
bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...);
-void bch_prio_write(struct cache *ca);
+int bch_prio_write(struct cache *ca, bool wait);
void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent);
extern struct workqueue_struct *bcache_wq;
extern struct workqueue_struct *bch_journal_wq;
+extern struct workqueue_struct *bch_flush_wq;
extern struct mutex bch_register_lock;
extern struct list_head bch_cache_sets;
@@ -1023,5 +1028,7 @@
void bch_debug_init(void);
void bch_request_exit(void);
int bch_request_init(void);
+void bch_btree_exit(void);
+int bch_btree_init(void);
#endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 0876879..fda68c0 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -321,7 +321,7 @@
b->page_order = page_order;
- t->data = (void *) __get_free_pages(gfp, b->page_order);
+ t->data = (void *) __get_free_pages(__GFP_COMP|gfp, b->page_order);
if (!t->data)
goto err;
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index c71365e..a50dcfd 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -397,7 +397,8 @@
/* Bkey utility code */
-#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys)
+#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, \
+ (unsigned int)(i)->keys)
static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned int idx)
{
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index ba434d9..5a33910 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -99,6 +99,8 @@
#define PTR_HASH(c, k) \
(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
+static struct workqueue_struct *btree_io_wq;
+
#define insert_lock(s, b) ((b)->level <= (s)->lock)
/*
@@ -366,7 +368,7 @@
btree_complete_write(b, w);
if (btree_node_dirty(b))
- schedule_delayed_work(&b->work, 30 * HZ);
+ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
closure_return_with_destructor(cl, btree_node_write_unlock);
}
@@ -539,7 +541,7 @@
BUG_ON(!i->keys);
if (!btree_node_dirty(b))
- schedule_delayed_work(&b->work, 30 * HZ);
+ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
set_btree_node_dirty(b);
@@ -723,6 +725,8 @@
* IO can always make forward progress:
*/
nr /= c->btree_pages;
+ if (nr == 0)
+ nr = 1;
nr = min_t(unsigned long, nr, mca_can_free(c));
i = 0;
@@ -838,7 +842,7 @@
mutex_init(&c->verify_lock);
c->verify_ondisk = (void *)
- __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
+ __get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(bucket_pages(c)));
c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
@@ -884,15 +888,17 @@
static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
{
- struct task_struct *old;
-
- old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
- if (old && old != current) {
+ spin_lock(&c->btree_cannibalize_lock);
+ if (likely(c->btree_cache_alloc_lock == NULL)) {
+ c->btree_cache_alloc_lock = current;
+ } else if (c->btree_cache_alloc_lock != current) {
if (op)
prepare_to_wait(&c->btree_cache_wait, &op->wait,
TASK_UNINTERRUPTIBLE);
+ spin_unlock(&c->btree_cannibalize_lock);
return -EINTR;
}
+ spin_unlock(&c->btree_cannibalize_lock);
return 0;
}
@@ -927,10 +933,12 @@
*/
static void bch_cannibalize_unlock(struct cache_set *c)
{
+ spin_lock(&c->btree_cannibalize_lock);
if (c->btree_cache_alloc_lock == current) {
c->btree_cache_alloc_lock = NULL;
wake_up(&c->btree_cache_wait);
}
+ spin_unlock(&c->btree_cannibalize_lock);
}
static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
@@ -1440,7 +1448,7 @@
if (__set_blocks(n1, n1->keys + n2->keys,
block_bytes(b->c)) >
btree_blocks(new_nodes[i]))
- goto out_nocoalesce;
+ goto out_unlock_nocoalesce;
keys = n2->keys;
/* Take the key of the node we're getting rid of */
@@ -1469,7 +1477,7 @@
if (__bch_keylist_realloc(&keylist,
bkey_u64s(&new_nodes[i]->key)))
- goto out_nocoalesce;
+ goto out_unlock_nocoalesce;
bch_btree_node_write(new_nodes[i], &cl);
bch_keylist_add(&keylist, &new_nodes[i]->key);
@@ -1515,6 +1523,10 @@
/* Invalidated our iterator */
return -EINTR;
+out_unlock_nocoalesce:
+ for (i = 0; i < nodes; i++)
+ mutex_unlock(&new_nodes[i]->write_lock);
+
out_nocoalesce:
closure_sync(&cl);
@@ -2649,3 +2661,18 @@
spin_lock_init(&buf->lock);
array_allocator_init(&buf->freelist);
}
+
+void bch_btree_exit(void)
+{
+ if (btree_io_wq)
+ destroy_workqueue(btree_io_wq);
+}
+
+int __init bch_btree_init(void)
+{
+ btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0);
+ if (!btree_io_wq)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index be2a2a2..b4fd923 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -417,10 +417,15 @@
/* Journalling */
+#define nr_to_fifo_front(p, front_p, mask) (((p) - (front_p)) & (mask))
+
static void btree_flush_write(struct cache_set *c)
{
struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR];
- unsigned int i, n;
+ unsigned int i, nr;
+ int ref_nr;
+ atomic_t *fifo_front_p, *now_fifo_front_p;
+ size_t mask;
if (c->journal.btree_flushing)
return;
@@ -433,12 +438,50 @@
c->journal.btree_flushing = true;
spin_unlock(&c->journal.flush_write_lock);
+ /* get the oldest journal entry and check its refcount */
+ spin_lock(&c->journal.lock);
+ fifo_front_p = &fifo_front(&c->journal.pin);
+ ref_nr = atomic_read(fifo_front_p);
+ if (ref_nr <= 0) {
+ /*
+ * do nothing if no btree node references
+ * the oldest journal entry
+ */
+ spin_unlock(&c->journal.lock);
+ goto out;
+ }
+ spin_unlock(&c->journal.lock);
+
+ mask = c->journal.pin.mask;
+ nr = 0;
atomic_long_inc(&c->flush_write);
memset(btree_nodes, 0, sizeof(btree_nodes));
- n = 0;
mutex_lock(&c->bucket_lock);
list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
+ /*
+ * It is safe to get now_fifo_front_p without holding
+ * c->journal.lock here, because we don't need to know
+ * the exactly accurate value, just check whether the
+ * front pointer of c->journal.pin is changed.
+ */
+ now_fifo_front_p = &fifo_front(&c->journal.pin);
+ /*
+ * If the oldest journal entry is reclaimed and front
+ * pointer of c->journal.pin changes, it is unnecessary
+ * to scan c->btree_cache anymore, just quit the loop and
+ * flush out what we have already.
+ */
+ if (now_fifo_front_p != fifo_front_p)
+ break;
+ /*
+ * quit this loop if all matching btree nodes are
+ * scanned and record in btree_nodes[] already.
+ */
+ ref_nr = atomic_read(fifo_front_p);
+ if (nr >= ref_nr)
+ break;
+
if (btree_node_journal_flush(b))
pr_err("BUG: flush_write bit should not be set here!");
@@ -454,17 +497,44 @@
continue;
}
+ /*
+ * Only select the btree node which exactly references
+ * the oldest journal entry.
+ *
+ * If the journal entry pointed by fifo_front_p is
+ * reclaimed in parallel, don't worry:
+ * - the list_for_each_xxx loop will quit when checking
+ * next now_fifo_front_p.
+ * - If there are matched nodes recorded in btree_nodes[],
+ * they are clean now (this is why and how the oldest
+ * journal entry can be reclaimed). These selected nodes
+ * will be ignored and skipped in the folowing for-loop.
+ */
+ if (nr_to_fifo_front(btree_current_write(b)->journal,
+ fifo_front_p,
+ mask) != 0) {
+ mutex_unlock(&b->write_lock);
+ continue;
+ }
+
set_btree_node_journal_flush(b);
mutex_unlock(&b->write_lock);
- btree_nodes[n++] = b;
- if (n == BTREE_FLUSH_NR)
+ btree_nodes[nr++] = b;
+ /*
+ * To avoid holding c->bucket_lock too long time,
+ * only scan for BTREE_FLUSH_NR matched btree nodes
+ * at most. If there are more btree nodes reference
+ * the oldest journal entry, try to flush them next
+ * time when btree_flush_write() is called.
+ */
+ if (nr == BTREE_FLUSH_NR)
break;
}
mutex_unlock(&c->bucket_lock);
- for (i = 0; i < n; i++) {
+ for (i = 0; i < nr; i++) {
b = btree_nodes[i];
if (!b) {
pr_err("BUG: btree_nodes[%d] is NULL", i);
@@ -497,6 +567,7 @@
mutex_unlock(&b->write_lock);
}
+out:
spin_lock(&c->journal.flush_write_lock);
c->journal.btree_flushing = false;
spin_unlock(&c->journal.flush_write_lock);
@@ -887,8 +958,8 @@
journal_try_write(c);
} else if (!w->dirty) {
w->dirty = true;
- schedule_delayed_work(&c->journal.work,
- msecs_to_jiffies(c->journal_delay_ms));
+ queue_delayed_work(bch_flush_wq, &c->journal.work,
+ msecs_to_jiffies(c->journal_delay_ms));
spin_unlock(&c->journal.lock);
} else {
spin_unlock(&c->journal.lock);
@@ -931,8 +1002,8 @@
j->w[1].c = c;
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
- !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
- !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
+ !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)) ||
+ !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)))
return -ENOMEM;
return 0;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 41adcd1..4045ae7 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -391,13 +391,20 @@
goto skip;
/*
- * Flag for bypass if the IO is for read-ahead or background,
- * unless the read-ahead request is for metadata
+ * If the bio is for read-ahead or background IO, bypass it or
+ * not depends on the following situations,
+ * - If the IO is for meta data, always cache it and no bypass
+ * - If the IO is not meta data, check dc->cache_reada_policy,
+ * BCH_CACHE_READA_ALL: cache it and not bypass
+ * BCH_CACHE_READA_META_ONLY: not cache it and bypass
+ * That is, read-ahead request for metadata always get cached
* (eg, for gfs2 or xfs).
*/
- if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
- !(bio->bi_opf & (REQ_META|REQ_PRIO)))
- goto skip;
+ if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) {
+ if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
+ (dc->cache_readahead_policy != BCH_CACHE_READA_ALL))
+ goto skip;
+ }
if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
bio_sectors(bio) & (c->sb.block_size - 1)) {
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index ba1c937..503aafe 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -109,9 +109,13 @@
void bch_cache_accounting_clear(struct cache_accounting *acc)
{
- memset(&acc->total.cache_hits,
- 0,
- sizeof(struct cache_stats));
+ acc->total.cache_hits = 0;
+ acc->total.cache_misses = 0;
+ acc->total.cache_bypass_hits = 0;
+ acc->total.cache_bypass_misses = 0;
+ acc->total.cache_readaheads = 0;
+ acc->total.cache_miss_collisions = 0;
+ acc->total.sectors_bypassed = 0;
}
void bch_cache_accounting_destroy(struct cache_accounting *acc)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 20ed838..efdf6ce 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -48,6 +48,7 @@
static DEFINE_IDA(bcache_device_idx);
static wait_queue_head_t unregister_wait;
struct workqueue_struct *bcache_wq;
+struct workqueue_struct *bch_flush_wq;
struct workqueue_struct *bch_journal_wq;
@@ -529,12 +530,29 @@
closure_sync(cl);
}
-void bch_prio_write(struct cache *ca)
+int bch_prio_write(struct cache *ca, bool wait)
{
int i;
struct bucket *b;
struct closure cl;
+ pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu",
+ fifo_used(&ca->free[RESERVE_PRIO]),
+ fifo_used(&ca->free[RESERVE_NONE]),
+ fifo_used(&ca->free_inc));
+
+ /*
+ * Pre-check if there are enough free buckets. In the non-blocking
+ * scenario it's better to fail early rather than starting to allocate
+ * buckets and do a cleanup later in case of failure.
+ */
+ if (!wait) {
+ size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
+ fifo_used(&ca->free[RESERVE_NONE]);
+ if (prio_buckets(ca) > avail)
+ return -ENOMEM;
+ }
+
closure_init_stack(&cl);
lockdep_assert_held(&ca->set->bucket_lock);
@@ -544,9 +562,6 @@
atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
&ca->meta_sectors_written);
- //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
- // fifo_used(&ca->free_inc), fifo_used(&ca->unused));
-
for (i = prio_buckets(ca) - 1; i >= 0; --i) {
long bucket;
struct prio_set *p = ca->disk_buckets;
@@ -564,7 +579,7 @@
p->magic = pset_magic(&ca->sb);
p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
- bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
+ bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
BUG_ON(bucket == -1);
mutex_unlock(&ca->set->bucket_lock);
@@ -593,6 +608,7 @@
ca->prio_last_buckets[i] = ca->prio_buckets[i];
}
+ return 0;
}
static void prio_read(struct cache *ca, uint64_t bucket)
@@ -761,20 +777,31 @@
static void bcache_device_free(struct bcache_device *d)
{
+ struct gendisk *disk = d->disk;
+
lockdep_assert_held(&bch_register_lock);
- pr_info("%s stopped", d->disk->disk_name);
+ if (disk)
+ pr_info("%s stopped", disk->disk_name);
+ else
+ pr_err("bcache device (NULL gendisk) stopped");
if (d->c)
bcache_device_detach(d);
- if (d->disk && d->disk->flags & GENHD_FL_UP)
- del_gendisk(d->disk);
- if (d->disk && d->disk->queue)
- blk_cleanup_queue(d->disk->queue);
- if (d->disk) {
+
+ if (disk) {
+ bool disk_added = (disk->flags & GENHD_FL_UP) != 0;
+
+ if (disk_added)
+ del_gendisk(disk);
+
+ if (disk->queue)
+ blk_cleanup_queue(disk->queue);
+
ida_simple_remove(&bcache_device_idx,
- first_minor_to_idx(d->disk->first_minor));
- put_disk(d->disk);
+ first_minor_to_idx(disk->first_minor));
+ if (disk_added)
+ put_disk(disk);
}
bioset_exit(&d->bio_split);
@@ -790,19 +817,19 @@
struct request_queue *q;
const size_t max_stripes = min_t(size_t, INT_MAX,
SIZE_MAX / sizeof(atomic_t));
- size_t n;
+ uint64_t n;
int idx;
if (!d->stripe_size)
d->stripe_size = 1 << 31;
- d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
-
- if (!d->nr_stripes || d->nr_stripes > max_stripes) {
- pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
- (unsigned int)d->nr_stripes);
+ n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
+ if (!n || n > max_stripes) {
+ pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n",
+ n);
return -ENOMEM;
}
+ d->nr_stripes = n;
n = d->nr_stripes * sizeof(atomic_t);
d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
@@ -812,20 +839,20 @@
n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
if (!d->full_dirty_stripes)
- return -ENOMEM;
+ goto out_free_stripe_sectors_dirty;
idx = ida_simple_get(&bcache_device_idx, 0,
BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
if (idx < 0)
- return idx;
+ goto out_free_full_dirty_stripes;
if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
- goto err;
+ goto out_ida_remove;
d->disk = alloc_disk(BCACHE_MINORS);
if (!d->disk)
- goto err;
+ goto out_bioset_exit;
set_capacity(d->disk, sectors);
snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
@@ -860,8 +887,14 @@
return 0;
-err:
+out_bioset_exit:
+ bioset_exit(&d->bio_split);
+out_ida_remove:
ida_simple_remove(&bcache_device_idx, idx);
+out_free_full_dirty_stripes:
+ kvfree(d->full_dirty_stripes);
+out_free_stripe_sectors_dirty:
+ kvfree(d->stripe_sectors_dirty);
return -ENOMEM;
}
@@ -1251,6 +1284,9 @@
mutex_unlock(&bch_register_lock);
+ if (dc->sb_bio.bi_inline_vecs[0].bv_page)
+ put_page(bio_first_page_all(&dc->sb_bio));
+
if (!IS_ERR_OR_NULL(dc->bdev))
blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
@@ -1725,7 +1761,7 @@
}
#define alloc_bucket_pages(gfp, c) \
- ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
+ ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(bucket_pages(c))))
struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
{
@@ -1769,6 +1805,7 @@
sema_init(&c->sb_write_mutex, 1);
mutex_init(&c->bucket_lock);
init_waitqueue_head(&c->btree_cache_wait);
+ spin_lock_init(&c->btree_cannibalize_lock);
init_waitqueue_head(&c->bucket_wait);
init_waitqueue_head(&c->gc_wait);
sema_init(&c->uuid_write_mutex, 1);
@@ -1954,7 +1991,7 @@
mutex_lock(&c->bucket_lock);
for_each_cache(ca, c, i)
- bch_prio_write(ca);
+ bch_prio_write(ca, true);
mutex_unlock(&c->bucket_lock);
err = "cannot allocate new UUID bucket";
@@ -2062,7 +2099,14 @@
sysfs_create_link(&c->kobj, &ca->kobj, buf))
goto err;
- if (ca->sb.seq > c->sb.seq) {
+ /*
+ * A special case is both ca->sb.seq and c->sb.seq are 0,
+ * such condition happens on a new created cache device whose
+ * super block is never flushed yet. In this case c->sb.version
+ * and other members should be updated too, otherwise we will
+ * have a mistaken super block version in cache set.
+ */
+ if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) {
c->sb.version = ca->sb.version;
memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
c->sb.flags = ca->sb.flags;
@@ -2346,29 +2390,35 @@
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
const char *buffer, size_t size)
{
- ssize_t ret = -EINVAL;
- const char *err = "cannot allocate memory";
+ const char *err;
char *path = NULL;
- struct cache_sb *sb = NULL;
+ struct cache_sb *sb;
struct block_device *bdev = NULL;
- struct page *sb_page = NULL;
+ struct page *sb_page;
+ ssize_t ret;
+ ret = -EBUSY;
+ err = "failed to reference bcache module";
if (!try_module_get(THIS_MODULE))
- return -EBUSY;
+ goto out;
/* For latest state of bcache_is_reboot */
smp_mb();
+ err = "bcache is in reboot";
if (bcache_is_reboot)
- return -EBUSY;
+ goto out_module_put;
+ ret = -ENOMEM;
+ err = "cannot allocate memory";
path = kstrndup(buffer, size, GFP_KERNEL);
if (!path)
- goto err;
+ goto out_module_put;
sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
if (!sb)
- goto err;
+ goto out_free_path;
+ ret = -EINVAL;
err = "failed to open device";
bdev = blkdev_get_by_path(strim(path),
FMODE_READ|FMODE_WRITE|FMODE_EXCL,
@@ -2385,57 +2435,69 @@
if (!IS_ERR(bdev))
bdput(bdev);
if (attr == &ksysfs_register_quiet)
- goto quiet_out;
+ goto done;
}
- goto err;
+ goto out_free_sb;
}
err = "failed to set blocksize";
if (set_blocksize(bdev, 4096))
- goto err_close;
+ goto out_blkdev_put;
err = read_super(sb, bdev, &sb_page);
if (err)
- goto err_close;
+ goto out_blkdev_put;
err = "failed to register device";
if (SB_IS_BDEV(sb)) {
struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
if (!dc)
- goto err_close;
+ goto out_put_sb_page;
mutex_lock(&bch_register_lock);
ret = register_bdev(sb, sb_page, bdev, dc);
mutex_unlock(&bch_register_lock);
/* blkdev_put() will be called in cached_dev_free() */
- if (ret < 0)
- goto err;
+ if (ret < 0) {
+ bdev = NULL;
+ goto out_put_sb_page;
+ }
} else {
struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
- goto err_close;
+ goto out_put_sb_page;
/* blkdev_put() will be called in bch_cache_release() */
- if (register_cache(sb, sb_page, bdev, ca) != 0)
- goto err;
+ if (register_cache(sb, sb_page, bdev, ca) != 0) {
+ bdev = NULL;
+ goto out_put_sb_page;
+ }
}
-quiet_out:
- ret = size;
-out:
- if (sb_page)
- put_page(sb_page);
+
+ put_page(sb_page);
+done:
kfree(sb);
kfree(path);
module_put(THIS_MODULE);
- return ret;
+ return size;
-err_close:
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-err:
- pr_info("error %s: %s", path, err);
- goto out;
+out_put_sb_page:
+ put_page(sb_page);
+out_blkdev_put:
+ if (bdev)
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+out_free_sb:
+ kfree(sb);
+out_free_path:
+ kfree(path);
+ path = NULL;
+out_module_put:
+ module_put(THIS_MODULE);
+out:
+ pr_info("error %s: %s", path?path:"", err);
+ return ret;
}
@@ -2597,6 +2659,9 @@
destroy_workqueue(bcache_wq);
if (bch_journal_wq)
destroy_workqueue(bch_journal_wq);
+ if (bch_flush_wq)
+ destroy_workqueue(bch_flush_wq);
+ bch_btree_exit();
if (bcache_major)
unregister_blkdev(bcache_major, "bcache");
@@ -2652,10 +2717,26 @@
return bcache_major;
}
+ if (bch_btree_init())
+ goto err;
+
bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
if (!bcache_wq)
goto err;
+ /*
+ * Let's not make this `WQ_MEM_RECLAIM` for the following reasons:
+ *
+ * 1. It used `system_wq` before which also does no memory reclaim.
+ * 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and
+ * reduced throughput can be observed.
+ *
+ * We still want to user our own queue to not congest the `system_wq`.
+ */
+ bch_flush_wq = alloc_workqueue("bch_flush", 0, 0);
+ if (!bch_flush_wq)
+ goto err;
+
bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
if (!bch_journal_wq)
goto err;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 627dcea..7f0fb4b 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -27,6 +27,12 @@
NULL
};
+static const char * const bch_reada_cache_policies[] = {
+ "all",
+ "meta-only",
+ NULL
+};
+
/* Default is 0 ("auto") */
static const char * const bch_stop_on_failure_modes[] = {
"auto",
@@ -100,6 +106,7 @@
rw_attribute(sequential_cutoff);
rw_attribute(data_csum);
rw_attribute(cache_mode);
+rw_attribute(readahead_cache_policy);
rw_attribute(stop_when_cache_set_failed);
rw_attribute(writeback_metadata);
rw_attribute(writeback_running);
@@ -167,6 +174,11 @@
bch_cache_modes,
BDEV_CACHE_MODE(&dc->sb));
+ if (attr == &sysfs_readahead_cache_policy)
+ return bch_snprint_string_list(buf, PAGE_SIZE,
+ bch_reada_cache_policies,
+ dc->cache_readahead_policy);
+
if (attr == &sysfs_stop_when_cache_set_failed)
return bch_snprint_string_list(buf, PAGE_SIZE,
bch_stop_on_failure_modes,
@@ -352,6 +364,15 @@
}
}
+ if (attr == &sysfs_readahead_cache_policy) {
+ v = __sysfs_match_string(bch_reada_cache_policies, -1, buf);
+ if (v < 0)
+ return v;
+
+ if ((unsigned int) v != dc->cache_readahead_policy)
+ dc->cache_readahead_policy = v;
+ }
+
if (attr == &sysfs_stop_when_cache_set_failed) {
v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf);
if (v < 0)
@@ -466,6 +487,7 @@
&sysfs_data_csum,
#endif
&sysfs_cache_mode,
+ &sysfs_readahead_cache_policy,
&sysfs_stop_when_cache_set_failed,
&sysfs_writeback_metadata,
&sysfs_writeback_running,
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index d60268f..0b02210 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -519,15 +519,19 @@
uint64_t offset, int nr_sectors)
{
struct bcache_device *d = c->devices[inode];
- unsigned int stripe_offset, stripe, sectors_dirty;
+ unsigned int stripe_offset, sectors_dirty;
+ int stripe;
if (!d)
return;
+ stripe = offset_to_stripe(d, offset);
+ if (stripe < 0)
+ return;
+
if (UUID_FLASH_ONLY(&c->uuids[inode]))
atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors);
- stripe = offset_to_stripe(d, offset);
stripe_offset = offset & (d->stripe_size - 1);
while (nr_sectors) {
@@ -567,12 +571,12 @@
static void refill_full_stripes(struct cached_dev *dc)
{
struct keybuf *buf = &dc->writeback_keys;
- unsigned int start_stripe, stripe, next_stripe;
+ unsigned int start_stripe, next_stripe;
+ int stripe;
bool wrapped = false;
stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
-
- if (stripe >= dc->disk.nr_stripes)
+ if (stripe < 0)
stripe = 0;
start_stripe = stripe;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 4e4c681..c4ff760 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -33,10 +33,22 @@
return ret;
}
-static inline unsigned int offset_to_stripe(struct bcache_device *d,
+static inline int offset_to_stripe(struct bcache_device *d,
uint64_t offset)
{
do_div(offset, d->stripe_size);
+
+ /* d->nr_stripes is in range [1, INT_MAX] */
+ if (unlikely(offset >= d->nr_stripes)) {
+ pr_err("Invalid stripe %llu (>= nr_stripes %d).\n",
+ offset, d->nr_stripes);
+ return -EINVAL;
+ }
+
+ /*
+ * Here offset is definitly smaller than INT_MAX,
+ * return it as int will never overflow.
+ */
return offset;
}
@@ -44,7 +56,10 @@
uint64_t offset,
unsigned int nr_sectors)
{
- unsigned int stripe = offset_to_stripe(&dc->disk, offset);
+ int stripe = offset_to_stripe(&dc->disk, offset);
+
+ if (stripe < 0)
+ return false;
while (1) {
if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))