Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index f6e0a8b..6dfa653 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config BCACHE
tristate "Block device as cache"
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 7a28232..6f77682 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -327,10 +327,11 @@
* possibly issue discards to them, then we add the bucket to
* the free list:
*/
- while (!fifo_empty(&ca->free_inc)) {
+ while (1) {
long bucket;
- fifo_pop(&ca->free_inc, bucket);
+ if (!fifo_pop(&ca->free_inc, bucket))
+ break;
if (ca->discard) {
mutex_unlock(&ca->set->bucket_lock);
@@ -392,6 +393,11 @@
struct bucket *b;
long r;
+
+ /* No allocation if CACHE_SET_IO_DISABLE bit is set */
+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)))
+ return -1;
+
/* fastpath */
if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
fifo_pop(&ca->free[reserve], r))
@@ -483,8 +489,12 @@
{
int i;
+ /* No allocation if CACHE_SET_IO_DISABLE bit is set */
+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
+ return -1;
+
lockdep_assert_held(&c->bucket_lock);
- BUG_ON(!n || n > c->caches_loaded || n > 8);
+ BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET);
bkey_init(k);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 954dad2..013e35a 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -627,6 +627,20 @@
struct bkey gc_done;
/*
+ * For automatical garbage collection after writeback completed, this
+ * varialbe is used as bit fields,
+ * - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback
+ * - 0000 0010b (BCH_DO_AUTO_GC): do gc after writeback
+ * This is an optimization for following write request after writeback
+ * finished, but read hit rate dropped due to clean data on cache is
+ * discarded. Unless user explicitly sets it via sysfs, it won't be
+ * enabled.
+ */
+#define BCH_ENABLE_AUTO_GC 1
+#define BCH_DO_AUTO_GC 2
+ uint8_t gc_after_writeback;
+
+ /*
* The allocation code needs gc_mark in struct bucket to be correct, but
* it's not while a gc is in progress. Protected by bucket_lock.
*/
@@ -658,7 +672,11 @@
/*
* A btree node on disk could have too many bsets for an iterator to fit
- * on the stack - have to dynamically allocate them
+ * on the stack - have to dynamically allocate them.
+ * bch_cache_set_alloc() will make sure the pool can allocate iterators
+ * equipped with enough room that can host
+ * (sb.bucket_size / sb.block_size)
+ * btree_iter_sets, which is more than static MAX_BSETS.
*/
mempool_t fill_iter;
@@ -687,8 +705,8 @@
atomic_long_t writeback_keys_failed;
atomic_long_t reclaim;
+ atomic_long_t reclaimed_journal_buckets;
atomic_long_t flush_write;
- atomic_long_t retry_flush_write;
enum {
ON_ERROR_UNREGISTER,
@@ -708,8 +726,6 @@
#define BUCKET_HASH_BITS 12
struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
-
- DECLARE_HEAP(struct btree *, flush_btree);
};
struct bbio {
@@ -988,7 +1004,7 @@
int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
uint8_t *set_uuid);
void bch_cached_dev_detach(struct cached_dev *dc);
-void bch_cached_dev_run(struct cached_dev *dc);
+int bch_cached_dev_run(struct cached_dev *dc);
void bcache_device_stop(struct bcache_device *d);
void bch_cache_set_unregister(struct cache_set *c);
@@ -1004,7 +1020,7 @@
int bch_cache_allocator_start(struct cache *ca);
void bch_debug_exit(void);
-void bch_debug_init(struct kobject *kobj);
+void bch_debug_init(void);
void bch_request_exit(void);
int bch_request_init(void);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 8f07fa6..0876879 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -347,22 +347,19 @@
void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
bool *expensive_debug_checks)
{
- unsigned int i;
-
b->ops = ops;
b->expensive_debug_checks = expensive_debug_checks;
b->nsets = 0;
b->last_set_unwritten = 0;
- /* XXX: shouldn't be needed */
- for (i = 0; i < MAX_BSETS; i++)
- b->set[i].size = 0;
/*
- * Second loop starts at 1 because b->keys[0]->data is the memory we
- * allocated
+ * struct btree_keys in embedded in struct btree, and struct
+ * bset_tree is embedded into struct btree_keys. They are all
+ * initialized as 0 by kzalloc() in mca_bucket_alloc(), and
+ * b->set[0].data is allocated in bch_btree_keys_alloc(), so we
+ * don't have to initiate b->set[].size and b->set[].data here
+ * any more.
*/
- for (i = 1; i < MAX_BSETS; i++)
- b->set[i].data = NULL;
}
EXPORT_SYMBOL(bch_btree_keys_init);
@@ -887,12 +884,22 @@
struct bset *i = bset_tree_last(b)->data;
struct bkey *m, *prev = NULL;
struct btree_iter iter;
+ struct bkey preceding_key_on_stack = ZERO_KEY;
+ struct bkey *preceding_key_p = &preceding_key_on_stack;
BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
- m = bch_btree_iter_init(b, &iter, b->ops->is_extents
- ? PRECEDING_KEY(&START_KEY(k))
- : PRECEDING_KEY(k));
+ /*
+ * If k has preceding key, preceding_key_p will be set to address
+ * of k's preceding key; otherwise preceding_key_p will be set
+ * to NULL inside preceding_key().
+ */
+ if (b->ops->is_extents)
+ preceding_key(&START_KEY(k), &preceding_key_p);
+ else
+ preceding_key(k, &preceding_key_p);
+
+ m = bch_btree_iter_init(b, &iter, preceding_key_p);
if (b->ops->insert_fixup(b, k, &iter, replace_key))
return status;
@@ -960,45 +967,25 @@
unsigned int inorder, j, n = 1;
do {
- /*
- * A bit trick here.
- * If p < t->size, (int)(p - t->size) is a minus value and
- * the most significant bit is set, right shifting 31 bits
- * gets 1. If p >= t->size, the most significant bit is
- * not set, right shifting 31 bits gets 0.
- * So the following 2 lines equals to
- * if (p >= t->size)
- * p = 0;
- * but a branch instruction is avoided.
- */
unsigned int p = n << 4;
- p &= ((int) (p - t->size)) >> 31;
-
- prefetch(&t->tree[p]);
+ if (p < t->size)
+ prefetch(&t->tree[p]);
j = n;
f = &t->tree[j];
- /*
- * Similar bit trick, use subtract operation to avoid a branch
- * instruction.
- *
- * n = (f->mantissa > bfloat_mantissa())
- * ? j * 2
- * : j * 2 + 1;
- *
- * We need to subtract 1 from f->mantissa for the sign bit trick
- * to work - that's done in make_bfloat()
- */
- if (likely(f->exponent != 127))
- n = j * 2 + (((unsigned int)
- (f->mantissa -
- bfloat_mantissa(search, f))) >> 31);
- else
- n = (bkey_cmp(tree_to_bkey(t, j), search) > 0)
- ? j * 2
- : j * 2 + 1;
+ if (likely(f->exponent != 127)) {
+ if (f->mantissa >= bfloat_mantissa(search, f))
+ n = j * 2;
+ else
+ n = j * 2 + 1;
+ } else {
+ if (bkey_cmp(tree_to_bkey(t, j), search) > 0)
+ n = j * 2;
+ else
+ n = j * 2 + 1;
+ }
} while (n < t->size);
inorder = to_inorder(j, t);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index bac76aa..c71365e 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -434,20 +434,26 @@
return __bch_cut_back(where, k);
}
-#define PRECEDING_KEY(_k) \
-({ \
- struct bkey *_ret = NULL; \
- \
- if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
- _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
- \
- if (!_ret->low) \
- _ret->high--; \
- _ret->low--; \
- } \
- \
- _ret; \
-})
+/*
+ * Pointer '*preceding_key_p' points to a memory object to store preceding
+ * key of k. If the preceding key does not exist, set '*preceding_key_p' to
+ * NULL. So the caller of preceding_key() needs to take care of memory
+ * which '*preceding_key_p' pointed to before calling preceding_key().
+ * Currently the only caller of preceding_key() is bch_btree_insert_key(),
+ * and it points to an on-stack variable, so the memory release is handled
+ * by stackframe itself.
+ */
+static inline void preceding_key(struct bkey *k, struct bkey **preceding_key_p)
+{
+ if (KEY_INODE(k) || KEY_OFFSET(k)) {
+ (**preceding_key_p) = KEY(KEY_INODE(k), KEY_OFFSET(k), 0);
+ if (!(*preceding_key_p)->low)
+ (*preceding_key_p)->high--;
+ (*preceding_key_p)->low--;
+ } else {
+ (*preceding_key_p) = NULL;
+ }
+}
static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
{
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3f4211b..ba434d9 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -35,7 +35,7 @@
#include <linux/rcupdate.h>
#include <linux/sched/clock.h>
#include <linux/rculist.h>
-
+#include <linux/delay.h>
#include <trace/events/bcache.h>
/*
@@ -207,6 +207,11 @@
struct bset *i = btree_bset_first(b);
struct btree_iter *iter;
+ /*
+ * c->fill_iter can allocate an iterator with more memory space
+ * than static MAX_BSETS.
+ * See the comment arount cache_set->fill_iter.
+ */
iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
iter->used = 0;
@@ -424,13 +429,14 @@
bset_sector_offset(&b->keys, i));
if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
- int j;
struct bio_vec *bv;
- void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
+ void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, b->bio, j)
- memcpy(page_address(bv->bv_page),
- base + j * PAGE_SIZE, PAGE_SIZE);
+ bio_for_each_segment_all(bv, b->bio, iter_all) {
+ memcpy(page_address(bv->bv_page), addr, PAGE_SIZE);
+ addr += PAGE_SIZE;
+ }
bch_submit_bbio(b->bio, b->c, &k.key, 0);
@@ -607,6 +613,10 @@
static struct btree *mca_bucket_alloc(struct cache_set *c,
struct bkey *k, gfp_t gfp)
{
+ /*
+ * kzalloc() is necessary here for initialization,
+ * see code comments in bch_btree_keys_init().
+ */
struct btree *b = kzalloc(sizeof(struct btree), gfp);
if (!b)
@@ -649,7 +659,25 @@
up(&b->io_mutex);
}
+retry:
+ /*
+ * BTREE_NODE_dirty might be cleared in btree_flush_btree() by
+ * __bch_btree_node_write(). To avoid an extra flush, acquire
+ * b->write_lock before checking BTREE_NODE_dirty bit.
+ */
mutex_lock(&b->write_lock);
+ /*
+ * If this btree node is selected in btree_flush_write() by journal
+ * code, delay and retry until the node is flushed by journal code
+ * and BTREE_NODE_journal_flush bit cleared by btree_flush_write().
+ */
+ if (btree_node_journal_flush(b)) {
+ pr_debug("bnode %p is flushing by journal, retry", b);
+ mutex_unlock(&b->write_lock);
+ udelay(1);
+ goto retry;
+ }
+
if (btree_node_dirty(b))
__bch_btree_node_write(b, &cl);
mutex_unlock(&b->write_lock);
@@ -772,10 +800,15 @@
while (!list_empty(&c->btree_cache)) {
b = list_first_entry(&c->btree_cache, struct btree, list);
- if (btree_node_dirty(b))
+ /*
+ * This function is called by cache_set_free(), no I/O
+ * request on cache now, it is unnecessary to acquire
+ * b->write_lock before clearing BTREE_NODE_dirty anymore.
+ */
+ if (btree_node_dirty(b)) {
btree_complete_write(b, btree_current_write(b));
- clear_bit(BTREE_NODE_dirty, &b->flags);
-
+ clear_bit(BTREE_NODE_dirty, &b->flags);
+ }
mca_data_free(b);
}
@@ -1061,11 +1094,25 @@
BUG_ON(b == b->c->root);
+retry:
mutex_lock(&b->write_lock);
+ /*
+ * If the btree node is selected and flushing in btree_flush_write(),
+ * delay and retry until the BTREE_NODE_journal_flush bit cleared,
+ * then it is safe to free the btree node here. Otherwise this btree
+ * node will be in race condition.
+ */
+ if (btree_node_journal_flush(b)) {
+ mutex_unlock(&b->write_lock);
+ pr_debug("bnode %p journal_flush set, retry", b);
+ udelay(1);
+ goto retry;
+ }
- if (btree_node_dirty(b))
+ if (btree_node_dirty(b)) {
btree_complete_write(b, btree_current_write(b));
- clear_bit(BTREE_NODE_dirty, &b->flags);
+ clear_bit(BTREE_NODE_dirty, &b->flags);
+ }
mutex_unlock(&b->write_lock);
@@ -1470,11 +1517,11 @@
out_nocoalesce:
closure_sync(&cl);
- bch_keylist_free(&keylist);
while ((k = bch_keylist_pop(&keylist)))
if (!bkey_cmp(k, &ZERO_KEY))
atomic_dec(&b->c->prio_blocked);
+ bch_keylist_free(&keylist);
for (i = 0; i < nodes; i++)
if (!IS_ERR_OR_NULL(new_nodes[i])) {
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index a68d6c5..76cfd12 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -158,11 +158,13 @@
BTREE_NODE_io_error,
BTREE_NODE_dirty,
BTREE_NODE_write_idx,
+ BTREE_NODE_journal_flush,
};
BTREE_FLAG(io_error);
BTREE_FLAG(dirty);
BTREE_FLAG(write_idx);
+BTREE_FLAG(journal_flush);
static inline struct btree_write *btree_current_write(struct btree *b)
{
@@ -266,6 +268,24 @@
wake_up(&c->gc_wait);
}
+static inline void force_wake_up_gc(struct cache_set *c)
+{
+ /*
+ * Garbage collection thread only works when sectors_to_gc < 0,
+ * calling wake_up_gc() won't start gc thread if sectors_to_gc is
+ * not a nagetive value.
+ * Therefore sectors_to_gc is set to -1 here, before waking up
+ * gc thread by calling wake_up_gc(). Then gc_should_run() will
+ * give a chance to permit gc thread to run. "Give a chance" means
+ * before going into gc_should_run(), there is still possibility
+ * that c->sectors_to_gc being set to other positive value. So
+ * this routine won't 100% make sure gc thread will be woken up
+ * to run.
+ */
+ atomic_set(&c->sectors_to_gc, -1);
+ wake_up_gc(c);
+}
+
#define MAP_DONE 0
#define MAP_CONTINUE 1
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 73f5319..c12cd80 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -105,8 +105,14 @@
static void closure_sync_fn(struct closure *cl)
{
- cl->s->done = 1;
- wake_up_process(cl->s->task);
+ struct closure_syncer *s = cl->s;
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = READ_ONCE(s->task);
+ s->done = 1;
+ wake_up_process(p);
+ rcu_read_unlock();
}
void __sched __closure_sync(struct closure *cl)
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index eca0d49..c88cdc4 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -345,7 +345,8 @@
} while (0)
/**
- * closure_return - finish execution of a closure, with destructor
+ * closure_return_with_destructor - finish execution of a closure,
+ * with destructor
*
* Works like closure_return(), except @destructor will be called when all
* outstanding refs on @cl have been dropped; @destructor may be used to safely
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 06da66b..336f439 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -178,10 +178,9 @@
while (size) {
struct keybuf_key *w;
unsigned int bytes = min(i->bytes, size);
- int err = copy_to_user(buf, i->buf, bytes);
- if (err)
- return err;
+ if (copy_to_user(buf, i->buf, bytes))
+ return -EFAULT;
ret += bytes;
buf += bytes;
@@ -249,11 +248,10 @@
void bch_debug_exit(void)
{
- if (!IS_ERR_OR_NULL(bcache_debug))
- debugfs_remove_recursive(bcache_debug);
+ debugfs_remove_recursive(bcache_debug);
}
-void __init bch_debug_init(struct kobject *kobj)
+void __init bch_debug_init(void)
{
/*
* it is unnecessary to check return value of
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index c809724..8867100 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -538,6 +538,7 @@
{
struct btree *b = container_of(bk, struct btree, keys);
unsigned int i, stale;
+ char buf[80];
if (!KEY_PTRS(k) ||
bch_extent_invalid(bk, k))
@@ -547,19 +548,19 @@
if (!ptr_available(b->c, k, i))
return true;
- if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
- return false;
-
for (i = 0; i < KEY_PTRS(k); i++) {
stale = ptr_stale(b->c, k, i);
- btree_bug_on(stale > 96, b,
+ if (stale && KEY_DIRTY(k)) {
+ bch_extent_to_text(buf, sizeof(buf), k);
+ pr_info("stale dirty pointer, stale %u, key: %s",
+ stale, buf);
+ }
+
+ btree_bug_on(stale > BUCKET_GC_GEN_MAX, b,
"key too stale: %i, need_gc %u",
stale, b->c->need_gc);
- btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
- b, "stale dirty pointer");
-
if (stale)
return true;
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index c250979..4d93f07 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -58,6 +58,18 @@
WARN_ONCE(!dc, "NULL pointer of struct cached_dev");
+ /*
+ * Read-ahead requests on a degrading and recovering md raid
+ * (e.g. raid6) device might be failured immediately by md
+ * raid code, which is not a real hardware media failure. So
+ * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors.
+ */
+ if (bio->bi_opf & REQ_RAHEAD) {
+ pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore",
+ dc->backing_dev_name);
+ return;
+ }
+
errors = atomic_add_return(1, &dc->io_errors);
if (errors < dc->error_limit)
pr_err("%s: IO error on backing device, unrecoverable",
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 522c742..be2a2a2 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -100,6 +100,20 @@
blocks = set_blocks(j, block_bytes(ca->set));
+ /*
+ * Nodes in 'list' are in linear increasing order of
+ * i->j.seq, the node on head has the smallest (oldest)
+ * journal seq, the node on tail has the biggest
+ * (latest) journal seq.
+ */
+
+ /*
+ * Check from the oldest jset for last_seq. If
+ * i->j.seq < j->last_seq, it means the oldest jset
+ * in list is expired and useless, remove it from
+ * this list. Otherwise, j is a condidate jset for
+ * further following checks.
+ */
while (!list_empty(list)) {
i = list_first_entry(list,
struct journal_replay, list);
@@ -109,13 +123,22 @@
kfree(i);
}
+ /* iterate list in reverse order (from latest jset) */
list_for_each_entry_reverse(i, list, list) {
if (j->seq == i->j.seq)
goto next_set;
+ /*
+ * if j->seq is less than any i->j.last_seq
+ * in list, j is an expired and useless jset.
+ */
if (j->seq < i->j.last_seq)
goto next_set;
+ /*
+ * 'where' points to first jset in list which
+ * is elder then j.
+ */
if (j->seq > i->j.seq) {
where = &i->list;
goto add;
@@ -129,10 +152,12 @@
if (!i)
return -ENOMEM;
memcpy(&i->j, j, bytes);
+ /* Add to the location after 'where' points to */
list_add(&i->list, where);
ret = 1;
- ja->seq[bucket_index] = j->seq;
+ if (j->seq > ja->seq[bucket_index])
+ ja->seq[bucket_index] = j->seq;
next_set:
offset += blocks * ca->sb.block_size;
len -= blocks * ca->sb.block_size;
@@ -147,7 +172,7 @@
{
#define read_bucket(b) \
({ \
- int ret = journal_read_bucket(ca, list, b); \
+ ret = journal_read_bucket(ca, list, b); \
__set_bit(b, bitmap); \
if (ret < 0) \
return ret; \
@@ -156,6 +181,7 @@
struct cache *ca;
unsigned int iter;
+ int ret = 0;
for_each_cache(ca, c, iter) {
struct journal_device *ja = &ca->journal;
@@ -317,6 +343,18 @@
}
}
+static bool is_discard_enabled(struct cache_set *s)
+{
+ struct cache *ca;
+ unsigned int i;
+
+ for_each_cache(ca, s, i)
+ if (ca->discard)
+ return true;
+
+ return false;
+}
+
int bch_journal_replay(struct cache_set *s, struct list_head *list)
{
int ret = 0, keys = 0, entries = 0;
@@ -330,9 +368,17 @@
list_for_each_entry(i, list, list) {
BUG_ON(i->pin && atomic_read(i->pin) != 1);
- cache_set_err_on(n != i->j.seq, s,
-"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
- n, i->j.seq - 1, start, end);
+ if (n != i->j.seq) {
+ if (n == start && is_discard_enabled(s))
+ pr_info("bcache: journal entries %llu-%llu may be discarded! (replaying %llu-%llu)",
+ n, i->j.seq - 1, start, end);
+ else {
+ pr_err("bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
+ n, i->j.seq - 1, start, end);
+ ret = -EIO;
+ goto err;
+ }
+ }
for (k = i->j.start;
k < bset_bkey_last(&i->j);
@@ -370,60 +416,90 @@
}
/* Journalling */
-#define journal_max_cmp(l, r) \
- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \
- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
-#define journal_min_cmp(l, r) \
- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \
- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
static void btree_flush_write(struct cache_set *c)
{
- /*
- * Try to find the btree node with that references the oldest journal
- * entry, best is our current candidate and is locked if non NULL:
- */
- struct btree *b;
- int i;
+ struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR];
+ unsigned int i, n;
+
+ if (c->journal.btree_flushing)
+ return;
+
+ spin_lock(&c->journal.flush_write_lock);
+ if (c->journal.btree_flushing) {
+ spin_unlock(&c->journal.flush_write_lock);
+ return;
+ }
+ c->journal.btree_flushing = true;
+ spin_unlock(&c->journal.flush_write_lock);
atomic_long_inc(&c->flush_write);
+ memset(btree_nodes, 0, sizeof(btree_nodes));
+ n = 0;
-retry:
- spin_lock(&c->journal.lock);
- if (heap_empty(&c->flush_btree)) {
- for_each_cached_btree(b, c, i)
- if (btree_current_write(b)->journal) {
- if (!heap_full(&c->flush_btree))
- heap_add(&c->flush_btree, b,
- journal_max_cmp);
- else if (journal_max_cmp(b,
- heap_peek(&c->flush_btree))) {
- c->flush_btree.data[0] = b;
- heap_sift(&c->flush_btree, 0,
- journal_max_cmp);
- }
- }
+ mutex_lock(&c->bucket_lock);
+ list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
+ if (btree_node_journal_flush(b))
+ pr_err("BUG: flush_write bit should not be set here!");
- for (i = c->flush_btree.used / 2 - 1; i >= 0; --i)
- heap_sift(&c->flush_btree, i, journal_min_cmp);
- }
-
- b = NULL;
- heap_pop(&c->flush_btree, b, journal_min_cmp);
- spin_unlock(&c->journal.lock);
-
- if (b) {
mutex_lock(&b->write_lock);
+
+ if (!btree_node_dirty(b)) {
+ mutex_unlock(&b->write_lock);
+ continue;
+ }
+
if (!btree_current_write(b)->journal) {
mutex_unlock(&b->write_lock);
- /* We raced */
- atomic_long_inc(&c->retry_flush_write);
- goto retry;
+ continue;
+ }
+
+ set_btree_node_journal_flush(b);
+
+ mutex_unlock(&b->write_lock);
+
+ btree_nodes[n++] = b;
+ if (n == BTREE_FLUSH_NR)
+ break;
+ }
+ mutex_unlock(&c->bucket_lock);
+
+ for (i = 0; i < n; i++) {
+ b = btree_nodes[i];
+ if (!b) {
+ pr_err("BUG: btree_nodes[%d] is NULL", i);
+ continue;
+ }
+
+ /* safe to check without holding b->write_lock */
+ if (!btree_node_journal_flush(b)) {
+ pr_err("BUG: bnode %p: journal_flush bit cleaned", b);
+ continue;
+ }
+
+ mutex_lock(&b->write_lock);
+ if (!btree_current_write(b)->journal) {
+ clear_bit(BTREE_NODE_journal_flush, &b->flags);
+ mutex_unlock(&b->write_lock);
+ pr_debug("bnode %p: written by others", b);
+ continue;
+ }
+
+ if (!btree_node_dirty(b)) {
+ clear_bit(BTREE_NODE_journal_flush, &b->flags);
+ mutex_unlock(&b->write_lock);
+ pr_debug("bnode %p: dirty bit cleaned by others", b);
+ continue;
}
__bch_btree_node_write(b, NULL);
+ clear_bit(BTREE_NODE_journal_flush, &b->flags);
mutex_unlock(&b->write_lock);
}
+
+ spin_lock(&c->journal.flush_write_lock);
+ c->journal.btree_flushing = false;
+ spin_unlock(&c->journal.flush_write_lock);
}
#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
@@ -538,13 +614,14 @@
k->ptr[n++] = MAKE_PTR(0,
bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
ca->sb.nr_this_dev);
+ atomic_long_inc(&c->reclaimed_journal_buckets);
}
- bkey_init(k);
- SET_KEY_PTRS(k, n);
-
- if (n)
+ if (n) {
+ bkey_init(k);
+ SET_KEY_PTRS(k, n);
c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
+ }
out:
if (!journal_full(&c->journal))
__closure_wake_up(&c->journal.wait);
@@ -663,7 +740,7 @@
REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
bch_bio_map(bio, w->data);
- trace_bcache_journal_write(bio);
+ trace_bcache_journal_write(bio, w->data->keys);
bio_list_add(&list, bio);
SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
@@ -671,6 +748,9 @@
ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
}
+ /* If KEY_PTRS(k) == 0, this jset gets lost in air */
+ BUG_ON(i == 0);
+
atomic_dec_bug(&fifo_back(&c->journal.pin));
bch_journal_next(&c->journal);
journal_reclaim(c);
@@ -787,6 +867,10 @@
struct journal_write *w;
atomic_t *ret;
+ /* No journaling if CACHE_SET_IO_DISABLE set already */
+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
+ return NULL;
+
if (!CACHE_SYNC(&c->sb))
return NULL;
@@ -831,7 +915,6 @@
free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
free_fifo(&c->journal.pin);
- free_heap(&c->flush_btree);
}
int bch_journal_alloc(struct cache_set *c)
@@ -839,6 +922,7 @@
struct journal *j = &c->journal;
spin_lock_init(&j->lock);
+ spin_lock_init(&j->flush_write_lock);
INIT_DELAYED_WORK(&j->work, journal_write_work);
c->journal_delay_ms = 100;
@@ -846,8 +930,7 @@
j->w[0].c = c;
j->w[1].c = c;
- if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) ||
- !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
!(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
!(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
return -ENOMEM;
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 66f0fac..f2ea34d 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -103,6 +103,8 @@
/* Embedded in struct cache_set */
struct journal {
spinlock_t lock;
+ spinlock_t flush_write_lock;
+ bool btree_flushing;
/* used when waiting because the journal was full */
struct closure_waitlist wait;
struct closure io;
@@ -154,6 +156,8 @@
struct bio_vec bv[8];
};
+#define BTREE_FLUSH_NR 8
+
#define journal_pin_cmp(c, l, r) \
(fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 22944aa..41adcd1 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -311,11 +311,11 @@
* data is written it calls bch_journal, and after the keys have been added to
* the next journal write they're inserted into the btree.
*
- * It inserts the data in s->cache_bio; bi_sector is used for the key offset,
+ * It inserts the data in op->bio; bi_sector is used for the key offset,
* and op->inode is used for the key inode.
*
- * If s->bypass is true, instead of inserting the data it invalidates the
- * region of the cache represented by s->cache_bio and op->inode.
+ * If op->bypass is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
*/
void bch_data_insert(struct closure *cl)
{
@@ -329,12 +329,13 @@
bch_data_insert_start(cl);
}
-/* Congested? */
-
-unsigned int bch_get_congested(struct cache_set *c)
+/*
+ * Congested? Return 0 (not congested) or the limit (in sectors)
+ * beyond which we should bypass the cache due to congestion.
+ */
+unsigned int bch_get_congested(const struct cache_set *c)
{
int i;
- long rand;
if (!c->congested_read_threshold_us &&
!c->congested_write_threshold_us)
@@ -353,8 +354,7 @@
if (i > 0)
i = fract_exp_two(i, 6);
- rand = get_random_int();
- i -= bitmap_weight(&rand, BITS_PER_LONG);
+ i -= hweight32(get_random_u32());
return i > 0 ? i : 1;
}
@@ -376,7 +376,7 @@
{
struct cache_set *c = dc->disk.c;
unsigned int mode = cache_mode(dc);
- unsigned int sectors, congested = bch_get_congested(c);
+ unsigned int sectors, congested;
struct task_struct *task = current;
struct io *i;
@@ -392,10 +392,11 @@
/*
* Flag for bypass if the IO is for read-ahead or background,
- * unless the read-ahead request is for metadata (eg, for gfs2).
+ * unless the read-ahead request is for metadata
+ * (eg, for gfs2 or xfs).
*/
if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
- !(bio->bi_opf & REQ_META))
+ !(bio->bi_opf & (REQ_META|REQ_PRIO)))
goto skip;
if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
@@ -411,6 +412,7 @@
goto rescale;
}
+ congested = bch_get_congested(c);
if (!congested && !dc->sequential_cutoff)
goto rescale;
@@ -705,14 +707,14 @@
{
struct search *s = container_of(cl, struct search, cl);
- atomic_dec(&s->d->c->search_inflight);
+ atomic_dec(&s->iop.c->search_inflight);
if (s->iop.bio)
bio_put(s->iop.bio);
bio_complete(s);
closure_debug_destroy(cl);
- mempool_free(s, &s->d->c->search);
+ mempool_free(s, &s->iop.c->search);
}
static inline struct search *search_alloc(struct bio *bio,
@@ -755,13 +757,13 @@
struct search *s = container_of(cl, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
- search_free(cl);
cached_dev_put(dc);
+ search_free(cl);
}
/* Process reads */
-static void cached_dev_cache_miss_done(struct closure *cl)
+static void cached_dev_read_error_done(struct closure *cl)
{
struct search *s = container_of(cl, struct search, cl);
@@ -799,7 +801,22 @@
closure_bio_submit(s->iop.c, bio, cl);
}
- continue_at(cl, cached_dev_cache_miss_done, NULL);
+ continue_at(cl, cached_dev_read_error_done, NULL);
+}
+
+static void cached_dev_cache_miss_done(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct bcache_device *d = s->d;
+
+ if (s->iop.replace_collision)
+ bch_mark_cache_miss_collision(s->iop.c, s->d);
+
+ if (s->iop.bio)
+ bio_free_pages(s->iop.bio);
+
+ cached_dev_bio_complete(cl);
+ closure_put(&d->cl);
}
static void cached_dev_read_done(struct closure *cl)
@@ -832,6 +849,7 @@
if (verify(dc) && s->recoverable && !s->read_dirty_data)
bch_data_verify(dc, s->orig_bio);
+ closure_get(&dc->disk.cl);
bio_complete(s);
if (s->iop.bio &&
@@ -877,7 +895,7 @@
}
if (!(bio->bi_opf & REQ_RAHEAD) &&
- !(bio->bi_opf & REQ_META) &&
+ !(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
reada = min_t(sector_t, dc->readahead >> 9,
get_capacity(bio->bi_disk) - bio_end_sector(bio));
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index aa055cf..c64dbd7 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -33,12 +33,12 @@
BKEY_PADDED(replace_key);
};
-unsigned int bch_get_congested(struct cache_set *c);
+unsigned int bch_get_congested(const struct cache_set *c);
void bch_data_insert(struct closure *cl);
void bch_cached_dev_request_init(struct cached_dev *dc);
void bch_flash_dev_request_init(struct bcache_device *d);
-extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
+extern struct kmem_cache *bch_search_cache;
#endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index 894410f..ba1c937 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -111,7 +111,7 @@
{
memset(&acc->total.cache_hits,
0,
- sizeof(unsigned long) * 7);
+ sizeof(struct cache_stats));
}
void bch_cache_accounting_destroy(struct cache_accounting *acc)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 03bb5ce..20ed838 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -25,8 +25,8 @@
#include <linux/reboot.h>
#include <linux/sysfs.h>
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+unsigned int bch_cutoff_writeback;
+unsigned int bch_cutoff_writeback_sync;
static const char bcache_magic[] = {
0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
@@ -40,6 +40,7 @@
static struct kobject *bcache_kobj;
struct mutex bch_register_lock;
+bool bcache_is_reboot;
LIST_HEAD(bch_cache_sets);
static LIST_HEAD(uncached_devices);
@@ -49,6 +50,7 @@
struct workqueue_struct *bcache_wq;
struct workqueue_struct *bch_journal_wq;
+
#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
/* limitation of partitions number on single bcache device */
#define BCACHE_MINORS 128
@@ -197,7 +199,9 @@
static void write_bdev_super_endio(struct bio *bio)
{
struct cached_dev *dc = bio->bi_private;
- /* XXX: error checking */
+
+ if (bio->bi_status)
+ bch_count_backing_io_errors(dc, bio);
closure_put(&dc->sb_write);
}
@@ -418,6 +422,7 @@
{
BKEY_PADDED(key) k;
struct closure cl;
+ struct cache *ca;
closure_init_stack(&cl);
lockdep_assert_held(&bch_register_lock);
@@ -429,6 +434,10 @@
uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
closure_sync(&cl);
+ /* Only one bucket used for uuid write */
+ ca = PTR_CACHE(c, &k.key, 0);
+ atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
+
bkey_copy(&c->uuid_bucket, &k.key);
bkey_put(c, &k.key);
return 0;
@@ -657,6 +666,11 @@
void bcache_device_stop(struct bcache_device *d)
{
if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
+ /*
+ * closure_fn set to
+ * - cached device: cached_dev_flush()
+ * - flash dev: flash_dev_flush()
+ */
closure_queue(&d->cl);
}
@@ -681,6 +695,7 @@
{
unsigned int i;
struct cache *ca;
+ int ret;
for_each_cache(ca, d->c, i)
bd_link_disk_holder(ca->bdev, d->disk);
@@ -688,9 +703,13 @@
snprintf(d->name, BCACHEDEVNAME_SIZE,
"%s%u", name, d->id);
- WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
- sysfs_create_link(&c->kobj, &d->kobj, d->name),
- "Couldn't create device <-> cache set symlinks");
+ ret = sysfs_create_link(&d->kobj, &c->kobj, "cache");
+ if (ret < 0)
+ pr_err("Couldn't create device -> cache set symlink");
+
+ ret = sysfs_create_link(&c->kobj, &d->kobj, d->name);
+ if (ret < 0)
+ pr_err("Couldn't create cache set -> device symlink");
clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
}
@@ -898,25 +917,33 @@
}
-void bch_cached_dev_run(struct cached_dev *dc)
+int bch_cached_dev_run(struct cached_dev *dc)
{
struct bcache_device *d = &dc->disk;
- char buf[SB_LABEL_SIZE + 1];
+ char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
char *env[] = {
"DRIVER=bcache",
kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
- NULL,
+ kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""),
NULL,
};
- memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
- buf[SB_LABEL_SIZE] = '\0';
- env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
+ if (dc->io_disable) {
+ pr_err("I/O disabled on cached dev %s",
+ dc->backing_dev_name);
+ kfree(env[1]);
+ kfree(env[2]);
+ kfree(buf);
+ return -EIO;
+ }
if (atomic_xchg(&dc->running, 1)) {
kfree(env[1]);
kfree(env[2]);
- return;
+ kfree(buf);
+ pr_info("cached dev %s is running already",
+ dc->backing_dev_name);
+ return -EBUSY;
}
if (!d->c &&
@@ -939,10 +966,14 @@
kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
kfree(env[1]);
kfree(env[2]);
+ kfree(buf);
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
- sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
- pr_debug("error creating sysfs link");
+ sysfs_create_link(&disk_to_dev(d->disk)->kobj,
+ &d->kobj, "bcache")) {
+ pr_err("Couldn't create bcache dev <-> disk sysfs symlinks");
+ return -ENOMEM;
+ }
dc->status_update_thread = kthread_run(cached_dev_status_update,
dc, "bcache_status_update");
@@ -951,6 +982,8 @@
"continue to run without monitoring backing "
"device status");
}
+
+ return 0;
}
/*
@@ -988,7 +1021,6 @@
BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
BUG_ON(refcount_read(&dc->count));
- mutex_lock(&bch_register_lock);
if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
cancel_writeback_rate_update_dwork(dc);
@@ -1004,6 +1036,9 @@
bch_write_bdev_super(dc, &cl);
closure_sync(&cl);
+ mutex_lock(&bch_register_lock);
+
+ calc_cached_dev_sectors(dc->disk.c);
bcache_device_detach(&dc->disk);
list_move(&dc->list, &uncached_devices);
@@ -1045,6 +1080,7 @@
uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
struct uuid_entry *u;
struct cached_dev *exist_dc, *t;
+ int ret = 0;
if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
(!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
@@ -1144,6 +1180,8 @@
down_write(&dc->writeback_lock);
if (bch_cached_dev_writeback_start(dc)) {
up_write(&dc->writeback_lock);
+ pr_err("Couldn't start writeback facilities for %s",
+ dc->disk.disk->disk_name);
return -ENOMEM;
}
@@ -1154,7 +1192,22 @@
bch_sectors_dirty_init(&dc->disk);
- bch_cached_dev_run(dc);
+ ret = bch_cached_dev_run(dc);
+ if (ret && (ret != -EBUSY)) {
+ up_write(&dc->writeback_lock);
+ /*
+ * bch_register_lock is held, bcache_device_stop() is not
+ * able to be directly called. The kthread and kworker
+ * created previously in bch_cached_dev_writeback_start()
+ * have to be stopped manually here.
+ */
+ kthread_stop(dc->writeback_thread);
+ cancel_writeback_rate_update_dwork(dc);
+ pr_err("Couldn't run cached device %s",
+ dc->backing_dev_name);
+ return ret;
+ }
+
bcache_device_link(&dc->disk, c, "bdev");
atomic_inc(&c->attached_dev_nr);
@@ -1168,6 +1221,7 @@
return 0;
}
+/* when dc->disk.kobj released */
void bch_cached_dev_release(struct kobject *kobj)
{
struct cached_dev *dc = container_of(kobj, struct cached_dev,
@@ -1180,18 +1234,16 @@
{
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
- mutex_lock(&bch_register_lock);
-
if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
cancel_writeback_rate_update_dwork(dc);
if (!IS_ERR_OR_NULL(dc->writeback_thread))
kthread_stop(dc->writeback_thread);
- if (dc->writeback_write_wq)
- destroy_workqueue(dc->writeback_write_wq);
if (!IS_ERR_OR_NULL(dc->status_update_thread))
kthread_stop(dc->status_update_thread);
+ mutex_lock(&bch_register_lock);
+
if (atomic_read(&dc->running))
bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
bcache_device_free(&dc->disk);
@@ -1274,12 +1326,13 @@
/* Cached device - bcache superblock */
-static void register_bdev(struct cache_sb *sb, struct page *sb_page,
+static int register_bdev(struct cache_sb *sb, struct page *sb_page,
struct block_device *bdev,
struct cached_dev *dc)
{
const char *err = "cannot allocate memory";
struct cache_set *c;
+ int ret = -ENOMEM;
bdevname(bdev, dc->backing_dev_name);
memcpy(&dc->sb, sb, sizeof(struct cache_sb));
@@ -1309,17 +1362,23 @@
bch_cached_dev_attach(dc, c, NULL);
if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
- BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
- bch_cached_dev_run(dc);
+ BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) {
+ err = "failed to run cached device";
+ ret = bch_cached_dev_run(dc);
+ if (ret)
+ goto err;
+ }
- return;
+ return 0;
err:
pr_notice("error %s: %s", dc->backing_dev_name, err);
bcache_device_stop(&dc->disk);
+ return ret;
}
/* Flash only volumes */
+/* When d->kobj released */
void bch_flash_dev_release(struct kobject *kobj)
{
struct bcache_device *d = container_of(kobj, struct bcache_device,
@@ -1425,8 +1484,6 @@
bool bch_cached_dev_error(struct cached_dev *dc)
{
- struct cache_set *c;
-
if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
return false;
@@ -1437,21 +1494,6 @@
pr_err("stop %s: too many IO errors on backing device %s\n",
dc->disk.disk->disk_name, dc->backing_dev_name);
- /*
- * If the cached device is still attached to a cache set,
- * even dc->io_disable is true and no more I/O requests
- * accepted, cache device internal I/O (writeback scan or
- * garbage collection) may still prevent bcache device from
- * being stopped. So here CACHE_SET_IO_DISABLE should be
- * set to c->flags too, to make the internal I/O to cache
- * device rejected and stopped immediately.
- * If c is NULL, that means the bcache device is not attached
- * to any cache set, then no CACHE_SET_IO_DISABLE bit to set.
- */
- c = dc->disk.c;
- if (c && test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
- pr_info("CACHE_SET_IO_DISABLE already set");
-
bcache_device_stop(&dc->disk);
return true;
}
@@ -1490,6 +1532,7 @@
return true;
}
+/* When c->kobj released */
void bch_cache_set_release(struct kobject *kobj)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
@@ -1504,13 +1547,13 @@
struct cache *ca;
unsigned int i;
- if (!IS_ERR_OR_NULL(c->debug))
- debugfs_remove(c->debug);
+ debugfs_remove(c->debug);
bch_open_buckets_free(c);
bch_btree_cache_free(c);
bch_journal_free(c);
+ mutex_lock(&bch_register_lock);
for_each_cache(ca, c, i)
if (ca) {
ca->set = NULL;
@@ -1529,7 +1572,6 @@
mempool_exit(&c->search);
kfree(c->devices);
- mutex_lock(&bch_register_lock);
list_del(&c->list);
mutex_unlock(&bch_register_lock);
@@ -1552,19 +1594,23 @@
kobject_put(&c->internal);
kobject_del(&c->kobj);
- if (c->gc_thread)
+ if (!IS_ERR_OR_NULL(c->gc_thread))
kthread_stop(c->gc_thread);
if (!IS_ERR_OR_NULL(c->root))
list_add(&c->root->list, &c->btree_cache);
- /* Should skip this if we're unregistering because of an error */
- list_for_each_entry(b, &c->btree_cache, list) {
- mutex_lock(&b->write_lock);
- if (btree_node_dirty(b))
- __bch_btree_node_write(b, NULL);
- mutex_unlock(&b->write_lock);
- }
+ /*
+ * Avoid flushing cached nodes if cache set is retiring
+ * due to too many I/O errors detected.
+ */
+ if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
+ list_for_each_entry(b, &c->btree_cache, list) {
+ mutex_lock(&b->write_lock);
+ if (btree_node_dirty(b))
+ __bch_btree_node_write(b, NULL);
+ mutex_unlock(&b->write_lock);
+ }
for_each_cache(ca, c, i)
if (ca->alloc_thread)
@@ -1610,21 +1656,21 @@
*/
pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
d->disk->disk_name);
- /*
- * There might be a small time gap that cache set is
- * released but bcache device is not. Inside this time
- * gap, regular I/O requests will directly go into
- * backing device as no cache set attached to. This
- * behavior may also introduce potential inconsistence
- * data in writeback mode while cache is dirty.
- * Therefore before calling bcache_device_stop() due
- * to a broken cache device, dc->io_disable should be
- * explicitly set to true.
- */
- dc->io_disable = true;
- /* make others know io_disable is true earlier */
- smp_mb();
- bcache_device_stop(d);
+ /*
+ * There might be a small time gap that cache set is
+ * released but bcache device is not. Inside this time
+ * gap, regular I/O requests will directly go into
+ * backing device as no cache set attached to. This
+ * behavior may also introduce potential inconsistence
+ * data in writeback mode while cache is dirty.
+ * Therefore before calling bcache_device_stop() due
+ * to a broken cache device, dc->io_disable should be
+ * explicitly set to true.
+ */
+ dc->io_disable = true;
+ /* make others know io_disable is true earlier */
+ smp_mb();
+ bcache_device_stop(d);
} else {
/*
* dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
@@ -1668,6 +1714,7 @@
void bch_cache_set_stop(struct cache_set *c)
{
if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
+ /* closure_fn set to __cache_set_unregister() */
closure_queue(&c->caching);
}
@@ -1770,13 +1817,15 @@
return NULL;
}
-static void run_cache_set(struct cache_set *c)
+static int run_cache_set(struct cache_set *c)
{
const char *err = "cannot allocate memory";
struct cached_dev *dc, *t;
struct cache *ca;
struct closure cl;
unsigned int i;
+ LIST_HEAD(journal);
+ struct journal_replay *l;
closure_init_stack(&cl);
@@ -1785,7 +1834,6 @@
set_gc_sectors(c);
if (CACHE_SYNC(&c->sb)) {
- LIST_HEAD(journal);
struct bkey *k;
struct jset *j;
@@ -1835,6 +1883,23 @@
if (bch_btree_check(c))
goto err;
+ /*
+ * bch_btree_check() may occupy too much system memory which
+ * has negative effects to user space application (e.g. data
+ * base) performance. Shrink the mca cache memory proactively
+ * here to avoid competing memory with user space workloads..
+ */
+ if (!c->shrinker_disabled) {
+ struct shrink_control sc;
+
+ sc.gfp_mask = GFP_KERNEL;
+ sc.nr_to_scan = c->btree_cache_used * c->btree_pages;
+ /* first run to clear b->accessed tag */
+ c->shrink.scan_objects(&c->shrink, &sc);
+ /* second run to reap non-accessed nodes */
+ c->shrink.scan_objects(&c->shrink, &sc);
+ }
+
bch_journal_mark(c, &journal);
bch_initial_gc_finish(c);
pr_debug("btree_check() done");
@@ -1864,7 +1929,9 @@
if (j->version < BCACHE_JSET_VERSION_UUID)
__uuid_write(c);
- bch_journal_replay(c, &journal);
+ err = "bcache: replay journal failed";
+ if (bch_journal_replay(c, &journal))
+ goto err;
} else {
pr_notice("invalidating existing data");
@@ -1932,11 +1999,19 @@
flash_devs_run(c);
set_bit(CACHE_SET_RUNNING, &c->flags);
- return;
+ return 0;
err:
+ while (!list_empty(&journal)) {
+ l = list_first_entry(&journal, struct journal_replay, list);
+ list_del(&l->list);
+ kfree(l);
+ }
+
closure_sync(&cl);
- /* XXX: test this, it's broken */
+
bch_cache_set_error(c, "%s", err);
+
+ return -EIO;
}
static bool can_attach_cache(struct cache *ca, struct cache_set *c)
@@ -2000,8 +2075,11 @@
ca->set->cache[ca->sb.nr_this_dev] = ca;
c->cache_by_alloc[c->caches_loaded++] = ca;
- if (c->caches_loaded == c->sb.nr_in_set)
- run_cache_set(c);
+ if (c->caches_loaded == c->sb.nr_in_set) {
+ err = "failed to run cache set";
+ if (run_cache_set(c) < 0)
+ goto err;
+ }
return NULL;
err:
@@ -2011,6 +2089,7 @@
/* Cache device */
+/* When ca->kobj released */
void bch_cache_release(struct kobject *kobj)
{
struct cache *ca = container_of(kobj, struct cache, kobj);
@@ -2046,6 +2125,8 @@
size_t free;
size_t btree_buckets;
struct bucket *b;
+ int ret = -ENOMEM;
+ const char *err = NULL;
__module_get(THIS_MODULE);
kobject_init(&ca->kobj, &bch_cache_ktype);
@@ -2063,27 +2144,93 @@
*/
btree_buckets = ca->sb.njournal_buckets ?: 8;
free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
+ if (!free) {
+ ret = -EPERM;
+ err = "ca->sb.nbuckets is too small";
+ goto err_free;
+ }
- if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) ||
- !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
- !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
- !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
- !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
- !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
- !(ca->buckets = vzalloc(array_size(sizeof(struct bucket),
- ca->sb.nbuckets))) ||
- !(ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
- prio_buckets(ca), 2),
- GFP_KERNEL)) ||
- !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)))
- return -ENOMEM;
+ if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets,
+ GFP_KERNEL)) {
+ err = "ca->free[RESERVE_BTREE] alloc failed";
+ goto err_btree_alloc;
+ }
+
+ if (!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca),
+ GFP_KERNEL)) {
+ err = "ca->free[RESERVE_PRIO] alloc failed";
+ goto err_prio_alloc;
+ }
+
+ if (!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL)) {
+ err = "ca->free[RESERVE_MOVINGGC] alloc failed";
+ goto err_movinggc_alloc;
+ }
+
+ if (!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL)) {
+ err = "ca->free[RESERVE_NONE] alloc failed";
+ goto err_none_alloc;
+ }
+
+ if (!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL)) {
+ err = "ca->free_inc alloc failed";
+ goto err_free_inc_alloc;
+ }
+
+ if (!init_heap(&ca->heap, free << 3, GFP_KERNEL)) {
+ err = "ca->heap alloc failed";
+ goto err_heap_alloc;
+ }
+
+ ca->buckets = vzalloc(array_size(sizeof(struct bucket),
+ ca->sb.nbuckets));
+ if (!ca->buckets) {
+ err = "ca->buckets alloc failed";
+ goto err_buckets_alloc;
+ }
+
+ ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
+ prio_buckets(ca), 2),
+ GFP_KERNEL);
+ if (!ca->prio_buckets) {
+ err = "ca->prio_buckets alloc failed";
+ goto err_prio_buckets_alloc;
+ }
+
+ ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca);
+ if (!ca->disk_buckets) {
+ err = "ca->disk_buckets alloc failed";
+ goto err_disk_buckets_alloc;
+ }
ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
for_each_bucket(b, ca)
atomic_set(&b->pin, 0);
-
return 0;
+
+err_disk_buckets_alloc:
+ kfree(ca->prio_buckets);
+err_prio_buckets_alloc:
+ vfree(ca->buckets);
+err_buckets_alloc:
+ free_heap(&ca->heap);
+err_heap_alloc:
+ free_fifo(&ca->free_inc);
+err_free_inc_alloc:
+ free_fifo(&ca->free[RESERVE_NONE]);
+err_none_alloc:
+ free_fifo(&ca->free[RESERVE_MOVINGGC]);
+err_movinggc_alloc:
+ free_fifo(&ca->free[RESERVE_PRIO]);
+err_prio_alloc:
+ free_fifo(&ca->free[RESERVE_BTREE]);
+err_btree_alloc:
+err_free:
+ module_put(THIS_MODULE);
+ if (err)
+ pr_notice("error %s: %s", ca->cache_dev_name, err);
+ return ret;
}
static int register_cache(struct cache_sb *sb, struct page *sb_page,
@@ -2106,9 +2253,17 @@
ret = cache_alloc(ca);
if (ret != 0) {
+ /*
+ * If we failed here, it means ca->kobj is not initialized yet,
+ * kobject_put() won't be called and there is no chance to
+ * call blkdev_put() to bdev in bch_cache_release(). So we
+ * explicitly call blkdev_put() here.
+ */
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
if (ret == -ENOMEM)
err = "cache_alloc(): -ENOMEM";
+ else if (ret == -EPERM)
+ err = "cache_alloc(): cache device is too small";
else
err = "cache_alloc(): unknown error";
goto err;
@@ -2147,9 +2302,13 @@
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
const char *buffer, size_t size);
+static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
+ struct kobj_attribute *attr,
+ const char *buffer, size_t size);
kobj_attribute_write(register, register_bcache);
kobj_attribute_write(register_quiet, register_bcache);
+kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup);
static bool bch_is_open_backing(struct block_device *bdev)
{
@@ -2187,7 +2346,7 @@
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
const char *buffer, size_t size)
{
- ssize_t ret = size;
+ ssize_t ret = -EINVAL;
const char *err = "cannot allocate memory";
char *path = NULL;
struct cache_sb *sb = NULL;
@@ -2197,6 +2356,11 @@
if (!try_module_get(THIS_MODULE))
return -EBUSY;
+ /* For latest state of bcache_is_reboot */
+ smp_mb();
+ if (bcache_is_reboot)
+ return -EBUSY;
+
path = kstrndup(buffer, size, GFP_KERNEL);
if (!path)
goto err;
@@ -2221,7 +2385,7 @@
if (!IS_ERR(bdev))
bdput(bdev);
if (attr == &ksysfs_register_quiet)
- goto out;
+ goto quiet_out;
}
goto err;
}
@@ -2242,17 +2406,23 @@
goto err_close;
mutex_lock(&bch_register_lock);
- register_bdev(sb, sb_page, bdev, dc);
+ ret = register_bdev(sb, sb_page, bdev, dc);
mutex_unlock(&bch_register_lock);
+ /* blkdev_put() will be called in cached_dev_free() */
+ if (ret < 0)
+ goto err;
} else {
struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
goto err_close;
+ /* blkdev_put() will be called in bch_cache_release() */
if (register_cache(sb, sb_page, bdev, ca) != 0)
goto err;
}
+quiet_out:
+ ret = size;
out:
if (sb_page)
put_page(sb_page);
@@ -2265,12 +2435,64 @@
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
err:
pr_info("error %s: %s", path, err);
- ret = -EINVAL;
goto out;
}
+
+struct pdev {
+ struct list_head list;
+ struct cached_dev *dc;
+};
+
+static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
+ struct kobj_attribute *attr,
+ const char *buffer,
+ size_t size)
+{
+ LIST_HEAD(pending_devs);
+ ssize_t ret = size;
+ struct cached_dev *dc, *tdc;
+ struct pdev *pdev, *tpdev;
+ struct cache_set *c, *tc;
+
+ mutex_lock(&bch_register_lock);
+ list_for_each_entry_safe(dc, tdc, &uncached_devices, list) {
+ pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL);
+ if (!pdev)
+ break;
+ pdev->dc = dc;
+ list_add(&pdev->list, &pending_devs);
+ }
+
+ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
+ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
+ char *pdev_set_uuid = pdev->dc->sb.set_uuid;
+ char *set_uuid = c->sb.uuid;
+
+ if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
+ list_del(&pdev->list);
+ kfree(pdev);
+ break;
+ }
+ }
+ }
+ mutex_unlock(&bch_register_lock);
+
+ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
+ pr_info("delete pdev %p", pdev);
+ list_del(&pdev->list);
+ bcache_device_stop(&pdev->dc->disk);
+ kfree(pdev);
+ }
+
+ return ret;
+}
+
static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
{
+ if (bcache_is_reboot)
+ return NOTIFY_DONE;
+
if (code == SYS_DOWN ||
code == SYS_HALT ||
code == SYS_POWER_OFF) {
@@ -2283,22 +2505,57 @@
mutex_lock(&bch_register_lock);
+ if (bcache_is_reboot)
+ goto out;
+
+ /* New registration is rejected since now */
+ bcache_is_reboot = true;
+ /*
+ * Make registering caller (if there is) on other CPU
+ * core know bcache_is_reboot set to true earlier
+ */
+ smp_mb();
+
if (list_empty(&bch_cache_sets) &&
list_empty(&uncached_devices))
goto out;
+ mutex_unlock(&bch_register_lock);
+
pr_info("Stopping all devices:");
+ /*
+ * The reason bch_register_lock is not held to call
+ * bch_cache_set_stop() and bcache_device_stop() is to
+ * avoid potential deadlock during reboot, because cache
+ * set or bcache device stopping process will acqurie
+ * bch_register_lock too.
+ *
+ * We are safe here because bcache_is_reboot sets to
+ * true already, register_bcache() will reject new
+ * registration now. bcache_is_reboot also makes sure
+ * bcache_reboot() won't be re-entered on by other thread,
+ * so there is no race in following list iteration by
+ * list_for_each_entry_safe().
+ */
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
+
+ /*
+ * Give an early chance for other kthreads and
+ * kworkers to stop themselves
+ */
+ schedule();
+
/* What's a condition variable? */
while (1) {
- long timeout = start + 2 * HZ - jiffies;
+ long timeout = start + 10 * HZ - jiffies;
+ mutex_lock(&bch_register_lock);
stopped = list_empty(&bch_cache_sets) &&
list_empty(&uncached_devices);
@@ -2310,7 +2567,6 @@
mutex_unlock(&bch_register_lock);
schedule_timeout(timeout);
- mutex_lock(&bch_register_lock);
}
finish_wait(&unregister_wait, &wait);
@@ -2348,14 +2604,43 @@
mutex_destroy(&bch_register_lock);
}
+/* Check and fixup module parameters */
+static void check_module_parameters(void)
+{
+ if (bch_cutoff_writeback_sync == 0)
+ bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
+ else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
+ pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u",
+ bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
+ bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
+ }
+
+ if (bch_cutoff_writeback == 0)
+ bch_cutoff_writeback = CUTOFF_WRITEBACK;
+ else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
+ pr_warn("set bch_cutoff_writeback (%u) to max value %u",
+ bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
+ bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
+ }
+
+ if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
+ pr_warn("set bch_cutoff_writeback (%u) to %u",
+ bch_cutoff_writeback, bch_cutoff_writeback_sync);
+ bch_cutoff_writeback = bch_cutoff_writeback_sync;
+ }
+}
+
static int __init bcache_init(void)
{
static const struct attribute *files[] = {
&ksysfs_register.attr,
&ksysfs_register_quiet.attr,
+ &ksysfs_pendings_cleanup.attr,
NULL
};
+ check_module_parameters();
+
mutex_init(&bch_register_lock);
init_waitqueue_head(&unregister_wait);
register_reboot_notifier(&reboot);
@@ -2383,14 +2668,29 @@
sysfs_create_files(bcache_kobj, files))
goto err;
- bch_debug_init(bcache_kobj);
+ bch_debug_init();
closure_debug_init();
+ bcache_is_reboot = false;
+
return 0;
err:
bcache_exit();
return -ENOMEM;
}
+/*
+ * Module hooks
+ */
module_exit(bcache_exit);
module_init(bcache_init);
+
+module_param(bch_cutoff_writeback, uint, 0);
+MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
+
+module_param(bch_cutoff_writeback_sync, uint, 0);
+MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
+
+MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 26f035a..627dcea 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -16,7 +16,9 @@
#include <linux/sort.h>
#include <linux/sched/clock.h>
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
+extern bool bcache_is_reboot;
+
+/* Default is 0 ("writethrough") */
static const char * const bch_cache_modes[] = {
"writethrough",
"writeback",
@@ -25,7 +27,7 @@
NULL
};
-/* Default is -1; we skip past it for stop_when_cache_set_failed */
+/* Default is 0 ("auto") */
static const char * const bch_stop_on_failure_modes[] = {
"auto",
"always",
@@ -67,6 +69,8 @@
read_attribute(btree_written);
read_attribute(metadata_written);
read_attribute(active_journal_entries);
+read_attribute(backing_dev_name);
+read_attribute(backing_dev_uuid);
sysfs_time_stats_attribute(btree_gc, sec, ms);
sysfs_time_stats_attribute(btree_split, sec, us);
@@ -82,12 +86,14 @@
read_attribute(state);
read_attribute(cache_read_races);
read_attribute(reclaim);
+read_attribute(reclaimed_journal_buckets);
read_attribute(flush_write);
-read_attribute(retry_flush_write);
read_attribute(writeback_keys_done);
read_attribute(writeback_keys_failed);
read_attribute(io_errors);
read_attribute(congested);
+read_attribute(cutoff_writeback);
+read_attribute(cutoff_writeback_sync);
rw_attribute(congested_read_threshold_us);
rw_attribute(congested_write_threshold_us);
@@ -128,6 +134,7 @@
rw_attribute(cache_replacement_policy);
rw_attribute(btree_shrinker_disabled);
rw_attribute(copy_gc_enabled);
+rw_attribute(gc_after_writeback);
rw_attribute(size);
static ssize_t bch_snprint_string_list(char *buf,
@@ -175,7 +182,7 @@
var_print(writeback_percent);
sysfs_hprint(writeback_rate,
wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0);
- sysfs_hprint(io_errors, atomic_read(&dc->io_errors));
+ sysfs_printf(io_errors, "%i", atomic_read(&dc->io_errors));
sysfs_printf(io_error_limit, "%i", dc->error_limit);
sysfs_printf(io_disable, "%i", dc->io_disable);
var_print(writeback_rate_update_seconds);
@@ -240,6 +247,19 @@
return strlen(buf);
}
+ if (attr == &sysfs_backing_dev_name) {
+ snprintf(buf, BDEVNAME_SIZE + 1, "%s", dc->backing_dev_name);
+ strcat(buf, "\n");
+ return strlen(buf);
+ }
+
+ if (attr == &sysfs_backing_dev_uuid) {
+ /* convert binary uuid into 36-byte string plus '\0' */
+ snprintf(buf, 36+1, "%pU", dc->sb.uuid);
+ strcat(buf, "\n");
+ return strlen(buf);
+ }
+
#undef var
return 0;
}
@@ -253,18 +273,23 @@
struct cache_set *c;
struct kobj_uevent_env *env;
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
#define d_strtoul(var) sysfs_strtoul(var, dc->var)
#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
sysfs_strtoul(data_csum, dc->disk.data_csum);
d_strtoul(verify);
- d_strtoul(bypass_torture_test);
- d_strtoul(writeback_metadata);
- d_strtoul(writeback_running);
- d_strtoul(writeback_delay);
+ sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test);
+ sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata);
+ sysfs_strtoul_bool(writeback_running, dc->writeback_running);
+ sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX);
- sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
+ sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
+ 0, bch_cutoff_writeback);
if (attr == &sysfs_writeback_rate) {
ssize_t ret;
@@ -283,9 +308,15 @@
sysfs_strtoul_clamp(writeback_rate_update_seconds,
dc->writeback_rate_update_seconds,
1, WRITEBACK_RATE_UPDATE_SECS_MAX);
- d_strtoul(writeback_rate_i_term_inverse);
- d_strtoul_nonzero(writeback_rate_p_term_inverse);
- d_strtoul_nonzero(writeback_rate_minimum);
+ sysfs_strtoul_clamp(writeback_rate_i_term_inverse,
+ dc->writeback_rate_i_term_inverse,
+ 1, UINT_MAX);
+ sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
+ dc->writeback_rate_p_term_inverse,
+ 1, UINT_MAX);
+ sysfs_strtoul_clamp(writeback_rate_minimum,
+ dc->writeback_rate_minimum,
+ 1, UINT_MAX);
sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX);
@@ -295,15 +326,20 @@
dc->io_disable = v ? 1 : 0;
}
- d_strtoi_h(sequential_cutoff);
+ sysfs_strtoul_clamp(sequential_cutoff,
+ dc->sequential_cutoff,
+ 0, UINT_MAX);
d_strtoi_h(readahead);
if (attr == &sysfs_clear_stats)
bch_cache_accounting_clear(&dc->accounting);
if (attr == &sysfs_running &&
- strtoul_or_return(buf))
- bch_cached_dev_run(dc);
+ strtoul_or_return(buf)) {
+ v = bch_cached_dev_run(dc);
+ if (v)
+ return v;
+ }
if (attr == &sysfs_cache_mode) {
v = __sysfs_match_string(bch_cache_modes, -1, buf);
@@ -381,14 +417,40 @@
struct cached_dev *dc = container_of(kobj, struct cached_dev,
disk.kobj);
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
mutex_lock(&bch_register_lock);
size = __cached_dev_store(kobj, attr, buf, size);
- if (attr == &sysfs_writeback_running)
- bch_writeback_queue(dc);
+ if (attr == &sysfs_writeback_running) {
+ /* dc->writeback_running changed in __cached_dev_store() */
+ if (IS_ERR_OR_NULL(dc->writeback_thread)) {
+ /*
+ * reject setting it to 1 via sysfs if writeback
+ * kthread is not created yet.
+ */
+ if (dc->writeback_running) {
+ dc->writeback_running = false;
+ pr_err("%s: failed to run non-existent writeback thread",
+ dc->disk.disk->disk_name);
+ }
+ } else
+ /*
+ * writeback kthread will check if dc->writeback_running
+ * is true or false.
+ */
+ bch_writeback_queue(dc);
+ }
+ /*
+ * Only set BCACHE_DEV_WB_RUNNING when cached device attached to
+ * a cache set, otherwise it doesn't make sense.
+ */
if (attr == &sysfs_writeback_percent)
- if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
+ if ((dc->disk.c != NULL) &&
+ (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)))
schedule_delayed_work(&dc->writeback_rate_update,
dc->writeback_rate_update_seconds * HZ);
@@ -415,7 +477,7 @@
&sysfs_writeback_rate_p_term_inverse,
&sysfs_writeback_rate_minimum,
&sysfs_writeback_rate_debug,
- &sysfs_errors,
+ &sysfs_io_errors,
&sysfs_io_error_limit,
&sysfs_io_disable,
&sysfs_dirty_data,
@@ -431,6 +493,8 @@
&sysfs_verify,
&sysfs_bypass_torture_test,
#endif
+ &sysfs_backing_dev_name,
+ &sysfs_backing_dev_uuid,
NULL
};
KTYPE(bch_cached_dev);
@@ -460,6 +524,10 @@
kobj);
struct uuid_entry *u = &d->c->uuids[d->id];
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
sysfs_strtoul(data_csum, d->data_csum);
if (attr == &sysfs_size) {
@@ -642,12 +710,12 @@
sysfs_print(reclaim,
atomic_long_read(&c->reclaim));
+ sysfs_print(reclaimed_journal_buckets,
+ atomic_long_read(&c->reclaimed_journal_buckets));
+
sysfs_print(flush_write,
atomic_long_read(&c->flush_write));
- sysfs_print(retry_flush_write,
- atomic_long_read(&c->retry_flush_write));
-
sysfs_print(writeback_keys_done,
atomic_long_read(&c->writeback_keys_done));
sysfs_print(writeback_keys_failed,
@@ -668,6 +736,9 @@
sysfs_print(congested_write_threshold_us,
c->congested_write_threshold_us);
+ sysfs_print(cutoff_writeback, bch_cutoff_writeback);
+ sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync);
+
sysfs_print(active_journal_entries, fifo_used(&c->journal.pin));
sysfs_printf(verify, "%i", c->verify);
sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled);
@@ -676,6 +747,7 @@
sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+ sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback);
sysfs_printf(io_disable, "%i",
test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -691,6 +763,10 @@
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
ssize_t v;
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
if (attr == &sysfs_unregister)
bch_cache_set_unregister(c);
@@ -725,21 +801,8 @@
bch_cache_accounting_clear(&c->accounting);
}
- if (attr == &sysfs_trigger_gc) {
- /*
- * Garbage collection thread only works when sectors_to_gc < 0,
- * when users write to sysfs entry trigger_gc, most of time
- * they want to forcibly triger gargage collection. Here -1 is
- * set to c->sectors_to_gc, to make gc_should_run() give a
- * chance to permit gc thread to run. "give a chance" means
- * before going into gc_should_run(), there is still chance
- * that c->sectors_to_gc being set to other positive value. So
- * writing sysfs entry trigger_gc won't always make sure gc
- * thread takes effect.
- */
- atomic_set(&c->sectors_to_gc, -1);
- wake_up_gc(c);
- }
+ if (attr == &sysfs_trigger_gc)
+ force_wake_up_gc(c);
if (attr == &sysfs_prune_cache) {
struct shrink_control sc;
@@ -749,10 +812,12 @@
c->shrink.scan_objects(&c->shrink, &sc);
}
- sysfs_strtoul(congested_read_threshold_us,
- c->congested_read_threshold_us);
- sysfs_strtoul(congested_write_threshold_us,
- c->congested_write_threshold_us);
+ sysfs_strtoul_clamp(congested_read_threshold_us,
+ c->congested_read_threshold_us,
+ 0, UINT_MAX);
+ sysfs_strtoul_clamp(congested_write_threshold_us,
+ c->congested_write_threshold_us,
+ 0, UINT_MAX);
if (attr == &sysfs_errors) {
v = __sysfs_match_string(error_actions, -1, buf);
@@ -762,12 +827,20 @@
c->on_error = v;
}
- if (attr == &sysfs_io_error_limit)
- c->error_limit = strtoul_or_return(buf);
+ sysfs_strtoul_clamp(io_error_limit, c->error_limit, 0, UINT_MAX);
/* See count_io_errors() for why 88 */
- if (attr == &sysfs_io_error_halflife)
- c->error_decay = strtoul_or_return(buf) / 88;
+ if (attr == &sysfs_io_error_halflife) {
+ unsigned long v = 0;
+ ssize_t ret;
+
+ ret = strtoul_safe_clamp(buf, v, 0, UINT_MAX);
+ if (!ret) {
+ c->error_decay = v / 88;
+ return size;
+ }
+ return ret;
+ }
if (attr == &sysfs_io_disable) {
v = strtoul_or_return(buf);
@@ -782,13 +855,21 @@
}
}
- sysfs_strtoul(journal_delay_ms, c->journal_delay_ms);
- sysfs_strtoul(verify, c->verify);
- sysfs_strtoul(key_merging_disabled, c->key_merging_disabled);
+ sysfs_strtoul_clamp(journal_delay_ms,
+ c->journal_delay_ms,
+ 0, USHRT_MAX);
+ sysfs_strtoul_bool(verify, c->verify);
+ sysfs_strtoul_bool(key_merging_disabled, c->key_merging_disabled);
sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks);
- sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
- sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
- sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled);
+ sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite);
+ sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled);
+ sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled);
+ /*
+ * write gc_after_writeback here may overwrite an already set
+ * BCH_DO_AUTO_GC, it doesn't matter because this flag will be
+ * set in next chance.
+ */
+ sysfs_strtoul_clamp(gc_after_writeback, c->gc_after_writeback, 0, 1);
return size;
}
@@ -805,6 +886,10 @@
{
struct cache_set *c = container_of(kobj, struct cache_set, internal);
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
return bch_cache_set_store(&c->kobj, attr, buf, size);
}
@@ -854,8 +939,8 @@
&sysfs_bset_tree_stats,
&sysfs_cache_read_races,
&sysfs_reclaim,
+ &sysfs_reclaimed_journal_buckets,
&sysfs_flush_write,
- &sysfs_retry_flush_write,
&sysfs_writeback_keys_done,
&sysfs_writeback_keys_failed,
@@ -869,13 +954,17 @@
&sysfs_gc_always_rewrite,
&sysfs_btree_shrinker_disabled,
&sysfs_copy_gc_enabled,
+ &sysfs_gc_after_writeback,
&sysfs_io_disable,
+ &sysfs_cutoff_writeback,
+ &sysfs_cutoff_writeback_sync,
NULL
};
KTYPE(bch_cache_set_internal);
static int __bch_cache_cmp(const void *l, const void *r)
{
+ cond_resched();
return *((uint16_t *)r) - *((uint16_t *)l);
}
@@ -938,8 +1027,6 @@
!cached[n - 1])
--n;
- unused = ca->sb.nbuckets - n;
-
while (cached < p + n &&
*cached == BTREE_PRIO)
cached++, n--;
@@ -989,6 +1076,10 @@
struct cache *ca = container_of(kobj, struct cache, kobj);
ssize_t v;
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
if (attr == &sysfs_discard) {
bool v = strtoul_or_return(buf);
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h
index 3fe8242..215df32 100644
--- a/drivers/md/bcache/sysfs.h
+++ b/drivers/md/bcache/sysfs.h
@@ -79,11 +79,28 @@
return strtoul_safe(buf, var) ?: (ssize_t) size; \
} while (0)
+#define sysfs_strtoul_bool(file, var) \
+do { \
+ if (attr == &sysfs_ ## file) { \
+ unsigned long v = strtoul_or_return(buf); \
+ \
+ var = v ? 1 : 0; \
+ return size; \
+ } \
+} while (0)
+
#define sysfs_strtoul_clamp(file, var, min, max) \
do { \
- if (attr == &sysfs_ ## file) \
- return strtoul_safe_clamp(buf, var, min, max) \
- ?: (ssize_t) size; \
+ if (attr == &sysfs_ ## file) { \
+ unsigned long v = 0; \
+ ssize_t ret; \
+ ret = strtoul_safe_clamp(buf, v, min, max); \
+ if (!ret) { \
+ var = v; \
+ return size; \
+ } \
+ return ret; \
+ } \
} while (0)
#define strtoul_or_return(cp) \
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 20eddea..62fb917 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -270,7 +270,11 @@
int i;
struct bio_vec *bv;
- bio_for_each_segment_all(bv, bio, i) {
+ /*
+ * This is called on freshly new bio, so it is safe to access the
+ * bvec table directly.
+ */
+ for (i = 0, bv = bio->bi_io_vec; i < bio->bi_vcnt; bv++, i++) {
bv->bv_page = alloc_page(gfp_mask);
if (!bv->bv_page) {
while (--bv >= bio->bi_io_vec)
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 00aab6a..c029f74 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -113,8 +113,6 @@
#define heap_full(h) ((h)->used == (h)->size)
-#define heap_empty(h) ((h)->used == 0)
-
#define DECLARE_FIFO(type, name) \
struct { \
size_t front, back, size, mask; \
@@ -560,17 +558,29 @@
return crc;
}
-/* Does linear interpolation between powers of two */
+/*
+ * A stepwise-linear pseudo-exponential. This returns 1 << (x >>
+ * frac_bits), with the less-significant bits filled in by linear
+ * interpolation.
+ *
+ * This can also be interpreted as a floating-point number format,
+ * where the low frac_bits are the mantissa (with implicit leading
+ * 1 bit), and the more significant bits are the exponent.
+ * The return value is 1.mantissa * 2^exponent.
+ *
+ * The way this is used, fract_bits is 6 and the largest possible
+ * input is CONGESTED_MAX-1 = 1023 (exponent 16, mantissa 0x1.fc),
+ * so the maximum output is 0x1fc00.
+ */
static inline unsigned int fract_exp_two(unsigned int x,
unsigned int fract_bits)
{
- unsigned int fract = x & ~(~0 << fract_bits);
+ unsigned int mantissa = 1 << fract_bits; /* Implicit bit */
- x >>= fract_bits;
- x = 1 << x;
- x += (x * fract) >> fract_bits;
-
- return x;
+ mantissa += x & (mantissa - 1);
+ x >>= fract_bits; /* The exponent */
+ /* Largest intermediate value 0x7f0000 */
+ return mantissa << x >> fract_bits;
}
void bch_bio_map(struct bio *bio, void *base);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 08c3a9f..d60268f 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -17,6 +17,15 @@
#include <linux/sched/clock.h>
#include <trace/events/bcache.h>
+static void update_gc_after_writeback(struct cache_set *c)
+{
+ if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) ||
+ c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
+ return;
+
+ c->gc_after_writeback |= BCH_DO_AUTO_GC;
+}
+
/* Rate limiting */
static uint64_t __calc_target_rate(struct cached_dev *dc)
{
@@ -113,6 +122,9 @@
static bool set_at_max_writeback_rate(struct cache_set *c,
struct cached_dev *dc)
{
+ /* Don't set max writeback rate if gc is running */
+ if (!c->gc_mark_valid)
+ return false;
/*
* Idle_counter is increased everytime when update_writeback_rate() is
* called. If all backing devices attached to the same cache set have
@@ -191,6 +203,7 @@
if (!set_at_max_writeback_rate(c, dc)) {
down_read(&dc->writeback_lock);
__update_writeback_rate(dc);
+ update_gc_after_writeback(c);
up_read(&dc->writeback_lock);
}
}
@@ -689,6 +702,23 @@
up_write(&dc->writeback_lock);
break;
}
+
+ /*
+ * When dirty data rate is high (e.g. 50%+), there might
+ * be heavy buckets fragmentation after writeback
+ * finished, which hurts following write performance.
+ * If users really care about write performance they
+ * may set BCH_ENABLE_AUTO_GC via sysfs, then when
+ * BCH_DO_AUTO_GC is set, garbage collection thread
+ * will be wake up here. After moving gc, the shrunk
+ * btree and discarded free buckets SSD space may be
+ * helpful for following write requests.
+ */
+ if (c->gc_after_writeback ==
+ (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
+ c->gc_after_writeback &= ~BCH_DO_AUTO_GC;
+ force_wake_up_gc(c);
+ }
}
up_write(&dc->writeback_lock);
@@ -708,6 +738,10 @@
}
}
+ if (dc->writeback_write_wq) {
+ flush_workqueue(dc->writeback_write_wq);
+ destroy_workqueue(dc->writeback_write_wq);
+ }
cached_dev_put(dc);
wait_for_kthread_stop();
@@ -777,7 +811,7 @@
bch_keybuf_init(&dc->writeback_keys);
dc->writeback_metadata = true;
- dc->writeback_running = true;
+ dc->writeback_running = false;
dc->writeback_percent = 10;
dc->writeback_delay = 30;
atomic_long_set(&dc->writeback_rate.rate, 1024);
@@ -803,8 +837,10 @@
"bcache_writeback");
if (IS_ERR(dc->writeback_thread)) {
cached_dev_put(dc);
+ destroy_workqueue(dc->writeback_write_wq);
return PTR_ERR(dc->writeback_thread);
}
+ dc->writeback_running = true;
WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
schedule_delayed_work(&dc->writeback_rate_update,
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index d2b9fdb..4e4c681 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -5,12 +5,17 @@
#define CUTOFF_WRITEBACK 40
#define CUTOFF_WRITEBACK_SYNC 70
+#define CUTOFF_WRITEBACK_MAX 70
+#define CUTOFF_WRITEBACK_SYNC_MAX 90
+
#define MAX_WRITEBACKS_IN_PASS 5
#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
#define WRITEBACK_RATE_UPDATE_SECS_MAX 60
#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5
+#define BCH_AUTO_GC_DIRTY_THRESHOLD 50
+
/*
* 14 (16384ths) is chosen here as something that each backing device
* should be a reasonable fraction of the share, and not to blow up
@@ -53,6 +58,9 @@
}
}
+extern unsigned int bch_cutoff_writeback;
+extern unsigned int bch_cutoff_writeback_sync;
+
static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
unsigned int cache_mode, bool would_skip)
{
@@ -60,7 +68,10 @@
if (cache_mode != CACHE_MODE_WRITEBACK ||
test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
- in_use > CUTOFF_WRITEBACK_SYNC)
+ in_use > bch_cutoff_writeback_sync)
+ return false;
+
+ if (bio_op(bio) == REQ_OP_DISCARD)
return false;
if (dc->partial_stripes_expensive &&
@@ -73,7 +84,7 @@
return (op_is_sync(bio->bi_opf) ||
bio->bi_opf & (REQ_META|REQ_PRIO) ||
- in_use <= CUTOFF_WRITEBACK);
+ in_use <= bch_cutoff_writeback);
}
static inline void bch_writeback_queue(struct cached_dev *dc)