Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
index cd2d5b9..2e8e6f9 100644
--- a/net/ceph/Kconfig
+++ b/net/ceph/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config CEPH_LIB
tristate "Ceph core library"
depends on INET
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index db09def..59d0ba2 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_CEPH_LIB) += libceph.o
libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
- mon_client.o \
+ mon_client.o decode.o \
cls_lock_client.o \
osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
striper.o \
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 87afb9e..2d56824 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/ceph/ceph_debug.h>
#include <linux/backing-dev.h>
@@ -12,6 +13,7 @@
#include <linux/nsproxy.h>
#include <linux/parser.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/statfs.h>
@@ -184,18 +186,34 @@
}
EXPORT_SYMBOL(ceph_compare_options);
+/*
+ * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are
+ * compatible with (a superset of) GFP_KERNEL. This is because while the
+ * actual pages are allocated with the specified flags, the page table pages
+ * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take
+ * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc().
+ *
+ * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO.
+ */
void *ceph_kvmalloc(size_t size, gfp_t flags)
{
- if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- void *ptr = kmalloc(size, flags | __GFP_NOWARN);
- if (ptr)
- return ptr;
+ void *p;
+
+ if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) {
+ p = kvmalloc(size, flags);
+ } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) {
+ unsigned int nofs_flag = memalloc_nofs_save();
+ p = kvmalloc(size, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
+ } else {
+ unsigned int noio_flag = memalloc_noio_save();
+ p = kvmalloc(size, GFP_KERNEL);
+ memalloc_noio_restore(noio_flag);
}
- return __vmalloc(size, flags, PAGE_KERNEL);
+ return p;
}
-
static int parse_fsid(const char *str, struct ceph_fsid *fsid)
{
int i = 0;
@@ -255,6 +273,7 @@
Opt_nocephx_sign_messages,
Opt_tcp_nodelay,
Opt_notcp_nodelay,
+ Opt_abort_on_full,
};
static match_table_t opt_tokens = {
@@ -280,6 +299,7 @@
{Opt_nocephx_sign_messages, "nocephx_sign_messages"},
{Opt_tcp_nodelay, "tcp_nodelay"},
{Opt_notcp_nodelay, "notcp_nodelay"},
+ {Opt_abort_on_full, "abort_on_full"},
{-1, NULL}
};
@@ -535,6 +555,10 @@
opt->flags &= ~CEPH_OPT_TCP_NODELAY;
break;
+ case Opt_abort_on_full:
+ opt->flags |= CEPH_OPT_ABORT_ON_FULL;
+ break;
+
default:
BUG_ON(token);
}
@@ -549,7 +573,8 @@
}
EXPORT_SYMBOL(ceph_parse_options);
-int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
+int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
+ bool show_all)
{
struct ceph_options *opt = client->options;
size_t pos = m->count;
@@ -574,6 +599,8 @@
seq_puts(m, "nocephx_sign_messages,");
if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
seq_puts(m, "notcp_nodelay,");
+ if (show_all && (opt->flags & CEPH_OPT_ABORT_ON_FULL))
+ seq_puts(m, "abort_on_full,");
if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
seq_printf(m, "mount_timeout=%d,",
@@ -684,6 +711,14 @@
}
EXPORT_SYMBOL(ceph_destroy_client);
+void ceph_reset_client_addr(struct ceph_client *client)
+{
+ ceph_messenger_reset_nonce(&client->msgr);
+ ceph_monc_reopen_session(&client->monc);
+ ceph_osdc_reopen_osds(&client->osdc);
+}
+EXPORT_SYMBOL(ceph_reset_client_addr);
+
/*
* true if we have the mon map (and have thus joined the cluster)
*/
@@ -729,7 +764,6 @@
}
EXPORT_SYMBOL(__ceph_open_session);
-
int ceph_open_session(struct ceph_client *client)
{
int ret;
@@ -745,14 +779,29 @@
}
EXPORT_SYMBOL(ceph_open_session);
+int ceph_wait_for_latest_osdmap(struct ceph_client *client,
+ unsigned long timeout)
+{
+ u64 newest_epoch;
+ int ret;
+
+ ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch);
+ if (ret)
+ return ret;
+
+ if (client->osdc.osdmap->epoch >= newest_epoch)
+ return 0;
+
+ ceph_osdc_maybe_request_map(&client->osdc);
+ return ceph_monc_wait_osdmap(&client->monc, newest_epoch, timeout);
+}
+EXPORT_SYMBOL(ceph_wait_for_latest_osdmap);
static int __init init_ceph_lib(void)
{
int ret = 0;
- ret = ceph_debugfs_init();
- if (ret < 0)
- goto out;
+ ceph_debugfs_init();
ret = ceph_crypto_init();
if (ret < 0)
@@ -777,7 +826,6 @@
ceph_crypto_shutdown();
out_debugfs:
ceph_debugfs_cleanup();
-out:
return ret;
}
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index 2105a6e..17447c1 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -6,6 +6,7 @@
#include <linux/ceph/cls_lock_client.h>
#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
/**
* ceph_cls_lock - grab rados lock for object
@@ -264,14 +265,17 @@
return ret;
*p += sizeof(struct ceph_timespec); /* skip expiration */
- ceph_decode_copy(p, &locker->info.addr, sizeof(locker->info.addr));
- ceph_decode_addr(&locker->info.addr);
+
+ ret = ceph_decode_entity_addr(p, end, &locker->info.addr);
+ if (ret)
+ return ret;
+
len = ceph_decode_32(p);
*p += len; /* skip description */
dout("%s %s%llu cookie %s addr %s\n", __func__,
ENTITY_NAME(locker->id.name), locker->id.cookie,
- ceph_pr_addr(&locker->info.addr.in_addr));
+ ceph_pr_addr(&locker->info.addr));
return 0;
}
@@ -360,7 +364,7 @@
dout("%s lock_name %s\n", __func__, lock_name);
ret = ceph_osdc_call(osdc, oid, oloc, "lock", "get_info",
CEPH_OSD_FLAG_READ, get_info_op_page,
- get_info_op_buf_size, reply_page, &reply_len);
+ get_info_op_buf_size, &reply_page, &reply_len);
dout("%s: status %d\n", __func__, ret);
if (ret >= 0) {
@@ -375,3 +379,47 @@
return ret;
}
EXPORT_SYMBOL(ceph_cls_lock_info);
+
+int ceph_cls_assert_locked(struct ceph_osd_request *req, int which,
+ char *lock_name, u8 type, char *cookie, char *tag)
+{
+ int assert_op_buf_size;
+ int name_len = strlen(lock_name);
+ int cookie_len = strlen(cookie);
+ int tag_len = strlen(tag);
+ struct page **pages;
+ void *p, *end;
+ int ret;
+
+ assert_op_buf_size = name_len + sizeof(__le32) +
+ cookie_len + sizeof(__le32) +
+ tag_len + sizeof(__le32) +
+ sizeof(u8) + CEPH_ENCODING_START_BLK_LEN;
+ if (assert_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ ret = osd_req_op_cls_init(req, which, "lock", "assert_locked");
+ if (ret)
+ return ret;
+
+ pages = ceph_alloc_page_vector(1, GFP_NOIO);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ p = page_address(pages[0]);
+ end = p + assert_op_buf_size;
+
+ /* encode cls_lock_assert_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ assert_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+ ceph_encode_8(&p, type);
+ ceph_encode_string(&p, end, cookie, cookie_len);
+ ceph_encode_string(&p, end, tag, tag_len);
+ WARN_ON(p != end);
+
+ osd_req_op_cls_request_data_pages(req, which, pages, assert_op_buf_size,
+ 0, false, true);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_cls_assert_locked);
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 02172c4..4f75df4 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -46,9 +46,9 @@
goto fail;
}
- /* crypto_alloc_skcipher() allocates with GFP_KERNEL */
+ /* crypto_alloc_sync_skcipher() allocates with GFP_KERNEL */
noio_flag = memalloc_noio_save();
- key->tfm = crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+ key->tfm = crypto_alloc_sync_skcipher("cbc(aes)", 0, 0);
memalloc_noio_restore(noio_flag);
if (IS_ERR(key->tfm)) {
ret = PTR_ERR(key->tfm);
@@ -56,7 +56,7 @@
goto fail;
}
- ret = crypto_skcipher_setkey(key->tfm, key->key, key->len);
+ ret = crypto_sync_skcipher_setkey(key->tfm, key->key, key->len);
if (ret)
goto fail;
@@ -136,8 +136,10 @@
if (key) {
kfree(key->key);
key->key = NULL;
- crypto_free_skcipher(key->tfm);
- key->tfm = NULL;
+ if (key->tfm) {
+ crypto_free_sync_skcipher(key->tfm);
+ key->tfm = NULL;
+ }
}
}
@@ -216,7 +218,7 @@
static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt,
void *buf, int buf_len, int in_len, int *pout_len)
{
- SKCIPHER_REQUEST_ON_STACK(req, key->tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm);
struct sg_table sgt;
struct scatterlist prealloc_sg;
char iv[AES_BLOCK_SIZE] __aligned(8);
@@ -232,7 +234,7 @@
return ret;
memcpy(iv, aes_iv, AES_BLOCK_SIZE);
- skcipher_request_set_tfm(req, key->tfm);
+ skcipher_request_set_sync_tfm(req, key->tfm);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv);
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index bb45c7d..96ef4d8 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -13,7 +13,7 @@
struct ceph_timespec created;
int len;
void *key;
- struct crypto_skcipher *tfm;
+ struct crypto_sync_skcipher *tfm;
};
int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 0295260..7cb992e 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -46,7 +46,7 @@
seq_printf(s, "\t%s%lld\t%s\n",
ENTITY_NAME(inst->name),
- ceph_pr_addr(&inst->addr.in_addr));
+ ceph_pr_addr(&inst->addr));
}
return 0;
}
@@ -82,7 +82,7 @@
char sb[64];
seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
- i, ceph_pr_addr(&addr->in_addr),
+ i, ceph_pr_addr(addr),
((map->osd_weight[i]*100) >> 16),
ceph_osdmap_state_str(sb, sizeof(sb), state),
((ceph_get_primary_affinity(map, i)*100) >> 16));
@@ -375,7 +375,7 @@
struct ceph_client *client = s->private;
int ret;
- ret = ceph_print_client_options(s, client);
+ ret = ceph_print_client_options(s, client, true);
if (ret)
return ret;
@@ -389,12 +389,9 @@
CEPH_DEFINE_SHOW_FUNC(osdc_show)
CEPH_DEFINE_SHOW_FUNC(client_options_show)
-int __init ceph_debugfs_init(void)
+void __init ceph_debugfs_init(void)
{
ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
- if (!ceph_debugfs_dir)
- return -ENOMEM;
- return 0;
}
void ceph_debugfs_cleanup(void)
@@ -402,9 +399,8 @@
debugfs_remove(ceph_debugfs_dir);
}
-int ceph_debugfs_client_init(struct ceph_client *client)
+void ceph_debugfs_client_init(struct ceph_client *client)
{
- int ret = -ENOMEM;
char name[80];
snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
@@ -412,56 +408,37 @@
dout("ceph_debugfs_client_init %p %s\n", client, name);
- BUG_ON(client->debugfs_dir);
client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
- if (!client->debugfs_dir)
- goto out;
client->monc.debugfs_file = debugfs_create_file("monc",
0400,
client->debugfs_dir,
client,
&monc_show_fops);
- if (!client->monc.debugfs_file)
- goto out;
client->osdc.debugfs_file = debugfs_create_file("osdc",
0400,
client->debugfs_dir,
client,
&osdc_show_fops);
- if (!client->osdc.debugfs_file)
- goto out;
client->debugfs_monmap = debugfs_create_file("monmap",
0400,
client->debugfs_dir,
client,
&monmap_show_fops);
- if (!client->debugfs_monmap)
- goto out;
client->debugfs_osdmap = debugfs_create_file("osdmap",
0400,
client->debugfs_dir,
client,
&osdmap_show_fops);
- if (!client->debugfs_osdmap)
- goto out;
client->debugfs_options = debugfs_create_file("client_options",
0400,
client->debugfs_dir,
client,
&client_options_show_fops);
- if (!client->debugfs_options)
- goto out;
-
- return 0;
-
-out:
- ceph_debugfs_client_cleanup(client);
- return ret;
}
void ceph_debugfs_client_cleanup(struct ceph_client *client)
@@ -477,18 +454,16 @@
#else /* CONFIG_DEBUG_FS */
-int __init ceph_debugfs_init(void)
+void __init ceph_debugfs_init(void)
{
- return 0;
}
void ceph_debugfs_cleanup(void)
{
}
-int ceph_debugfs_client_init(struct ceph_client *client)
+void ceph_debugfs_client_init(struct ceph_client *client)
{
- return 0;
}
void ceph_debugfs_client_cleanup(struct ceph_client *client)
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
new file mode 100644
index 0000000..eea5295
--- /dev/null
+++ b/net/ceph/decode.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ceph/decode.h>
+
+static int
+ceph_decode_entity_addr_versioned(void **p, void *end,
+ struct ceph_entity_addr *addr)
+{
+ int ret;
+ u8 struct_v;
+ u32 struct_len, addr_len;
+ void *struct_end;
+
+ ret = ceph_start_decoding(p, end, 1, "entity_addr_t", &struct_v,
+ &struct_len);
+ if (ret)
+ goto bad;
+
+ ret = -EINVAL;
+ struct_end = *p + struct_len;
+
+ ceph_decode_copy_safe(p, end, &addr->type, sizeof(addr->type), bad);
+
+ ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
+
+ ceph_decode_32_safe(p, end, addr_len, bad);
+ if (addr_len > sizeof(addr->in_addr))
+ goto bad;
+
+ memset(&addr->in_addr, 0, sizeof(addr->in_addr));
+ if (addr_len) {
+ ceph_decode_copy_safe(p, end, &addr->in_addr, addr_len, bad);
+
+ addr->in_addr.ss_family =
+ le16_to_cpu((__force __le16)addr->in_addr.ss_family);
+ }
+
+ /* Advance past anything the client doesn't yet understand */
+ *p = struct_end;
+ ret = 0;
+bad:
+ return ret;
+}
+
+static int
+ceph_decode_entity_addr_legacy(void **p, void *end,
+ struct ceph_entity_addr *addr)
+{
+ int ret = -EINVAL;
+
+ /* Skip rest of type field */
+ ceph_decode_skip_n(p, end, 3, bad);
+
+ /*
+ * Clients that don't support ADDR2 always send TYPE_NONE, change it
+ * to TYPE_LEGACY for forward compatibility.
+ */
+ addr->type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
+ ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
+ memset(&addr->in_addr, 0, sizeof(addr->in_addr));
+ ceph_decode_copy_safe(p, end, &addr->in_addr,
+ sizeof(addr->in_addr), bad);
+ addr->in_addr.ss_family =
+ be16_to_cpu((__force __be16)addr->in_addr.ss_family);
+ ret = 0;
+bad:
+ return ret;
+}
+
+int
+ceph_decode_entity_addr(void **p, void *end, struct ceph_entity_addr *addr)
+{
+ u8 marker;
+
+ ceph_decode_8_safe(p, end, marker, bad);
+ if (marker == 1)
+ return ceph_decode_entity_addr_versioned(p, end, addr);
+ else if (marker == 0)
+ return ceph_decode_entity_addr_legacy(p, end, addr);
+bad:
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addr);
+
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9a1c27c..e4cb3db 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -156,7 +156,6 @@
/* Slab caches for frequently-allocated structures */
static struct kmem_cache *ceph_msg_cache;
-static struct kmem_cache *ceph_msg_data_cache;
/* static tag bytes (protocol control messages) */
static char tag_msg = CEPH_MSGR_TAG_MSG;
@@ -187,30 +186,33 @@
static struct page *zero_page; /* used in certain error cases */
-const char *ceph_pr_addr(const struct sockaddr_storage *ss)
+const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
{
int i;
char *s;
- struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
- struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
+ struct sockaddr_storage ss = addr->in_addr; /* align */
+ struct sockaddr_in *in4 = (struct sockaddr_in *)&ss;
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)&ss;
i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK;
s = addr_str[i];
- switch (ss->ss_family) {
+ switch (ss.ss_family) {
case AF_INET:
- snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr,
+ snprintf(s, MAX_ADDR_STR_LEN, "(%d)%pI4:%hu",
+ le32_to_cpu(addr->type), &in4->sin_addr,
ntohs(in4->sin_port));
break;
case AF_INET6:
- snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr,
+ snprintf(s, MAX_ADDR_STR_LEN, "(%d)[%pI6c]:%hu",
+ le32_to_cpu(addr->type), &in6->sin6_addr,
ntohs(in6->sin6_port));
break;
default:
snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)",
- ss->ss_family);
+ ss.ss_family);
}
return s;
@@ -220,7 +222,7 @@
static void encode_my_addr(struct ceph_messenger *msgr)
{
memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
- ceph_encode_addr(&msgr->my_enc_addr);
+ ceph_encode_banner_addr(&msgr->my_enc_addr);
}
/*
@@ -235,23 +237,11 @@
if (!ceph_msg_cache)
return -ENOMEM;
- BUG_ON(ceph_msg_data_cache);
- ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0);
- if (ceph_msg_data_cache)
- return 0;
-
- kmem_cache_destroy(ceph_msg_cache);
- ceph_msg_cache = NULL;
-
- return -ENOMEM;
+ return 0;
}
static void ceph_msgr_slab_exit(void)
{
- BUG_ON(!ceph_msg_data_cache);
- kmem_cache_destroy(ceph_msg_data_cache);
- ceph_msg_data_cache = NULL;
-
BUG_ON(!ceph_msg_cache);
kmem_cache_destroy(ceph_msg_cache);
ceph_msg_cache = NULL;
@@ -462,7 +452,7 @@
*/
static int ceph_tcp_connect(struct ceph_connection *con)
{
- struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
+ struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */
struct socket *sock;
unsigned int noio_flag;
int ret;
@@ -471,7 +461,7 @@
/* sock_create_kern() allocates with GFP_KERNEL */
noio_flag = memalloc_noio_save();
- ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family,
+ ret = sock_create_kern(read_pnet(&con->msgr->net), ss.ss_family,
SOCK_STREAM, IPPROTO_TCP, &sock);
memalloc_noio_restore(noio_flag);
if (ret)
@@ -484,18 +474,18 @@
set_sock_callbacks(sock, con);
- dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
+ dout("connect %s\n", ceph_pr_addr(&con->peer_addr));
con_sock_state_connecting(con);
- ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+ ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss),
O_NONBLOCK);
if (ret == -EINPROGRESS) {
dout("connect %s EINPROGRESS sk_state = %u\n",
- ceph_pr_addr(&con->peer_addr.in_addr),
+ ceph_pr_addr(&con->peer_addr),
sock->sk->sk_state);
} else if (ret < 0) {
pr_err("connect %s error %d\n",
- ceph_pr_addr(&con->peer_addr.in_addr), ret);
+ ceph_pr_addr(&con->peer_addr), ret);
sock_release(sock);
return ret;
}
@@ -526,7 +516,7 @@
if (!buf)
msg.msg_flags |= MSG_TRUNC;
- iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, len);
+ iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
r = sock_recvmsg(sock, &msg, msg.msg_flags);
if (r == -EAGAIN)
r = 0;
@@ -545,7 +535,7 @@
int r;
BUG_ON(page_offset + length > PAGE_SIZE);
- iov_iter_bvec(&msg.msg_iter, READ | ITER_BVEC, &bvec, 1, length);
+ iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
r = sock_recvmsg(sock, &msg, msg.msg_flags);
if (r == -EAGAIN)
r = 0;
@@ -557,7 +547,7 @@
* shortly.
*/
static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
- size_t kvlen, size_t len, int more)
+ size_t kvlen, size_t len, bool more)
{
struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
int r;
@@ -573,24 +563,15 @@
return r;
}
-static int __ceph_tcp_sendpage(struct socket *sock, struct page *page,
- int offset, size_t size, bool more)
-{
- int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR);
- int ret;
-
- ret = kernel_sendpage(sock, page, offset, size, flags);
- if (ret == -EAGAIN)
- ret = 0;
-
- return ret;
-}
-
+/*
+ * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
+ */
static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
- int offset, size_t size, bool more)
+ int offset, size_t size, int more)
{
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
- struct bio_vec bvec;
+ ssize_t (*sendpage)(struct socket *sock, struct page *page,
+ int offset, size_t size, int flags);
+ int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
int ret;
/*
@@ -602,19 +583,11 @@
* triggers one of hardened usercopy checks.
*/
if (page_count(page) >= 1 && !PageSlab(page))
- return __ceph_tcp_sendpage(sock, page, offset, size, more);
-
- bvec.bv_page = page;
- bvec.bv_offset = offset;
- bvec.bv_len = size;
-
- if (more)
- msg.msg_flags |= MSG_MORE;
+ sendpage = sock->ops->sendpage;
else
- msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
+ sendpage = sock_no_sendpage;
- iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC, &bvec, 1, size);
- ret = sock_sendmsg(sock, &msg);
+ ret = sendpage(sock, page, offset, size, flags);
if (ret == -EAGAIN)
ret = 0;
@@ -699,8 +672,7 @@
void ceph_con_close(struct ceph_connection *con)
{
mutex_lock(&con->mutex);
- dout("con_close %p peer %s\n", con,
- ceph_pr_addr(&con->peer_addr.in_addr));
+ dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
con->state = CON_STATE_CLOSED;
con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */
@@ -724,7 +696,7 @@
struct ceph_entity_addr *addr)
{
mutex_lock(&con->mutex);
- dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
+ dout("con_open %p %s\n", con, ceph_pr_addr(addr));
WARN_ON(con->state != CON_STATE_CLOSED);
con->state = CON_STATE_PREOPEN;
@@ -870,6 +842,7 @@
size_t bytes)
{
struct ceph_bio_iter *it = &cursor->bio_iter;
+ struct page *page = bio_iter_page(it->bio, it->iter);
BUG_ON(bytes > cursor->resid);
BUG_ON(bytes > bio_iter_len(it->bio, it->iter));
@@ -881,7 +854,8 @@
return false; /* no more data */
}
- if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done))
+ if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done &&
+ page == bio_iter_page(it->bio, it->iter)))
return false; /* more bytes to process in this segment */
if (!it->iter.bi_size) {
@@ -929,6 +903,7 @@
size_t bytes)
{
struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs;
+ struct page *page = bvec_iter_page(bvecs, cursor->bvec_iter);
BUG_ON(bytes > cursor->resid);
BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter));
@@ -940,7 +915,8 @@
return false; /* no more data */
}
- if (!bytes || cursor->bvec_iter.bi_bvec_done)
+ if (!bytes || (cursor->bvec_iter.bi_bvec_done &&
+ page == bvec_iter_page(bvecs, cursor->bvec_iter)))
return false; /* more bytes to process in this segment */
BUG_ON(cursor->last_piece);
@@ -1147,16 +1123,13 @@
static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
{
struct ceph_msg_data_cursor *cursor = &msg->cursor;
- struct ceph_msg_data *data;
BUG_ON(!length);
BUG_ON(length > msg->data_length);
- BUG_ON(list_empty(&msg->data));
+ BUG_ON(!msg->num_data_items);
- cursor->data_head = &msg->data;
cursor->total_resid = length;
- data = list_first_entry(&msg->data, struct ceph_msg_data, links);
- cursor->data = data;
+ cursor->data = msg->data;
__ceph_msg_data_cursor_init(cursor);
}
@@ -1237,8 +1210,7 @@
if (!cursor->resid && cursor->total_resid) {
WARN_ON(!cursor->last_piece);
- BUG_ON(list_is_last(&cursor->data->links, cursor->data_head));
- cursor->data = list_next_entry(cursor->data, links);
+ cursor->data++;
__ceph_msg_data_cursor_init(cursor);
new_piece = true;
}
@@ -1254,9 +1226,6 @@
static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
{
- BUG_ON(!msg);
- BUG_ON(!data_len);
-
/* Initialize data cursor */
ceph_msg_data_cursor_init(msg, (size_t)data_len);
@@ -1592,11 +1561,12 @@
struct ceph_msg *msg = con->out_msg;
struct ceph_msg_data_cursor *cursor = &msg->cursor;
bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
u32 crc;
dout("%s %p msg %p\n", __func__, con, msg);
- if (list_empty(&msg->data))
+ if (!msg->num_data_items)
return -EINVAL;
/*
@@ -1612,7 +1582,6 @@
struct page *page;
size_t page_offset;
size_t length;
- bool last_piece;
int ret;
if (!cursor->resid) {
@@ -1620,10 +1589,11 @@
continue;
}
- page = ceph_msg_data_next(cursor, &page_offset, &length,
- &last_piece);
- ret = ceph_tcp_sendpage(con->sock, page, page_offset,
- length, !last_piece);
+ page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+ if (length == cursor->total_resid)
+ more = MSG_MORE;
+ ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
+ more);
if (ret <= 0) {
if (do_datacrc)
msg->footer.data_crc = cpu_to_le32(crc);
@@ -1653,13 +1623,16 @@
*/
static int write_partial_skip(struct ceph_connection *con)
{
+ int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
int ret;
dout("%s %p %d left\n", __func__, con, con->out_skip);
while (con->out_skip > 0) {
size_t size = min(con->out_skip, (int) PAGE_SIZE);
- ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true);
+ if (size == con->out_skip)
+ more = MSG_MORE;
+ ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more);
if (ret <= 0)
goto out;
con->out_skip -= ret;
@@ -1761,12 +1734,14 @@
ret = read_partial(con, end, size, &con->actual_peer_addr);
if (ret <= 0)
goto out;
+ ceph_decode_banner_addr(&con->actual_peer_addr);
size = sizeof (con->peer_addr_for_me);
end += size;
ret = read_partial(con, end, size, &con->peer_addr_for_me);
if (ret <= 0)
goto out;
+ ceph_decode_banner_addr(&con->peer_addr_for_me);
out:
return ret;
@@ -1817,21 +1792,22 @@
{
if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
pr_err("connect to %s got bad banner\n",
- ceph_pr_addr(&con->peer_addr.in_addr));
+ ceph_pr_addr(&con->peer_addr));
con->error_msg = "protocol error, bad banner";
return -1;
}
return 0;
}
-static bool addr_is_blank(struct sockaddr_storage *ss)
+static bool addr_is_blank(struct ceph_entity_addr *addr)
{
- struct in_addr *addr = &((struct sockaddr_in *)ss)->sin_addr;
- struct in6_addr *addr6 = &((struct sockaddr_in6 *)ss)->sin6_addr;
+ struct sockaddr_storage ss = addr->in_addr; /* align */
+ struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr;
+ struct in6_addr *addr6 = &((struct sockaddr_in6 *)&ss)->sin6_addr;
- switch (ss->ss_family) {
+ switch (ss.ss_family) {
case AF_INET:
- return addr->s_addr == htonl(INADDR_ANY);
+ return addr4->s_addr == htonl(INADDR_ANY);
case AF_INET6:
return ipv6_addr_any(addr6);
default:
@@ -1839,25 +1815,25 @@
}
}
-static int addr_port(struct sockaddr_storage *ss)
+static int addr_port(struct ceph_entity_addr *addr)
{
- switch (ss->ss_family) {
+ switch (get_unaligned(&addr->in_addr.ss_family)) {
case AF_INET:
- return ntohs(((struct sockaddr_in *)ss)->sin_port);
+ return ntohs(get_unaligned(&((struct sockaddr_in *)&addr->in_addr)->sin_port));
case AF_INET6:
- return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
+ return ntohs(get_unaligned(&((struct sockaddr_in6 *)&addr->in_addr)->sin6_port));
}
return 0;
}
-static void addr_set_port(struct sockaddr_storage *ss, int p)
+static void addr_set_port(struct ceph_entity_addr *addr, int p)
{
- switch (ss->ss_family) {
+ switch (get_unaligned(&addr->in_addr.ss_family)) {
case AF_INET:
- ((struct sockaddr_in *)ss)->sin_port = htons(p);
+ put_unaligned(htons(p), &((struct sockaddr_in *)&addr->in_addr)->sin_port);
break;
case AF_INET6:
- ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+ put_unaligned(htons(p), &((struct sockaddr_in6 *)&addr->in_addr)->sin6_port);
break;
}
}
@@ -1865,21 +1841,18 @@
/*
* Unlike other *_pton function semantics, zero indicates success.
*/
-static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss,
+static int ceph_pton(const char *str, size_t len, struct ceph_entity_addr *addr,
char delim, const char **ipend)
{
- struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
- struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
+ memset(&addr->in_addr, 0, sizeof(addr->in_addr));
- memset(ss, 0, sizeof(*ss));
-
- if (in4_pton(str, len, (u8 *)&in4->sin_addr.s_addr, delim, ipend)) {
- ss->ss_family = AF_INET;
+ if (in4_pton(str, len, (u8 *)&((struct sockaddr_in *)&addr->in_addr)->sin_addr.s_addr, delim, ipend)) {
+ put_unaligned(AF_INET, &addr->in_addr.ss_family);
return 0;
}
- if (in6_pton(str, len, (u8 *)&in6->sin6_addr.s6_addr, delim, ipend)) {
- ss->ss_family = AF_INET6;
+ if (in6_pton(str, len, (u8 *)&((struct sockaddr_in6 *)&addr->in_addr)->sin6_addr.s6_addr, delim, ipend)) {
+ put_unaligned(AF_INET6, &addr->in_addr.ss_family);
return 0;
}
@@ -1891,7 +1864,7 @@
*/
#ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER
static int ceph_dns_resolve_name(const char *name, size_t namelen,
- struct sockaddr_storage *ss, char delim, const char **ipend)
+ struct ceph_entity_addr *addr, char delim, const char **ipend)
{
const char *end, *delim_p;
char *colon_p, *ip_addr = NULL;
@@ -1918,9 +1891,10 @@
return -EINVAL;
/* do dns_resolve upcall */
- ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL);
+ ip_len = dns_query(current->nsproxy->net_ns,
+ NULL, name, end - name, NULL, &ip_addr, NULL, false);
if (ip_len > 0)
- ret = ceph_pton(ip_addr, ip_len, ss, -1, NULL);
+ ret = ceph_pton(ip_addr, ip_len, addr, -1, NULL);
else
ret = -ESRCH;
@@ -1929,13 +1903,13 @@
*ipend = end;
pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name,
- ret, ret ? "failed" : ceph_pr_addr(ss));
+ ret, ret ? "failed" : ceph_pr_addr(addr));
return ret;
}
#else
static inline int ceph_dns_resolve_name(const char *name, size_t namelen,
- struct sockaddr_storage *ss, char delim, const char **ipend)
+ struct ceph_entity_addr *addr, char delim, const char **ipend)
{
return -EINVAL;
}
@@ -1946,13 +1920,13 @@
* then try to extract a hostname to resolve using userspace DNS upcall.
*/
static int ceph_parse_server_name(const char *name, size_t namelen,
- struct sockaddr_storage *ss, char delim, const char **ipend)
+ struct ceph_entity_addr *addr, char delim, const char **ipend)
{
int ret;
- ret = ceph_pton(name, namelen, ss, delim, ipend);
+ ret = ceph_pton(name, namelen, addr, delim, ipend);
if (ret)
- ret = ceph_dns_resolve_name(name, namelen, ss, delim, ipend);
+ ret = ceph_dns_resolve_name(name, namelen, addr, delim, ipend);
return ret;
}
@@ -1971,7 +1945,6 @@
dout("parse_ips on '%.*s'\n", (int)(end-c), c);
for (i = 0; i < max_count; i++) {
const char *ipend;
- struct sockaddr_storage *ss = &addr[i].in_addr;
int port;
char delim = ',';
@@ -1980,7 +1953,7 @@
p++;
}
- ret = ceph_parse_server_name(p, end - p, ss, delim, &ipend);
+ ret = ceph_parse_server_name(p, end - p, &addr[i], delim, &ipend);
if (ret)
goto bad;
ret = -EINVAL;
@@ -2011,9 +1984,10 @@
port = CEPH_MON_PORT;
}
- addr_set_port(ss, port);
+ addr_set_port(&addr[i], port);
+ addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
- dout("parse_ips got %s\n", ceph_pr_addr(ss));
+ dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
if (p == end)
break;
@@ -2042,9 +2016,6 @@
if (verify_hello(con) < 0)
return -1;
- ceph_decode_addr(&con->actual_peer_addr);
- ceph_decode_addr(&con->peer_addr_for_me);
-
/*
* Make sure the other end is who we wanted. note that the other
* end may not yet know their ip address, so if it's 0.0.0.0, give
@@ -2052,12 +2023,12 @@
*/
if (memcmp(&con->peer_addr, &con->actual_peer_addr,
sizeof(con->peer_addr)) != 0 &&
- !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+ !(addr_is_blank(&con->actual_peer_addr) &&
con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
pr_warn("wrong peer, want %s/%d, got %s/%d\n",
- ceph_pr_addr(&con->peer_addr.in_addr),
+ ceph_pr_addr(&con->peer_addr),
(int)le32_to_cpu(con->peer_addr.nonce),
- ceph_pr_addr(&con->actual_peer_addr.in_addr),
+ ceph_pr_addr(&con->actual_peer_addr),
(int)le32_to_cpu(con->actual_peer_addr.nonce));
con->error_msg = "wrong peer at address";
return -1;
@@ -2066,16 +2037,16 @@
/*
* did we learn our address?
*/
- if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
- int port = addr_port(&con->msgr->inst.addr.in_addr);
+ if (addr_is_blank(&con->msgr->inst.addr)) {
+ int port = addr_port(&con->msgr->inst.addr);
memcpy(&con->msgr->inst.addr.in_addr,
&con->peer_addr_for_me.in_addr,
sizeof(con->peer_addr_for_me.in_addr));
- addr_set_port(&con->msgr->inst.addr.in_addr, port);
+ addr_set_port(&con->msgr->inst.addr, port);
encode_my_addr(con->msgr);
dout("process_banner learned my addr is %s\n",
- ceph_pr_addr(&con->msgr->inst.addr.in_addr));
+ ceph_pr_addr(&con->msgr->inst.addr));
}
return 0;
@@ -2091,6 +2062,8 @@
dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
if (con->auth) {
+ int len = le32_to_cpu(con->in_reply.authorizer_len);
+
/*
* Any connection that defines ->get_authorizer()
* should also define ->add_authorizer_challenge() and
@@ -2100,8 +2073,7 @@
*/
if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
ret = con->ops->add_authorizer_challenge(
- con, con->auth->authorizer_reply_buf,
- le32_to_cpu(con->in_reply.authorizer_len));
+ con, con->auth->authorizer_reply_buf, len);
if (ret < 0)
return ret;
@@ -2111,10 +2083,12 @@
return 0;
}
- ret = con->ops->verify_authorizer_reply(con);
- if (ret < 0) {
- con->error_msg = "bad authorize reply";
- return ret;
+ if (len) {
+ ret = con->ops->verify_authorizer_reply(con);
+ if (ret < 0) {
+ con->error_msg = "bad authorize reply";
+ return ret;
+ }
}
}
@@ -2123,7 +2097,7 @@
pr_err("%s%lld %s feature set mismatch,"
" my %llx < server's %llx, missing %llx\n",
ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr.in_addr),
+ ceph_pr_addr(&con->peer_addr),
sup_feat, server_feat, server_feat & ~sup_feat);
con->error_msg = "missing required protocol features";
reset_connection(con);
@@ -2133,7 +2107,7 @@
pr_err("%s%lld %s protocol version mismatch,"
" my %d != server's %d\n",
ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr.in_addr),
+ ceph_pr_addr(&con->peer_addr),
le32_to_cpu(con->out_connect.protocol_version),
le32_to_cpu(con->in_reply.protocol_version));
con->error_msg = "protocol version mismatch";
@@ -2167,7 +2141,7 @@
le32_to_cpu(con->in_reply.connect_seq));
pr_err("%s%lld %s connection reset\n",
ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr.in_addr));
+ ceph_pr_addr(&con->peer_addr));
reset_connection(con);
con_out_kvec_reset(con);
ret = prepare_write_connect(con);
@@ -2224,7 +2198,7 @@
pr_err("%s%lld %s protocol feature mismatch,"
" my required %llx > server's %llx, need %llx\n",
ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr.in_addr),
+ ceph_pr_addr(&con->peer_addr),
req_feat, server_feat, req_feat & ~server_feat);
con->error_msg = "missing required protocol features";
reset_connection(con);
@@ -2353,8 +2327,7 @@
u32 crc = 0;
int ret;
- BUG_ON(!msg);
- if (list_empty(&msg->data))
+ if (!msg->num_data_items)
return -EIO;
if (do_datacrc)
@@ -2432,7 +2405,7 @@
if ((s64)seq - (s64)con->in_seq < 1) {
pr_info("skipping %s%lld %s seq %lld expected %lld\n",
ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr.in_addr),
+ ceph_pr_addr(&con->peer_addr),
seq, con->in_seq + 1);
con->in_base_pos = -front_len - middle_len - data_len -
sizeof_footer(con);
@@ -3011,10 +2984,10 @@
static void con_fault(struct ceph_connection *con)
{
dout("fault %p state %lu to peer %s\n",
- con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
+ con, con->state, ceph_pr_addr(&con->peer_addr));
pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
+ ceph_pr_addr(&con->peer_addr), con->error_msg);
con->error_msg = NULL;
WARN_ON(con->state != CON_STATE_CONNECTING &&
@@ -3058,6 +3031,12 @@
}
+void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
+{
+ u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
+ msgr->inst.addr.nonce = cpu_to_le32(nonce);
+ encode_my_addr(msgr);
+}
/*
* initialize a new messenger instance
@@ -3240,9 +3219,10 @@
dout("con_keepalive %p\n", con);
mutex_lock(&con->mutex);
clear_standby(con);
+ con_flag_set(con, CON_FLAG_KEEPALIVE_PENDING);
mutex_unlock(&con->mutex);
- if (con_flag_test_and_set(con, CON_FLAG_KEEPALIVE_PENDING) == 0 &&
- con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+
+ if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
queue_con(con);
}
EXPORT_SYMBOL(ceph_con_keepalive);
@@ -3262,32 +3242,16 @@
return false;
}
-static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
+static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg)
{
- struct ceph_msg_data *data;
-
- if (WARN_ON(!ceph_msg_data_type_valid(type)))
- return NULL;
-
- data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS);
- if (!data)
- return NULL;
-
- data->type = type;
- INIT_LIST_HEAD(&data->links);
-
- return data;
+ BUG_ON(msg->num_data_items >= msg->max_data_items);
+ return &msg->data[msg->num_data_items++];
}
static void ceph_msg_data_destroy(struct ceph_msg_data *data)
{
- if (!data)
- return;
-
- WARN_ON(!list_empty(&data->links));
if (data->type == CEPH_MSG_DATA_PAGELIST)
ceph_pagelist_release(data->pagelist);
- kmem_cache_free(ceph_msg_data_cache, data);
}
void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
@@ -3298,13 +3262,12 @@
BUG_ON(!pages);
BUG_ON(!length);
- data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES);
- BUG_ON(!data);
+ data = ceph_msg_data_add(msg);
+ data->type = CEPH_MSG_DATA_PAGES;
data->pages = pages;
data->length = length;
data->alignment = alignment & ~PAGE_MASK;
- list_add_tail(&data->links, &msg->data);
msg->data_length += length;
}
EXPORT_SYMBOL(ceph_msg_data_add_pages);
@@ -3317,11 +3280,11 @@
BUG_ON(!pagelist);
BUG_ON(!pagelist->length);
- data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST);
- BUG_ON(!data);
+ data = ceph_msg_data_add(msg);
+ data->type = CEPH_MSG_DATA_PAGELIST;
+ refcount_inc(&pagelist->refcnt);
data->pagelist = pagelist;
- list_add_tail(&data->links, &msg->data);
msg->data_length += pagelist->length;
}
EXPORT_SYMBOL(ceph_msg_data_add_pagelist);
@@ -3332,12 +3295,11 @@
{
struct ceph_msg_data *data;
- data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
- BUG_ON(!data);
+ data = ceph_msg_data_add(msg);
+ data->type = CEPH_MSG_DATA_BIO;
data->bio_pos = *bio_pos;
data->bio_length = length;
- list_add_tail(&data->links, &msg->data);
msg->data_length += length;
}
EXPORT_SYMBOL(ceph_msg_data_add_bio);
@@ -3348,11 +3310,10 @@
{
struct ceph_msg_data *data;
- data = ceph_msg_data_create(CEPH_MSG_DATA_BVECS);
- BUG_ON(!data);
+ data = ceph_msg_data_add(msg);
+ data->type = CEPH_MSG_DATA_BVECS;
data->bvec_pos = *bvec_pos;
- list_add_tail(&data->links, &msg->data);
msg->data_length += bvec_pos->iter.bi_size;
}
EXPORT_SYMBOL(ceph_msg_data_add_bvecs);
@@ -3361,8 +3322,8 @@
* construct a new message with given type, size
* the new msg has a ref count of 1.
*/
-struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
- bool can_fail)
+struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
+ gfp_t flags, bool can_fail)
{
struct ceph_msg *m;
@@ -3376,7 +3337,6 @@
INIT_LIST_HEAD(&m->list_head);
kref_init(&m->kref);
- INIT_LIST_HEAD(&m->data);
/* front */
if (front_len) {
@@ -3391,6 +3351,15 @@
}
m->front_alloc_len = m->front.iov_len = front_len;
+ if (max_data_items) {
+ m->data = kmalloc_array(max_data_items, sizeof(*m->data),
+ flags);
+ if (!m->data)
+ goto out2;
+
+ m->max_data_items = max_data_items;
+ }
+
dout("ceph_msg_new %p front %d\n", m, front_len);
return m;
@@ -3407,6 +3376,13 @@
}
return NULL;
}
+EXPORT_SYMBOL(ceph_msg_new2);
+
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
+ bool can_fail)
+{
+ return ceph_msg_new2(type, front_len, 0, flags, can_fail);
+}
EXPORT_SYMBOL(ceph_msg_new);
/*
@@ -3502,13 +3478,14 @@
{
dout("%s %p\n", __func__, m);
kvfree(m->front.iov_base);
+ kfree(m->data);
kmem_cache_free(ceph_msg_cache, m);
}
static void ceph_msg_release(struct kref *kref)
{
struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
- struct ceph_msg_data *data, *next;
+ int i;
dout("%s %p\n", __func__, m);
WARN_ON(!list_empty(&m->list_head));
@@ -3521,11 +3498,8 @@
m->middle = NULL;
}
- list_for_each_entry_safe(data, next, &m->data, links) {
- list_del_init(&data->links);
- ceph_msg_data_destroy(data);
- }
- m->data_length = 0;
+ for (i = 0; i < m->num_data_items; i++)
+ ceph_msg_data_destroy(&m->data[i]);
if (m->pool)
ceph_msgpool_put(m->pool, m);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 18deb3d..7256c40 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -39,7 +39,7 @@
/*
* Decode a monmap blob (e.g., during mount).
*/
-struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+static struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
{
struct ceph_monmap *m = NULL;
int i, err = -EINVAL;
@@ -50,7 +50,7 @@
ceph_decode_32_safe(&p, end, len, bad);
ceph_decode_need(&p, end, len, bad);
- dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+ dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p));
p += sizeof(u16); /* skip version */
ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
@@ -58,7 +58,6 @@
epoch = ceph_decode_32(&p);
num_mon = ceph_decode_32(&p);
- ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
if (num_mon > CEPH_MAX_MON)
goto bad;
@@ -68,17 +67,22 @@
m->fsid = fsid;
m->epoch = epoch;
m->num_mon = num_mon;
- ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
- for (i = 0; i < num_mon; i++)
- ceph_decode_addr(&m->mon_inst[i].addr);
+ for (i = 0; i < num_mon; ++i) {
+ struct ceph_entity_inst *inst = &m->mon_inst[i];
+ /* copy name portion */
+ ceph_decode_copy_safe(&p, end, &inst->name,
+ sizeof(inst->name), bad);
+ err = ceph_decode_entity_addr(&p, end, &inst->addr);
+ if (err)
+ goto bad;
+ }
dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
m->num_mon);
for (i = 0; i < m->num_mon; i++)
dout("monmap_decode mon%d is %s\n", i,
- ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
+ ceph_pr_addr(&m->mon_inst[i].addr));
return m;
-
bad:
dout("monmap_decode failed with %d\n", err);
kfree(m);
@@ -203,12 +207,19 @@
{
if (!monc->hunting)
pr_info("mon%d %s session lost, hunting for new mon\n",
- monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr.in_addr));
+ monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr));
__close_session(monc);
__open_session(monc);
}
+void ceph_monc_reopen_session(struct ceph_mon_client *monc)
+{
+ mutex_lock(&monc->mutex);
+ reopen_session(monc);
+ mutex_unlock(&monc->mutex);
+}
+
static void un_backoff(struct ceph_mon_client *monc)
{
monc->hunt_mult /= 2; /* reduce by 50% */
@@ -469,6 +480,7 @@
if (IS_ERR(monmap)) {
pr_err("problem decoding monmap, %d\n",
(int)PTR_ERR(monmap));
+ ceph_msg_dump(msg);
goto out;
}
@@ -922,6 +934,15 @@
mutex_unlock(&monc->mutex);
ret = wait_generic_request(req);
+ if (!ret)
+ /*
+ * Make sure we have the osdmap that includes the blacklist
+ * entry. This is needed to ensure that the OSDs pick up the
+ * new blacklist before processing any future requests from
+ * this client.
+ */
+ ret = ceph_wait_for_latest_osdmap(monc->client, 0);
+
out:
put_generic_request(req);
return ret;
@@ -1169,7 +1190,7 @@
__resend_generic_request(monc);
pr_info("mon%d %s session established\n", monc->cur_mon,
- ceph_pr_addr(&monc->con.peer_addr.in_addr));
+ ceph_pr_addr(&monc->con.peer_addr));
}
out:
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
index 7257153..e3ecb80 100644
--- a/net/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -14,7 +14,8 @@
struct ceph_msgpool *pool = arg;
struct ceph_msg *msg;
- msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true);
+ msg = ceph_msg_new2(pool->type, pool->front_len, pool->max_data_items,
+ gfp_mask, true);
if (!msg) {
dout("msgpool_alloc %s failed\n", pool->name);
} else {
@@ -35,11 +36,13 @@
}
int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
- int front_len, int size, bool blocking, const char *name)
+ int front_len, int max_data_items, int size,
+ const char *name)
{
dout("msgpool %s init\n", name);
pool->type = type;
pool->front_len = front_len;
+ pool->max_data_items = max_data_items;
pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool);
if (!pool->pool)
return -ENOMEM;
@@ -53,18 +56,21 @@
mempool_destroy(pool->pool);
}
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
- int front_len)
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len,
+ int max_data_items)
{
struct ceph_msg *msg;
- if (front_len > pool->front_len) {
- dout("msgpool_get %s need front %d, pool size is %d\n",
- pool->name, front_len, pool->front_len);
- WARN_ON(1);
+ if (front_len > pool->front_len ||
+ max_data_items > pool->max_data_items) {
+ pr_warn_ratelimited("%s need %d/%d, pool %s has %d/%d\n",
+ __func__, front_len, max_data_items, pool->name,
+ pool->front_len, pool->max_data_items);
+ WARN_ON_ONCE(1);
/* try to alloc a fresh message */
- return ceph_msg_new(pool->type, front_len, GFP_NOFS, false);
+ return ceph_msg_new2(pool->type, front_len, max_data_items,
+ GFP_NOFS, false);
}
msg = mempool_alloc(pool->pool, GFP_NOFS);
@@ -80,6 +86,9 @@
msg->front.iov_len = pool->front_len;
msg->hdr.front_len = cpu_to_le32(pool->front_len);
+ msg->data_length = 0;
+ msg->num_data_items = 0;
+
kref_init(&msg->kref); /* retake single ref */
mempool_free(msg, pool->pool);
}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 60934bd..ba45b07 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -126,6 +126,9 @@
osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
}
+/*
+ * Consumes @pages if @own_pages is true.
+ */
static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
struct page **pages, u64 length, u32 alignment,
bool pages_from_pool, bool own_pages)
@@ -138,6 +141,9 @@
osd_data->own_pages = own_pages;
}
+/*
+ * Consumes a ref on @pagelist.
+ */
static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
struct ceph_pagelist *pagelist)
{
@@ -165,14 +171,6 @@
osd_data->num_bvecs = num_bvecs;
}
-#define osd_req_op_data(oreq, whch, typ, fld) \
-({ \
- struct ceph_osd_request *__oreq = (oreq); \
- unsigned int __whch = (whch); \
- BUG_ON(__whch >= __oreq->r_num_ops); \
- &__oreq->r_ops[__whch].typ.fld; \
-})
-
static struct ceph_osd_data *
osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
{
@@ -362,6 +360,8 @@
num_pages = calc_pages_for((u64)osd_data->alignment,
(u64)osd_data->length);
ceph_release_page_vector(osd_data->pages, num_pages);
+ } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
+ ceph_pagelist_release(osd_data->pagelist);
}
ceph_osd_data_init(osd_data);
}
@@ -402,6 +402,9 @@
case CEPH_OSD_OP_LIST_WATCHERS:
ceph_osd_data_release(&op->list_watchers.response_data);
break;
+ case CEPH_OSD_OP_COPY_FROM:
+ ceph_osd_data_release(&op->copy_from.osd_data);
+ break;
default:
break;
}
@@ -467,7 +470,7 @@
{
WARN_ON(!RB_EMPTY_NODE(&req->r_node));
WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
- WARN_ON(!list_empty(&req->r_unsafe_item));
+ WARN_ON(!list_empty(&req->r_private_item));
WARN_ON(req->r_osd);
}
@@ -527,7 +530,7 @@
init_completion(&req->r_completion);
RB_CLEAR_NODE(&req->r_node);
RB_CLEAR_NODE(&req->r_mc_node);
- INIT_LIST_HEAD(&req->r_unsafe_item);
+ INIT_LIST_HEAD(&req->r_private_item);
target_init(&req->r_t);
}
@@ -606,12 +609,15 @@
return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
}
-int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
+static int __ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp,
+ int num_request_data_items,
+ int num_reply_data_items)
{
struct ceph_osd_client *osdc = req->r_osdc;
struct ceph_msg *msg;
int msg_size;
+ WARN_ON(req->r_request || req->r_reply);
WARN_ON(ceph_oid_empty(&req->r_base_oid));
WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
@@ -633,9 +639,11 @@
msg_size += 4 + 8; /* retry_attempt, features */
if (req->r_mempool)
- msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
+ msg = ceph_msgpool_get(&osdc->msgpool_op, msg_size,
+ num_request_data_items);
else
- msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
+ msg = ceph_msg_new2(CEPH_MSG_OSD_OP, msg_size,
+ num_request_data_items, gfp, true);
if (!msg)
return -ENOMEM;
@@ -648,9 +656,11 @@
msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
if (req->r_mempool)
- msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+ msg = ceph_msgpool_get(&osdc->msgpool_op_reply, msg_size,
+ num_reply_data_items);
else
- msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
+ msg = ceph_msg_new2(CEPH_MSG_OSD_OPREPLY, msg_size,
+ num_reply_data_items, gfp, true);
if (!msg)
return -ENOMEM;
@@ -658,7 +668,6 @@
return 0;
}
-EXPORT_SYMBOL(ceph_osdc_alloc_messages);
static bool osd_req_opcode_valid(u16 opcode)
{
@@ -671,6 +680,65 @@
}
}
+static void get_num_data_items(struct ceph_osd_request *req,
+ int *num_request_data_items,
+ int *num_reply_data_items)
+{
+ struct ceph_osd_req_op *op;
+
+ *num_request_data_items = 0;
+ *num_reply_data_items = 0;
+
+ for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) {
+ switch (op->op) {
+ /* request */
+ case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
+ case CEPH_OSD_OP_SETXATTR:
+ case CEPH_OSD_OP_CMPXATTR:
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ case CEPH_OSD_OP_COPY_FROM:
+ *num_request_data_items += 1;
+ break;
+
+ /* reply */
+ case CEPH_OSD_OP_STAT:
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_LIST_WATCHERS:
+ *num_reply_data_items += 1;
+ break;
+
+ /* both */
+ case CEPH_OSD_OP_NOTIFY:
+ *num_request_data_items += 1;
+ *num_reply_data_items += 1;
+ break;
+ case CEPH_OSD_OP_CALL:
+ *num_request_data_items += 2;
+ *num_reply_data_items += 1;
+ break;
+
+ default:
+ WARN_ON(!osd_req_opcode_valid(op->op));
+ break;
+ }
+ }
+}
+
+/*
+ * oid, oloc and OSD op opcode(s) must be filled in before this function
+ * is called.
+ */
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
+{
+ int num_request_data_items, num_reply_data_items;
+
+ get_num_data_items(req, &num_request_data_items, &num_reply_data_items);
+ return __ceph_osdc_alloc_messages(req, gfp, num_request_data_items,
+ num_reply_data_items);
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
+
/*
* This is an osd op init function for opcodes that have no data or
* other information associated with them. It also serves as a
@@ -767,40 +835,45 @@
EXPORT_SYMBOL(osd_req_op_extent_dup_last);
int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
- u16 opcode, const char *class, const char *method)
+ const char *class, const char *method)
{
- struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
- opcode, 0);
+ struct ceph_osd_req_op *op;
struct ceph_pagelist *pagelist;
size_t payload_len = 0;
size_t size;
+ int ret;
- BUG_ON(opcode != CEPH_OSD_OP_CALL);
+ op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0);
- pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
+ pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!pagelist)
return -ENOMEM;
- ceph_pagelist_init(pagelist);
-
op->cls.class_name = class;
size = strlen(class);
BUG_ON(size > (size_t) U8_MAX);
op->cls.class_len = size;
- ceph_pagelist_append(pagelist, class, size);
+ ret = ceph_pagelist_append(pagelist, class, size);
+ if (ret)
+ goto err_pagelist_free;
payload_len += size;
op->cls.method_name = method;
size = strlen(method);
BUG_ON(size > (size_t) U8_MAX);
op->cls.method_len = size;
- ceph_pagelist_append(pagelist, method, size);
+ ret = ceph_pagelist_append(pagelist, method, size);
+ if (ret)
+ goto err_pagelist_free;
payload_len += size;
osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
-
op->indata_len = payload_len;
return 0;
+
+err_pagelist_free:
+ ceph_pagelist_release(pagelist);
+ return ret;
}
EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -812,21 +885,24 @@
opcode, 0);
struct ceph_pagelist *pagelist;
size_t payload_len;
+ int ret;
BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
- pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+ pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!pagelist)
return -ENOMEM;
- ceph_pagelist_init(pagelist);
-
payload_len = strlen(name);
op->xattr.name_len = payload_len;
- ceph_pagelist_append(pagelist, name, payload_len);
+ ret = ceph_pagelist_append(pagelist, name, payload_len);
+ if (ret)
+ goto err_pagelist_free;
op->xattr.value_len = size;
- ceph_pagelist_append(pagelist, value, size);
+ ret = ceph_pagelist_append(pagelist, value, size);
+ if (ret)
+ goto err_pagelist_free;
payload_len += size;
op->xattr.cmp_op = cmp_op;
@@ -835,6 +911,10 @@
ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
op->indata_len = payload_len;
return 0;
+
+err_pagelist_free:
+ ceph_pagelist_release(pagelist);
+ return ret;
}
EXPORT_SYMBOL(osd_req_op_xattr_init);
@@ -900,12 +980,6 @@
static u32 osd_req_encode_op(struct ceph_osd_op *dst,
const struct ceph_osd_req_op *src)
{
- if (WARN_ON(!osd_req_opcode_valid(src->op))) {
- pr_err("unrecognized osd opcode %d\n", src->op);
-
- return 0;
- }
-
switch (src->op) {
case CEPH_OSD_OP_STAT:
break;
@@ -955,6 +1029,14 @@
case CEPH_OSD_OP_CREATE:
case CEPH_OSD_OP_DELETE:
break;
+ case CEPH_OSD_OP_COPY_FROM:
+ dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid);
+ dst->copy_from.src_version =
+ cpu_to_le64(src->copy_from.src_version);
+ dst->copy_from.flags = src->copy_from.flags;
+ dst->copy_from.src_fadvise_flags =
+ cpu_to_le32(src->copy_from.src_fadvise_flags);
+ break;
default:
pr_err("unsupported osd opcode %s\n",
ceph_osd_op_name(src->op));
@@ -1038,7 +1120,15 @@
if (flags & CEPH_OSD_FLAG_WRITE)
req->r_data_offset = off;
- r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+ if (num_ops > 1)
+ /*
+ * This is a special case for ceph_writepages_start(), but it
+ * also covers ceph_uninline_data(). If more multi-op request
+ * use cases emerge, we will need a separate helper.
+ */
+ r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_ops, 0);
+ else
+ r = ceph_osdc_alloc_messages(req, GFP_NOFS);
if (r)
goto fail;
@@ -1415,7 +1505,6 @@
static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
struct ceph_osd_request_target *t,
- struct ceph_connection *con,
bool any_change)
{
struct ceph_pg_pool_info *pi;
@@ -1423,7 +1512,7 @@
struct ceph_osds up, acting;
bool force_resend = false;
bool unpaused = false;
- bool legacy_change;
+ bool legacy_change = false;
bool split = false;
bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
bool recovery_deletes = ceph_osdmap_flag(osdc,
@@ -1511,15 +1600,14 @@
t->osd = acting.primary;
}
- if (unpaused || legacy_change || force_resend ||
- (split && con && CEPH_HAVE_FEATURE(con->peer_features,
- RESEND_ON_SPLIT)))
+ if (unpaused || legacy_change || force_resend || split)
ct_res = CALC_TARGET_NEED_RESEND;
else
ct_res = CALC_TARGET_NO_ACTION;
out:
- dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+ dout("%s t %p -> %d%d%d%d ct_res %d osd%d\n", __func__, t, unpaused,
+ legacy_change, force_resend, split, ct_res, t->osd);
return ct_res;
}
@@ -1845,48 +1933,55 @@
return true;
}
-static void setup_request_data(struct ceph_osd_request *req,
- struct ceph_msg *msg)
+/*
+ * Keep get_num_data_items() in sync with this function.
+ */
+static void setup_request_data(struct ceph_osd_request *req)
{
- u32 data_len = 0;
- int i;
+ struct ceph_msg *request_msg = req->r_request;
+ struct ceph_msg *reply_msg = req->r_reply;
+ struct ceph_osd_req_op *op;
- if (!list_empty(&msg->data))
+ if (req->r_request->num_data_items || req->r_reply->num_data_items)
return;
- WARN_ON(msg->data_length);
- for (i = 0; i < req->r_num_ops; i++) {
- struct ceph_osd_req_op *op = &req->r_ops[i];
-
+ WARN_ON(request_msg->data_length || reply_msg->data_length);
+ for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) {
switch (op->op) {
/* request */
case CEPH_OSD_OP_WRITE:
case CEPH_OSD_OP_WRITEFULL:
WARN_ON(op->indata_len != op->extent.length);
- ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
+ ceph_osdc_msg_data_add(request_msg,
+ &op->extent.osd_data);
break;
case CEPH_OSD_OP_SETXATTR:
case CEPH_OSD_OP_CMPXATTR:
WARN_ON(op->indata_len != op->xattr.name_len +
op->xattr.value_len);
- ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
+ ceph_osdc_msg_data_add(request_msg,
+ &op->xattr.osd_data);
break;
case CEPH_OSD_OP_NOTIFY_ACK:
- ceph_osdc_msg_data_add(msg,
+ ceph_osdc_msg_data_add(request_msg,
&op->notify_ack.request_data);
break;
+ case CEPH_OSD_OP_COPY_FROM:
+ ceph_osdc_msg_data_add(request_msg,
+ &op->copy_from.osd_data);
+ break;
/* reply */
case CEPH_OSD_OP_STAT:
- ceph_osdc_msg_data_add(req->r_reply,
+ ceph_osdc_msg_data_add(reply_msg,
&op->raw_data_in);
break;
case CEPH_OSD_OP_READ:
- ceph_osdc_msg_data_add(req->r_reply,
+ ceph_osdc_msg_data_add(reply_msg,
&op->extent.osd_data);
break;
case CEPH_OSD_OP_LIST_WATCHERS:
- ceph_osdc_msg_data_add(req->r_reply,
+ ceph_osdc_msg_data_add(reply_msg,
&op->list_watchers.response_data);
break;
@@ -1895,25 +1990,23 @@
WARN_ON(op->indata_len != op->cls.class_len +
op->cls.method_len +
op->cls.indata_len);
- ceph_osdc_msg_data_add(msg, &op->cls.request_info);
+ ceph_osdc_msg_data_add(request_msg,
+ &op->cls.request_info);
/* optional, can be NONE */
- ceph_osdc_msg_data_add(msg, &op->cls.request_data);
+ ceph_osdc_msg_data_add(request_msg,
+ &op->cls.request_data);
/* optional, can be NONE */
- ceph_osdc_msg_data_add(req->r_reply,
+ ceph_osdc_msg_data_add(reply_msg,
&op->cls.response_data);
break;
case CEPH_OSD_OP_NOTIFY:
- ceph_osdc_msg_data_add(msg,
+ ceph_osdc_msg_data_add(request_msg,
&op->notify.request_data);
- ceph_osdc_msg_data_add(req->r_reply,
+ ceph_osdc_msg_data_add(reply_msg,
&op->notify.response_data);
break;
}
-
- data_len += op->indata_len;
}
-
- WARN_ON(data_len != msg->data_length);
}
static void encode_pgid(void **p, const struct ceph_pg *pgid)
@@ -1961,7 +2054,7 @@
req->r_data_offset || req->r_snapc);
}
- setup_request_data(req, msg);
+ setup_request_data(req);
encode_spgid(&p, &req->r_t.spgid); /* actual spg */
ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */
@@ -2195,7 +2288,7 @@
dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
again:
- ct_res = calc_target(osdc, &req->r_t, NULL, false);
+ ct_res = calc_target(osdc, &req->r_t, false);
if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
goto promote;
@@ -2229,7 +2322,7 @@
(ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
pool_full(osdc, req->r_t.base_oloc.pool))) {
dout("req %p full/pool_full\n", req);
- if (osdc->abort_on_full) {
+ if (ceph_test_opt(osdc->client, ABORT_ON_FULL)) {
err = -ENOSPC;
} else {
pr_warn_ratelimited("FULL or reached pool quota\n");
@@ -2312,7 +2405,7 @@
static void __complete_request(struct ceph_osd_request *req)
{
- dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
+ dout("%s req %p tid %llu cb %ps result %d\n", __func__, req,
req->r_tid, req->r_callback, req->r_result);
if (req->r_callback)
@@ -2399,6 +2492,14 @@
}
EXPORT_SYMBOL(ceph_osdc_abort_requests);
+void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc)
+{
+ down_write(&osdc->lock);
+ osdc->abort_err = 0;
+ up_write(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_clear_abort_err);
+
static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
{
if (likely(eb > osdc->epoch_barrier)) {
@@ -2459,7 +2560,7 @@
{
bool victims = false;
- if (osdc->abort_on_full &&
+ if (ceph_test_opt(osdc->client, ABORT_ON_FULL) &&
(ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc)))
for_each_request(osdc, abort_on_full_fn, &victims);
}
@@ -3001,11 +3102,21 @@
struct ceph_osd_client *osdc = lreq->osdc;
struct ceph_osd *osd;
- calc_target(osdc, &lreq->t, NULL, false);
+ down_write(&osdc->lock);
+ linger_register(lreq);
+ if (lreq->is_watch) {
+ lreq->reg_req->r_ops[0].watch.cookie = lreq->linger_id;
+ lreq->ping_req->r_ops[0].watch.cookie = lreq->linger_id;
+ } else {
+ lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id;
+ }
+
+ calc_target(osdc, &lreq->t, false);
osd = lookup_create_osd(osdc, lreq->t.osd, true);
link_linger(osd, lreq);
send_linger(lreq);
+ up_write(&osdc->lock);
}
static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
@@ -3617,7 +3728,7 @@
struct ceph_osd_client *osdc = lreq->osdc;
enum calc_target_result ct_res;
- ct_res = calc_target(osdc, &lreq->t, NULL, true);
+ ct_res = calc_target(osdc, &lreq->t, true);
if (ct_res == CALC_TARGET_NEED_RESEND) {
struct ceph_osd *osd;
@@ -3689,8 +3800,7 @@
n = rb_next(n); /* unlink_request(), check_pool_dne() */
dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
- ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con,
- false);
+ ct_res = calc_target(osdc, &req->r_t, false);
switch (ct_res) {
case CALC_TARGET_NO_ACTION:
force_resend_writes = cleared_full ||
@@ -3799,7 +3909,7 @@
n = rb_next(n);
if (req->r_t.epoch < osdc->osdmap->epoch) {
- ct_res = calc_target(osdc, &req->r_t, NULL, false);
+ ct_res = calc_target(osdc, &req->r_t, false);
if (ct_res == CALC_TARGET_POOL_DNE) {
erase_request(need_resend, req);
check_pool_dne(req);
@@ -4318,9 +4428,7 @@
lreq->notify_id, notify_id);
} else if (!completion_done(&lreq->notify_finish_wait)) {
struct ceph_msg_data *data =
- list_first_entry_or_null(&msg->data,
- struct ceph_msg_data,
- links);
+ msg->num_data_items ? &msg->data[0] : NULL;
if (data) {
if (lreq->preply_pages) {
@@ -4476,6 +4584,23 @@
ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+ return req;
+}
+
+static struct ceph_osd_request *
+alloc_watch_request(struct ceph_osd_linger_request *lreq, u8 watch_opcode)
+{
+ struct ceph_osd_request *req;
+
+ req = alloc_linger_request(lreq);
+ if (!req)
+ return NULL;
+
+ /*
+ * Pass 0 for cookie because we don't know it yet, it will be
+ * filled in by linger_submit().
+ */
+ osd_req_op_watch_init(req, 0, 0, watch_opcode);
if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
ceph_osdc_put_request(req);
@@ -4514,27 +4639,19 @@
lreq->t.flags = CEPH_OSD_FLAG_WRITE;
ktime_get_real_ts64(&lreq->mtime);
- lreq->reg_req = alloc_linger_request(lreq);
+ lreq->reg_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_WATCH);
if (!lreq->reg_req) {
ret = -ENOMEM;
goto err_put_lreq;
}
- lreq->ping_req = alloc_linger_request(lreq);
+ lreq->ping_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_PING);
if (!lreq->ping_req) {
ret = -ENOMEM;
goto err_put_lreq;
}
- down_write(&osdc->lock);
- linger_register(lreq); /* before osd_req_op_* */
- osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
- CEPH_OSD_WATCH_OP_WATCH);
- osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
- CEPH_OSD_WATCH_OP_PING);
linger_submit(lreq);
- up_write(&osdc->lock);
-
ret = linger_reg_commit_wait(lreq);
if (ret) {
linger_cancel(lreq);
@@ -4599,11 +4716,10 @@
op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
- pl = kmalloc(sizeof(*pl), GFP_NOIO);
+ pl = ceph_pagelist_alloc(GFP_NOIO);
if (!pl)
return -ENOMEM;
- ceph_pagelist_init(pl);
ret = ceph_pagelist_encode_64(pl, notify_id);
ret |= ceph_pagelist_encode_64(pl, cookie);
if (payload) {
@@ -4641,12 +4757,12 @@
ceph_oloc_copy(&req->r_base_oloc, oloc);
req->r_flags = CEPH_OSD_FLAG_READ;
- ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
+ payload_len);
if (ret)
goto out_put_req;
- ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
- payload_len);
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
if (ret)
goto out_put_req;
@@ -4670,11 +4786,10 @@
op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
op->notify.cookie = cookie;
- pl = kmalloc(sizeof(*pl), GFP_NOIO);
+ pl = ceph_pagelist_alloc(GFP_NOIO);
if (!pl)
return -ENOMEM;
- ceph_pagelist_init(pl);
ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
ret |= ceph_pagelist_encode_32(pl, timeout);
ret |= ceph_pagelist_encode_32(pl, payload_len);
@@ -4733,29 +4848,30 @@
goto out_put_lreq;
}
+ /*
+ * Pass 0 for cookie because we don't know it yet, it will be
+ * filled in by linger_submit().
+ */
+ ret = osd_req_op_notify_init(lreq->reg_req, 0, 0, 1, timeout,
+ payload, payload_len);
+ if (ret)
+ goto out_put_lreq;
+
/* for notify_id */
pages = ceph_alloc_page_vector(1, GFP_NOIO);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
goto out_put_lreq;
}
-
- down_write(&osdc->lock);
- linger_register(lreq); /* before osd_req_op_* */
- ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
- timeout, payload, payload_len);
- if (ret) {
- linger_unregister(lreq);
- up_write(&osdc->lock);
- ceph_release_page_vector(pages, 1);
- goto out_put_lreq;
- }
ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
response_data),
pages, PAGE_SIZE, 0, false, true);
- linger_submit(lreq);
- up_write(&osdc->lock);
+ ret = ceph_osdc_alloc_messages(lreq->reg_req, GFP_NOIO);
+ if (ret)
+ goto out_put_lreq;
+
+ linger_submit(lreq);
ret = linger_reg_commit_wait(lreq);
if (!ret)
ret = linger_notify_finish_wait(lreq);
@@ -4812,20 +4928,26 @@
ret = ceph_start_decoding(p, end, 2, "watch_item_t",
&struct_v, &struct_len);
if (ret)
- return ret;
+ goto bad;
- ceph_decode_copy(p, &item->name, sizeof(item->name));
- item->cookie = ceph_decode_64(p);
- *p += 4; /* skip timeout_seconds */
+ ret = -EINVAL;
+ ceph_decode_copy_safe(p, end, &item->name, sizeof(item->name), bad);
+ ceph_decode_64_safe(p, end, item->cookie, bad);
+ ceph_decode_skip_32(p, end, bad); /* skip timeout seconds */
+
if (struct_v >= 2) {
- ceph_decode_copy(p, &item->addr, sizeof(item->addr));
- ceph_decode_addr(&item->addr);
+ ret = ceph_decode_entity_addr(p, end, &item->addr);
+ if (ret)
+ goto bad;
+ } else {
+ ret = 0;
}
dout("%s %s%llu cookie %llu addr %s\n", __func__,
ENTITY_NAME(item->name), item->cookie,
- ceph_pr_addr(&item->addr.in_addr));
- return 0;
+ ceph_pr_addr(&item->addr));
+bad:
+ return ret;
}
static int decode_watchers(void **p, void *end,
@@ -4881,10 +5003,6 @@
ceph_oloc_copy(&req->r_base_oloc, oloc);
req->r_flags = CEPH_OSD_FLAG_READ;
- ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
- if (ret)
- goto out_put_req;
-
pages = ceph_alloc_page_vector(1, GFP_NOIO);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
@@ -4896,6 +5014,10 @@
response_data),
pages, PAGE_SIZE, 0, false, true);
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
+
ceph_osdc_start_request(osdc, req, false);
ret = ceph_osdc_wait_request(osdc, req);
if (ret >= 0) {
@@ -4942,12 +5064,12 @@
const char *class, const char *method,
unsigned int flags,
struct page *req_page, size_t req_len,
- struct page *resp_page, size_t *resp_len)
+ struct page **resp_pages, size_t *resp_len)
{
struct ceph_osd_request *req;
int ret;
- if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
+ if (req_len > PAGE_SIZE)
return -E2BIG;
req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
@@ -4958,26 +5080,26 @@
ceph_oloc_copy(&req->r_base_oloc, oloc);
req->r_flags = flags;
- ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
- if (ret)
- goto out_put_req;
-
- ret = osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method);
+ ret = osd_req_op_cls_init(req, 0, class, method);
if (ret)
goto out_put_req;
if (req_page)
osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
0, false, false);
- if (resp_page)
- osd_req_op_cls_response_data_pages(req, 0, &resp_page,
+ if (resp_pages)
+ osd_req_op_cls_response_data_pages(req, 0, resp_pages,
*resp_len, 0, false, false);
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
+
ceph_osdc_start_request(osdc, req, false);
ret = ceph_osdc_wait_request(osdc, req);
if (ret >= 0) {
ret = req->r_ops[0].rval;
- if (resp_page)
+ if (resp_pages)
*resp_len = req->r_ops[0].outdata_len;
}
@@ -4988,6 +5110,24 @@
EXPORT_SYMBOL(ceph_osdc_call);
/*
+ * reset all osd connections
+ */
+void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc)
+{
+ struct rb_node *n;
+
+ down_write(&osdc->lock);
+ for (n = rb_first(&osdc->osds); n; ) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ n = rb_next(n);
+ if (!reopen_osd(osd))
+ kick_osd_requests(osd);
+ }
+ up_write(&osdc->lock);
+}
+
+/*
* init, shutdown
*/
int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
@@ -5021,11 +5161,12 @@
goto out_map;
err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
- PAGE_SIZE, 10, true, "osd_op");
+ PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, "osd_op");
if (err < 0)
goto out_mempool;
err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
- PAGE_SIZE, 10, true, "osd_op_reply");
+ PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10,
+ "osd_op_reply");
if (err < 0)
goto out_msgpool;
@@ -5168,6 +5309,80 @@
}
EXPORT_SYMBOL(ceph_osdc_writepages);
+static int osd_req_op_copy_from_init(struct ceph_osd_request *req,
+ u64 src_snapid, u64 src_version,
+ struct ceph_object_id *src_oid,
+ struct ceph_object_locator *src_oloc,
+ u32 src_fadvise_flags,
+ u32 dst_fadvise_flags,
+ u8 copy_from_flags)
+{
+ struct ceph_osd_req_op *op;
+ struct page **pages;
+ void *p, *end;
+
+ pages = ceph_alloc_page_vector(1, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, dst_fadvise_flags);
+ op->copy_from.snapid = src_snapid;
+ op->copy_from.src_version = src_version;
+ op->copy_from.flags = copy_from_flags;
+ op->copy_from.src_fadvise_flags = src_fadvise_flags;
+
+ p = page_address(pages[0]);
+ end = p + PAGE_SIZE;
+ ceph_encode_string(&p, end, src_oid->name, src_oid->name_len);
+ encode_oloc(&p, end, src_oloc);
+ op->indata_len = PAGE_SIZE - (end - p);
+
+ ceph_osd_data_pages_init(&op->copy_from.osd_data, pages,
+ op->indata_len, 0, false, true);
+ return 0;
+}
+
+int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
+ u64 src_snapid, u64 src_version,
+ struct ceph_object_id *src_oid,
+ struct ceph_object_locator *src_oloc,
+ u32 src_fadvise_flags,
+ struct ceph_object_id *dst_oid,
+ struct ceph_object_locator *dst_oloc,
+ u32 dst_fadvise_flags,
+ u8 copy_from_flags)
+{
+ struct ceph_osd_request *req;
+ int ret;
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ req->r_flags = CEPH_OSD_FLAG_WRITE;
+
+ ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc);
+ ceph_oid_copy(&req->r_t.base_oid, dst_oid);
+
+ ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid,
+ src_oloc, src_fadvise_flags,
+ dst_fadvise_flags, copy_from_flags);
+ if (ret)
+ goto out;
+
+ ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
+ if (ret)
+ goto out;
+
+ ceph_osdc_start_request(osdc, req, false);
+ ret = ceph_osdc_wait_request(osdc, req);
+
+out:
+ ceph_osdc_put_request(req);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_copy_from);
+
int __init ceph_osdc_setup(void)
{
size_t size = sizeof(struct ceph_osd_request) +
@@ -5295,7 +5510,7 @@
u32 front_len = le32_to_cpu(hdr->front_len);
u32 data_len = le32_to_cpu(hdr->data_len);
- m = ceph_msg_new(type, front_len, GFP_NOIO, false);
+ m = ceph_msg_new2(type, front_len, 1, GFP_NOIO, false);
if (!m)
return NULL;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 98c0ff3..4e0de14 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -495,9 +495,8 @@
/ sizeof(struct crush_rule_step))
goto bad;
#endif
- r = c->rules[i] = kmalloc(sizeof(*r) +
- yes*sizeof(struct crush_rule_step),
- GFP_NOFS);
+ r = kmalloc(struct_size(r, steps, yes), GFP_NOFS);
+ c->rules[i] = r;
if (r == NULL)
goto badmem;
dout(" rule %d is at %p\n", i, r);
@@ -974,11 +973,11 @@
struct ceph_pg_pool_info, node);
__remove_pg_pool(&map->pg_pools, pi);
}
- kfree(map->osd_state);
- kfree(map->osd_weight);
- kfree(map->osd_addr);
- kfree(map->osd_primary_affinity);
- kfree(map->crush_workspace);
+ kvfree(map->osd_state);
+ kvfree(map->osd_weight);
+ kvfree(map->osd_addr);
+ kvfree(map->osd_primary_affinity);
+ kvfree(map->crush_workspace);
kfree(map);
}
@@ -987,28 +986,41 @@
*
* The new elements are properly initialized.
*/
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
{
u32 *state;
u32 *weight;
struct ceph_entity_addr *addr;
+ u32 to_copy;
int i;
- state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS);
- if (!state)
+ dout("%s old %u new %u\n", __func__, map->max_osd, max);
+ if (max == map->max_osd)
+ return 0;
+
+ state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS);
+ weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS);
+ addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS);
+ if (!state || !weight || !addr) {
+ kvfree(state);
+ kvfree(weight);
+ kvfree(addr);
return -ENOMEM;
+ }
+
+ to_copy = min(map->max_osd, max);
+ if (map->osd_state) {
+ memcpy(state, map->osd_state, to_copy * sizeof(*state));
+ memcpy(weight, map->osd_weight, to_copy * sizeof(*weight));
+ memcpy(addr, map->osd_addr, to_copy * sizeof(*addr));
+ kvfree(map->osd_state);
+ kvfree(map->osd_weight);
+ kvfree(map->osd_addr);
+ }
+
map->osd_state = state;
-
- weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS);
- if (!weight)
- return -ENOMEM;
map->osd_weight = weight;
-
- addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS);
- if (!addr)
- return -ENOMEM;
map->osd_addr = addr;
-
for (i = map->max_osd; i < max; i++) {
map->osd_state[i] = 0;
map->osd_weight[i] = CEPH_OSD_OUT;
@@ -1018,12 +1030,16 @@
if (map->osd_primary_affinity) {
u32 *affinity;
- affinity = krealloc(map->osd_primary_affinity,
- max*sizeof(*affinity), GFP_NOFS);
+ affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)),
+ GFP_NOFS);
if (!affinity)
return -ENOMEM;
- map->osd_primary_affinity = affinity;
+ memcpy(affinity, map->osd_primary_affinity,
+ to_copy * sizeof(*affinity));
+ kvfree(map->osd_primary_affinity);
+
+ map->osd_primary_affinity = affinity;
for (i = map->max_osd; i < max; i++)
map->osd_primary_affinity[i] =
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
@@ -1044,7 +1060,7 @@
work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
dout("%s work_size %zu bytes\n", __func__, work_size);
- workspace = kmalloc(work_size, GFP_NOIO);
+ workspace = ceph_kvmalloc(work_size, GFP_NOIO);
if (!workspace) {
crush_destroy(crush);
return -ENOMEM;
@@ -1053,7 +1069,7 @@
if (map->crush)
crush_destroy(map->crush);
- kfree(map->crush_workspace);
+ kvfree(map->crush_workspace);
map->crush = crush;
map->crush_workspace = workspace;
return 0;
@@ -1299,9 +1315,9 @@
if (!map->osd_primary_affinity) {
int i;
- map->osd_primary_affinity = kmalloc_array(map->max_osd,
- sizeof(u32),
- GFP_NOFS);
+ map->osd_primary_affinity = ceph_kvmalloc(
+ array_size(map->max_osd, sizeof(*map->osd_primary_affinity)),
+ GFP_NOFS);
if (!map->osd_primary_affinity)
return -ENOMEM;
@@ -1322,7 +1338,7 @@
ceph_decode_32_safe(p, end, len, e_inval);
if (len == 0) {
- kfree(map->osd_primary_affinity);
+ kvfree(map->osd_primary_affinity);
map->osd_primary_affinity = NULL;
return 0;
}
@@ -1490,11 +1506,9 @@
/* osd_state, osd_weight, osd_addrs->client_addr */
ceph_decode_need(p, end, 3*sizeof(u32) +
- map->max_osd*((struct_v >= 5 ? sizeof(u32) :
- sizeof(u8)) +
- sizeof(*map->osd_weight) +
- sizeof(*map->osd_addr)), e_inval);
-
+ map->max_osd*(struct_v >= 5 ? sizeof(u32) :
+ sizeof(u8)) +
+ sizeof(*map->osd_weight), e_inval);
if (ceph_decode_32(p) != map->max_osd)
goto e_inval;
@@ -1515,9 +1529,11 @@
if (ceph_decode_32(p) != map->max_osd)
goto e_inval;
- ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
- for (i = 0; i < map->max_osd; i++)
- ceph_decode_addr(&map->osd_addr[i]);
+ for (i = 0; i < map->max_osd; i++) {
+ err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]);
+ if (err)
+ goto bad;
+ }
/* pg_temp */
err = decode_pg_temp(p, end, map);
@@ -1619,12 +1635,17 @@
void *new_state;
void *new_weight_end;
u32 len;
+ int i;
new_up_client = *p;
ceph_decode_32_safe(p, end, len, e_inval);
- len *= sizeof(u32) + sizeof(struct ceph_entity_addr);
- ceph_decode_need(p, end, len, e_inval);
- *p += len;
+ for (i = 0; i < len; ++i) {
+ struct ceph_entity_addr addr;
+
+ ceph_decode_skip_32(p, end, e_inval);
+ if (ceph_decode_entity_addr(p, end, &addr))
+ goto e_inval;
+ }
new_state = *p;
ceph_decode_32_safe(p, end, len, e_inval);
@@ -1700,9 +1721,9 @@
struct ceph_entity_addr addr;
osd = ceph_decode_32(p);
- ceph_decode_copy(p, &addr, sizeof(addr));
- ceph_decode_addr(&addr);
BUG_ON(osd >= map->max_osd);
+ if (ceph_decode_entity_addr(p, end, &addr))
+ goto e_inval;
pr_info("osd%d up\n", osd);
map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
map->osd_addr[osd] = addr;
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
index 2ea0564..65e34f7 100644
--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@@ -6,6 +6,26 @@
#include <linux/highmem.h>
#include <linux/ceph/pagelist.h>
+struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags)
+{
+ struct ceph_pagelist *pl;
+
+ pl = kmalloc(sizeof(*pl), gfp_flags);
+ if (!pl)
+ return NULL;
+
+ INIT_LIST_HEAD(&pl->head);
+ pl->mapped_tail = NULL;
+ pl->length = 0;
+ pl->room = 0;
+ INIT_LIST_HEAD(&pl->free_list);
+ pl->num_pages_free = 0;
+ refcount_set(&pl->refcnt, 1);
+
+ return pl;
+}
+EXPORT_SYMBOL(ceph_pagelist_alloc);
+
static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
{
if (pl->mapped_tail) {
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index d3736f5..64305e7 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -10,39 +10,6 @@
#include <linux/ceph/libceph.h>
-/*
- * build a vector of user pages
- */
-struct page **ceph_get_direct_page_vector(const void __user *data,
- int num_pages, bool write_page)
-{
- struct page **pages;
- int got = 0;
- int rc = 0;
-
- pages = kmalloc_array(num_pages, sizeof(*pages), GFP_NOFS);
- if (!pages)
- return ERR_PTR(-ENOMEM);
-
- while (got < num_pages) {
- rc = get_user_pages_fast(
- (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
- num_pages - got, write_page, pages + got);
- if (rc < 0)
- break;
- BUG_ON(rc == 0);
- got += rc;
- }
- if (rc < 0)
- goto fail;
- return pages;
-
-fail:
- ceph_put_page_vector(pages, got, false);
- return ERR_PTR(rc);
-}
-EXPORT_SYMBOL(ceph_get_direct_page_vector);
-
void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)
{
int i;
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
index e14a5d0..e243159 100644
--- a/net/ceph/snapshot.c
+++ b/net/ceph/snapshot.c
@@ -1,21 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* snapshot.c Ceph snapshot context utility routines (part of libceph)
*
* Copyright (C) 2013 Inktank Storage, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
*/
#include <linux/types.h>
diff --git a/net/ceph/striper.c b/net/ceph/striper.c
index c36462d..3b3fa75 100644
--- a/net/ceph/striper.c
+++ b/net/ceph/striper.c
@@ -259,3 +259,20 @@
return 0;
}
EXPORT_SYMBOL(ceph_extent_to_file);
+
+u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size)
+{
+ u64 period = (u64)l->stripe_count * l->object_size;
+ u64 num_periods = DIV64_U64_ROUND_UP(size, period);
+ u64 remainder_bytes;
+ u64 remainder_objs = 0;
+
+ div64_u64_rem(size, period, &remainder_bytes);
+ if (remainder_bytes > 0 &&
+ remainder_bytes < (u64)l->stripe_count * l->stripe_unit)
+ remainder_objs = l->stripe_count -
+ DIV_ROUND_UP_ULL(remainder_bytes, l->stripe_unit);
+
+ return num_periods * l->stripe_count - remainder_objs;
+}
+EXPORT_SYMBOL(ceph_get_num_objects);