Blame - net/ceph/osd_client.c - hafnium/third_party/linux

blob: a8481da37f1ad5a99bb9d6dbf14d973a6ceb37ab [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2
				3	#include <linux/ceph/ceph_debug.h>
				4
				5	#include <linux/module.h>
				6	#include <linux/err.h>
				7	#include <linux/highmem.h>
				8	#include <linux/mm.h>
				9	#include <linux/pagemap.h>
				10	#include <linux/slab.h>
				11	#include <linux/uaccess.h>
				12	#ifdef CONFIG_BLOCK
				13	#include <linux/bio.h>
				14	#endif
				15
				16	#include <linux/ceph/ceph_features.h>
				17	#include <linux/ceph/libceph.h>
				18	#include <linux/ceph/osd_client.h>
				19	#include <linux/ceph/messenger.h>
				20	#include <linux/ceph/decode.h>
				21	#include <linux/ceph/auth.h>
				22	#include <linux/ceph/pagelist.h>
				23	#include <linux/ceph/striper.h>
				24
				25	#define OSD_OPREPLY_FRONT_LEN 512
				26
				27	static struct kmem_cache *ceph_osd_request_cache;
				28
				29	static const struct ceph_connection_operations osd_con_ops;
				30
				31	/*
				32	* Implement client access to distributed object storage cluster.
				33	*
				34	* All data objects are stored within a cluster/cloud of OSDs, or
				35	* "object storage devices." (Note that Ceph OSDs have _nothing_ to
				36	* do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
				37	* remote daemons serving up and coordinating consistent and safe
				38	* access to storage.
				39	*
				40	* Cluster membership and the mapping of data objects onto storage devices
				41	* are described by the osd map.
				42	*
				43	* We keep track of pending OSD requests (read, write), resubmit
				44	* requests to different OSDs when the cluster topology/data layout
				45	* change, or retry the affected requests when the communications
				46	* channel with an OSD is reset.
				47	*/
				48
				49	static void link_request(struct ceph_osd osd, struct ceph_osd_request req);
				50	static void unlink_request(struct ceph_osd osd, struct ceph_osd_request req);
				51	static void link_linger(struct ceph_osd *osd,
				52	struct ceph_osd_linger_request *lreq);
				53	static void unlink_linger(struct ceph_osd *osd,
				54	struct ceph_osd_linger_request *lreq);
				55	static void clear_backoffs(struct ceph_osd *osd);
				56
				57	#if 1
				58	static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
				59	{
				60	bool wrlocked = true;
				61
				62	if (unlikely(down_read_trylock(sem))) {
				63	wrlocked = false;
				64	up_read(sem);
				65	}
				66
				67	return wrlocked;
				68	}
				69	static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
				70	{
				71	WARN_ON(!rwsem_is_locked(&osdc->lock));
				72	}
				73	static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
				74	{
				75	WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
				76	}
				77	static inline void verify_osd_locked(struct ceph_osd *osd)
				78	{
				79	struct ceph_osd_client *osdc = osd->o_osdc;
				80
				81	WARN_ON(!(mutex_is_locked(&osd->lock) &&
				82	rwsem_is_locked(&osdc->lock)) &&
				83	!rwsem_is_wrlocked(&osdc->lock));
				84	}
				85	static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
				86	{
				87	WARN_ON(!mutex_is_locked(&lreq->lock));
				88	}
				89	#else
				90	static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
				91	static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
				92	static inline void verify_osd_locked(struct ceph_osd *osd) { }
				93	static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
				94	#endif
				95
				96	/*
				97	* calculate the mapping of a file extent onto an object, and fill out the
				98	* request accordingly. shorten extent as necessary if it crosses an
				99	* object boundary.
				100	*
				101	* fill osd op in request message.
				102	*/
				103	static int calc_layout(struct ceph_file_layout layout, u64 off, u64 plen,
				104	u64 objnum, u64 objoff, u64 *objlen)
				105	{
				106	u64 orig_len = *plen;
				107	u32 xlen;
				108
				109	/* object extent? */
				110	ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
				111	objoff, &xlen);
				112	*objlen = xlen;
				113	if (*objlen < orig_len) {
				114	plen = objlen;
				115	dout(" skipping last %llu, final file extent %llu~%llu\n",
				116	orig_len - plen, off, plen);
				117	}
				118
				119	dout("calc_layout objnum=%llx %llu~%llu\n", objnum, objoff, *objlen);
				120	return 0;
				121	}
				122
				123	static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
				124	{
				125	memset(osd_data, 0, sizeof (*osd_data));
				126	osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
				127	}
				128
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	129	/*
				130	* Consumes @pages if @own_pages is true.
				131	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	132	static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
				133	struct page **pages, u64 length, u32 alignment,
				134	bool pages_from_pool, bool own_pages)
				135	{
				136	osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
				137	osd_data->pages = pages;
				138	osd_data->length = length;
				139	osd_data->alignment = alignment;
				140	osd_data->pages_from_pool = pages_from_pool;
				141	osd_data->own_pages = own_pages;
				142	}
				143
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	144	/*
				145	* Consumes a ref on @pagelist.
				146	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	147	static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
				148	struct ceph_pagelist *pagelist)
				149	{
				150	osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
				151	osd_data->pagelist = pagelist;
				152	}
				153
				154	#ifdef CONFIG_BLOCK
				155	static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
				156	struct ceph_bio_iter *bio_pos,
				157	u32 bio_length)
				158	{
				159	osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
				160	osd_data->bio_pos = *bio_pos;
				161	osd_data->bio_length = bio_length;
				162	}
				163	#endif /* CONFIG_BLOCK */
				164
				165	static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
				166	struct ceph_bvec_iter *bvec_pos,
				167	u32 num_bvecs)
				168	{
				169	osd_data->type = CEPH_OSD_DATA_TYPE_BVECS;
				170	osd_data->bvec_pos = *bvec_pos;
				171	osd_data->num_bvecs = num_bvecs;
				172	}
				173
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	174	static struct ceph_osd_data *
				175	osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
				176	{
				177	BUG_ON(which >= osd_req->r_num_ops);
				178
				179	return &osd_req->r_ops[which].raw_data_in;
				180	}
				181
				182	struct ceph_osd_data *
				183	osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
				184	unsigned int which)
				185	{
				186	return osd_req_op_data(osd_req, which, extent, osd_data);
				187	}
				188	EXPORT_SYMBOL(osd_req_op_extent_osd_data);
				189
				190	void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
				191	unsigned int which, struct page **pages,
				192	u64 length, u32 alignment,
				193	bool pages_from_pool, bool own_pages)
				194	{
				195	struct ceph_osd_data *osd_data;
				196
				197	osd_data = osd_req_op_raw_data_in(osd_req, which);
				198	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				199	pages_from_pool, own_pages);
				200	}
				201	EXPORT_SYMBOL(osd_req_op_raw_data_in_pages);
				202
				203	void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
				204	unsigned int which, struct page **pages,
				205	u64 length, u32 alignment,
				206	bool pages_from_pool, bool own_pages)
				207	{
				208	struct ceph_osd_data *osd_data;
				209
				210	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				211	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				212	pages_from_pool, own_pages);
				213	}
				214	EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
				215
				216	void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
				217	unsigned int which, struct ceph_pagelist *pagelist)
				218	{
				219	struct ceph_osd_data *osd_data;
				220
				221	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				222	ceph_osd_data_pagelist_init(osd_data, pagelist);
				223	}
				224	EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
				225
				226	#ifdef CONFIG_BLOCK
				227	void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
				228	unsigned int which,
				229	struct ceph_bio_iter *bio_pos,
				230	u32 bio_length)
				231	{
				232	struct ceph_osd_data *osd_data;
				233
				234	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				235	ceph_osd_data_bio_init(osd_data, bio_pos, bio_length);
				236	}
				237	EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
				238	#endif /* CONFIG_BLOCK */
				239
				240	void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
				241	unsigned int which,
				242	struct bio_vec *bvecs, u32 num_bvecs,
				243	u32 bytes)
				244	{
				245	struct ceph_osd_data *osd_data;
				246	struct ceph_bvec_iter it = {
				247	.bvecs = bvecs,
				248	.iter = { .bi_size = bytes },
				249	};
				250
				251	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				252	ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs);
				253	}
				254	EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvecs);
				255
				256	void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
				257	unsigned int which,
				258	struct ceph_bvec_iter *bvec_pos)
				259	{
				260	struct ceph_osd_data *osd_data;
				261
				262	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
				263	ceph_osd_data_bvecs_init(osd_data, bvec_pos, 0);
				264	}
				265	EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos);
				266
				267	static void osd_req_op_cls_request_info_pagelist(
				268	struct ceph_osd_request *osd_req,
				269	unsigned int which, struct ceph_pagelist *pagelist)
				270	{
				271	struct ceph_osd_data *osd_data;
				272
				273	osd_data = osd_req_op_data(osd_req, which, cls, request_info);
				274	ceph_osd_data_pagelist_init(osd_data, pagelist);
				275	}
				276
				277	void osd_req_op_cls_request_data_pagelist(
				278	struct ceph_osd_request *osd_req,
				279	unsigned int which, struct ceph_pagelist *pagelist)
				280	{
				281	struct ceph_osd_data *osd_data;
				282
				283	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
				284	ceph_osd_data_pagelist_init(osd_data, pagelist);
				285	osd_req->r_ops[which].cls.indata_len += pagelist->length;
				286	osd_req->r_ops[which].indata_len += pagelist->length;
				287	}
				288	EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
				289
				290	void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
				291	unsigned int which, struct page **pages, u64 length,
				292	u32 alignment, bool pages_from_pool, bool own_pages)
				293	{
				294	struct ceph_osd_data *osd_data;
				295
				296	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
				297	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				298	pages_from_pool, own_pages);
				299	osd_req->r_ops[which].cls.indata_len += length;
				300	osd_req->r_ops[which].indata_len += length;
				301	}
				302	EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
				303
				304	void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req,
				305	unsigned int which,
				306	struct bio_vec *bvecs, u32 num_bvecs,
				307	u32 bytes)
				308	{
				309	struct ceph_osd_data *osd_data;
				310	struct ceph_bvec_iter it = {
				311	.bvecs = bvecs,
				312	.iter = { .bi_size = bytes },
				313	};
				314
				315	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
				316	ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs);
				317	osd_req->r_ops[which].cls.indata_len += bytes;
				318	osd_req->r_ops[which].indata_len += bytes;
				319	}
				320	EXPORT_SYMBOL(osd_req_op_cls_request_data_bvecs);
				321
				322	void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
				323	unsigned int which, struct page **pages, u64 length,
				324	u32 alignment, bool pages_from_pool, bool own_pages)
				325	{
				326	struct ceph_osd_data *osd_data;
				327
				328	osd_data = osd_req_op_data(osd_req, which, cls, response_data);
				329	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
				330	pages_from_pool, own_pages);
				331	}
				332	EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
				333
				334	static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
				335	{
				336	switch (osd_data->type) {
				337	case CEPH_OSD_DATA_TYPE_NONE:
				338	return 0;
				339	case CEPH_OSD_DATA_TYPE_PAGES:
				340	return osd_data->length;
				341	case CEPH_OSD_DATA_TYPE_PAGELIST:
				342	return (u64)osd_data->pagelist->length;
				343	#ifdef CONFIG_BLOCK
				344	case CEPH_OSD_DATA_TYPE_BIO:
				345	return (u64)osd_data->bio_length;
				346	#endif /* CONFIG_BLOCK */
				347	case CEPH_OSD_DATA_TYPE_BVECS:
				348	return osd_data->bvec_pos.iter.bi_size;
				349	default:
				350	WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
				351	return 0;
				352	}
				353	}
				354
				355	static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
				356	{
				357	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) {
				358	int num_pages;
				359
				360	num_pages = calc_pages_for((u64)osd_data->alignment,
				361	(u64)osd_data->length);
				362	ceph_release_page_vector(osd_data->pages, num_pages);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	363	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
				364	ceph_pagelist_release(osd_data->pagelist);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	365	}
				366	ceph_osd_data_init(osd_data);
				367	}
				368
				369	static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
				370	unsigned int which)
				371	{
				372	struct ceph_osd_req_op *op;
				373
				374	BUG_ON(which >= osd_req->r_num_ops);
				375	op = &osd_req->r_ops[which];
				376
				377	switch (op->op) {
				378	case CEPH_OSD_OP_READ:
				379	case CEPH_OSD_OP_WRITE:
				380	case CEPH_OSD_OP_WRITEFULL:
				381	ceph_osd_data_release(&op->extent.osd_data);
				382	break;
				383	case CEPH_OSD_OP_CALL:
				384	ceph_osd_data_release(&op->cls.request_info);
				385	ceph_osd_data_release(&op->cls.request_data);
				386	ceph_osd_data_release(&op->cls.response_data);
				387	break;
				388	case CEPH_OSD_OP_SETXATTR:
				389	case CEPH_OSD_OP_CMPXATTR:
				390	ceph_osd_data_release(&op->xattr.osd_data);
				391	break;
				392	case CEPH_OSD_OP_STAT:
				393	ceph_osd_data_release(&op->raw_data_in);
				394	break;
				395	case CEPH_OSD_OP_NOTIFY_ACK:
				396	ceph_osd_data_release(&op->notify_ack.request_data);
				397	break;
				398	case CEPH_OSD_OP_NOTIFY:
				399	ceph_osd_data_release(&op->notify.request_data);
				400	ceph_osd_data_release(&op->notify.response_data);
				401	break;
				402	case CEPH_OSD_OP_LIST_WATCHERS:
				403	ceph_osd_data_release(&op->list_watchers.response_data);
				404	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	405	case CEPH_OSD_OP_COPY_FROM:
				406	ceph_osd_data_release(&op->copy_from.osd_data);
				407	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	408	default:
				409	break;
				410	}
				411	}
				412
				413	/*
				414	* Assumes @t is zero-initialized.
				415	*/
				416	static void target_init(struct ceph_osd_request_target *t)
				417	{
				418	ceph_oid_init(&t->base_oid);
				419	ceph_oloc_init(&t->base_oloc);
				420	ceph_oid_init(&t->target_oid);
				421	ceph_oloc_init(&t->target_oloc);
				422
				423	ceph_osds_init(&t->acting);
				424	ceph_osds_init(&t->up);
				425	t->size = -1;
				426	t->min_size = -1;
				427
				428	t->osd = CEPH_HOMELESS_OSD;
				429	}
				430
				431	static void target_copy(struct ceph_osd_request_target *dest,
				432	const struct ceph_osd_request_target *src)
				433	{
				434	ceph_oid_copy(&dest->base_oid, &src->base_oid);
				435	ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
				436	ceph_oid_copy(&dest->target_oid, &src->target_oid);
				437	ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
				438
				439	dest->pgid = src->pgid; /* struct */
				440	dest->spgid = src->spgid; /* struct */
				441	dest->pg_num = src->pg_num;
				442	dest->pg_num_mask = src->pg_num_mask;
				443	ceph_osds_copy(&dest->acting, &src->acting);
				444	ceph_osds_copy(&dest->up, &src->up);
				445	dest->size = src->size;
				446	dest->min_size = src->min_size;
				447	dest->sort_bitwise = src->sort_bitwise;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	448	dest->recovery_deletes = src->recovery_deletes;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	449
				450	dest->flags = src->flags;
				451	dest->paused = src->paused;
				452
				453	dest->epoch = src->epoch;
				454	dest->last_force_resend = src->last_force_resend;
				455
				456	dest->osd = src->osd;
				457	}
				458
				459	static void target_destroy(struct ceph_osd_request_target *t)
				460	{
				461	ceph_oid_destroy(&t->base_oid);
				462	ceph_oloc_destroy(&t->base_oloc);
				463	ceph_oid_destroy(&t->target_oid);
				464	ceph_oloc_destroy(&t->target_oloc);
				465	}
				466
				467	/*
				468	* requests
				469	*/
				470	static void request_release_checks(struct ceph_osd_request *req)
				471	{
				472	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
				473	WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	474	WARN_ON(!list_empty(&req->r_private_item));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	475	WARN_ON(req->r_osd);
				476	}
				477
				478	static void ceph_osdc_release_request(struct kref *kref)
				479	{
				480	struct ceph_osd_request *req = container_of(kref,
				481	struct ceph_osd_request, r_kref);
				482	unsigned int which;
				483
				484	dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
				485	req->r_request, req->r_reply);
				486	request_release_checks(req);
				487
				488	if (req->r_request)
				489	ceph_msg_put(req->r_request);
				490	if (req->r_reply)
				491	ceph_msg_put(req->r_reply);
				492
				493	for (which = 0; which < req->r_num_ops; which++)
				494	osd_req_op_data_release(req, which);
				495
				496	target_destroy(&req->r_t);
				497	ceph_put_snap_context(req->r_snapc);
				498
				499	if (req->r_mempool)
				500	mempool_free(req, req->r_osdc->req_mempool);
				501	else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
				502	kmem_cache_free(ceph_osd_request_cache, req);
				503	else
				504	kfree(req);
				505	}
				506
				507	void ceph_osdc_get_request(struct ceph_osd_request *req)
				508	{
				509	dout("%s %p (was %d)\n", __func__, req,
				510	kref_read(&req->r_kref));
				511	kref_get(&req->r_kref);
				512	}
				513	EXPORT_SYMBOL(ceph_osdc_get_request);
				514
				515	void ceph_osdc_put_request(struct ceph_osd_request *req)
				516	{
				517	if (req) {
				518	dout("%s %p (was %d)\n", __func__, req,
				519	kref_read(&req->r_kref));
				520	kref_put(&req->r_kref, ceph_osdc_release_request);
				521	}
				522	}
				523	EXPORT_SYMBOL(ceph_osdc_put_request);
				524
				525	static void request_init(struct ceph_osd_request *req)
				526	{
				527	/* req only, each op is zeroed in _osd_req_op_init() */
				528	memset(req, 0, sizeof(*req));
				529
				530	kref_init(&req->r_kref);
				531	init_completion(&req->r_completion);
				532	RB_CLEAR_NODE(&req->r_node);
				533	RB_CLEAR_NODE(&req->r_mc_node);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	534	INIT_LIST_HEAD(&req->r_private_item);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	535
				536	target_init(&req->r_t);
				537	}
				538
				539	/*
				540	* This is ugly, but it allows us to reuse linger registration and ping
				541	* requests, keeping the structure of the code around send_linger{_ping}()
				542	* reasonable. Setting up a min_nr=2 mempool for each linger request
				543	* and dealing with copying ops (this blasts req only, watch op remains
				544	* intact) isn't any better.
				545	*/
				546	static void request_reinit(struct ceph_osd_request *req)
				547	{
				548	struct ceph_osd_client *osdc = req->r_osdc;
				549	bool mempool = req->r_mempool;
				550	unsigned int num_ops = req->r_num_ops;
				551	u64 snapid = req->r_snapid;
				552	struct ceph_snap_context *snapc = req->r_snapc;
				553	bool linger = req->r_linger;
				554	struct ceph_msg *request_msg = req->r_request;
				555	struct ceph_msg *reply_msg = req->r_reply;
				556
				557	dout("%s req %p\n", __func__, req);
				558	WARN_ON(kref_read(&req->r_kref) != 1);
				559	request_release_checks(req);
				560
				561	WARN_ON(kref_read(&request_msg->kref) != 1);
				562	WARN_ON(kref_read(&reply_msg->kref) != 1);
				563	target_destroy(&req->r_t);
				564
				565	request_init(req);
				566	req->r_osdc = osdc;
				567	req->r_mempool = mempool;
				568	req->r_num_ops = num_ops;
				569	req->r_snapid = snapid;
				570	req->r_snapc = snapc;
				571	req->r_linger = linger;
				572	req->r_request = request_msg;
				573	req->r_reply = reply_msg;
				574	}
				575
				576	struct ceph_osd_request ceph_osdc_alloc_request(struct ceph_osd_client osdc,
				577	struct ceph_snap_context *snapc,
				578	unsigned int num_ops,
				579	bool use_mempool,
				580	gfp_t gfp_flags)
				581	{
				582	struct ceph_osd_request *req;
				583
				584	if (use_mempool) {
				585	BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
				586	req = mempool_alloc(osdc->req_mempool, gfp_flags);
				587	} else if (num_ops <= CEPH_OSD_SLAB_OPS) {
				588	req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags);
				589	} else {
				590	BUG_ON(num_ops > CEPH_OSD_MAX_OPS);
				591	req = kmalloc(struct_size(req, r_ops, num_ops), gfp_flags);
				592	}
				593	if (unlikely(!req))
				594	return NULL;
				595
				596	request_init(req);
				597	req->r_osdc = osdc;
				598	req->r_mempool = use_mempool;
				599	req->r_num_ops = num_ops;
				600	req->r_snapid = CEPH_NOSNAP;
				601	req->r_snapc = ceph_get_snap_context(snapc);
				602
				603	dout("%s req %p\n", __func__, req);
				604	return req;
				605	}
				606	EXPORT_SYMBOL(ceph_osdc_alloc_request);
				607
				608	static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc)
				609	{
				610	return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
				611	}
				612
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	613	static int __ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp,
				614	int num_request_data_items,
				615	int num_reply_data_items)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	616	{
				617	struct ceph_osd_client *osdc = req->r_osdc;
				618	struct ceph_msg *msg;
				619	int msg_size;
				620
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	621	WARN_ON(req->r_request \|\| req->r_reply);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	622	WARN_ON(ceph_oid_empty(&req->r_base_oid));
				623	WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
				624
				625	/* create request message */
				626	msg_size = CEPH_ENCODING_START_BLK_LEN +
				627	CEPH_PGID_ENCODING_LEN + 1; /* spgid */
				628	msg_size += 4 + 4 + 4; /* hash, osdmap_epoch, flags */
				629	msg_size += CEPH_ENCODING_START_BLK_LEN +
				630	sizeof(struct ceph_osd_reqid); /* reqid */
				631	msg_size += sizeof(struct ceph_blkin_trace_info); /* trace */
				632	msg_size += 4 + sizeof(struct ceph_timespec); /* client_inc, mtime */
				633	msg_size += CEPH_ENCODING_START_BLK_LEN +
				634	ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */
				635	msg_size += 4 + req->r_base_oid.name_len; /* oid */
				636	msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
				637	msg_size += 8; /* snapid */
				638	msg_size += 8; /* snap_seq */
				639	msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
				640	msg_size += 4 + 8; /* retry_attempt, features */
				641
				642	if (req->r_mempool)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	643	msg = ceph_msgpool_get(&osdc->msgpool_op, msg_size,
				644	num_request_data_items);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	645	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	646	msg = ceph_msg_new2(CEPH_MSG_OSD_OP, msg_size,
				647	num_request_data_items, gfp, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	648	if (!msg)
				649	return -ENOMEM;
				650
				651	memset(msg->front.iov_base, 0, msg->front.iov_len);
				652	req->r_request = msg;
				653
				654	/* create reply message */
				655	msg_size = OSD_OPREPLY_FRONT_LEN;
				656	msg_size += req->r_base_oid.name_len;
				657	msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
				658
				659	if (req->r_mempool)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	660	msg = ceph_msgpool_get(&osdc->msgpool_op_reply, msg_size,
				661	num_reply_data_items);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	662	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	663	msg = ceph_msg_new2(CEPH_MSG_OSD_OPREPLY, msg_size,
				664	num_reply_data_items, gfp, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	665	if (!msg)
				666	return -ENOMEM;
				667
				668	req->r_reply = msg;
				669
				670	return 0;
				671	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	672
				673	static bool osd_req_opcode_valid(u16 opcode)
				674	{
				675	switch (opcode) {
				676	#define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return true;
				677	__CEPH_FORALL_OSD_OPS(GENERATE_CASE)
				678	#undef GENERATE_CASE
				679	default:
				680	return false;
				681	}
				682	}
				683
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	684	static void get_num_data_items(struct ceph_osd_request *req,
				685	int *num_request_data_items,
				686	int *num_reply_data_items)
				687	{
				688	struct ceph_osd_req_op *op;
				689
				690	*num_request_data_items = 0;
				691	*num_reply_data_items = 0;
				692
				693	for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) {
				694	switch (op->op) {
				695	/* request */
				696	case CEPH_OSD_OP_WRITE:
				697	case CEPH_OSD_OP_WRITEFULL:
				698	case CEPH_OSD_OP_SETXATTR:
				699	case CEPH_OSD_OP_CMPXATTR:
				700	case CEPH_OSD_OP_NOTIFY_ACK:
				701	case CEPH_OSD_OP_COPY_FROM:
				702	*num_request_data_items += 1;
				703	break;
				704
				705	/* reply */
				706	case CEPH_OSD_OP_STAT:
				707	case CEPH_OSD_OP_READ:
				708	case CEPH_OSD_OP_LIST_WATCHERS:
				709	*num_reply_data_items += 1;
				710	break;
				711
				712	/* both */
				713	case CEPH_OSD_OP_NOTIFY:
				714	*num_request_data_items += 1;
				715	*num_reply_data_items += 1;
				716	break;
				717	case CEPH_OSD_OP_CALL:
				718	*num_request_data_items += 2;
				719	*num_reply_data_items += 1;
				720	break;
				721
				722	default:
				723	WARN_ON(!osd_req_opcode_valid(op->op));
				724	break;
				725	}
				726	}
				727	}
				728
				729	/*
				730	* oid, oloc and OSD op opcode(s) must be filled in before this function
				731	* is called.
				732	*/
				733	int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
				734	{
				735	int num_request_data_items, num_reply_data_items;
				736
				737	get_num_data_items(req, &num_request_data_items, &num_reply_data_items);
				738	return __ceph_osdc_alloc_messages(req, gfp, num_request_data_items,
				739	num_reply_data_items);
				740	}
				741	EXPORT_SYMBOL(ceph_osdc_alloc_messages);
				742
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	743	/*
				744	* This is an osd op init function for opcodes that have no data or
				745	* other information associated with them. It also serves as a
				746	* common init routine for all the other init functions, below.
				747	*/
				748	static struct ceph_osd_req_op *
				749	_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
				750	u16 opcode, u32 flags)
				751	{
				752	struct ceph_osd_req_op *op;
				753
				754	BUG_ON(which >= osd_req->r_num_ops);
				755	BUG_ON(!osd_req_opcode_valid(opcode));
				756
				757	op = &osd_req->r_ops[which];
				758	memset(op, 0, sizeof (*op));
				759	op->op = opcode;
				760	op->flags = flags;
				761
				762	return op;
				763	}
				764
				765	void osd_req_op_init(struct ceph_osd_request *osd_req,
				766	unsigned int which, u16 opcode, u32 flags)
				767	{
				768	(void)_osd_req_op_init(osd_req, which, opcode, flags);
				769	}
				770	EXPORT_SYMBOL(osd_req_op_init);
				771
				772	void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
				773	unsigned int which, u16 opcode,
				774	u64 offset, u64 length,
				775	u64 truncate_size, u32 truncate_seq)
				776	{
				777	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				778	opcode, 0);
				779	size_t payload_len = 0;
				780
				781	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
				782	opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
				783	opcode != CEPH_OSD_OP_TRUNCATE);
				784
				785	op->extent.offset = offset;
				786	op->extent.length = length;
				787	op->extent.truncate_size = truncate_size;
				788	op->extent.truncate_seq = truncate_seq;
				789	if (opcode == CEPH_OSD_OP_WRITE \|\| opcode == CEPH_OSD_OP_WRITEFULL)
				790	payload_len += length;
				791
				792	op->indata_len = payload_len;
				793	}
				794	EXPORT_SYMBOL(osd_req_op_extent_init);
				795
				796	void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
				797	unsigned int which, u64 length)
				798	{
				799	struct ceph_osd_req_op *op;
				800	u64 previous;
				801
				802	BUG_ON(which >= osd_req->r_num_ops);
				803	op = &osd_req->r_ops[which];
				804	previous = op->extent.length;
				805
				806	if (length == previous)
				807	return; /* Nothing to do */
				808	BUG_ON(length > previous);
				809
				810	op->extent.length = length;
				811	if (op->op == CEPH_OSD_OP_WRITE \|\| op->op == CEPH_OSD_OP_WRITEFULL)
				812	op->indata_len -= previous - length;
				813	}
				814	EXPORT_SYMBOL(osd_req_op_extent_update);
				815
				816	void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
				817	unsigned int which, u64 offset_inc)
				818	{
				819	struct ceph_osd_req_op op, prev_op;
				820
				821	BUG_ON(which + 1 >= osd_req->r_num_ops);
				822
				823	prev_op = &osd_req->r_ops[which];
				824	op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags);
				825	/* dup previous one */
				826	op->indata_len = prev_op->indata_len;
				827	op->outdata_len = prev_op->outdata_len;
				828	op->extent = prev_op->extent;
				829	/* adjust offset */
				830	op->extent.offset += offset_inc;
				831	op->extent.length -= offset_inc;
				832
				833	if (op->op == CEPH_OSD_OP_WRITE \|\| op->op == CEPH_OSD_OP_WRITEFULL)
				834	op->indata_len -= offset_inc;
				835	}
				836	EXPORT_SYMBOL(osd_req_op_extent_dup_last);
				837
				838	int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	839	const char class, const char method)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	840	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	841	struct ceph_osd_req_op *op;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	842	struct ceph_pagelist *pagelist;
				843	size_t payload_len = 0;
				844	size_t size;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	845	int ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	846
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	847	op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	848
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	849	pagelist = ceph_pagelist_alloc(GFP_NOFS);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	850	if (!pagelist)
				851	return -ENOMEM;
				852
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	853	op->cls.class_name = class;
				854	size = strlen(class);
				855	BUG_ON(size > (size_t) U8_MAX);
				856	op->cls.class_len = size;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	857	ret = ceph_pagelist_append(pagelist, class, size);
				858	if (ret)
				859	goto err_pagelist_free;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	860	payload_len += size;
				861
				862	op->cls.method_name = method;
				863	size = strlen(method);
				864	BUG_ON(size > (size_t) U8_MAX);
				865	op->cls.method_len = size;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	866	ret = ceph_pagelist_append(pagelist, method, size);
				867	if (ret)
				868	goto err_pagelist_free;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	869	payload_len += size;
				870
				871	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	872	op->indata_len = payload_len;
				873	return 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	874
				875	err_pagelist_free:
				876	ceph_pagelist_release(pagelist);
				877	return ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	878	}
				879	EXPORT_SYMBOL(osd_req_op_cls_init);
				880
				881	int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
				882	u16 opcode, const char name, const void value,
				883	size_t size, u8 cmp_op, u8 cmp_mode)
				884	{
				885	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				886	opcode, 0);
				887	struct ceph_pagelist *pagelist;
				888	size_t payload_len;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	889	int ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	890
				891	BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
				892
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	893	pagelist = ceph_pagelist_alloc(GFP_NOFS);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	894	if (!pagelist)
				895	return -ENOMEM;
				896
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	897	payload_len = strlen(name);
				898	op->xattr.name_len = payload_len;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	899	ret = ceph_pagelist_append(pagelist, name, payload_len);
				900	if (ret)
				901	goto err_pagelist_free;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	902
				903	op->xattr.value_len = size;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	904	ret = ceph_pagelist_append(pagelist, value, size);
				905	if (ret)
				906	goto err_pagelist_free;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	907	payload_len += size;
				908
				909	op->xattr.cmp_op = cmp_op;
				910	op->xattr.cmp_mode = cmp_mode;
				911
				912	ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
				913	op->indata_len = payload_len;
				914	return 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	915
				916	err_pagelist_free:
				917	ceph_pagelist_release(pagelist);
				918	return ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	919	}
				920	EXPORT_SYMBOL(osd_req_op_xattr_init);
				921
				922	/*
				923	* @watch_opcode: CEPH_OSD_WATCH_OP_*
				924	*/
				925	static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
				926	u64 cookie, u8 watch_opcode)
				927	{
				928	struct ceph_osd_req_op *op;
				929
				930	op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
				931	op->watch.cookie = cookie;
				932	op->watch.op = watch_opcode;
				933	op->watch.gen = 0;
				934	}
				935
				936	void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
				937	unsigned int which,
				938	u64 expected_object_size,
				939	u64 expected_write_size)
				940	{
				941	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
				942	CEPH_OSD_OP_SETALLOCHINT,
				943	0);
				944
				945	op->alloc_hint.expected_object_size = expected_object_size;
				946	op->alloc_hint.expected_write_size = expected_write_size;
				947
				948	/*
				949	* CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
				950	* not worth a feature bit. Set FAILOK per-op flag to make
				951	* sure older osds don't trip over an unsupported opcode.
				952	*/
				953	op->flags \|= CEPH_OSD_OP_FLAG_FAILOK;
				954	}
				955	EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
				956
				957	static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
				958	struct ceph_osd_data *osd_data)
				959	{
				960	u64 length = ceph_osd_data_length(osd_data);
				961
				962	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
				963	BUG_ON(length > (u64) SIZE_MAX);
				964	if (length)
				965	ceph_msg_data_add_pages(msg, osd_data->pages,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	966	length, osd_data->alignment, false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	967	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
				968	BUG_ON(!length);
				969	ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
				970	#ifdef CONFIG_BLOCK
				971	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
				972	ceph_msg_data_add_bio(msg, &osd_data->bio_pos, length);
				973	#endif
				974	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) {
				975	ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos);
				976	} else {
				977	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
				978	}
				979	}
				980
				981	static u32 osd_req_encode_op(struct ceph_osd_op *dst,
				982	const struct ceph_osd_req_op *src)
				983	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	984	switch (src->op) {
				985	case CEPH_OSD_OP_STAT:
				986	break;
				987	case CEPH_OSD_OP_READ:
				988	case CEPH_OSD_OP_WRITE:
				989	case CEPH_OSD_OP_WRITEFULL:
				990	case CEPH_OSD_OP_ZERO:
				991	case CEPH_OSD_OP_TRUNCATE:
				992	dst->extent.offset = cpu_to_le64(src->extent.offset);
				993	dst->extent.length = cpu_to_le64(src->extent.length);
				994	dst->extent.truncate_size =
				995	cpu_to_le64(src->extent.truncate_size);
				996	dst->extent.truncate_seq =
				997	cpu_to_le32(src->extent.truncate_seq);
				998	break;
				999	case CEPH_OSD_OP_CALL:
				1000	dst->cls.class_len = src->cls.class_len;
				1001	dst->cls.method_len = src->cls.method_len;
				1002	dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
				1003	break;
				1004	case CEPH_OSD_OP_WATCH:
				1005	dst->watch.cookie = cpu_to_le64(src->watch.cookie);
				1006	dst->watch.ver = cpu_to_le64(0);
				1007	dst->watch.op = src->watch.op;
				1008	dst->watch.gen = cpu_to_le32(src->watch.gen);
				1009	break;
				1010	case CEPH_OSD_OP_NOTIFY_ACK:
				1011	break;
				1012	case CEPH_OSD_OP_NOTIFY:
				1013	dst->notify.cookie = cpu_to_le64(src->notify.cookie);
				1014	break;
				1015	case CEPH_OSD_OP_LIST_WATCHERS:
				1016	break;
				1017	case CEPH_OSD_OP_SETALLOCHINT:
				1018	dst->alloc_hint.expected_object_size =
				1019	cpu_to_le64(src->alloc_hint.expected_object_size);
				1020	dst->alloc_hint.expected_write_size =
				1021	cpu_to_le64(src->alloc_hint.expected_write_size);
				1022	break;
				1023	case CEPH_OSD_OP_SETXATTR:
				1024	case CEPH_OSD_OP_CMPXATTR:
				1025	dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
				1026	dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
				1027	dst->xattr.cmp_op = src->xattr.cmp_op;
				1028	dst->xattr.cmp_mode = src->xattr.cmp_mode;
				1029	break;
				1030	case CEPH_OSD_OP_CREATE:
				1031	case CEPH_OSD_OP_DELETE:
				1032	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1033	case CEPH_OSD_OP_COPY_FROM:
				1034	dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid);
				1035	dst->copy_from.src_version =
				1036	cpu_to_le64(src->copy_from.src_version);
				1037	dst->copy_from.flags = src->copy_from.flags;
				1038	dst->copy_from.src_fadvise_flags =
				1039	cpu_to_le32(src->copy_from.src_fadvise_flags);
				1040	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1041	default:
				1042	pr_err("unsupported osd opcode %s\n",
				1043	ceph_osd_op_name(src->op));
				1044	WARN_ON(1);
				1045
				1046	return 0;
				1047	}
				1048
				1049	dst->op = cpu_to_le16(src->op);
				1050	dst->flags = cpu_to_le32(src->flags);
				1051	dst->payload_len = cpu_to_le32(src->indata_len);
				1052
				1053	return src->indata_len;
				1054	}
				1055
				1056	/*
				1057	* build new request AND message, calculate layout, and adjust file
				1058	* extent as needed.
				1059	*
				1060	* if the file was recently truncated, we include information about its
				1061	* old and new size so that the object can be updated appropriately. (we
				1062	* avoid synchronously deleting truncated objects because it's slow.)
				1063	*/
				1064	struct ceph_osd_request ceph_osdc_new_request(struct ceph_osd_client osdc,
				1065	struct ceph_file_layout *layout,
				1066	struct ceph_vino vino,
				1067	u64 off, u64 *plen,
				1068	unsigned int which, int num_ops,
				1069	int opcode, int flags,
				1070	struct ceph_snap_context *snapc,
				1071	u32 truncate_seq,
				1072	u64 truncate_size,
				1073	bool use_mempool)
				1074	{
				1075	struct ceph_osd_request *req;
				1076	u64 objnum = 0;
				1077	u64 objoff = 0;
				1078	u64 objlen = 0;
				1079	int r;
				1080
				1081	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
				1082	opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE &&
				1083	opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE);
				1084
				1085	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
				1086	GFP_NOFS);
				1087	if (!req) {
				1088	r = -ENOMEM;
				1089	goto fail;
				1090	}
				1091
				1092	/* calculate max write size */
				1093	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
				1094	if (r)
				1095	goto fail;
				1096
				1097	if (opcode == CEPH_OSD_OP_CREATE \|\| opcode == CEPH_OSD_OP_DELETE) {
				1098	osd_req_op_init(req, which, opcode, 0);
				1099	} else {
				1100	u32 object_size = layout->object_size;
				1101	u32 object_base = off - objoff;
				1102	if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
				1103	if (truncate_size <= object_base) {
				1104	truncate_size = 0;
				1105	} else {
				1106	truncate_size -= object_base;
				1107	if (truncate_size > object_size)
				1108	truncate_size = object_size;
				1109	}
				1110	}
				1111	osd_req_op_extent_init(req, which, opcode, objoff, objlen,
				1112	truncate_size, truncate_seq);
				1113	}
				1114
				1115	req->r_flags = flags;
				1116	req->r_base_oloc.pool = layout->pool_id;
				1117	req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
				1118	ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
				1119
				1120	req->r_snapid = vino.snap;
				1121	if (flags & CEPH_OSD_FLAG_WRITE)
				1122	req->r_data_offset = off;
				1123
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1124	if (num_ops > 1)
				1125	/*
				1126	* This is a special case for ceph_writepages_start(), but it
				1127	* also covers ceph_uninline_data(). If more multi-op request
				1128	* use cases emerge, we will need a separate helper.
				1129	*/
				1130	r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_ops, 0);
				1131	else
				1132	r = ceph_osdc_alloc_messages(req, GFP_NOFS);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1133	if (r)
				1134	goto fail;
				1135
				1136	return req;
				1137
				1138	fail:
				1139	ceph_osdc_put_request(req);
				1140	return ERR_PTR(r);
				1141	}
				1142	EXPORT_SYMBOL(ceph_osdc_new_request);
				1143
				1144	/*
				1145	* We keep osd requests in an rbtree, sorted by ->r_tid.
				1146	*/
				1147	DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
				1148	DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
				1149
				1150	/*
				1151	* Call @fn on each OSD request as long as @fn returns 0.
				1152	*/
				1153	static void for_each_request(struct ceph_osd_client *osdc,
				1154	int (fn)(struct ceph_osd_request req, void *arg),
				1155	void *arg)
				1156	{
				1157	struct rb_node n, p;
				1158
				1159	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
				1160	struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
				1161
				1162	for (p = rb_first(&osd->o_requests); p; ) {
				1163	struct ceph_osd_request *req =
				1164	rb_entry(p, struct ceph_osd_request, r_node);
				1165
				1166	p = rb_next(p);
				1167	if (fn(req, arg))
				1168	return;
				1169	}
				1170	}
				1171
				1172	for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
				1173	struct ceph_osd_request *req =
				1174	rb_entry(p, struct ceph_osd_request, r_node);
				1175
				1176	p = rb_next(p);
				1177	if (fn(req, arg))
				1178	return;
				1179	}
				1180	}
				1181
				1182	static bool osd_homeless(struct ceph_osd *osd)
				1183	{
				1184	return osd->o_osd == CEPH_HOMELESS_OSD;
				1185	}
				1186
				1187	static bool osd_registered(struct ceph_osd *osd)
				1188	{
				1189	verify_osdc_locked(osd->o_osdc);
				1190
				1191	return !RB_EMPTY_NODE(&osd->o_node);
				1192	}
				1193
				1194	/*
				1195	* Assumes @osd is zero-initialized.
				1196	*/
				1197	static void osd_init(struct ceph_osd *osd)
				1198	{
				1199	refcount_set(&osd->o_ref, 1);
				1200	RB_CLEAR_NODE(&osd->o_node);
				1201	osd->o_requests = RB_ROOT;
				1202	osd->o_linger_requests = RB_ROOT;
				1203	osd->o_backoff_mappings = RB_ROOT;
				1204	osd->o_backoffs_by_id = RB_ROOT;
				1205	INIT_LIST_HEAD(&osd->o_osd_lru);
				1206	INIT_LIST_HEAD(&osd->o_keepalive_item);
				1207	osd->o_incarnation = 1;
				1208	mutex_init(&osd->lock);
				1209	}
				1210
				1211	static void osd_cleanup(struct ceph_osd *osd)
				1212	{
				1213	WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
				1214	WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
				1215	WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
				1216	WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoff_mappings));
				1217	WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoffs_by_id));
				1218	WARN_ON(!list_empty(&osd->o_osd_lru));
				1219	WARN_ON(!list_empty(&osd->o_keepalive_item));
				1220
				1221	if (osd->o_auth.authorizer) {
				1222	WARN_ON(osd_homeless(osd));
				1223	ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
				1224	}
				1225	}
				1226
				1227	/*
				1228	* Track open sessions with osds.
				1229	*/
				1230	static struct ceph_osd create_osd(struct ceph_osd_client osdc, int onum)
				1231	{
				1232	struct ceph_osd *osd;
				1233
				1234	WARN_ON(onum == CEPH_HOMELESS_OSD);
				1235
				1236	osd = kzalloc(sizeof(*osd), GFP_NOIO \| __GFP_NOFAIL);
				1237	osd_init(osd);
				1238	osd->o_osdc = osdc;
				1239	osd->o_osd = onum;
				1240
				1241	ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
				1242
				1243	return osd;
				1244	}
				1245
				1246	static struct ceph_osd get_osd(struct ceph_osd osd)
				1247	{
				1248	if (refcount_inc_not_zero(&osd->o_ref)) {
				1249	dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1,
				1250	refcount_read(&osd->o_ref));
				1251	return osd;
				1252	} else {
				1253	dout("get_osd %p FAIL\n", osd);
				1254	return NULL;
				1255	}
				1256	}
				1257
				1258	static void put_osd(struct ceph_osd *osd)
				1259	{
				1260	dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref),
				1261	refcount_read(&osd->o_ref) - 1);
				1262	if (refcount_dec_and_test(&osd->o_ref)) {
				1263	osd_cleanup(osd);
				1264	kfree(osd);
				1265	}
				1266	}
				1267
				1268	DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
				1269
				1270	static void __move_osd_to_lru(struct ceph_osd *osd)
				1271	{
				1272	struct ceph_osd_client *osdc = osd->o_osdc;
				1273
				1274	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				1275	BUG_ON(!list_empty(&osd->o_osd_lru));
				1276
				1277	spin_lock(&osdc->osd_lru_lock);
				1278	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
				1279	spin_unlock(&osdc->osd_lru_lock);
				1280
				1281	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
				1282	}
				1283
				1284	static void maybe_move_osd_to_lru(struct ceph_osd *osd)
				1285	{
				1286	if (RB_EMPTY_ROOT(&osd->o_requests) &&
				1287	RB_EMPTY_ROOT(&osd->o_linger_requests))
				1288	__move_osd_to_lru(osd);
				1289	}
				1290
				1291	static void __remove_osd_from_lru(struct ceph_osd *osd)
				1292	{
				1293	struct ceph_osd_client *osdc = osd->o_osdc;
				1294
				1295	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				1296
				1297	spin_lock(&osdc->osd_lru_lock);
				1298	if (!list_empty(&osd->o_osd_lru))
				1299	list_del_init(&osd->o_osd_lru);
				1300	spin_unlock(&osdc->osd_lru_lock);
				1301	}
				1302
				1303	/*
				1304	* Close the connection and assign any leftover requests to the
				1305	* homeless session.
				1306	*/
				1307	static void close_osd(struct ceph_osd *osd)
				1308	{
				1309	struct ceph_osd_client *osdc = osd->o_osdc;
				1310	struct rb_node *n;
				1311
				1312	verify_osdc_wrlocked(osdc);
				1313	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				1314
				1315	ceph_con_close(&osd->o_con);
				1316
				1317	for (n = rb_first(&osd->o_requests); n; ) {
				1318	struct ceph_osd_request *req =
				1319	rb_entry(n, struct ceph_osd_request, r_node);
				1320
				1321	n = rb_next(n); /* unlink_request() */
				1322
				1323	dout(" reassigning req %p tid %llu\n", req, req->r_tid);
				1324	unlink_request(osd, req);
				1325	link_request(&osdc->homeless_osd, req);
				1326	}
				1327	for (n = rb_first(&osd->o_linger_requests); n; ) {
				1328	struct ceph_osd_linger_request *lreq =
				1329	rb_entry(n, struct ceph_osd_linger_request, node);
				1330
				1331	n = rb_next(n); /* unlink_linger() */
				1332
				1333	dout(" reassigning lreq %p linger_id %llu\n", lreq,
				1334	lreq->linger_id);
				1335	unlink_linger(osd, lreq);
				1336	link_linger(&osdc->homeless_osd, lreq);
				1337	}
				1338	clear_backoffs(osd);
				1339
				1340	__remove_osd_from_lru(osd);
				1341	erase_osd(&osdc->osds, osd);
				1342	put_osd(osd);
				1343	}
				1344
				1345	/*
				1346	* reset osd connect
				1347	*/
				1348	static int reopen_osd(struct ceph_osd *osd)
				1349	{
				1350	struct ceph_entity_addr *peer_addr;
				1351
				1352	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				1353
				1354	if (RB_EMPTY_ROOT(&osd->o_requests) &&
				1355	RB_EMPTY_ROOT(&osd->o_linger_requests)) {
				1356	close_osd(osd);
				1357	return -ENODEV;
				1358	}
				1359
				1360	peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
				1361	if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
				1362	!ceph_con_opened(&osd->o_con)) {
				1363	struct rb_node *n;
				1364
				1365	dout("osd addr hasn't changed and connection never opened, "
				1366	"letting msgr retry\n");
				1367	/* touch each r_stamp for handle_timeout()'s benfit */
				1368	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
				1369	struct ceph_osd_request *req =
				1370	rb_entry(n, struct ceph_osd_request, r_node);
				1371	req->r_stamp = jiffies;
				1372	}
				1373
				1374	return -EAGAIN;
				1375	}
				1376
				1377	ceph_con_close(&osd->o_con);
				1378	ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
				1379	osd->o_incarnation++;
				1380
				1381	return 0;
				1382	}
				1383
				1384	static struct ceph_osd lookup_create_osd(struct ceph_osd_client osdc, int o,
				1385	bool wrlocked)
				1386	{
				1387	struct ceph_osd *osd;
				1388
				1389	if (wrlocked)
				1390	verify_osdc_wrlocked(osdc);
				1391	else
				1392	verify_osdc_locked(osdc);
				1393
				1394	if (o != CEPH_HOMELESS_OSD)
				1395	osd = lookup_osd(&osdc->osds, o);
				1396	else
				1397	osd = &osdc->homeless_osd;
				1398	if (!osd) {
				1399	if (!wrlocked)
				1400	return ERR_PTR(-EAGAIN);
				1401
				1402	osd = create_osd(osdc, o);
				1403	insert_osd(&osdc->osds, osd);
				1404	ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
				1405	&osdc->osdmap->osd_addr[osd->o_osd]);
				1406	}
				1407
				1408	dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
				1409	return osd;
				1410	}
				1411
				1412	/*
				1413	* Create request <-> OSD session relation.
				1414	*
				1415	* @req has to be assigned a tid, @osd may be homeless.
				1416	*/
				1417	static void link_request(struct ceph_osd osd, struct ceph_osd_request req)
				1418	{
				1419	verify_osd_locked(osd);
				1420	WARN_ON(!req->r_tid \|\| req->r_osd);
				1421	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
				1422	req, req->r_tid);
				1423
				1424	if (!osd_homeless(osd))
				1425	__remove_osd_from_lru(osd);
				1426	else
				1427	atomic_inc(&osd->o_osdc->num_homeless);
				1428
				1429	get_osd(osd);
				1430	insert_request(&osd->o_requests, req);
				1431	req->r_osd = osd;
				1432	}
				1433
				1434	static void unlink_request(struct ceph_osd osd, struct ceph_osd_request req)
				1435	{
				1436	verify_osd_locked(osd);
				1437	WARN_ON(req->r_osd != osd);
				1438	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
				1439	req, req->r_tid);
				1440
				1441	req->r_osd = NULL;
				1442	erase_request(&osd->o_requests, req);
				1443	put_osd(osd);
				1444
				1445	if (!osd_homeless(osd))
				1446	maybe_move_osd_to_lru(osd);
				1447	else
				1448	atomic_dec(&osd->o_osdc->num_homeless);
				1449	}
				1450
				1451	static bool __pool_full(struct ceph_pg_pool_info *pi)
				1452	{
				1453	return pi->flags & CEPH_POOL_FLAG_FULL;
				1454	}
				1455
				1456	static bool have_pool_full(struct ceph_osd_client *osdc)
				1457	{
				1458	struct rb_node *n;
				1459
				1460	for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
				1461	struct ceph_pg_pool_info *pi =
				1462	rb_entry(n, struct ceph_pg_pool_info, node);
				1463
				1464	if (__pool_full(pi))
				1465	return true;
				1466	}
				1467
				1468	return false;
				1469	}
				1470
				1471	static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
				1472	{
				1473	struct ceph_pg_pool_info *pi;
				1474
				1475	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
				1476	if (!pi)
				1477	return false;
				1478
				1479	return __pool_full(pi);
				1480	}
				1481
				1482	/*
				1483	* Returns whether a request should be blocked from being sent
				1484	* based on the current osdmap and osd_client settings.
				1485	*/
				1486	static bool target_should_be_paused(struct ceph_osd_client *osdc,
				1487	const struct ceph_osd_request_target *t,
				1488	struct ceph_pg_pool_info *pi)
				1489	{
				1490	bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
				1491	bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) \|\|
				1492	ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				1493	__pool_full(pi);
				1494
				1495	WARN_ON(pi->id != t->target_oloc.pool);
				1496	return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) \|\|
				1497	((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) \|\|
				1498	(osdc->osdmap->epoch < osdc->epoch_barrier);
				1499	}
				1500
				1501	enum calc_target_result {
				1502	CALC_TARGET_NO_ACTION = 0,
				1503	CALC_TARGET_NEED_RESEND,
				1504	CALC_TARGET_POOL_DNE,
				1505	};
				1506
				1507	static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
				1508	struct ceph_osd_request_target *t,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1509	bool any_change)
				1510	{
				1511	struct ceph_pg_pool_info *pi;
				1512	struct ceph_pg pgid, last_pgid;
				1513	struct ceph_osds up, acting;
				1514	bool force_resend = false;
				1515	bool unpaused = false;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1516	bool legacy_change = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1517	bool split = false;
				1518	bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
				1519	bool recovery_deletes = ceph_osdmap_flag(osdc,
				1520	CEPH_OSDMAP_RECOVERY_DELETES);
				1521	enum calc_target_result ct_res;
				1522
				1523	t->epoch = osdc->osdmap->epoch;
				1524	pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
				1525	if (!pi) {
				1526	t->osd = CEPH_HOMELESS_OSD;
				1527	ct_res = CALC_TARGET_POOL_DNE;
				1528	goto out;
				1529	}
				1530
				1531	if (osdc->osdmap->epoch == pi->last_force_request_resend) {
				1532	if (t->last_force_resend < pi->last_force_request_resend) {
				1533	t->last_force_resend = pi->last_force_request_resend;
				1534	force_resend = true;
				1535	} else if (t->last_force_resend == 0) {
				1536	force_resend = true;
				1537	}
				1538	}
				1539
				1540	/* apply tiering */
				1541	ceph_oid_copy(&t->target_oid, &t->base_oid);
				1542	ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
				1543	if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
				1544	if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
				1545	t->target_oloc.pool = pi->read_tier;
				1546	if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
				1547	t->target_oloc.pool = pi->write_tier;
				1548
				1549	pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool);
				1550	if (!pi) {
				1551	t->osd = CEPH_HOMELESS_OSD;
				1552	ct_res = CALC_TARGET_POOL_DNE;
				1553	goto out;
				1554	}
				1555	}
				1556
				1557	__ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, &pgid);
				1558	last_pgid.pool = pgid.pool;
				1559	last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
				1560
				1561	ceph_pg_to_up_acting_osds(osdc->osdmap, pi, &pgid, &up, &acting);
				1562	if (any_change &&
				1563	ceph_is_new_interval(&t->acting,
				1564	&acting,
				1565	&t->up,
				1566	&up,
				1567	t->size,
				1568	pi->size,
				1569	t->min_size,
				1570	pi->min_size,
				1571	t->pg_num,
				1572	pi->pg_num,
				1573	t->sort_bitwise,
				1574	sort_bitwise,
				1575	t->recovery_deletes,
				1576	recovery_deletes,
				1577	&last_pgid))
				1578	force_resend = true;
				1579
				1580	if (t->paused && !target_should_be_paused(osdc, t, pi)) {
				1581	t->paused = false;
				1582	unpaused = true;
				1583	}
				1584	legacy_change = ceph_pg_compare(&t->pgid, &pgid) \|\|
				1585	ceph_osds_changed(&t->acting, &acting, any_change);
				1586	if (t->pg_num)
				1587	split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num);
				1588
				1589	if (legacy_change \|\| force_resend \|\| split) {
				1590	t->pgid = pgid; /* struct */
				1591	ceph_pg_to_primary_shard(osdc->osdmap, pi, &pgid, &t->spgid);
				1592	ceph_osds_copy(&t->acting, &acting);
				1593	ceph_osds_copy(&t->up, &up);
				1594	t->size = pi->size;
				1595	t->min_size = pi->min_size;
				1596	t->pg_num = pi->pg_num;
				1597	t->pg_num_mask = pi->pg_num_mask;
				1598	t->sort_bitwise = sort_bitwise;
				1599	t->recovery_deletes = recovery_deletes;
				1600
				1601	t->osd = acting.primary;
				1602	}
				1603
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1604	if (unpaused \|\| legacy_change \|\| force_resend \|\| split)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1605	ct_res = CALC_TARGET_NEED_RESEND;
				1606	else
				1607	ct_res = CALC_TARGET_NO_ACTION;
				1608
				1609	out:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1610	dout("%s t %p -> %d%d%d%d ct_res %d osd%d\n", __func__, t, unpaused,
				1611	legacy_change, force_resend, split, ct_res, t->osd);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1612	return ct_res;
				1613	}
				1614
				1615	static struct ceph_spg_mapping *alloc_spg_mapping(void)
				1616	{
				1617	struct ceph_spg_mapping *spg;
				1618
				1619	spg = kmalloc(sizeof(*spg), GFP_NOIO);
				1620	if (!spg)
				1621	return NULL;
				1622
				1623	RB_CLEAR_NODE(&spg->node);
				1624	spg->backoffs = RB_ROOT;
				1625	return spg;
				1626	}
				1627
				1628	static void free_spg_mapping(struct ceph_spg_mapping *spg)
				1629	{
				1630	WARN_ON(!RB_EMPTY_NODE(&spg->node));
				1631	WARN_ON(!RB_EMPTY_ROOT(&spg->backoffs));
				1632
				1633	kfree(spg);
				1634	}
				1635
				1636	/*
				1637	* rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to
				1638	* ceph_pg_mapping. Used to track OSD backoffs -- a backoff [range] is
				1639	* defined only within a specific spgid; it does not pass anything to
				1640	* children on split, or to another primary.
				1641	*/
				1642	DEFINE_RB_FUNCS2(spg_mapping, struct ceph_spg_mapping, spgid, ceph_spg_compare,
				1643	RB_BYPTR, const struct ceph_spg *, node)
				1644
				1645	static u64 hoid_get_bitwise_key(const struct ceph_hobject_id *hoid)
				1646	{
				1647	return hoid->is_max ? 0x100000000ull : hoid->hash_reverse_bits;
				1648	}
				1649
				1650	static void hoid_get_effective_key(const struct ceph_hobject_id *hoid,
				1651	void *pkey, size_t pkey_len)
				1652	{
				1653	if (hoid->key_len) {
				1654	*pkey = hoid->key;
				1655	*pkey_len = hoid->key_len;
				1656	} else {
				1657	*pkey = hoid->oid;
				1658	*pkey_len = hoid->oid_len;
				1659	}
				1660	}
				1661
				1662	static int compare_names(const void *name1, size_t name1_len,
				1663	const void *name2, size_t name2_len)
				1664	{
				1665	int ret;
				1666
				1667	ret = memcmp(name1, name2, min(name1_len, name2_len));
				1668	if (!ret) {
				1669	if (name1_len < name2_len)
				1670	ret = -1;
				1671	else if (name1_len > name2_len)
				1672	ret = 1;
				1673	}
				1674	return ret;
				1675	}
				1676
				1677	static int hoid_compare(const struct ceph_hobject_id *lhs,
				1678	const struct ceph_hobject_id *rhs)
				1679	{
				1680	void effective_key1, effective_key2;
				1681	size_t effective_key1_len, effective_key2_len;
				1682	int ret;
				1683
				1684	if (lhs->is_max < rhs->is_max)
				1685	return -1;
				1686	if (lhs->is_max > rhs->is_max)
				1687	return 1;
				1688
				1689	if (lhs->pool < rhs->pool)
				1690	return -1;
				1691	if (lhs->pool > rhs->pool)
				1692	return 1;
				1693
				1694	if (hoid_get_bitwise_key(lhs) < hoid_get_bitwise_key(rhs))
				1695	return -1;
				1696	if (hoid_get_bitwise_key(lhs) > hoid_get_bitwise_key(rhs))
				1697	return 1;
				1698
				1699	ret = compare_names(lhs->nspace, lhs->nspace_len,
				1700	rhs->nspace, rhs->nspace_len);
				1701	if (ret)
				1702	return ret;
				1703
				1704	hoid_get_effective_key(lhs, &effective_key1, &effective_key1_len);
				1705	hoid_get_effective_key(rhs, &effective_key2, &effective_key2_len);
				1706	ret = compare_names(effective_key1, effective_key1_len,
				1707	effective_key2, effective_key2_len);
				1708	if (ret)
				1709	return ret;
				1710
				1711	ret = compare_names(lhs->oid, lhs->oid_len, rhs->oid, rhs->oid_len);
				1712	if (ret)
				1713	return ret;
				1714
				1715	if (lhs->snapid < rhs->snapid)
				1716	return -1;
				1717	if (lhs->snapid > rhs->snapid)
				1718	return 1;
				1719
				1720	return 0;
				1721	}
				1722
				1723	/*
				1724	* For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX
				1725	* compat stuff here.
				1726	*
				1727	* Assumes @hoid is zero-initialized.
				1728	*/
				1729	static int decode_hoid(void *p, void end, struct ceph_hobject_id *hoid)
				1730	{
				1731	u8 struct_v;
				1732	u32 struct_len;
				1733	int ret;
				1734
				1735	ret = ceph_start_decoding(p, end, 4, "hobject_t", &struct_v,
				1736	&struct_len);
				1737	if (ret)
				1738	return ret;
				1739
				1740	if (struct_v < 4) {
				1741	pr_err("got struct_v %d < 4 of hobject_t\n", struct_v);
				1742	goto e_inval;
				1743	}
				1744
				1745	hoid->key = ceph_extract_encoded_string(p, end, &hoid->key_len,
				1746	GFP_NOIO);
				1747	if (IS_ERR(hoid->key)) {
				1748	ret = PTR_ERR(hoid->key);
				1749	hoid->key = NULL;
				1750	return ret;
				1751	}
				1752
				1753	hoid->oid = ceph_extract_encoded_string(p, end, &hoid->oid_len,
				1754	GFP_NOIO);
				1755	if (IS_ERR(hoid->oid)) {
				1756	ret = PTR_ERR(hoid->oid);
				1757	hoid->oid = NULL;
				1758	return ret;
				1759	}
				1760
				1761	ceph_decode_64_safe(p, end, hoid->snapid, e_inval);
				1762	ceph_decode_32_safe(p, end, hoid->hash, e_inval);
				1763	ceph_decode_8_safe(p, end, hoid->is_max, e_inval);
				1764
				1765	hoid->nspace = ceph_extract_encoded_string(p, end, &hoid->nspace_len,
				1766	GFP_NOIO);
				1767	if (IS_ERR(hoid->nspace)) {
				1768	ret = PTR_ERR(hoid->nspace);
				1769	hoid->nspace = NULL;
				1770	return ret;
				1771	}
				1772
				1773	ceph_decode_64_safe(p, end, hoid->pool, e_inval);
				1774
				1775	ceph_hoid_build_hash_cache(hoid);
				1776	return 0;
				1777
				1778	e_inval:
				1779	return -EINVAL;
				1780	}
				1781
				1782	static int hoid_encoding_size(const struct ceph_hobject_id *hoid)
				1783	{
				1784	return 8 + 4 + 1 + 8 + /* snapid, hash, is_max, pool */
				1785	4 + hoid->key_len + 4 + hoid->oid_len + 4 + hoid->nspace_len;
				1786	}
				1787
				1788	static void encode_hoid(void *p, void end, const struct ceph_hobject_id *hoid)
				1789	{
				1790	ceph_start_encoding(p, 4, 3, hoid_encoding_size(hoid));
				1791	ceph_encode_string(p, end, hoid->key, hoid->key_len);
				1792	ceph_encode_string(p, end, hoid->oid, hoid->oid_len);
				1793	ceph_encode_64(p, hoid->snapid);
				1794	ceph_encode_32(p, hoid->hash);
				1795	ceph_encode_8(p, hoid->is_max);
				1796	ceph_encode_string(p, end, hoid->nspace, hoid->nspace_len);
				1797	ceph_encode_64(p, hoid->pool);
				1798	}
				1799
				1800	static void free_hoid(struct ceph_hobject_id *hoid)
				1801	{
				1802	if (hoid) {
				1803	kfree(hoid->key);
				1804	kfree(hoid->oid);
				1805	kfree(hoid->nspace);
				1806	kfree(hoid);
				1807	}
				1808	}
				1809
				1810	static struct ceph_osd_backoff *alloc_backoff(void)
				1811	{
				1812	struct ceph_osd_backoff *backoff;
				1813
				1814	backoff = kzalloc(sizeof(*backoff), GFP_NOIO);
				1815	if (!backoff)
				1816	return NULL;
				1817
				1818	RB_CLEAR_NODE(&backoff->spg_node);
				1819	RB_CLEAR_NODE(&backoff->id_node);
				1820	return backoff;
				1821	}
				1822
				1823	static void free_backoff(struct ceph_osd_backoff *backoff)
				1824	{
				1825	WARN_ON(!RB_EMPTY_NODE(&backoff->spg_node));
				1826	WARN_ON(!RB_EMPTY_NODE(&backoff->id_node));
				1827
				1828	free_hoid(backoff->begin);
				1829	free_hoid(backoff->end);
				1830	kfree(backoff);
				1831	}
				1832
				1833	/*
				1834	* Within a specific spgid, backoffs are managed by ->begin hoid.
				1835	*/
				1836	DEFINE_RB_INSDEL_FUNCS2(backoff, struct ceph_osd_backoff, begin, hoid_compare,
				1837	RB_BYVAL, spg_node);
				1838
				1839	static struct ceph_osd_backoff lookup_containing_backoff(struct rb_root root,
				1840	const struct ceph_hobject_id *hoid)
				1841	{
				1842	struct rb_node *n = root->rb_node;
				1843
				1844	while (n) {
				1845	struct ceph_osd_backoff *cur =
				1846	rb_entry(n, struct ceph_osd_backoff, spg_node);
				1847	int cmp;
				1848
				1849	cmp = hoid_compare(hoid, cur->begin);
				1850	if (cmp < 0) {
				1851	n = n->rb_left;
				1852	} else if (cmp > 0) {
				1853	if (hoid_compare(hoid, cur->end) < 0)
				1854	return cur;
				1855
				1856	n = n->rb_right;
				1857	} else {
				1858	return cur;
				1859	}
				1860	}
				1861
				1862	return NULL;
				1863	}
				1864
				1865	/*
				1866	* Each backoff has a unique id within its OSD session.
				1867	*/
				1868	DEFINE_RB_FUNCS(backoff_by_id, struct ceph_osd_backoff, id, id_node)
				1869
				1870	static void clear_backoffs(struct ceph_osd *osd)
				1871	{
				1872	while (!RB_EMPTY_ROOT(&osd->o_backoff_mappings)) {
				1873	struct ceph_spg_mapping *spg =
				1874	rb_entry(rb_first(&osd->o_backoff_mappings),
				1875	struct ceph_spg_mapping, node);
				1876
				1877	while (!RB_EMPTY_ROOT(&spg->backoffs)) {
				1878	struct ceph_osd_backoff *backoff =
				1879	rb_entry(rb_first(&spg->backoffs),
				1880	struct ceph_osd_backoff, spg_node);
				1881
				1882	erase_backoff(&spg->backoffs, backoff);
				1883	erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
				1884	free_backoff(backoff);
				1885	}
				1886	erase_spg_mapping(&osd->o_backoff_mappings, spg);
				1887	free_spg_mapping(spg);
				1888	}
				1889	}
				1890
				1891	/*
				1892	* Set up a temporary, non-owning view into @t.
				1893	*/
				1894	static void hoid_fill_from_target(struct ceph_hobject_id *hoid,
				1895	const struct ceph_osd_request_target *t)
				1896	{
				1897	hoid->key = NULL;
				1898	hoid->key_len = 0;
				1899	hoid->oid = t->target_oid.name;
				1900	hoid->oid_len = t->target_oid.name_len;
				1901	hoid->snapid = CEPH_NOSNAP;
				1902	hoid->hash = t->pgid.seed;
				1903	hoid->is_max = false;
				1904	if (t->target_oloc.pool_ns) {
				1905	hoid->nspace = t->target_oloc.pool_ns->str;
				1906	hoid->nspace_len = t->target_oloc.pool_ns->len;
				1907	} else {
				1908	hoid->nspace = NULL;
				1909	hoid->nspace_len = 0;
				1910	}
				1911	hoid->pool = t->target_oloc.pool;
				1912	ceph_hoid_build_hash_cache(hoid);
				1913	}
				1914
				1915	static bool should_plug_request(struct ceph_osd_request *req)
				1916	{
				1917	struct ceph_osd *osd = req->r_osd;
				1918	struct ceph_spg_mapping *spg;
				1919	struct ceph_osd_backoff *backoff;
				1920	struct ceph_hobject_id hoid;
				1921
				1922	spg = lookup_spg_mapping(&osd->o_backoff_mappings, &req->r_t.spgid);
				1923	if (!spg)
				1924	return false;
				1925
				1926	hoid_fill_from_target(&hoid, &req->r_t);
				1927	backoff = lookup_containing_backoff(&spg->backoffs, &hoid);
				1928	if (!backoff)
				1929	return false;
				1930
				1931	dout("%s req %p tid %llu backoff osd%d spgid %llu.%xs%d id %llu\n",
				1932	__func__, req, req->r_tid, osd->o_osd, backoff->spgid.pgid.pool,
				1933	backoff->spgid.pgid.seed, backoff->spgid.shard, backoff->id);
				1934	return true;
				1935	}
				1936
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1937	/*
				1938	* Keep get_num_data_items() in sync with this function.
				1939	*/
				1940	static void setup_request_data(struct ceph_osd_request *req)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1941	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1942	struct ceph_msg *request_msg = req->r_request;
				1943	struct ceph_msg *reply_msg = req->r_reply;
				1944	struct ceph_osd_req_op *op;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1945
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1946	if (req->r_request->num_data_items \|\| req->r_reply->num_data_items)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1947	return;
				1948
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1949	WARN_ON(request_msg->data_length \|\| reply_msg->data_length);
				1950	for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1951	switch (op->op) {
				1952	/* request */
				1953	case CEPH_OSD_OP_WRITE:
				1954	case CEPH_OSD_OP_WRITEFULL:
				1955	WARN_ON(op->indata_len != op->extent.length);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1956	ceph_osdc_msg_data_add(request_msg,
				1957	&op->extent.osd_data);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1958	break;
				1959	case CEPH_OSD_OP_SETXATTR:
				1960	case CEPH_OSD_OP_CMPXATTR:
				1961	WARN_ON(op->indata_len != op->xattr.name_len +
				1962	op->xattr.value_len);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1963	ceph_osdc_msg_data_add(request_msg,
				1964	&op->xattr.osd_data);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1965	break;
				1966	case CEPH_OSD_OP_NOTIFY_ACK:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1967	ceph_osdc_msg_data_add(request_msg,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1968	&op->notify_ack.request_data);
				1969	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1970	case CEPH_OSD_OP_COPY_FROM:
				1971	ceph_osdc_msg_data_add(request_msg,
				1972	&op->copy_from.osd_data);
				1973	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1974
				1975	/* reply */
				1976	case CEPH_OSD_OP_STAT:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1977	ceph_osdc_msg_data_add(reply_msg,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1978	&op->raw_data_in);
				1979	break;
				1980	case CEPH_OSD_OP_READ:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1981	ceph_osdc_msg_data_add(reply_msg,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1982	&op->extent.osd_data);
				1983	break;
				1984	case CEPH_OSD_OP_LIST_WATCHERS:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1985	ceph_osdc_msg_data_add(reply_msg,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1986	&op->list_watchers.response_data);
				1987	break;
				1988
				1989	/* both */
				1990	case CEPH_OSD_OP_CALL:
				1991	WARN_ON(op->indata_len != op->cls.class_len +
				1992	op->cls.method_len +
				1993	op->cls.indata_len);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1994	ceph_osdc_msg_data_add(request_msg,
				1995	&op->cls.request_info);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1996	/* optional, can be NONE */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1997	ceph_osdc_msg_data_add(request_msg,
				1998	&op->cls.request_data);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1999	/* optional, can be NONE */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2000	ceph_osdc_msg_data_add(reply_msg,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2001	&op->cls.response_data);
				2002	break;
				2003	case CEPH_OSD_OP_NOTIFY:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2004	ceph_osdc_msg_data_add(request_msg,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2005	&op->notify.request_data);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2006	ceph_osdc_msg_data_add(reply_msg,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2007	&op->notify.response_data);
				2008	break;
				2009	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2010	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2011	}
				2012
				2013	static void encode_pgid(void *p, const struct ceph_pg pgid)
				2014	{
				2015	ceph_encode_8(p, 1);
				2016	ceph_encode_64(p, pgid->pool);
				2017	ceph_encode_32(p, pgid->seed);
				2018	ceph_encode_32(p, -1); /* preferred */
				2019	}
				2020
				2021	static void encode_spgid(void *p, const struct ceph_spg spgid)
				2022	{
				2023	ceph_start_encoding(p, 1, 1, CEPH_PGID_ENCODING_LEN + 1);
				2024	encode_pgid(p, &spgid->pgid);
				2025	ceph_encode_8(p, spgid->shard);
				2026	}
				2027
				2028	static void encode_oloc(void *p, void end,
				2029	const struct ceph_object_locator *oloc)
				2030	{
				2031	ceph_start_encoding(p, 5, 4, ceph_oloc_encoding_size(oloc));
				2032	ceph_encode_64(p, oloc->pool);
				2033	ceph_encode_32(p, -1); /* preferred */
				2034	ceph_encode_32(p, 0); /* key len */
				2035	if (oloc->pool_ns)
				2036	ceph_encode_string(p, end, oloc->pool_ns->str,
				2037	oloc->pool_ns->len);
				2038	else
				2039	ceph_encode_32(p, 0);
				2040	}
				2041
				2042	static void encode_request_partial(struct ceph_osd_request *req,
				2043	struct ceph_msg *msg)
				2044	{
				2045	void *p = msg->front.iov_base;
				2046	void *const end = p + msg->front_alloc_len;
				2047	u32 data_len = 0;
				2048	int i;
				2049
				2050	if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
				2051	/* snapshots aren't writeable */
				2052	WARN_ON(req->r_snapid != CEPH_NOSNAP);
				2053	} else {
				2054	WARN_ON(req->r_mtime.tv_sec \|\| req->r_mtime.tv_nsec \|\|
				2055	req->r_data_offset \|\| req->r_snapc);
				2056	}
				2057
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2058	setup_request_data(req);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2059
				2060	encode_spgid(&p, &req->r_t.spgid); /* actual spg */
				2061	ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */
				2062	ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
				2063	ceph_encode_32(&p, req->r_flags);
				2064
				2065	/* reqid */
				2066	ceph_start_encoding(&p, 2, 2, sizeof(struct ceph_osd_reqid));
				2067	memset(p, 0, sizeof(struct ceph_osd_reqid));
				2068	p += sizeof(struct ceph_osd_reqid);
				2069
				2070	/* trace */
				2071	memset(p, 0, sizeof(struct ceph_blkin_trace_info));
				2072	p += sizeof(struct ceph_blkin_trace_info);
				2073
				2074	ceph_encode_32(&p, 0); /* client_inc, always 0 */
				2075	ceph_encode_timespec64(p, &req->r_mtime);
				2076	p += sizeof(struct ceph_timespec);
				2077
				2078	encode_oloc(&p, end, &req->r_t.target_oloc);
				2079	ceph_encode_string(&p, end, req->r_t.target_oid.name,
				2080	req->r_t.target_oid.name_len);
				2081
				2082	/* ops, can imply data */
				2083	ceph_encode_16(&p, req->r_num_ops);
				2084	for (i = 0; i < req->r_num_ops; i++) {
				2085	data_len += osd_req_encode_op(p, &req->r_ops[i]);
				2086	p += sizeof(struct ceph_osd_op);
				2087	}
				2088
				2089	ceph_encode_64(&p, req->r_snapid); /* snapid */
				2090	if (req->r_snapc) {
				2091	ceph_encode_64(&p, req->r_snapc->seq);
				2092	ceph_encode_32(&p, req->r_snapc->num_snaps);
				2093	for (i = 0; i < req->r_snapc->num_snaps; i++)
				2094	ceph_encode_64(&p, req->r_snapc->snaps[i]);
				2095	} else {
				2096	ceph_encode_64(&p, 0); /* snap_seq */
				2097	ceph_encode_32(&p, 0); /* snaps len */
				2098	}
				2099
				2100	ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
				2101	BUG_ON(p > end - 8); /* space for features */
				2102
				2103	msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */
				2104	/* front_len is finalized in encode_request_finish() */
				2105	msg->front.iov_len = p - msg->front.iov_base;
				2106	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				2107	msg->hdr.data_len = cpu_to_le32(data_len);
				2108	/*
				2109	* The header "data_off" is a hint to the receiver allowing it
				2110	* to align received data into its buffers such that there's no
				2111	* need to re-copy it before writing it to disk (direct I/O).
				2112	*/
				2113	msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
				2114
				2115	dout("%s req %p msg %p oid %s oid_len %d\n", __func__, req, msg,
				2116	req->r_t.target_oid.name, req->r_t.target_oid.name_len);
				2117	}
				2118
				2119	static void encode_request_finish(struct ceph_msg *msg)
				2120	{
				2121	void *p = msg->front.iov_base;
				2122	void *const partial_end = p + msg->front.iov_len;
				2123	void *const end = p + msg->front_alloc_len;
				2124
				2125	if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) {
				2126	/* luminous OSD -- encode features and be done */
				2127	p = partial_end;
				2128	ceph_encode_64(&p, msg->con->peer_features);
				2129	} else {
				2130	struct {
				2131	char spgid[CEPH_ENCODING_START_BLK_LEN +
				2132	CEPH_PGID_ENCODING_LEN + 1];
				2133	__le32 hash;
				2134	__le32 epoch;
				2135	__le32 flags;
				2136	char reqid[CEPH_ENCODING_START_BLK_LEN +
				2137	sizeof(struct ceph_osd_reqid)];
				2138	char trace[sizeof(struct ceph_blkin_trace_info)];
				2139	__le32 client_inc;
				2140	struct ceph_timespec mtime;
				2141	} __packed head;
				2142	struct ceph_pg pgid;
				2143	void oloc, oid, *tail;
				2144	int oloc_len, oid_len, tail_len;
				2145	int len;
				2146
				2147	/*
				2148	* Pre-luminous OSD -- reencode v8 into v4 using @head
				2149	* as a temporary buffer. Encode the raw PG; the rest
				2150	* is just a matter of moving oloc, oid and tail blobs
				2151	* around.
				2152	*/
				2153	memcpy(&head, p, sizeof(head));
				2154	p += sizeof(head);
				2155
				2156	oloc = p;
				2157	p += CEPH_ENCODING_START_BLK_LEN;
				2158	pgid.pool = ceph_decode_64(&p);
				2159	p += 4 + 4; /* preferred, key len */
				2160	len = ceph_decode_32(&p);
				2161	p += len; /* nspace */
				2162	oloc_len = p - oloc;
				2163
				2164	oid = p;
				2165	len = ceph_decode_32(&p);
				2166	p += len;
				2167	oid_len = p - oid;
				2168
				2169	tail = p;
				2170	tail_len = partial_end - p;
				2171
				2172	p = msg->front.iov_base;
				2173	ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc));
				2174	ceph_encode_copy(&p, &head.epoch, sizeof(head.epoch));
				2175	ceph_encode_copy(&p, &head.flags, sizeof(head.flags));
				2176	ceph_encode_copy(&p, &head.mtime, sizeof(head.mtime));
				2177
				2178	/* reassert_version */
				2179	memset(p, 0, sizeof(struct ceph_eversion));
				2180	p += sizeof(struct ceph_eversion);
				2181
				2182	BUG_ON(p >= oloc);
				2183	memmove(p, oloc, oloc_len);
				2184	p += oloc_len;
				2185
				2186	pgid.seed = le32_to_cpu(head.hash);
				2187	encode_pgid(&p, &pgid); /* raw pg */
				2188
				2189	BUG_ON(p >= oid);
				2190	memmove(p, oid, oid_len);
				2191	p += oid_len;
				2192
				2193	/* tail -- ops, snapid, snapc, retry_attempt */
				2194	BUG_ON(p >= tail);
				2195	memmove(p, tail, tail_len);
				2196	p += tail_len;
				2197
				2198	msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
				2199	}
				2200
				2201	BUG_ON(p > end);
				2202	msg->front.iov_len = p - msg->front.iov_base;
				2203	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				2204
				2205	dout("%s msg %p tid %llu %u+%u+%u v%d\n", __func__, msg,
				2206	le64_to_cpu(msg->hdr.tid), le32_to_cpu(msg->hdr.front_len),
				2207	le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len),
				2208	le16_to_cpu(msg->hdr.version));
				2209	}
				2210
				2211	/*
				2212	* @req has to be assigned a tid and registered.
				2213	*/
				2214	static void send_request(struct ceph_osd_request *req)
				2215	{
				2216	struct ceph_osd *osd = req->r_osd;
				2217
				2218	verify_osd_locked(osd);
				2219	WARN_ON(osd->o_osd != req->r_t.osd);
				2220
				2221	/* backoff? */
				2222	if (should_plug_request(req))
				2223	return;
				2224
				2225	/*
				2226	* We may have a previously queued request message hanging
				2227	* around. Cancel it to avoid corrupting the msgr.
				2228	*/
				2229	if (req->r_sent)
				2230	ceph_msg_revoke(req->r_request);
				2231
				2232	req->r_flags \|= CEPH_OSD_FLAG_KNOWN_REDIR;
				2233	if (req->r_attempts)
				2234	req->r_flags \|= CEPH_OSD_FLAG_RETRY;
				2235	else
				2236	WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
				2237
				2238	encode_request_partial(req, req->r_request);
				2239
				2240	dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d e%u flags 0x%x attempt %d\n",
				2241	__func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
				2242	req->r_t.spgid.pgid.pool, req->r_t.spgid.pgid.seed,
				2243	req->r_t.spgid.shard, osd->o_osd, req->r_t.epoch, req->r_flags,
				2244	req->r_attempts);
				2245
				2246	req->r_t.paused = false;
				2247	req->r_stamp = jiffies;
				2248	req->r_attempts++;
				2249
				2250	req->r_sent = osd->o_incarnation;
				2251	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
				2252	ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
				2253	}
				2254
				2255	static void maybe_request_map(struct ceph_osd_client *osdc)
				2256	{
				2257	bool continuous = false;
				2258
				2259	verify_osdc_locked(osdc);
				2260	WARN_ON(!osdc->osdmap->epoch);
				2261
				2262	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				2263	ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) \|\|
				2264	ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
				2265	dout("%s osdc %p continuous\n", __func__, osdc);
				2266	continuous = true;
				2267	} else {
				2268	dout("%s osdc %p onetime\n", __func__, osdc);
				2269	}
				2270
				2271	if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
				2272	osdc->osdmap->epoch + 1, continuous))
				2273	ceph_monc_renew_subs(&osdc->client->monc);
				2274	}
				2275
				2276	static void complete_request(struct ceph_osd_request *req, int err);
				2277	static void send_map_check(struct ceph_osd_request *req);
				2278
				2279	static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
				2280	{
				2281	struct ceph_osd_client *osdc = req->r_osdc;
				2282	struct ceph_osd *osd;
				2283	enum calc_target_result ct_res;
				2284	int err = 0;
				2285	bool need_send = false;
				2286	bool promoted = false;
				2287
				2288	WARN_ON(req->r_tid);
				2289	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
				2290
				2291	again:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2292	ct_res = calc_target(osdc, &req->r_t, false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2293	if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
				2294	goto promote;
				2295
				2296	osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
				2297	if (IS_ERR(osd)) {
				2298	WARN_ON(PTR_ERR(osd) != -EAGAIN \|\| wrlocked);
				2299	goto promote;
				2300	}
				2301
				2302	if (osdc->abort_err) {
				2303	dout("req %p abort_err %d\n", req, osdc->abort_err);
				2304	err = osdc->abort_err;
				2305	} else if (osdc->osdmap->epoch < osdc->epoch_barrier) {
				2306	dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch,
				2307	osdc->epoch_barrier);
				2308	req->r_t.paused = true;
				2309	maybe_request_map(osdc);
				2310	} else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
				2311	ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
				2312	dout("req %p pausewr\n", req);
				2313	req->r_t.paused = true;
				2314	maybe_request_map(osdc);
				2315	} else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
				2316	ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
				2317	dout("req %p pauserd\n", req);
				2318	req->r_t.paused = true;
				2319	maybe_request_map(osdc);
				2320	} else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
				2321	!(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY \|
				2322	CEPH_OSD_FLAG_FULL_FORCE)) &&
				2323	(ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				2324	pool_full(osdc, req->r_t.base_oloc.pool))) {
				2325	dout("req %p full/pool_full\n", req);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2326	if (ceph_test_opt(osdc->client, ABORT_ON_FULL)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2327	err = -ENOSPC;
				2328	} else {
				2329	pr_warn_ratelimited("FULL or reached pool quota\n");
				2330	req->r_t.paused = true;
				2331	maybe_request_map(osdc);
				2332	}
				2333	} else if (!osd_homeless(osd)) {
				2334	need_send = true;
				2335	} else {
				2336	maybe_request_map(osdc);
				2337	}
				2338
				2339	mutex_lock(&osd->lock);
				2340	/*
				2341	* Assign the tid atomically with send_request() to protect
				2342	* multiple writes to the same object from racing with each
				2343	* other, resulting in out of order ops on the OSDs.
				2344	*/
				2345	req->r_tid = atomic64_inc_return(&osdc->last_tid);
				2346	link_request(osd, req);
				2347	if (need_send)
				2348	send_request(req);
				2349	else if (err)
				2350	complete_request(req, err);
				2351	mutex_unlock(&osd->lock);
				2352
				2353	if (!err && ct_res == CALC_TARGET_POOL_DNE)
				2354	send_map_check(req);
				2355
				2356	if (promoted)
				2357	downgrade_write(&osdc->lock);
				2358	return;
				2359
				2360	promote:
				2361	up_read(&osdc->lock);
				2362	down_write(&osdc->lock);
				2363	wrlocked = true;
				2364	promoted = true;
				2365	goto again;
				2366	}
				2367
				2368	static void account_request(struct ceph_osd_request *req)
				2369	{
				2370	WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK \| CEPH_OSD_FLAG_ONDISK));
				2371	WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ \| CEPH_OSD_FLAG_WRITE)));
				2372
				2373	req->r_flags \|= CEPH_OSD_FLAG_ONDISK;
				2374	atomic_inc(&req->r_osdc->num_requests);
				2375
				2376	req->r_start_stamp = jiffies;
				2377	}
				2378
				2379	static void submit_request(struct ceph_osd_request *req, bool wrlocked)
				2380	{
				2381	ceph_osdc_get_request(req);
				2382	account_request(req);
				2383	__submit_request(req, wrlocked);
				2384	}
				2385
				2386	static void finish_request(struct ceph_osd_request *req)
				2387	{
				2388	struct ceph_osd_client *osdc = req->r_osdc;
				2389
				2390	WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
				2391	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
				2392
				2393	if (req->r_osd)
				2394	unlink_request(req->r_osd, req);
				2395	atomic_dec(&osdc->num_requests);
				2396
				2397	/*
				2398	* If an OSD has failed or returned and a request has been sent
				2399	* twice, it's possible to get a reply and end up here while the
				2400	* request message is queued for delivery. We will ignore the
				2401	* reply, so not a big deal, but better to try and catch it.
				2402	*/
				2403	ceph_msg_revoke(req->r_request);
				2404	ceph_msg_revoke_incoming(req->r_reply);
				2405	}
				2406
				2407	static void __complete_request(struct ceph_osd_request *req)
				2408	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2409	dout("%s req %p tid %llu cb %ps result %d\n", __func__, req,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2410	req->r_tid, req->r_callback, req->r_result);
				2411
				2412	if (req->r_callback)
				2413	req->r_callback(req);
				2414	complete_all(&req->r_completion);
				2415	ceph_osdc_put_request(req);
				2416	}
				2417
				2418	static void complete_request_workfn(struct work_struct *work)
				2419	{
				2420	struct ceph_osd_request *req =
				2421	container_of(work, struct ceph_osd_request, r_complete_work);
				2422
				2423	__complete_request(req);
				2424	}
				2425
				2426	/*
				2427	* This is open-coded in handle_reply().
				2428	*/
				2429	static void complete_request(struct ceph_osd_request *req, int err)
				2430	{
				2431	dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
				2432
				2433	req->r_result = err;
				2434	finish_request(req);
				2435
				2436	INIT_WORK(&req->r_complete_work, complete_request_workfn);
				2437	queue_work(req->r_osdc->completion_wq, &req->r_complete_work);
				2438	}
				2439
				2440	static void cancel_map_check(struct ceph_osd_request *req)
				2441	{
				2442	struct ceph_osd_client *osdc = req->r_osdc;
				2443	struct ceph_osd_request *lookup_req;
				2444
				2445	verify_osdc_wrlocked(osdc);
				2446
				2447	lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
				2448	if (!lookup_req)
				2449	return;
				2450
				2451	WARN_ON(lookup_req != req);
				2452	erase_request_mc(&osdc->map_checks, req);
				2453	ceph_osdc_put_request(req);
				2454	}
				2455
				2456	static void cancel_request(struct ceph_osd_request *req)
				2457	{
				2458	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
				2459
				2460	cancel_map_check(req);
				2461	finish_request(req);
				2462	complete_all(&req->r_completion);
				2463	ceph_osdc_put_request(req);
				2464	}
				2465
				2466	static void abort_request(struct ceph_osd_request *req, int err)
				2467	{
				2468	dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
				2469
				2470	cancel_map_check(req);
				2471	complete_request(req, err);
				2472	}
				2473
				2474	static int abort_fn(struct ceph_osd_request req, void arg)
				2475	{
				2476	int err = (int )arg;
				2477
				2478	abort_request(req, err);
				2479	return 0; /* continue iteration */
				2480	}
				2481
				2482	/*
				2483	* Abort all in-flight requests with @err and arrange for all future
				2484	* requests to be failed immediately.
				2485	*/
				2486	void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err)
				2487	{
				2488	dout("%s osdc %p err %d\n", __func__, osdc, err);
				2489	down_write(&osdc->lock);
				2490	for_each_request(osdc, abort_fn, &err);
				2491	osdc->abort_err = err;
				2492	up_write(&osdc->lock);
				2493	}
				2494	EXPORT_SYMBOL(ceph_osdc_abort_requests);
				2495
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2496	void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc)
				2497	{
				2498	down_write(&osdc->lock);
				2499	osdc->abort_err = 0;
				2500	up_write(&osdc->lock);
				2501	}
				2502	EXPORT_SYMBOL(ceph_osdc_clear_abort_err);
				2503
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2504	static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
				2505	{
				2506	if (likely(eb > osdc->epoch_barrier)) {
				2507	dout("updating epoch_barrier from %u to %u\n",
				2508	osdc->epoch_barrier, eb);
				2509	osdc->epoch_barrier = eb;
				2510	/* Request map if we're not to the barrier yet */
				2511	if (eb > osdc->osdmap->epoch)
				2512	maybe_request_map(osdc);
				2513	}
				2514	}
				2515
				2516	void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
				2517	{
				2518	down_read(&osdc->lock);
				2519	if (unlikely(eb > osdc->epoch_barrier)) {
				2520	up_read(&osdc->lock);
				2521	down_write(&osdc->lock);
				2522	update_epoch_barrier(osdc, eb);
				2523	up_write(&osdc->lock);
				2524	} else {
				2525	up_read(&osdc->lock);
				2526	}
				2527	}
				2528	EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier);
				2529
				2530	/*
				2531	* We can end up releasing caps as a result of abort_request().
				2532	* In that case, we probably want to ensure that the cap release message
				2533	* has an updated epoch barrier in it, so set the epoch barrier prior to
				2534	* aborting the first request.
				2535	*/
				2536	static int abort_on_full_fn(struct ceph_osd_request req, void arg)
				2537	{
				2538	struct ceph_osd_client *osdc = req->r_osdc;
				2539	bool *victims = arg;
				2540
				2541	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
				2542	(ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				2543	pool_full(osdc, req->r_t.base_oloc.pool))) {
				2544	if (!*victims) {
				2545	update_epoch_barrier(osdc, osdc->osdmap->epoch);
				2546	*victims = true;
				2547	}
				2548	abort_request(req, -ENOSPC);
				2549	}
				2550
				2551	return 0; /* continue iteration */
				2552	}
				2553
				2554	/*
				2555	* Drop all pending requests that are stalled waiting on a full condition to
				2556	* clear, and complete them with ENOSPC as the return code. Set the
				2557	* osdc->epoch_barrier to the latest map epoch that we've seen if any were
				2558	* cancelled.
				2559	*/
				2560	static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
				2561	{
				2562	bool victims = false;
				2563
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2564	if (ceph_test_opt(osdc->client, ABORT_ON_FULL) &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2565	(ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\| have_pool_full(osdc)))
				2566	for_each_request(osdc, abort_on_full_fn, &victims);
				2567	}
				2568
				2569	static void check_pool_dne(struct ceph_osd_request *req)
				2570	{
				2571	struct ceph_osd_client *osdc = req->r_osdc;
				2572	struct ceph_osdmap *map = osdc->osdmap;
				2573
				2574	verify_osdc_wrlocked(osdc);
				2575	WARN_ON(!map->epoch);
				2576
				2577	if (req->r_attempts) {
				2578	/*
				2579	* We sent a request earlier, which means that
				2580	* previously the pool existed, and now it does not
				2581	* (i.e., it was deleted).
				2582	*/
				2583	req->r_map_dne_bound = map->epoch;
				2584	dout("%s req %p tid %llu pool disappeared\n", __func__, req,
				2585	req->r_tid);
				2586	} else {
				2587	dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
				2588	req, req->r_tid, req->r_map_dne_bound, map->epoch);
				2589	}
				2590
				2591	if (req->r_map_dne_bound) {
				2592	if (map->epoch >= req->r_map_dne_bound) {
				2593	/* we had a new enough map */
				2594	pr_info_ratelimited("tid %llu pool does not exist\n",
				2595	req->r_tid);
				2596	complete_request(req, -ENOENT);
				2597	}
				2598	} else {
				2599	send_map_check(req);
				2600	}
				2601	}
				2602
				2603	static void map_check_cb(struct ceph_mon_generic_request *greq)
				2604	{
				2605	struct ceph_osd_client *osdc = &greq->monc->client->osdc;
				2606	struct ceph_osd_request *req;
				2607	u64 tid = greq->private_data;
				2608
				2609	WARN_ON(greq->result \|\| !greq->u.newest);
				2610
				2611	down_write(&osdc->lock);
				2612	req = lookup_request_mc(&osdc->map_checks, tid);
				2613	if (!req) {
				2614	dout("%s tid %llu dne\n", __func__, tid);
				2615	goto out_unlock;
				2616	}
				2617
				2618	dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
				2619	req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
				2620	if (!req->r_map_dne_bound)
				2621	req->r_map_dne_bound = greq->u.newest;
				2622	erase_request_mc(&osdc->map_checks, req);
				2623	check_pool_dne(req);
				2624
				2625	ceph_osdc_put_request(req);
				2626	out_unlock:
				2627	up_write(&osdc->lock);
				2628	}
				2629
				2630	static void send_map_check(struct ceph_osd_request *req)
				2631	{
				2632	struct ceph_osd_client *osdc = req->r_osdc;
				2633	struct ceph_osd_request *lookup_req;
				2634	int ret;
				2635
				2636	verify_osdc_wrlocked(osdc);
				2637
				2638	lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
				2639	if (lookup_req) {
				2640	WARN_ON(lookup_req != req);
				2641	return;
				2642	}
				2643
				2644	ceph_osdc_get_request(req);
				2645	insert_request_mc(&osdc->map_checks, req);
				2646	ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
				2647	map_check_cb, req->r_tid);
				2648	WARN_ON(ret);
				2649	}
				2650
				2651	/*
				2652	* lingering requests, watch/notify v2 infrastructure
				2653	*/
				2654	static void linger_release(struct kref *kref)
				2655	{
				2656	struct ceph_osd_linger_request *lreq =
				2657	container_of(kref, struct ceph_osd_linger_request, kref);
				2658
				2659	dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
				2660	lreq->reg_req, lreq->ping_req);
				2661	WARN_ON(!RB_EMPTY_NODE(&lreq->node));
				2662	WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
				2663	WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
				2664	WARN_ON(!list_empty(&lreq->scan_item));
				2665	WARN_ON(!list_empty(&lreq->pending_lworks));
				2666	WARN_ON(lreq->osd);
				2667
				2668	if (lreq->reg_req)
				2669	ceph_osdc_put_request(lreq->reg_req);
				2670	if (lreq->ping_req)
				2671	ceph_osdc_put_request(lreq->ping_req);
				2672	target_destroy(&lreq->t);
				2673	kfree(lreq);
				2674	}
				2675
				2676	static void linger_put(struct ceph_osd_linger_request *lreq)
				2677	{
				2678	if (lreq)
				2679	kref_put(&lreq->kref, linger_release);
				2680	}
				2681
				2682	static struct ceph_osd_linger_request *
				2683	linger_get(struct ceph_osd_linger_request *lreq)
				2684	{
				2685	kref_get(&lreq->kref);
				2686	return lreq;
				2687	}
				2688
				2689	static struct ceph_osd_linger_request *
				2690	linger_alloc(struct ceph_osd_client *osdc)
				2691	{
				2692	struct ceph_osd_linger_request *lreq;
				2693
				2694	lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
				2695	if (!lreq)
				2696	return NULL;
				2697
				2698	kref_init(&lreq->kref);
				2699	mutex_init(&lreq->lock);
				2700	RB_CLEAR_NODE(&lreq->node);
				2701	RB_CLEAR_NODE(&lreq->osdc_node);
				2702	RB_CLEAR_NODE(&lreq->mc_node);
				2703	INIT_LIST_HEAD(&lreq->scan_item);
				2704	INIT_LIST_HEAD(&lreq->pending_lworks);
				2705	init_completion(&lreq->reg_commit_wait);
				2706	init_completion(&lreq->notify_finish_wait);
				2707
				2708	lreq->osdc = osdc;
				2709	target_init(&lreq->t);
				2710
				2711	dout("%s lreq %p\n", __func__, lreq);
				2712	return lreq;
				2713	}
				2714
				2715	DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
				2716	DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
				2717	DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
				2718
				2719	/*
				2720	* Create linger request <-> OSD session relation.
				2721	*
				2722	* @lreq has to be registered, @osd may be homeless.
				2723	*/
				2724	static void link_linger(struct ceph_osd *osd,
				2725	struct ceph_osd_linger_request *lreq)
				2726	{
				2727	verify_osd_locked(osd);
				2728	WARN_ON(!lreq->linger_id \|\| lreq->osd);
				2729	dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
				2730	osd->o_osd, lreq, lreq->linger_id);
				2731
				2732	if (!osd_homeless(osd))
				2733	__remove_osd_from_lru(osd);
				2734	else
				2735	atomic_inc(&osd->o_osdc->num_homeless);
				2736
				2737	get_osd(osd);
				2738	insert_linger(&osd->o_linger_requests, lreq);
				2739	lreq->osd = osd;
				2740	}
				2741
				2742	static void unlink_linger(struct ceph_osd *osd,
				2743	struct ceph_osd_linger_request *lreq)
				2744	{
				2745	verify_osd_locked(osd);
				2746	WARN_ON(lreq->osd != osd);
				2747	dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
				2748	osd->o_osd, lreq, lreq->linger_id);
				2749
				2750	lreq->osd = NULL;
				2751	erase_linger(&osd->o_linger_requests, lreq);
				2752	put_osd(osd);
				2753
				2754	if (!osd_homeless(osd))
				2755	maybe_move_osd_to_lru(osd);
				2756	else
				2757	atomic_dec(&osd->o_osdc->num_homeless);
				2758	}
				2759
				2760	static bool __linger_registered(struct ceph_osd_linger_request *lreq)
				2761	{
				2762	verify_osdc_locked(lreq->osdc);
				2763
				2764	return !RB_EMPTY_NODE(&lreq->osdc_node);
				2765	}
				2766
				2767	static bool linger_registered(struct ceph_osd_linger_request *lreq)
				2768	{
				2769	struct ceph_osd_client *osdc = lreq->osdc;
				2770	bool registered;
				2771
				2772	down_read(&osdc->lock);
				2773	registered = __linger_registered(lreq);
				2774	up_read(&osdc->lock);
				2775
				2776	return registered;
				2777	}
				2778
				2779	static void linger_register(struct ceph_osd_linger_request *lreq)
				2780	{
				2781	struct ceph_osd_client *osdc = lreq->osdc;
				2782
				2783	verify_osdc_wrlocked(osdc);
				2784	WARN_ON(lreq->linger_id);
				2785
				2786	linger_get(lreq);
				2787	lreq->linger_id = ++osdc->last_linger_id;
				2788	insert_linger_osdc(&osdc->linger_requests, lreq);
				2789	}
				2790
				2791	static void linger_unregister(struct ceph_osd_linger_request *lreq)
				2792	{
				2793	struct ceph_osd_client *osdc = lreq->osdc;
				2794
				2795	verify_osdc_wrlocked(osdc);
				2796
				2797	erase_linger_osdc(&osdc->linger_requests, lreq);
				2798	linger_put(lreq);
				2799	}
				2800
				2801	static void cancel_linger_request(struct ceph_osd_request *req)
				2802	{
				2803	struct ceph_osd_linger_request *lreq = req->r_priv;
				2804
				2805	WARN_ON(!req->r_linger);
				2806	cancel_request(req);
				2807	linger_put(lreq);
				2808	}
				2809
				2810	struct linger_work {
				2811	struct work_struct work;
				2812	struct ceph_osd_linger_request *lreq;
				2813	struct list_head pending_item;
				2814	unsigned long queued_stamp;
				2815
				2816	union {
				2817	struct {
				2818	u64 notify_id;
				2819	u64 notifier_id;
				2820	void payload; / points into @msg front */
				2821	size_t payload_len;
				2822
				2823	struct ceph_msg msg; / for ceph_msg_put() */
				2824	} notify;
				2825	struct {
				2826	int err;
				2827	} error;
				2828	};
				2829	};
				2830
				2831	static struct linger_work lwork_alloc(struct ceph_osd_linger_request lreq,
				2832	work_func_t workfn)
				2833	{
				2834	struct linger_work *lwork;
				2835
				2836	lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
				2837	if (!lwork)
				2838	return NULL;
				2839
				2840	INIT_WORK(&lwork->work, workfn);
				2841	INIT_LIST_HEAD(&lwork->pending_item);
				2842	lwork->lreq = linger_get(lreq);
				2843
				2844	return lwork;
				2845	}
				2846
				2847	static void lwork_free(struct linger_work *lwork)
				2848	{
				2849	struct ceph_osd_linger_request *lreq = lwork->lreq;
				2850
				2851	mutex_lock(&lreq->lock);
				2852	list_del(&lwork->pending_item);
				2853	mutex_unlock(&lreq->lock);
				2854
				2855	linger_put(lreq);
				2856	kfree(lwork);
				2857	}
				2858
				2859	static void lwork_queue(struct linger_work *lwork)
				2860	{
				2861	struct ceph_osd_linger_request *lreq = lwork->lreq;
				2862	struct ceph_osd_client *osdc = lreq->osdc;
				2863
				2864	verify_lreq_locked(lreq);
				2865	WARN_ON(!list_empty(&lwork->pending_item));
				2866
				2867	lwork->queued_stamp = jiffies;
				2868	list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
				2869	queue_work(osdc->notify_wq, &lwork->work);
				2870	}
				2871
				2872	static void do_watch_notify(struct work_struct *w)
				2873	{
				2874	struct linger_work *lwork = container_of(w, struct linger_work, work);
				2875	struct ceph_osd_linger_request *lreq = lwork->lreq;
				2876
				2877	if (!linger_registered(lreq)) {
				2878	dout("%s lreq %p not registered\n", __func__, lreq);
				2879	goto out;
				2880	}
				2881
				2882	WARN_ON(!lreq->is_watch);
				2883	dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
				2884	__func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
				2885	lwork->notify.payload_len);
				2886	lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
				2887	lwork->notify.notifier_id, lwork->notify.payload,
				2888	lwork->notify.payload_len);
				2889
				2890	out:
				2891	ceph_msg_put(lwork->notify.msg);
				2892	lwork_free(lwork);
				2893	}
				2894
				2895	static void do_watch_error(struct work_struct *w)
				2896	{
				2897	struct linger_work *lwork = container_of(w, struct linger_work, work);
				2898	struct ceph_osd_linger_request *lreq = lwork->lreq;
				2899
				2900	if (!linger_registered(lreq)) {
				2901	dout("%s lreq %p not registered\n", __func__, lreq);
				2902	goto out;
				2903	}
				2904
				2905	dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
				2906	lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
				2907
				2908	out:
				2909	lwork_free(lwork);
				2910	}
				2911
				2912	static void queue_watch_error(struct ceph_osd_linger_request *lreq)
				2913	{
				2914	struct linger_work *lwork;
				2915
				2916	lwork = lwork_alloc(lreq, do_watch_error);
				2917	if (!lwork) {
				2918	pr_err("failed to allocate error-lwork\n");
				2919	return;
				2920	}
				2921
				2922	lwork->error.err = lreq->last_error;
				2923	lwork_queue(lwork);
				2924	}
				2925
				2926	static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
				2927	int result)
				2928	{
				2929	if (!completion_done(&lreq->reg_commit_wait)) {
				2930	lreq->reg_commit_error = (result <= 0 ? result : 0);
				2931	complete_all(&lreq->reg_commit_wait);
				2932	}
				2933	}
				2934
				2935	static void linger_commit_cb(struct ceph_osd_request *req)
				2936	{
				2937	struct ceph_osd_linger_request *lreq = req->r_priv;
				2938
				2939	mutex_lock(&lreq->lock);
				2940	dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
				2941	lreq->linger_id, req->r_result);
				2942	linger_reg_commit_complete(lreq, req->r_result);
				2943	lreq->committed = true;
				2944
				2945	if (!lreq->is_watch) {
				2946	struct ceph_osd_data *osd_data =
				2947	osd_req_op_data(req, 0, notify, response_data);
				2948	void *p = page_address(osd_data->pages[0]);
				2949
				2950	WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY \|\|
				2951	osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
				2952
				2953	/* make note of the notify_id */
				2954	if (req->r_ops[0].outdata_len >= sizeof(u64)) {
				2955	lreq->notify_id = ceph_decode_64(&p);
				2956	dout("lreq %p notify_id %llu\n", lreq,
				2957	lreq->notify_id);
				2958	} else {
				2959	dout("lreq %p no notify_id\n", lreq);
				2960	}
				2961	}
				2962
				2963	mutex_unlock(&lreq->lock);
				2964	linger_put(lreq);
				2965	}
				2966
				2967	static int normalize_watch_error(int err)
				2968	{
				2969	/*
				2970	* Translate ENOENT -> ENOTCONN so that a delete->disconnection
				2971	* notification and a failure to reconnect because we raced with
				2972	* the delete appear the same to the user.
				2973	*/
				2974	if (err == -ENOENT)
				2975	err = -ENOTCONN;
				2976
				2977	return err;
				2978	}
				2979
				2980	static void linger_reconnect_cb(struct ceph_osd_request *req)
				2981	{
				2982	struct ceph_osd_linger_request *lreq = req->r_priv;
				2983
				2984	mutex_lock(&lreq->lock);
				2985	dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
				2986	lreq, lreq->linger_id, req->r_result, lreq->last_error);
				2987	if (req->r_result < 0) {
				2988	if (!lreq->last_error) {
				2989	lreq->last_error = normalize_watch_error(req->r_result);
				2990	queue_watch_error(lreq);
				2991	}
				2992	}
				2993
				2994	mutex_unlock(&lreq->lock);
				2995	linger_put(lreq);
				2996	}
				2997
				2998	static void send_linger(struct ceph_osd_linger_request *lreq)
				2999	{
				3000	struct ceph_osd_request *req = lreq->reg_req;
				3001	struct ceph_osd_req_op *op = &req->r_ops[0];
				3002
				3003	verify_osdc_wrlocked(req->r_osdc);
				3004	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
				3005
				3006	if (req->r_osd)
				3007	cancel_linger_request(req);
				3008
				3009	request_reinit(req);
				3010	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
				3011	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
				3012	req->r_flags = lreq->t.flags;
				3013	req->r_mtime = lreq->mtime;
				3014
				3015	mutex_lock(&lreq->lock);
				3016	if (lreq->is_watch && lreq->committed) {
				3017	WARN_ON(op->op != CEPH_OSD_OP_WATCH \|\|
				3018	op->watch.cookie != lreq->linger_id);
				3019	op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
				3020	op->watch.gen = ++lreq->register_gen;
				3021	dout("lreq %p reconnect register_gen %u\n", lreq,
				3022	op->watch.gen);
				3023	req->r_callback = linger_reconnect_cb;
				3024	} else {
				3025	if (!lreq->is_watch)
				3026	lreq->notify_id = 0;
				3027	else
				3028	WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
				3029	dout("lreq %p register\n", lreq);
				3030	req->r_callback = linger_commit_cb;
				3031	}
				3032	mutex_unlock(&lreq->lock);
				3033
				3034	req->r_priv = linger_get(lreq);
				3035	req->r_linger = true;
				3036
				3037	submit_request(req, true);
				3038	}
				3039
				3040	static void linger_ping_cb(struct ceph_osd_request *req)
				3041	{
				3042	struct ceph_osd_linger_request *lreq = req->r_priv;
				3043
				3044	mutex_lock(&lreq->lock);
				3045	dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
				3046	__func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
				3047	lreq->last_error);
				3048	if (lreq->register_gen == req->r_ops[0].watch.gen) {
				3049	if (!req->r_result) {
				3050	lreq->watch_valid_thru = lreq->ping_sent;
				3051	} else if (!lreq->last_error) {
				3052	lreq->last_error = normalize_watch_error(req->r_result);
				3053	queue_watch_error(lreq);
				3054	}
				3055	} else {
				3056	dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
				3057	lreq->register_gen, req->r_ops[0].watch.gen);
				3058	}
				3059
				3060	mutex_unlock(&lreq->lock);
				3061	linger_put(lreq);
				3062	}
				3063
				3064	static void send_linger_ping(struct ceph_osd_linger_request *lreq)
				3065	{
				3066	struct ceph_osd_client *osdc = lreq->osdc;
				3067	struct ceph_osd_request *req = lreq->ping_req;
				3068	struct ceph_osd_req_op *op = &req->r_ops[0];
				3069
				3070	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
				3071	dout("%s PAUSERD\n", __func__);
				3072	return;
				3073	}
				3074
				3075	lreq->ping_sent = jiffies;
				3076	dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
				3077	__func__, lreq, lreq->linger_id, lreq->ping_sent,
				3078	lreq->register_gen);
				3079
				3080	if (req->r_osd)
				3081	cancel_linger_request(req);
				3082
				3083	request_reinit(req);
				3084	target_copy(&req->r_t, &lreq->t);
				3085
				3086	WARN_ON(op->op != CEPH_OSD_OP_WATCH \|\|
				3087	op->watch.cookie != lreq->linger_id \|\|
				3088	op->watch.op != CEPH_OSD_WATCH_OP_PING);
				3089	op->watch.gen = lreq->register_gen;
				3090	req->r_callback = linger_ping_cb;
				3091	req->r_priv = linger_get(lreq);
				3092	req->r_linger = true;
				3093
				3094	ceph_osdc_get_request(req);
				3095	account_request(req);
				3096	req->r_tid = atomic64_inc_return(&osdc->last_tid);
				3097	link_request(lreq->osd, req);
				3098	send_request(req);
				3099	}
				3100
				3101	static void linger_submit(struct ceph_osd_linger_request *lreq)
				3102	{
				3103	struct ceph_osd_client *osdc = lreq->osdc;
				3104	struct ceph_osd *osd;
				3105
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3106	down_write(&osdc->lock);
				3107	linger_register(lreq);
				3108	if (lreq->is_watch) {
				3109	lreq->reg_req->r_ops[0].watch.cookie = lreq->linger_id;
				3110	lreq->ping_req->r_ops[0].watch.cookie = lreq->linger_id;
				3111	} else {
				3112	lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id;
				3113	}
				3114
				3115	calc_target(osdc, &lreq->t, false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3116	osd = lookup_create_osd(osdc, lreq->t.osd, true);
				3117	link_linger(osd, lreq);
				3118
				3119	send_linger(lreq);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3120	up_write(&osdc->lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3121	}
				3122
				3123	static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
				3124	{
				3125	struct ceph_osd_client *osdc = lreq->osdc;
				3126	struct ceph_osd_linger_request *lookup_lreq;
				3127
				3128	verify_osdc_wrlocked(osdc);
				3129
				3130	lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
				3131	lreq->linger_id);
				3132	if (!lookup_lreq)
				3133	return;
				3134
				3135	WARN_ON(lookup_lreq != lreq);
				3136	erase_linger_mc(&osdc->linger_map_checks, lreq);
				3137	linger_put(lreq);
				3138	}
				3139
				3140	/*
				3141	* @lreq has to be both registered and linked.
				3142	*/
				3143	static void __linger_cancel(struct ceph_osd_linger_request *lreq)
				3144	{
				3145	if (lreq->is_watch && lreq->ping_req->r_osd)
				3146	cancel_linger_request(lreq->ping_req);
				3147	if (lreq->reg_req->r_osd)
				3148	cancel_linger_request(lreq->reg_req);
				3149	cancel_linger_map_check(lreq);
				3150	unlink_linger(lreq->osd, lreq);
				3151	linger_unregister(lreq);
				3152	}
				3153
				3154	static void linger_cancel(struct ceph_osd_linger_request *lreq)
				3155	{
				3156	struct ceph_osd_client *osdc = lreq->osdc;
				3157
				3158	down_write(&osdc->lock);
				3159	if (__linger_registered(lreq))
				3160	__linger_cancel(lreq);
				3161	up_write(&osdc->lock);
				3162	}
				3163
				3164	static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
				3165
				3166	static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
				3167	{
				3168	struct ceph_osd_client *osdc = lreq->osdc;
				3169	struct ceph_osdmap *map = osdc->osdmap;
				3170
				3171	verify_osdc_wrlocked(osdc);
				3172	WARN_ON(!map->epoch);
				3173
				3174	if (lreq->register_gen) {
				3175	lreq->map_dne_bound = map->epoch;
				3176	dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
				3177	lreq, lreq->linger_id);
				3178	} else {
				3179	dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
				3180	__func__, lreq, lreq->linger_id, lreq->map_dne_bound,
				3181	map->epoch);
				3182	}
				3183
				3184	if (lreq->map_dne_bound) {
				3185	if (map->epoch >= lreq->map_dne_bound) {
				3186	/* we had a new enough map */
				3187	pr_info("linger_id %llu pool does not exist\n",
				3188	lreq->linger_id);
				3189	linger_reg_commit_complete(lreq, -ENOENT);
				3190	__linger_cancel(lreq);
				3191	}
				3192	} else {
				3193	send_linger_map_check(lreq);
				3194	}
				3195	}
				3196
				3197	static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
				3198	{
				3199	struct ceph_osd_client *osdc = &greq->monc->client->osdc;
				3200	struct ceph_osd_linger_request *lreq;
				3201	u64 linger_id = greq->private_data;
				3202
				3203	WARN_ON(greq->result \|\| !greq->u.newest);
				3204
				3205	down_write(&osdc->lock);
				3206	lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
				3207	if (!lreq) {
				3208	dout("%s linger_id %llu dne\n", __func__, linger_id);
				3209	goto out_unlock;
				3210	}
				3211
				3212	dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
				3213	__func__, lreq, lreq->linger_id, lreq->map_dne_bound,
				3214	greq->u.newest);
				3215	if (!lreq->map_dne_bound)
				3216	lreq->map_dne_bound = greq->u.newest;
				3217	erase_linger_mc(&osdc->linger_map_checks, lreq);
				3218	check_linger_pool_dne(lreq);
				3219
				3220	linger_put(lreq);
				3221	out_unlock:
				3222	up_write(&osdc->lock);
				3223	}
				3224
				3225	static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
				3226	{
				3227	struct ceph_osd_client *osdc = lreq->osdc;
				3228	struct ceph_osd_linger_request *lookup_lreq;
				3229	int ret;
				3230
				3231	verify_osdc_wrlocked(osdc);
				3232
				3233	lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
				3234	lreq->linger_id);
				3235	if (lookup_lreq) {
				3236	WARN_ON(lookup_lreq != lreq);
				3237	return;
				3238	}
				3239
				3240	linger_get(lreq);
				3241	insert_linger_mc(&osdc->linger_map_checks, lreq);
				3242	ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
				3243	linger_map_check_cb, lreq->linger_id);
				3244	WARN_ON(ret);
				3245	}
				3246
				3247	static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
				3248	{
				3249	int ret;
				3250
				3251	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
				3252	ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
				3253	return ret ?: lreq->reg_commit_error;
				3254	}
				3255
				3256	static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
				3257	{
				3258	int ret;
				3259
				3260	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
				3261	ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
				3262	return ret ?: lreq->notify_finish_error;
				3263	}
				3264
				3265	/*
				3266	* Timeout callback, called every N seconds. When 1 or more OSD
				3267	* requests has been active for more than N seconds, we send a keepalive
				3268	* (tag + timestamp) to its OSD to ensure any communications channel
				3269	* reset is detected.
				3270	*/
				3271	static void handle_timeout(struct work_struct *work)
				3272	{
				3273	struct ceph_osd_client *osdc =
				3274	container_of(work, struct ceph_osd_client, timeout_work.work);
				3275	struct ceph_options *opts = osdc->client->options;
				3276	unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
				3277	unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout;
				3278	LIST_HEAD(slow_osds);
				3279	struct rb_node n, p;
				3280
				3281	dout("%s osdc %p\n", __func__, osdc);
				3282	down_write(&osdc->lock);
				3283
				3284	/*
				3285	* ping osds that are a bit slow. this ensures that if there
				3286	* is a break in the TCP connection we will notice, and reopen
				3287	* a connection with that osd (from the fault callback).
				3288	*/
				3289	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
				3290	struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
				3291	bool found = false;
				3292
				3293	for (p = rb_first(&osd->o_requests); p; ) {
				3294	struct ceph_osd_request *req =
				3295	rb_entry(p, struct ceph_osd_request, r_node);
				3296
				3297	p = rb_next(p); /* abort_request() */
				3298
				3299	if (time_before(req->r_stamp, cutoff)) {
				3300	dout(" req %p tid %llu on osd%d is laggy\n",
				3301	req, req->r_tid, osd->o_osd);
				3302	found = true;
				3303	}
				3304	if (opts->osd_request_timeout &&
				3305	time_before(req->r_start_stamp, expiry_cutoff)) {
				3306	pr_err_ratelimited("tid %llu on osd%d timeout\n",
				3307	req->r_tid, osd->o_osd);
				3308	abort_request(req, -ETIMEDOUT);
				3309	}
				3310	}
				3311	for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
				3312	struct ceph_osd_linger_request *lreq =
				3313	rb_entry(p, struct ceph_osd_linger_request, node);
				3314
				3315	dout(" lreq %p linger_id %llu is served by osd%d\n",
				3316	lreq, lreq->linger_id, osd->o_osd);
				3317	found = true;
				3318
				3319	mutex_lock(&lreq->lock);
				3320	if (lreq->is_watch && lreq->committed && !lreq->last_error)
				3321	send_linger_ping(lreq);
				3322	mutex_unlock(&lreq->lock);
				3323	}
				3324
				3325	if (found)
				3326	list_move_tail(&osd->o_keepalive_item, &slow_osds);
				3327	}
				3328
				3329	if (opts->osd_request_timeout) {
				3330	for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
				3331	struct ceph_osd_request *req =
				3332	rb_entry(p, struct ceph_osd_request, r_node);
				3333
				3334	p = rb_next(p); /* abort_request() */
				3335
				3336	if (time_before(req->r_start_stamp, expiry_cutoff)) {
				3337	pr_err_ratelimited("tid %llu on osd%d timeout\n",
				3338	req->r_tid, osdc->homeless_osd.o_osd);
				3339	abort_request(req, -ETIMEDOUT);
				3340	}
				3341	}
				3342	}
				3343
				3344	if (atomic_read(&osdc->num_homeless) \|\| !list_empty(&slow_osds))
				3345	maybe_request_map(osdc);
				3346
				3347	while (!list_empty(&slow_osds)) {
				3348	struct ceph_osd *osd = list_first_entry(&slow_osds,
				3349	struct ceph_osd,
				3350	o_keepalive_item);
				3351	list_del_init(&osd->o_keepalive_item);
				3352	ceph_con_keepalive(&osd->o_con);
				3353	}
				3354
				3355	up_write(&osdc->lock);
				3356	schedule_delayed_work(&osdc->timeout_work,
				3357	osdc->client->options->osd_keepalive_timeout);
				3358	}
				3359
				3360	static void handle_osds_timeout(struct work_struct *work)
				3361	{
				3362	struct ceph_osd_client *osdc =
				3363	container_of(work, struct ceph_osd_client,
				3364	osds_timeout_work.work);
				3365	unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
				3366	struct ceph_osd osd, nosd;
				3367
				3368	dout("%s osdc %p\n", __func__, osdc);
				3369	down_write(&osdc->lock);
				3370	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
				3371	if (time_before(jiffies, osd->lru_ttl))
				3372	break;
				3373
				3374	WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
				3375	WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
				3376	close_osd(osd);
				3377	}
				3378
				3379	up_write(&osdc->lock);
				3380	schedule_delayed_work(&osdc->osds_timeout_work,
				3381	round_jiffies_relative(delay));
				3382	}
				3383
				3384	static int ceph_oloc_decode(void *p, void end,
				3385	struct ceph_object_locator *oloc)
				3386	{
				3387	u8 struct_v, struct_cv;
				3388	u32 len;
				3389	void *struct_end;
				3390	int ret = 0;
				3391
				3392	ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
				3393	struct_v = ceph_decode_8(p);
				3394	struct_cv = ceph_decode_8(p);
				3395	if (struct_v < 3) {
				3396	pr_warn("got v %d < 3 cv %d of ceph_object_locator\n",
				3397	struct_v, struct_cv);
				3398	goto e_inval;
				3399	}
				3400	if (struct_cv > 6) {
				3401	pr_warn("got v %d cv %d > 6 of ceph_object_locator\n",
				3402	struct_v, struct_cv);
				3403	goto e_inval;
				3404	}
				3405	len = ceph_decode_32(p);
				3406	ceph_decode_need(p, end, len, e_inval);
				3407	struct_end = *p + len;
				3408
				3409	oloc->pool = ceph_decode_64(p);
				3410	p += 4; / skip preferred */
				3411
				3412	len = ceph_decode_32(p);
				3413	if (len > 0) {
				3414	pr_warn("ceph_object_locator::key is set\n");
				3415	goto e_inval;
				3416	}
				3417
				3418	if (struct_v >= 5) {
				3419	bool changed = false;
				3420
				3421	len = ceph_decode_32(p);
				3422	if (len > 0) {
				3423	ceph_decode_need(p, end, len, e_inval);
				3424	if (!oloc->pool_ns \|\|
				3425	ceph_compare_string(oloc->pool_ns, *p, len))
				3426	changed = true;
				3427	*p += len;
				3428	} else {
				3429	if (oloc->pool_ns)
				3430	changed = true;
				3431	}
				3432	if (changed) {
				3433	/* redirect changes namespace */
				3434	pr_warn("ceph_object_locator::nspace is changed\n");
				3435	goto e_inval;
				3436	}
				3437	}
				3438
				3439	if (struct_v >= 6) {
				3440	s64 hash = ceph_decode_64(p);
				3441	if (hash != -1) {
				3442	pr_warn("ceph_object_locator::hash is set\n");
				3443	goto e_inval;
				3444	}
				3445	}
				3446
				3447	/* skip the rest */
				3448	*p = struct_end;
				3449	out:
				3450	return ret;
				3451
				3452	e_inval:
				3453	ret = -EINVAL;
				3454	goto out;
				3455	}
				3456
				3457	static int ceph_redirect_decode(void *p, void end,
				3458	struct ceph_request_redirect *redir)
				3459	{
				3460	u8 struct_v, struct_cv;
				3461	u32 len;
				3462	void *struct_end;
				3463	int ret;
				3464
				3465	ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
				3466	struct_v = ceph_decode_8(p);
				3467	struct_cv = ceph_decode_8(p);
				3468	if (struct_cv > 1) {
				3469	pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n",
				3470	struct_v, struct_cv);
				3471	goto e_inval;
				3472	}
				3473	len = ceph_decode_32(p);
				3474	ceph_decode_need(p, end, len, e_inval);
				3475	struct_end = *p + len;
				3476
				3477	ret = ceph_oloc_decode(p, end, &redir->oloc);
				3478	if (ret)
				3479	goto out;
				3480
				3481	len = ceph_decode_32(p);
				3482	if (len > 0) {
				3483	pr_warn("ceph_request_redirect::object_name is set\n");
				3484	goto e_inval;
				3485	}
				3486
				3487	len = ceph_decode_32(p);
				3488	p += len; / skip osd_instructions */
				3489
				3490	/* skip the rest */
				3491	*p = struct_end;
				3492	out:
				3493	return ret;
				3494
				3495	e_inval:
				3496	ret = -EINVAL;
				3497	goto out;
				3498	}
				3499
				3500	struct MOSDOpReply {
				3501	struct ceph_pg pgid;
				3502	u64 flags;
				3503	int result;
				3504	u32 epoch;
				3505	int num_ops;
				3506	u32 outdata_len[CEPH_OSD_MAX_OPS];
				3507	s32 rval[CEPH_OSD_MAX_OPS];
				3508	int retry_attempt;
				3509	struct ceph_eversion replay_version;
				3510	u64 user_version;
				3511	struct ceph_request_redirect redirect;
				3512	};
				3513
				3514	static int decode_MOSDOpReply(const struct ceph_msg msg, struct MOSDOpReply m)
				3515	{
				3516	void *p = msg->front.iov_base;
				3517	void *const end = p + msg->front.iov_len;
				3518	u16 version = le16_to_cpu(msg->hdr.version);
				3519	struct ceph_eversion bad_replay_version;
				3520	u8 decode_redir;
				3521	u32 len;
				3522	int ret;
				3523	int i;
				3524
				3525	ceph_decode_32_safe(&p, end, len, e_inval);
				3526	ceph_decode_need(&p, end, len, e_inval);
				3527	p += len; /* skip oid */
				3528
				3529	ret = ceph_decode_pgid(&p, end, &m->pgid);
				3530	if (ret)
				3531	return ret;
				3532
				3533	ceph_decode_64_safe(&p, end, m->flags, e_inval);
				3534	ceph_decode_32_safe(&p, end, m->result, e_inval);
				3535	ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
				3536	memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
				3537	p += sizeof(bad_replay_version);
				3538	ceph_decode_32_safe(&p, end, m->epoch, e_inval);
				3539
				3540	ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
				3541	if (m->num_ops > ARRAY_SIZE(m->outdata_len))
				3542	goto e_inval;
				3543
				3544	ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
				3545	e_inval);
				3546	for (i = 0; i < m->num_ops; i++) {
				3547	struct ceph_osd_op *op = p;
				3548
				3549	m->outdata_len[i] = le32_to_cpu(op->payload_len);
				3550	p += sizeof(*op);
				3551	}
				3552
				3553	ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
				3554	for (i = 0; i < m->num_ops; i++)
				3555	ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
				3556
				3557	if (version >= 5) {
				3558	ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
				3559	memcpy(&m->replay_version, p, sizeof(m->replay_version));
				3560	p += sizeof(m->replay_version);
				3561	ceph_decode_64_safe(&p, end, m->user_version, e_inval);
				3562	} else {
				3563	m->replay_version = bad_replay_version; /* struct */
				3564	m->user_version = le64_to_cpu(m->replay_version.version);
				3565	}
				3566
				3567	if (version >= 6) {
				3568	if (version >= 7)
				3569	ceph_decode_8_safe(&p, end, decode_redir, e_inval);
				3570	else
				3571	decode_redir = 1;
				3572	} else {
				3573	decode_redir = 0;
				3574	}
				3575
				3576	if (decode_redir) {
				3577	ret = ceph_redirect_decode(&p, end, &m->redirect);
				3578	if (ret)
				3579	return ret;
				3580	} else {
				3581	ceph_oloc_init(&m->redirect.oloc);
				3582	}
				3583
				3584	return 0;
				3585
				3586	e_inval:
				3587	return -EINVAL;
				3588	}
				3589
				3590	/*
				3591	* Handle MOSDOpReply. Set ->r_result and call the callback if it is
				3592	* specified.
				3593	*/
				3594	static void handle_reply(struct ceph_osd osd, struct ceph_msg msg)
				3595	{
				3596	struct ceph_osd_client *osdc = osd->o_osdc;
				3597	struct ceph_osd_request *req;
				3598	struct MOSDOpReply m;
				3599	u64 tid = le64_to_cpu(msg->hdr.tid);
				3600	u32 data_len = 0;
				3601	int ret;
				3602	int i;
				3603
				3604	dout("%s msg %p tid %llu\n", __func__, msg, tid);
				3605
				3606	down_read(&osdc->lock);
				3607	if (!osd_registered(osd)) {
				3608	dout("%s osd%d unknown\n", __func__, osd->o_osd);
				3609	goto out_unlock_osdc;
				3610	}
				3611	WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
				3612
				3613	mutex_lock(&osd->lock);
				3614	req = lookup_request(&osd->o_requests, tid);
				3615	if (!req) {
				3616	dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
				3617	goto out_unlock_session;
				3618	}
				3619
				3620	m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns;
				3621	ret = decode_MOSDOpReply(msg, &m);
				3622	m.redirect.oloc.pool_ns = NULL;
				3623	if (ret) {
				3624	pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
				3625	req->r_tid, ret);
				3626	ceph_msg_dump(msg);
				3627	goto fail_request;
				3628	}
				3629	dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
				3630	__func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
				3631	m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
				3632	le64_to_cpu(m.replay_version.version), m.user_version);
				3633
				3634	if (m.retry_attempt >= 0) {
				3635	if (m.retry_attempt != req->r_attempts - 1) {
				3636	dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
				3637	req, req->r_tid, m.retry_attempt,
				3638	req->r_attempts - 1);
				3639	goto out_unlock_session;
				3640	}
				3641	} else {
				3642	WARN_ON(1); /* MOSDOpReply v4 is assumed */
				3643	}
				3644
				3645	if (!ceph_oloc_empty(&m.redirect.oloc)) {
				3646	dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
				3647	m.redirect.oloc.pool);
				3648	unlink_request(osd, req);
				3649	mutex_unlock(&osd->lock);
				3650
				3651	/*
				3652	* Not ceph_oloc_copy() - changing pool_ns is not
				3653	* supported.
				3654	*/
				3655	req->r_t.target_oloc.pool = m.redirect.oloc.pool;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3656	req->r_flags \|= CEPH_OSD_FLAG_REDIRECTED \|
				3657	CEPH_OSD_FLAG_IGNORE_OVERLAY \|
				3658	CEPH_OSD_FLAG_IGNORE_CACHE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3659	req->r_tid = 0;
				3660	__submit_request(req, false);
				3661	goto out_unlock_osdc;
				3662	}
				3663
				3664	if (m.num_ops != req->r_num_ops) {
				3665	pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
				3666	req->r_num_ops, req->r_tid);
				3667	goto fail_request;
				3668	}
				3669	for (i = 0; i < req->r_num_ops; i++) {
				3670	dout(" req %p tid %llu op %d rval %d len %u\n", req,
				3671	req->r_tid, i, m.rval[i], m.outdata_len[i]);
				3672	req->r_ops[i].rval = m.rval[i];
				3673	req->r_ops[i].outdata_len = m.outdata_len[i];
				3674	data_len += m.outdata_len[i];
				3675	}
				3676	if (data_len != le32_to_cpu(msg->hdr.data_len)) {
				3677	pr_err("sum of lens %u != %u for tid %llu\n", data_len,
				3678	le32_to_cpu(msg->hdr.data_len), req->r_tid);
				3679	goto fail_request;
				3680	}
				3681	dout("%s req %p tid %llu result %d data_len %u\n", __func__,
				3682	req, req->r_tid, m.result, data_len);
				3683
				3684	/*
				3685	* Since we only ever request ONDISK, we should only ever get
				3686	* one (type of) reply back.
				3687	*/
				3688	WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
				3689	req->r_result = m.result ?: data_len;
				3690	finish_request(req);
				3691	mutex_unlock(&osd->lock);
				3692	up_read(&osdc->lock);
				3693
				3694	__complete_request(req);
				3695	return;
				3696
				3697	fail_request:
				3698	complete_request(req, -EIO);
				3699	out_unlock_session:
				3700	mutex_unlock(&osd->lock);
				3701	out_unlock_osdc:
				3702	up_read(&osdc->lock);
				3703	}
				3704
				3705	static void set_pool_was_full(struct ceph_osd_client *osdc)
				3706	{
				3707	struct rb_node *n;
				3708
				3709	for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
				3710	struct ceph_pg_pool_info *pi =
				3711	rb_entry(n, struct ceph_pg_pool_info, node);
				3712
				3713	pi->was_full = __pool_full(pi);
				3714	}
				3715	}
				3716
				3717	static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
				3718	{
				3719	struct ceph_pg_pool_info *pi;
				3720
				3721	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
				3722	if (!pi)
				3723	return false;
				3724
				3725	return pi->was_full && !__pool_full(pi);
				3726	}
				3727
				3728	static enum calc_target_result
				3729	recalc_linger_target(struct ceph_osd_linger_request *lreq)
				3730	{
				3731	struct ceph_osd_client *osdc = lreq->osdc;
				3732	enum calc_target_result ct_res;
				3733
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3734	ct_res = calc_target(osdc, &lreq->t, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3735	if (ct_res == CALC_TARGET_NEED_RESEND) {
				3736	struct ceph_osd *osd;
				3737
				3738	osd = lookup_create_osd(osdc, lreq->t.osd, true);
				3739	if (osd != lreq->osd) {
				3740	unlink_linger(lreq->osd, lreq);
				3741	link_linger(osd, lreq);
				3742	}
				3743	}
				3744
				3745	return ct_res;
				3746	}
				3747
				3748	/*
				3749	* Requeue requests whose mapping to an OSD has changed.
				3750	*/
				3751	static void scan_requests(struct ceph_osd *osd,
				3752	bool force_resend,
				3753	bool cleared_full,
				3754	bool check_pool_cleared_full,
				3755	struct rb_root *need_resend,
				3756	struct list_head *need_resend_linger)
				3757	{
				3758	struct ceph_osd_client *osdc = osd->o_osdc;
				3759	struct rb_node *n;
				3760	bool force_resend_writes;
				3761
				3762	for (n = rb_first(&osd->o_linger_requests); n; ) {
				3763	struct ceph_osd_linger_request *lreq =
				3764	rb_entry(n, struct ceph_osd_linger_request, node);
				3765	enum calc_target_result ct_res;
				3766
				3767	n = rb_next(n); /* recalc_linger_target() */
				3768
				3769	dout("%s lreq %p linger_id %llu\n", __func__, lreq,
				3770	lreq->linger_id);
				3771	ct_res = recalc_linger_target(lreq);
				3772	switch (ct_res) {
				3773	case CALC_TARGET_NO_ACTION:
				3774	force_resend_writes = cleared_full \|\|
				3775	(check_pool_cleared_full &&
				3776	pool_cleared_full(osdc, lreq->t.base_oloc.pool));
				3777	if (!force_resend && !force_resend_writes)
				3778	break;
				3779
				3780	/* fall through */
				3781	case CALC_TARGET_NEED_RESEND:
				3782	cancel_linger_map_check(lreq);
				3783	/*
				3784	* scan_requests() for the previous epoch(s)
				3785	* may have already added it to the list, since
				3786	* it's not unlinked here.
				3787	*/
				3788	if (list_empty(&lreq->scan_item))
				3789	list_add_tail(&lreq->scan_item, need_resend_linger);
				3790	break;
				3791	case CALC_TARGET_POOL_DNE:
				3792	list_del_init(&lreq->scan_item);
				3793	check_linger_pool_dne(lreq);
				3794	break;
				3795	}
				3796	}
				3797
				3798	for (n = rb_first(&osd->o_requests); n; ) {
				3799	struct ceph_osd_request *req =
				3800	rb_entry(n, struct ceph_osd_request, r_node);
				3801	enum calc_target_result ct_res;
				3802
				3803	n = rb_next(n); /* unlink_request(), check_pool_dne() */
				3804
				3805	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3806	ct_res = calc_target(osdc, &req->r_t, false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3807	switch (ct_res) {
				3808	case CALC_TARGET_NO_ACTION:
				3809	force_resend_writes = cleared_full \|\|
				3810	(check_pool_cleared_full &&
				3811	pool_cleared_full(osdc, req->r_t.base_oloc.pool));
				3812	if (!force_resend &&
				3813	(!(req->r_flags & CEPH_OSD_FLAG_WRITE) \|\|
				3814	!force_resend_writes))
				3815	break;
				3816
				3817	/* fall through */
				3818	case CALC_TARGET_NEED_RESEND:
				3819	cancel_map_check(req);
				3820	unlink_request(osd, req);
				3821	insert_request(need_resend, req);
				3822	break;
				3823	case CALC_TARGET_POOL_DNE:
				3824	check_pool_dne(req);
				3825	break;
				3826	}
				3827	}
				3828	}
				3829
				3830	static int handle_one_map(struct ceph_osd_client *osdc,
				3831	void p, void end, bool incremental,
				3832	struct rb_root *need_resend,
				3833	struct list_head *need_resend_linger)
				3834	{
				3835	struct ceph_osdmap *newmap;
				3836	struct rb_node *n;
				3837	bool skipped_map = false;
				3838	bool was_full;
				3839
				3840	was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
				3841	set_pool_was_full(osdc);
				3842
				3843	if (incremental)
				3844	newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
				3845	else
				3846	newmap = ceph_osdmap_decode(&p, end);
				3847	if (IS_ERR(newmap))
				3848	return PTR_ERR(newmap);
				3849
				3850	if (newmap != osdc->osdmap) {
				3851	/*
				3852	* Preserve ->was_full before destroying the old map.
				3853	* For pools that weren't in the old map, ->was_full
				3854	* should be false.
				3855	*/
				3856	for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
				3857	struct ceph_pg_pool_info *pi =
				3858	rb_entry(n, struct ceph_pg_pool_info, node);
				3859	struct ceph_pg_pool_info *old_pi;
				3860
				3861	old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
				3862	if (old_pi)
				3863	pi->was_full = old_pi->was_full;
				3864	else
				3865	WARN_ON(pi->was_full);
				3866	}
				3867
				3868	if (osdc->osdmap->epoch &&
				3869	osdc->osdmap->epoch + 1 < newmap->epoch) {
				3870	WARN_ON(incremental);
				3871	skipped_map = true;
				3872	}
				3873
				3874	ceph_osdmap_destroy(osdc->osdmap);
				3875	osdc->osdmap = newmap;
				3876	}
				3877
				3878	was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
				3879	scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
				3880	need_resend, need_resend_linger);
				3881
				3882	for (n = rb_first(&osdc->osds); n; ) {
				3883	struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
				3884
				3885	n = rb_next(n); /* close_osd() */
				3886
				3887	scan_requests(osd, skipped_map, was_full, true, need_resend,
				3888	need_resend_linger);
				3889	if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) \|\|
				3890	memcmp(&osd->o_con.peer_addr,
				3891	ceph_osd_addr(osdc->osdmap, osd->o_osd),
				3892	sizeof(struct ceph_entity_addr)))
				3893	close_osd(osd);
				3894	}
				3895
				3896	return 0;
				3897	}
				3898
				3899	static void kick_requests(struct ceph_osd_client *osdc,
				3900	struct rb_root *need_resend,
				3901	struct list_head *need_resend_linger)
				3902	{
				3903	struct ceph_osd_linger_request lreq, nlreq;
				3904	enum calc_target_result ct_res;
				3905	struct rb_node *n;
				3906
				3907	/* make sure need_resend targets reflect latest map */
				3908	for (n = rb_first(need_resend); n; ) {
				3909	struct ceph_osd_request *req =
				3910	rb_entry(n, struct ceph_osd_request, r_node);
				3911
				3912	n = rb_next(n);
				3913
				3914	if (req->r_t.epoch < osdc->osdmap->epoch) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3915	ct_res = calc_target(osdc, &req->r_t, false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3916	if (ct_res == CALC_TARGET_POOL_DNE) {
				3917	erase_request(need_resend, req);
				3918	check_pool_dne(req);
				3919	}
				3920	}
				3921	}
				3922
				3923	for (n = rb_first(need_resend); n; ) {
				3924	struct ceph_osd_request *req =
				3925	rb_entry(n, struct ceph_osd_request, r_node);
				3926	struct ceph_osd *osd;
				3927
				3928	n = rb_next(n);
				3929	erase_request(need_resend, req); /* before link_request() */
				3930
				3931	osd = lookup_create_osd(osdc, req->r_t.osd, true);
				3932	link_request(osd, req);
				3933	if (!req->r_linger) {
				3934	if (!osd_homeless(osd) && !req->r_t.paused)
				3935	send_request(req);
				3936	} else {
				3937	cancel_linger_request(req);
				3938	}
				3939	}
				3940
				3941	list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
				3942	if (!osd_homeless(lreq->osd))
				3943	send_linger(lreq);
				3944
				3945	list_del_init(&lreq->scan_item);
				3946	}
				3947	}
				3948
				3949	/*
				3950	* Process updated osd map.
				3951	*
				3952	* The message contains any number of incremental and full maps, normally
				3953	* indicating some sort of topology change in the cluster. Kick requests
				3954	* off to different OSDs as needed.
				3955	*/
				3956	void ceph_osdc_handle_map(struct ceph_osd_client osdc, struct ceph_msg msg)
				3957	{
				3958	void *p = msg->front.iov_base;
				3959	void *const end = p + msg->front.iov_len;
				3960	u32 nr_maps, maplen;
				3961	u32 epoch;
				3962	struct ceph_fsid fsid;
				3963	struct rb_root need_resend = RB_ROOT;
				3964	LIST_HEAD(need_resend_linger);
				3965	bool handled_incremental = false;
				3966	bool was_pauserd, was_pausewr;
				3967	bool pauserd, pausewr;
				3968	int err;
				3969
				3970	dout("%s have %u\n", __func__, osdc->osdmap->epoch);
				3971	down_write(&osdc->lock);
				3972
				3973	/* verify fsid */
				3974	ceph_decode_need(&p, end, sizeof(fsid), bad);
				3975	ceph_decode_copy(&p, &fsid, sizeof(fsid));
				3976	if (ceph_check_fsid(osdc->client, &fsid) < 0)
				3977	goto bad;
				3978
				3979	was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
				3980	was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) \|\|
				3981	ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				3982	have_pool_full(osdc);
				3983
				3984	/* incremental maps */
				3985	ceph_decode_32_safe(&p, end, nr_maps, bad);
				3986	dout(" %d inc maps\n", nr_maps);
				3987	while (nr_maps > 0) {
				3988	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
				3989	epoch = ceph_decode_32(&p);
				3990	maplen = ceph_decode_32(&p);
				3991	ceph_decode_need(&p, end, maplen, bad);
				3992	if (osdc->osdmap->epoch &&
				3993	osdc->osdmap->epoch + 1 == epoch) {
				3994	dout("applying incremental map %u len %d\n",
				3995	epoch, maplen);
				3996	err = handle_one_map(osdc, p, p + maplen, true,
				3997	&need_resend, &need_resend_linger);
				3998	if (err)
				3999	goto bad;
				4000	handled_incremental = true;
				4001	} else {
				4002	dout("ignoring incremental map %u len %d\n",
				4003	epoch, maplen);
				4004	}
				4005	p += maplen;
				4006	nr_maps--;
				4007	}
				4008	if (handled_incremental)
				4009	goto done;
				4010
				4011	/* full maps */
				4012	ceph_decode_32_safe(&p, end, nr_maps, bad);
				4013	dout(" %d full maps\n", nr_maps);
				4014	while (nr_maps) {
				4015	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
				4016	epoch = ceph_decode_32(&p);
				4017	maplen = ceph_decode_32(&p);
				4018	ceph_decode_need(&p, end, maplen, bad);
				4019	if (nr_maps > 1) {
				4020	dout("skipping non-latest full map %u len %d\n",
				4021	epoch, maplen);
				4022	} else if (osdc->osdmap->epoch >= epoch) {
				4023	dout("skipping full map %u len %d, "
				4024	"older than our %u\n", epoch, maplen,
				4025	osdc->osdmap->epoch);
				4026	} else {
				4027	dout("taking full map %u len %d\n", epoch, maplen);
				4028	err = handle_one_map(osdc, p, p + maplen, false,
				4029	&need_resend, &need_resend_linger);
				4030	if (err)
				4031	goto bad;
				4032	}
				4033	p += maplen;
				4034	nr_maps--;
				4035	}
				4036
				4037	done:
				4038	/*
				4039	* subscribe to subsequent osdmap updates if full to ensure
				4040	* we find out when we are no longer full and stop returning
				4041	* ENOSPC.
				4042	*/
				4043	pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
				4044	pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) \|\|
				4045	ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) \|\|
				4046	have_pool_full(osdc);
				4047	if (was_pauserd \|\| was_pausewr \|\| pauserd \|\| pausewr \|\|
				4048	osdc->osdmap->epoch < osdc->epoch_barrier)
				4049	maybe_request_map(osdc);
				4050
				4051	kick_requests(osdc, &need_resend, &need_resend_linger);
				4052
				4053	ceph_osdc_abort_on_full(osdc);
				4054	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
				4055	osdc->osdmap->epoch);
				4056	up_write(&osdc->lock);
				4057	wake_up_all(&osdc->client->auth_wq);
				4058	return;
				4059
				4060	bad:
				4061	pr_err("osdc handle_map corrupt msg\n");
				4062	ceph_msg_dump(msg);
				4063	up_write(&osdc->lock);
				4064	}
				4065
				4066	/*
				4067	* Resubmit requests pending on the given osd.
				4068	*/
				4069	static void kick_osd_requests(struct ceph_osd *osd)
				4070	{
				4071	struct rb_node *n;
				4072
				4073	clear_backoffs(osd);
				4074
				4075	for (n = rb_first(&osd->o_requests); n; ) {
				4076	struct ceph_osd_request *req =
				4077	rb_entry(n, struct ceph_osd_request, r_node);
				4078
				4079	n = rb_next(n); /* cancel_linger_request() */
				4080
				4081	if (!req->r_linger) {
				4082	if (!req->r_t.paused)
				4083	send_request(req);
				4084	} else {
				4085	cancel_linger_request(req);
				4086	}
				4087	}
				4088	for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
				4089	struct ceph_osd_linger_request *lreq =
				4090	rb_entry(n, struct ceph_osd_linger_request, node);
				4091
				4092	send_linger(lreq);
				4093	}
				4094	}
				4095
				4096	/*
				4097	* If the osd connection drops, we need to resubmit all requests.
				4098	*/
				4099	static void osd_fault(struct ceph_connection *con)
				4100	{
				4101	struct ceph_osd *osd = con->private;
				4102	struct ceph_osd_client *osdc = osd->o_osdc;
				4103
				4104	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
				4105
				4106	down_write(&osdc->lock);
				4107	if (!osd_registered(osd)) {
				4108	dout("%s osd%d unknown\n", __func__, osd->o_osd);
				4109	goto out_unlock;
				4110	}
				4111
				4112	if (!reopen_osd(osd))
				4113	kick_osd_requests(osd);
				4114	maybe_request_map(osdc);
				4115
				4116	out_unlock:
				4117	up_write(&osdc->lock);
				4118	}
				4119
				4120	struct MOSDBackoff {
				4121	struct ceph_spg spgid;
				4122	u32 map_epoch;
				4123	u8 op;
				4124	u64 id;
				4125	struct ceph_hobject_id *begin;
				4126	struct ceph_hobject_id *end;
				4127	};
				4128
				4129	static int decode_MOSDBackoff(const struct ceph_msg msg, struct MOSDBackoff m)
				4130	{
				4131	void *p = msg->front.iov_base;
				4132	void *const end = p + msg->front.iov_len;
				4133	u8 struct_v;
				4134	u32 struct_len;
				4135	int ret;
				4136
				4137	ret = ceph_start_decoding(&p, end, 1, "spg_t", &struct_v, &struct_len);
				4138	if (ret)
				4139	return ret;
				4140
				4141	ret = ceph_decode_pgid(&p, end, &m->spgid.pgid);
				4142	if (ret)
				4143	return ret;
				4144
				4145	ceph_decode_8_safe(&p, end, m->spgid.shard, e_inval);
				4146	ceph_decode_32_safe(&p, end, m->map_epoch, e_inval);
				4147	ceph_decode_8_safe(&p, end, m->op, e_inval);
				4148	ceph_decode_64_safe(&p, end, m->id, e_inval);
				4149
				4150	m->begin = kzalloc(sizeof(*m->begin), GFP_NOIO);
				4151	if (!m->begin)
				4152	return -ENOMEM;
				4153
				4154	ret = decode_hoid(&p, end, m->begin);
				4155	if (ret) {
				4156	free_hoid(m->begin);
				4157	return ret;
				4158	}
				4159
				4160	m->end = kzalloc(sizeof(*m->end), GFP_NOIO);
				4161	if (!m->end) {
				4162	free_hoid(m->begin);
				4163	return -ENOMEM;
				4164	}
				4165
				4166	ret = decode_hoid(&p, end, m->end);
				4167	if (ret) {
				4168	free_hoid(m->begin);
				4169	free_hoid(m->end);
				4170	return ret;
				4171	}
				4172
				4173	return 0;
				4174
				4175	e_inval:
				4176	return -EINVAL;
				4177	}
				4178
				4179	static struct ceph_msg *create_backoff_message(
				4180	const struct ceph_osd_backoff *backoff,
				4181	u32 map_epoch)
				4182	{
				4183	struct ceph_msg *msg;
				4184	void p, end;
				4185	int msg_size;
				4186
				4187	msg_size = CEPH_ENCODING_START_BLK_LEN +
				4188	CEPH_PGID_ENCODING_LEN + 1; /* spgid */
				4189	msg_size += 4 + 1 + 8; /* map_epoch, op, id */
				4190	msg_size += CEPH_ENCODING_START_BLK_LEN +
				4191	hoid_encoding_size(backoff->begin);
				4192	msg_size += CEPH_ENCODING_START_BLK_LEN +
				4193	hoid_encoding_size(backoff->end);
				4194
				4195	msg = ceph_msg_new(CEPH_MSG_OSD_BACKOFF, msg_size, GFP_NOIO, true);
				4196	if (!msg)
				4197	return NULL;
				4198
				4199	p = msg->front.iov_base;
				4200	end = p + msg->front_alloc_len;
				4201
				4202	encode_spgid(&p, &backoff->spgid);
				4203	ceph_encode_32(&p, map_epoch);
				4204	ceph_encode_8(&p, CEPH_OSD_BACKOFF_OP_ACK_BLOCK);
				4205	ceph_encode_64(&p, backoff->id);
				4206	encode_hoid(&p, end, backoff->begin);
				4207	encode_hoid(&p, end, backoff->end);
				4208	BUG_ON(p != end);
				4209
				4210	msg->front.iov_len = p - msg->front.iov_base;
				4211	msg->hdr.version = cpu_to_le16(1); /* MOSDBackoff v1 */
				4212	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
				4213
				4214	return msg;
				4215	}
				4216
				4217	static void handle_backoff_block(struct ceph_osd osd, struct MOSDBackoff m)
				4218	{
				4219	struct ceph_spg_mapping *spg;
				4220	struct ceph_osd_backoff *backoff;
				4221	struct ceph_msg *msg;
				4222
				4223	dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
				4224	m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
				4225
				4226	spg = lookup_spg_mapping(&osd->o_backoff_mappings, &m->spgid);
				4227	if (!spg) {
				4228	spg = alloc_spg_mapping();
				4229	if (!spg) {
				4230	pr_err("%s failed to allocate spg\n", __func__);
				4231	return;
				4232	}
				4233	spg->spgid = m->spgid; /* struct */
				4234	insert_spg_mapping(&osd->o_backoff_mappings, spg);
				4235	}
				4236
				4237	backoff = alloc_backoff();
				4238	if (!backoff) {
				4239	pr_err("%s failed to allocate backoff\n", __func__);
				4240	return;
				4241	}
				4242	backoff->spgid = m->spgid; /* struct */
				4243	backoff->id = m->id;
				4244	backoff->begin = m->begin;
				4245	m->begin = NULL; /* backoff now owns this */
				4246	backoff->end = m->end;
				4247	m->end = NULL; /* ditto */
				4248
				4249	insert_backoff(&spg->backoffs, backoff);
				4250	insert_backoff_by_id(&osd->o_backoffs_by_id, backoff);
				4251
				4252	/*
				4253	* Ack with original backoff's epoch so that the OSD can
				4254	* discard this if there was a PG split.
				4255	*/
				4256	msg = create_backoff_message(backoff, m->map_epoch);
				4257	if (!msg) {
				4258	pr_err("%s failed to allocate msg\n", __func__);
				4259	return;
				4260	}
				4261	ceph_con_send(&osd->o_con, msg);
				4262	}
				4263
				4264	static bool target_contained_by(const struct ceph_osd_request_target *t,
				4265	const struct ceph_hobject_id *begin,
				4266	const struct ceph_hobject_id *end)
				4267	{
				4268	struct ceph_hobject_id hoid;
				4269	int cmp;
				4270
				4271	hoid_fill_from_target(&hoid, t);
				4272	cmp = hoid_compare(&hoid, begin);
				4273	return !cmp \|\| (cmp > 0 && hoid_compare(&hoid, end) < 0);
				4274	}
				4275
				4276	static void handle_backoff_unblock(struct ceph_osd *osd,
				4277	const struct MOSDBackoff *m)
				4278	{
				4279	struct ceph_spg_mapping *spg;
				4280	struct ceph_osd_backoff *backoff;
				4281	struct rb_node *n;
				4282
				4283	dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
				4284	m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
				4285
				4286	backoff = lookup_backoff_by_id(&osd->o_backoffs_by_id, m->id);
				4287	if (!backoff) {
				4288	pr_err("%s osd%d spgid %llu.%xs%d id %llu backoff dne\n",
				4289	__func__, osd->o_osd, m->spgid.pgid.pool,
				4290	m->spgid.pgid.seed, m->spgid.shard, m->id);
				4291	return;
				4292	}
				4293
				4294	if (hoid_compare(backoff->begin, m->begin) &&
				4295	hoid_compare(backoff->end, m->end)) {
				4296	pr_err("%s osd%d spgid %llu.%xs%d id %llu bad range?\n",
				4297	__func__, osd->o_osd, m->spgid.pgid.pool,
				4298	m->spgid.pgid.seed, m->spgid.shard, m->id);
				4299	/* unblock it anyway... */
				4300	}
				4301
				4302	spg = lookup_spg_mapping(&osd->o_backoff_mappings, &backoff->spgid);
				4303	BUG_ON(!spg);
				4304
				4305	erase_backoff(&spg->backoffs, backoff);
				4306	erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
				4307	free_backoff(backoff);
				4308
				4309	if (RB_EMPTY_ROOT(&spg->backoffs)) {
				4310	erase_spg_mapping(&osd->o_backoff_mappings, spg);
				4311	free_spg_mapping(spg);
				4312	}
				4313
				4314	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
				4315	struct ceph_osd_request *req =
				4316	rb_entry(n, struct ceph_osd_request, r_node);
				4317
				4318	if (!ceph_spg_compare(&req->r_t.spgid, &m->spgid)) {
				4319	/*
				4320	* Match against @m, not @backoff -- the PG may
				4321	* have split on the OSD.
				4322	*/
				4323	if (target_contained_by(&req->r_t, m->begin, m->end)) {
				4324	/*
				4325	* If no other installed backoff applies,
				4326	* resend.
				4327	*/
				4328	send_request(req);
				4329	}
				4330	}
				4331	}
				4332	}
				4333
				4334	static void handle_backoff(struct ceph_osd osd, struct ceph_msg msg)
				4335	{
				4336	struct ceph_osd_client *osdc = osd->o_osdc;
				4337	struct MOSDBackoff m;
				4338	int ret;
				4339
				4340	down_read(&osdc->lock);
				4341	if (!osd_registered(osd)) {
				4342	dout("%s osd%d unknown\n", __func__, osd->o_osd);
				4343	up_read(&osdc->lock);
				4344	return;
				4345	}
				4346	WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
				4347
				4348	mutex_lock(&osd->lock);
				4349	ret = decode_MOSDBackoff(msg, &m);
				4350	if (ret) {
				4351	pr_err("failed to decode MOSDBackoff: %d\n", ret);
				4352	ceph_msg_dump(msg);
				4353	goto out_unlock;
				4354	}
				4355
				4356	switch (m.op) {
				4357	case CEPH_OSD_BACKOFF_OP_BLOCK:
				4358	handle_backoff_block(osd, &m);
				4359	break;
				4360	case CEPH_OSD_BACKOFF_OP_UNBLOCK:
				4361	handle_backoff_unblock(osd, &m);
				4362	break;
				4363	default:
				4364	pr_err("%s osd%d unknown op %d\n", __func__, osd->o_osd, m.op);
				4365	}
				4366
				4367	free_hoid(m.begin);
				4368	free_hoid(m.end);
				4369
				4370	out_unlock:
				4371	mutex_unlock(&osd->lock);
				4372	up_read(&osdc->lock);
				4373	}
				4374
				4375	/*
				4376	* Process osd watch notifications
				4377	*/
				4378	static void handle_watch_notify(struct ceph_osd_client *osdc,
				4379	struct ceph_msg *msg)
				4380	{
				4381	void *p = msg->front.iov_base;
				4382	void *const end = p + msg->front.iov_len;
				4383	struct ceph_osd_linger_request *lreq;
				4384	struct linger_work *lwork;
				4385	u8 proto_ver, opcode;
				4386	u64 cookie, notify_id;
				4387	u64 notifier_id = 0;
				4388	s32 return_code = 0;
				4389	void *payload = NULL;
				4390	u32 payload_len = 0;
				4391
				4392	ceph_decode_8_safe(&p, end, proto_ver, bad);
				4393	ceph_decode_8_safe(&p, end, opcode, bad);
				4394	ceph_decode_64_safe(&p, end, cookie, bad);
				4395	p += 8; /* skip ver */
				4396	ceph_decode_64_safe(&p, end, notify_id, bad);
				4397
				4398	if (proto_ver >= 1) {
				4399	ceph_decode_32_safe(&p, end, payload_len, bad);
				4400	ceph_decode_need(&p, end, payload_len, bad);
				4401	payload = p;
				4402	p += payload_len;
				4403	}
				4404
				4405	if (le16_to_cpu(msg->hdr.version) >= 2)
				4406	ceph_decode_32_safe(&p, end, return_code, bad);
				4407
				4408	if (le16_to_cpu(msg->hdr.version) >= 3)
				4409	ceph_decode_64_safe(&p, end, notifier_id, bad);
				4410
				4411	down_read(&osdc->lock);
				4412	lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
				4413	if (!lreq) {
				4414	dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
				4415	cookie);
				4416	goto out_unlock_osdc;
				4417	}
				4418
				4419	mutex_lock(&lreq->lock);
				4420	dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
				4421	opcode, cookie, lreq, lreq->is_watch);
				4422	if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
				4423	if (!lreq->last_error) {
				4424	lreq->last_error = -ENOTCONN;
				4425	queue_watch_error(lreq);
				4426	}
				4427	} else if (!lreq->is_watch) {
				4428	/* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
				4429	if (lreq->notify_id && lreq->notify_id != notify_id) {
				4430	dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
				4431	lreq->notify_id, notify_id);
				4432	} else if (!completion_done(&lreq->notify_finish_wait)) {
				4433	struct ceph_msg_data *data =
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4434	msg->num_data_items ? &msg->data[0] : NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4435
				4436	if (data) {
				4437	if (lreq->preply_pages) {
				4438	WARN_ON(data->type !=
				4439	CEPH_MSG_DATA_PAGES);
				4440	*lreq->preply_pages = data->pages;
				4441	*lreq->preply_len = data->length;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4442	data->own_pages = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4443	}
				4444	}
				4445	lreq->notify_finish_error = return_code;
				4446	complete_all(&lreq->notify_finish_wait);
				4447	}
				4448	} else {
				4449	/* CEPH_WATCH_EVENT_NOTIFY */
				4450	lwork = lwork_alloc(lreq, do_watch_notify);
				4451	if (!lwork) {
				4452	pr_err("failed to allocate notify-lwork\n");
				4453	goto out_unlock_lreq;
				4454	}
				4455
				4456	lwork->notify.notify_id = notify_id;
				4457	lwork->notify.notifier_id = notifier_id;
				4458	lwork->notify.payload = payload;
				4459	lwork->notify.payload_len = payload_len;
				4460	lwork->notify.msg = ceph_msg_get(msg);
				4461	lwork_queue(lwork);
				4462	}
				4463
				4464	out_unlock_lreq:
				4465	mutex_unlock(&lreq->lock);
				4466	out_unlock_osdc:
				4467	up_read(&osdc->lock);
				4468	return;
				4469
				4470	bad:
				4471	pr_err("osdc handle_watch_notify corrupt msg\n");
				4472	}
				4473
				4474	/*
				4475	* Register request, send initial attempt.
				4476	*/
				4477	int ceph_osdc_start_request(struct ceph_osd_client *osdc,
				4478	struct ceph_osd_request *req,
				4479	bool nofail)
				4480	{
				4481	down_read(&osdc->lock);
				4482	submit_request(req, false);
				4483	up_read(&osdc->lock);
				4484
				4485	return 0;
				4486	}
				4487	EXPORT_SYMBOL(ceph_osdc_start_request);
				4488
				4489	/*
				4490	* Unregister a registered request. The request is not completed:
				4491	* ->r_result isn't set and __complete_request() isn't called.
				4492	*/
				4493	void ceph_osdc_cancel_request(struct ceph_osd_request *req)
				4494	{
				4495	struct ceph_osd_client *osdc = req->r_osdc;
				4496
				4497	down_write(&osdc->lock);
				4498	if (req->r_osd)
				4499	cancel_request(req);
				4500	up_write(&osdc->lock);
				4501	}
				4502	EXPORT_SYMBOL(ceph_osdc_cancel_request);
				4503
				4504	/*
				4505	* @timeout: in jiffies, 0 means "wait forever"
				4506	*/
				4507	static int wait_request_timeout(struct ceph_osd_request *req,
				4508	unsigned long timeout)
				4509	{
				4510	long left;
				4511
				4512	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
				4513	left = wait_for_completion_killable_timeout(&req->r_completion,
				4514	ceph_timeout_jiffies(timeout));
				4515	if (left <= 0) {
				4516	left = left ?: -ETIMEDOUT;
				4517	ceph_osdc_cancel_request(req);
				4518	} else {
				4519	left = req->r_result; /* completed */
				4520	}
				4521
				4522	return left;
				4523	}
				4524
				4525	/*
				4526	* wait for a request to complete
				4527	*/
				4528	int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
				4529	struct ceph_osd_request *req)
				4530	{
				4531	return wait_request_timeout(req, 0);
				4532	}
				4533	EXPORT_SYMBOL(ceph_osdc_wait_request);
				4534
				4535	/*
				4536	* sync - wait for all in-flight requests to flush. avoid starvation.
				4537	*/
				4538	void ceph_osdc_sync(struct ceph_osd_client *osdc)
				4539	{
				4540	struct rb_node n, p;
				4541	u64 last_tid = atomic64_read(&osdc->last_tid);
				4542
				4543	again:
				4544	down_read(&osdc->lock);
				4545	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
				4546	struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
				4547
				4548	mutex_lock(&osd->lock);
				4549	for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
				4550	struct ceph_osd_request *req =
				4551	rb_entry(p, struct ceph_osd_request, r_node);
				4552
				4553	if (req->r_tid > last_tid)
				4554	break;
				4555
				4556	if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
				4557	continue;
				4558
				4559	ceph_osdc_get_request(req);
				4560	mutex_unlock(&osd->lock);
				4561	up_read(&osdc->lock);
				4562	dout("%s waiting on req %p tid %llu last_tid %llu\n",
				4563	__func__, req, req->r_tid, last_tid);
				4564	wait_for_completion(&req->r_completion);
				4565	ceph_osdc_put_request(req);
				4566	goto again;
				4567	}
				4568
				4569	mutex_unlock(&osd->lock);
				4570	}
				4571
				4572	up_read(&osdc->lock);
				4573	dout("%s done last_tid %llu\n", __func__, last_tid);
				4574	}
				4575	EXPORT_SYMBOL(ceph_osdc_sync);
				4576
				4577	static struct ceph_osd_request *
				4578	alloc_linger_request(struct ceph_osd_linger_request *lreq)
				4579	{
				4580	struct ceph_osd_request *req;
				4581
				4582	req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
				4583	if (!req)
				4584	return NULL;
				4585
				4586	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
				4587	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4588	return req;
				4589	}
				4590
				4591	static struct ceph_osd_request *
				4592	alloc_watch_request(struct ceph_osd_linger_request *lreq, u8 watch_opcode)
				4593	{
				4594	struct ceph_osd_request *req;
				4595
				4596	req = alloc_linger_request(lreq);
				4597	if (!req)
				4598	return NULL;
				4599
				4600	/*
				4601	* Pass 0 for cookie because we don't know it yet, it will be
				4602	* filled in by linger_submit().
				4603	*/
				4604	osd_req_op_watch_init(req, 0, 0, watch_opcode);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4605
				4606	if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
				4607	ceph_osdc_put_request(req);
				4608	return NULL;
				4609	}
				4610
				4611	return req;
				4612	}
				4613
				4614	/*
				4615	* Returns a handle, caller owns a ref.
				4616	*/
				4617	struct ceph_osd_linger_request *
				4618	ceph_osdc_watch(struct ceph_osd_client *osdc,
				4619	struct ceph_object_id *oid,
				4620	struct ceph_object_locator *oloc,
				4621	rados_watchcb2_t wcb,
				4622	rados_watcherrcb_t errcb,
				4623	void *data)
				4624	{
				4625	struct ceph_osd_linger_request *lreq;
				4626	int ret;
				4627
				4628	lreq = linger_alloc(osdc);
				4629	if (!lreq)
				4630	return ERR_PTR(-ENOMEM);
				4631
				4632	lreq->is_watch = true;
				4633	lreq->wcb = wcb;
				4634	lreq->errcb = errcb;
				4635	lreq->data = data;
				4636	lreq->watch_valid_thru = jiffies;
				4637
				4638	ceph_oid_copy(&lreq->t.base_oid, oid);
				4639	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
				4640	lreq->t.flags = CEPH_OSD_FLAG_WRITE;
				4641	ktime_get_real_ts64(&lreq->mtime);
				4642
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4643	lreq->reg_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_WATCH);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4644	if (!lreq->reg_req) {
				4645	ret = -ENOMEM;
				4646	goto err_put_lreq;
				4647	}
				4648
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4649	lreq->ping_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_PING);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4650	if (!lreq->ping_req) {
				4651	ret = -ENOMEM;
				4652	goto err_put_lreq;
				4653	}
				4654
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4655	linger_submit(lreq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4656	ret = linger_reg_commit_wait(lreq);
				4657	if (ret) {
				4658	linger_cancel(lreq);
				4659	goto err_put_lreq;
				4660	}
				4661
				4662	return lreq;
				4663
				4664	err_put_lreq:
				4665	linger_put(lreq);
				4666	return ERR_PTR(ret);
				4667	}
				4668	EXPORT_SYMBOL(ceph_osdc_watch);
				4669
				4670	/*
				4671	* Releases a ref.
				4672	*
				4673	* Times out after mount_timeout to preserve rbd unmap behaviour
				4674	* introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
				4675	* with mount_timeout").
				4676	*/
				4677	int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
				4678	struct ceph_osd_linger_request *lreq)
				4679	{
				4680	struct ceph_options *opts = osdc->client->options;
				4681	struct ceph_osd_request *req;
				4682	int ret;
				4683
				4684	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
				4685	if (!req)
				4686	return -ENOMEM;
				4687
				4688	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
				4689	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
				4690	req->r_flags = CEPH_OSD_FLAG_WRITE;
				4691	ktime_get_real_ts64(&req->r_mtime);
				4692	osd_req_op_watch_init(req, 0, lreq->linger_id,
				4693	CEPH_OSD_WATCH_OP_UNWATCH);
				4694
				4695	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
				4696	if (ret)
				4697	goto out_put_req;
				4698
				4699	ceph_osdc_start_request(osdc, req, false);
				4700	linger_cancel(lreq);
				4701	linger_put(lreq);
				4702	ret = wait_request_timeout(req, opts->mount_timeout);
				4703
				4704	out_put_req:
				4705	ceph_osdc_put_request(req);
				4706	return ret;
				4707	}
				4708	EXPORT_SYMBOL(ceph_osdc_unwatch);
				4709
				4710	static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
				4711	u64 notify_id, u64 cookie, void *payload,
				4712	u32 payload_len)
				4713	{
				4714	struct ceph_osd_req_op *op;
				4715	struct ceph_pagelist *pl;
				4716	int ret;
				4717
				4718	op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
				4719
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4720	pl = ceph_pagelist_alloc(GFP_NOIO);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4721	if (!pl)
				4722	return -ENOMEM;
				4723
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4724	ret = ceph_pagelist_encode_64(pl, notify_id);
				4725	ret \|= ceph_pagelist_encode_64(pl, cookie);
				4726	if (payload) {
				4727	ret \|= ceph_pagelist_encode_32(pl, payload_len);
				4728	ret \|= ceph_pagelist_append(pl, payload, payload_len);
				4729	} else {
				4730	ret \|= ceph_pagelist_encode_32(pl, 0);
				4731	}
				4732	if (ret) {
				4733	ceph_pagelist_release(pl);
				4734	return -ENOMEM;
				4735	}
				4736
				4737	ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
				4738	op->indata_len = pl->length;
				4739	return 0;
				4740	}
				4741
				4742	int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
				4743	struct ceph_object_id *oid,
				4744	struct ceph_object_locator *oloc,
				4745	u64 notify_id,
				4746	u64 cookie,
				4747	void *payload,
				4748	u32 payload_len)
				4749	{
				4750	struct ceph_osd_request *req;
				4751	int ret;
				4752
				4753	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
				4754	if (!req)
				4755	return -ENOMEM;
				4756
				4757	ceph_oid_copy(&req->r_base_oid, oid);
				4758	ceph_oloc_copy(&req->r_base_oloc, oloc);
				4759	req->r_flags = CEPH_OSD_FLAG_READ;
				4760
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4761	ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
				4762	payload_len);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4763	if (ret)
				4764	goto out_put_req;
				4765
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4766	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4767	if (ret)
				4768	goto out_put_req;
				4769
				4770	ceph_osdc_start_request(osdc, req, false);
				4771	ret = ceph_osdc_wait_request(osdc, req);
				4772
				4773	out_put_req:
				4774	ceph_osdc_put_request(req);
				4775	return ret;
				4776	}
				4777	EXPORT_SYMBOL(ceph_osdc_notify_ack);
				4778
				4779	static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
				4780	u64 cookie, u32 prot_ver, u32 timeout,
				4781	void *payload, u32 payload_len)
				4782	{
				4783	struct ceph_osd_req_op *op;
				4784	struct ceph_pagelist *pl;
				4785	int ret;
				4786
				4787	op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
				4788	op->notify.cookie = cookie;
				4789
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4790	pl = ceph_pagelist_alloc(GFP_NOIO);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4791	if (!pl)
				4792	return -ENOMEM;
				4793
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4794	ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
				4795	ret \|= ceph_pagelist_encode_32(pl, timeout);
				4796	ret \|= ceph_pagelist_encode_32(pl, payload_len);
				4797	ret \|= ceph_pagelist_append(pl, payload, payload_len);
				4798	if (ret) {
				4799	ceph_pagelist_release(pl);
				4800	return -ENOMEM;
				4801	}
				4802
				4803	ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
				4804	op->indata_len = pl->length;
				4805	return 0;
				4806	}
				4807
				4808	/*
				4809	* @timeout: in seconds
				4810	*
				4811	* @preply_{pages,len} are initialized both on success and error.
				4812	* The caller is responsible for:
				4813	*
				4814	* ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
				4815	*/
				4816	int ceph_osdc_notify(struct ceph_osd_client *osdc,
				4817	struct ceph_object_id *oid,
				4818	struct ceph_object_locator *oloc,
				4819	void *payload,
				4820	u32 payload_len,
				4821	u32 timeout,
				4822	struct page ***preply_pages,
				4823	size_t *preply_len)
				4824	{
				4825	struct ceph_osd_linger_request *lreq;
				4826	struct page **pages;
				4827	int ret;
				4828
				4829	WARN_ON(!timeout);
				4830	if (preply_pages) {
				4831	*preply_pages = NULL;
				4832	*preply_len = 0;
				4833	}
				4834
				4835	lreq = linger_alloc(osdc);
				4836	if (!lreq)
				4837	return -ENOMEM;
				4838
				4839	lreq->preply_pages = preply_pages;
				4840	lreq->preply_len = preply_len;
				4841
				4842	ceph_oid_copy(&lreq->t.base_oid, oid);
				4843	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
				4844	lreq->t.flags = CEPH_OSD_FLAG_READ;
				4845
				4846	lreq->reg_req = alloc_linger_request(lreq);
				4847	if (!lreq->reg_req) {
				4848	ret = -ENOMEM;
				4849	goto out_put_lreq;
				4850	}
				4851
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4852	/*
				4853	* Pass 0 for cookie because we don't know it yet, it will be
				4854	* filled in by linger_submit().
				4855	*/
				4856	ret = osd_req_op_notify_init(lreq->reg_req, 0, 0, 1, timeout,
				4857	payload, payload_len);
				4858	if (ret)
				4859	goto out_put_lreq;
				4860
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4861	/* for notify_id */
				4862	pages = ceph_alloc_page_vector(1, GFP_NOIO);
				4863	if (IS_ERR(pages)) {
				4864	ret = PTR_ERR(pages);
				4865	goto out_put_lreq;
				4866	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4867	ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
				4868	response_data),
				4869	pages, PAGE_SIZE, 0, false, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4870
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4871	ret = ceph_osdc_alloc_messages(lreq->reg_req, GFP_NOIO);
				4872	if (ret)
				4873	goto out_put_lreq;
				4874
				4875	linger_submit(lreq);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4876	ret = linger_reg_commit_wait(lreq);
				4877	if (!ret)
				4878	ret = linger_notify_finish_wait(lreq);
				4879	else
				4880	dout("lreq %p failed to initiate notify %d\n", lreq, ret);
				4881
				4882	linger_cancel(lreq);
				4883	out_put_lreq:
				4884	linger_put(lreq);
				4885	return ret;
				4886	}
				4887	EXPORT_SYMBOL(ceph_osdc_notify);
				4888
				4889	/*
				4890	* Return the number of milliseconds since the watch was last
				4891	* confirmed, or an error. If there is an error, the watch is no
				4892	* longer valid, and should be destroyed with ceph_osdc_unwatch().
				4893	*/
				4894	int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
				4895	struct ceph_osd_linger_request *lreq)
				4896	{
				4897	unsigned long stamp, age;
				4898	int ret;
				4899
				4900	down_read(&osdc->lock);
				4901	mutex_lock(&lreq->lock);
				4902	stamp = lreq->watch_valid_thru;
				4903	if (!list_empty(&lreq->pending_lworks)) {
				4904	struct linger_work *lwork =
				4905	list_first_entry(&lreq->pending_lworks,
				4906	struct linger_work,
				4907	pending_item);
				4908
				4909	if (time_before(lwork->queued_stamp, stamp))
				4910	stamp = lwork->queued_stamp;
				4911	}
				4912	age = jiffies - stamp;
				4913	dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
				4914	lreq, lreq->linger_id, age, lreq->last_error);
				4915	/* we are truncating to msecs, so return a safe upper bound */
				4916	ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
				4917
				4918	mutex_unlock(&lreq->lock);
				4919	up_read(&osdc->lock);
				4920	return ret;
				4921	}
				4922
				4923	static int decode_watcher(void *p, void end, struct ceph_watch_item *item)
				4924	{
				4925	u8 struct_v;
				4926	u32 struct_len;
				4927	int ret;
				4928
				4929	ret = ceph_start_decoding(p, end, 2, "watch_item_t",
				4930	&struct_v, &struct_len);
				4931	if (ret)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4932	goto bad;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4933
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4934	ret = -EINVAL;
				4935	ceph_decode_copy_safe(p, end, &item->name, sizeof(item->name), bad);
				4936	ceph_decode_64_safe(p, end, item->cookie, bad);
				4937	ceph_decode_skip_32(p, end, bad); /* skip timeout seconds */
				4938
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4939	if (struct_v >= 2) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4940	ret = ceph_decode_entity_addr(p, end, &item->addr);
				4941	if (ret)
				4942	goto bad;
				4943	} else {
				4944	ret = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4945	}
				4946
				4947	dout("%s %s%llu cookie %llu addr %s\n", __func__,
				4948	ENTITY_NAME(item->name), item->cookie,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4949	ceph_pr_addr(&item->addr));
				4950	bad:
				4951	return ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4952	}
				4953
				4954	static int decode_watchers(void *p, void end,
				4955	struct ceph_watch_item **watchers,
				4956	u32 *num_watchers)
				4957	{
				4958	u8 struct_v;
				4959	u32 struct_len;
				4960	int i;
				4961	int ret;
				4962
				4963	ret = ceph_start_decoding(p, end, 1, "obj_list_watch_response_t",
				4964	&struct_v, &struct_len);
				4965	if (ret)
				4966	return ret;
				4967
				4968	*num_watchers = ceph_decode_32(p);
				4969	watchers = kcalloc(num_watchers, sizeof(**watchers), GFP_NOIO);
				4970	if (!*watchers)
				4971	return -ENOMEM;
				4972
				4973	for (i = 0; i < *num_watchers; i++) {
				4974	ret = decode_watcher(p, end, *watchers + i);
				4975	if (ret) {
				4976	kfree(*watchers);
				4977	return ret;
				4978	}
				4979	}
				4980
				4981	return 0;
				4982	}
				4983
				4984	/*
				4985	* On success, the caller is responsible for:
				4986	*
				4987	* kfree(watchers);
				4988	*/
				4989	int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
				4990	struct ceph_object_id *oid,
				4991	struct ceph_object_locator *oloc,
				4992	struct ceph_watch_item **watchers,
				4993	u32 *num_watchers)
				4994	{
				4995	struct ceph_osd_request *req;
				4996	struct page **pages;
				4997	int ret;
				4998
				4999	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
				5000	if (!req)
				5001	return -ENOMEM;
				5002
				5003	ceph_oid_copy(&req->r_base_oid, oid);
				5004	ceph_oloc_copy(&req->r_base_oloc, oloc);
				5005	req->r_flags = CEPH_OSD_FLAG_READ;
				5006
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5007	pages = ceph_alloc_page_vector(1, GFP_NOIO);
				5008	if (IS_ERR(pages)) {
				5009	ret = PTR_ERR(pages);
				5010	goto out_put_req;
				5011	}
				5012
				5013	osd_req_op_init(req, 0, CEPH_OSD_OP_LIST_WATCHERS, 0);
				5014	ceph_osd_data_pages_init(osd_req_op_data(req, 0, list_watchers,
				5015	response_data),
				5016	pages, PAGE_SIZE, 0, false, true);
				5017
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5018	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
				5019	if (ret)
				5020	goto out_put_req;
				5021
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5022	ceph_osdc_start_request(osdc, req, false);
				5023	ret = ceph_osdc_wait_request(osdc, req);
				5024	if (ret >= 0) {
				5025	void *p = page_address(pages[0]);
				5026	void *const end = p + req->r_ops[0].outdata_len;
				5027
				5028	ret = decode_watchers(&p, end, watchers, num_watchers);
				5029	}
				5030
				5031	out_put_req:
				5032	ceph_osdc_put_request(req);
				5033	return ret;
				5034	}
				5035	EXPORT_SYMBOL(ceph_osdc_list_watchers);
				5036
				5037	/*
				5038	* Call all pending notify callbacks - for use after a watch is
				5039	* unregistered, to make sure no more callbacks for it will be invoked
				5040	*/
				5041	void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
				5042	{
				5043	dout("%s osdc %p\n", __func__, osdc);
				5044	flush_workqueue(osdc->notify_wq);
				5045	}
				5046	EXPORT_SYMBOL(ceph_osdc_flush_notifies);
				5047
				5048	void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
				5049	{
				5050	down_read(&osdc->lock);
				5051	maybe_request_map(osdc);
				5052	up_read(&osdc->lock);
				5053	}
				5054	EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
				5055
				5056	/*
				5057	* Execute an OSD class method on an object.
				5058	*
				5059	* @flags: CEPH_OSD_FLAG_*
				5060	* @resp_len: in/out param for reply length
				5061	*/
				5062	int ceph_osdc_call(struct ceph_osd_client *osdc,
				5063	struct ceph_object_id *oid,
				5064	struct ceph_object_locator *oloc,
				5065	const char class, const char method,
				5066	unsigned int flags,
				5067	struct page *req_page, size_t req_len,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5068	struct page *resp_pages, size_t resp_len)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5069	{
				5070	struct ceph_osd_request *req;
				5071	int ret;
				5072
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5073	if (req_len > PAGE_SIZE)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5074	return -E2BIG;
				5075
				5076	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
				5077	if (!req)
				5078	return -ENOMEM;
				5079
				5080	ceph_oid_copy(&req->r_base_oid, oid);
				5081	ceph_oloc_copy(&req->r_base_oloc, oloc);
				5082	req->r_flags = flags;
				5083
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5084	ret = osd_req_op_cls_init(req, 0, class, method);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5085	if (ret)
				5086	goto out_put_req;
				5087
				5088	if (req_page)
				5089	osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
				5090	0, false, false);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5091	if (resp_pages)
				5092	osd_req_op_cls_response_data_pages(req, 0, resp_pages,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5093	*resp_len, 0, false, false);
				5094
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5095	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
				5096	if (ret)
				5097	goto out_put_req;
				5098
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5099	ceph_osdc_start_request(osdc, req, false);
				5100	ret = ceph_osdc_wait_request(osdc, req);
				5101	if (ret >= 0) {
				5102	ret = req->r_ops[0].rval;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5103	if (resp_pages)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5104	*resp_len = req->r_ops[0].outdata_len;
				5105	}
				5106
				5107	out_put_req:
				5108	ceph_osdc_put_request(req);
				5109	return ret;
				5110	}
				5111	EXPORT_SYMBOL(ceph_osdc_call);
				5112
				5113	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5114	* reset all osd connections
				5115	*/
				5116	void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc)
				5117	{
				5118	struct rb_node *n;
				5119
				5120	down_write(&osdc->lock);
				5121	for (n = rb_first(&osdc->osds); n; ) {
				5122	struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
				5123
				5124	n = rb_next(n);
				5125	if (!reopen_osd(osd))
				5126	kick_osd_requests(osd);
				5127	}
				5128	up_write(&osdc->lock);
				5129	}
				5130
				5131	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5132	* init, shutdown
				5133	*/
				5134	int ceph_osdc_init(struct ceph_osd_client osdc, struct ceph_client client)
				5135	{
				5136	int err;
				5137
				5138	dout("init\n");
				5139	osdc->client = client;
				5140	init_rwsem(&osdc->lock);
				5141	osdc->osds = RB_ROOT;
				5142	INIT_LIST_HEAD(&osdc->osd_lru);
				5143	spin_lock_init(&osdc->osd_lru_lock);
				5144	osd_init(&osdc->homeless_osd);
				5145	osdc->homeless_osd.o_osdc = osdc;
				5146	osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
				5147	osdc->last_linger_id = CEPH_LINGER_ID_START;
				5148	osdc->linger_requests = RB_ROOT;
				5149	osdc->map_checks = RB_ROOT;
				5150	osdc->linger_map_checks = RB_ROOT;
				5151	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
				5152	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
				5153
				5154	err = -ENOMEM;
				5155	osdc->osdmap = ceph_osdmap_alloc();
				5156	if (!osdc->osdmap)
				5157	goto out;
				5158
				5159	osdc->req_mempool = mempool_create_slab_pool(10,
				5160	ceph_osd_request_cache);
				5161	if (!osdc->req_mempool)
				5162	goto out_map;
				5163
				5164	err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5165	PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, "osd_op");
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5166	if (err < 0)
				5167	goto out_mempool;
				5168	err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5169	PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10,
				5170	"osd_op_reply");
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5171	if (err < 0)
				5172	goto out_msgpool;
				5173
				5174	err = -ENOMEM;
				5175	osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify");
				5176	if (!osdc->notify_wq)
				5177	goto out_msgpool_reply;
				5178
				5179	osdc->completion_wq = create_singlethread_workqueue("ceph-completion");
				5180	if (!osdc->completion_wq)
				5181	goto out_notify_wq;
				5182
				5183	schedule_delayed_work(&osdc->timeout_work,
				5184	osdc->client->options->osd_keepalive_timeout);
				5185	schedule_delayed_work(&osdc->osds_timeout_work,
				5186	round_jiffies_relative(osdc->client->options->osd_idle_ttl));
				5187
				5188	return 0;
				5189
				5190	out_notify_wq:
				5191	destroy_workqueue(osdc->notify_wq);
				5192	out_msgpool_reply:
				5193	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
				5194	out_msgpool:
				5195	ceph_msgpool_destroy(&osdc->msgpool_op);
				5196	out_mempool:
				5197	mempool_destroy(osdc->req_mempool);
				5198	out_map:
				5199	ceph_osdmap_destroy(osdc->osdmap);
				5200	out:
				5201	return err;
				5202	}
				5203
				5204	void ceph_osdc_stop(struct ceph_osd_client *osdc)
				5205	{
				5206	destroy_workqueue(osdc->completion_wq);
				5207	destroy_workqueue(osdc->notify_wq);
				5208	cancel_delayed_work_sync(&osdc->timeout_work);
				5209	cancel_delayed_work_sync(&osdc->osds_timeout_work);
				5210
				5211	down_write(&osdc->lock);
				5212	while (!RB_EMPTY_ROOT(&osdc->osds)) {
				5213	struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
				5214	struct ceph_osd, o_node);
				5215	close_osd(osd);
				5216	}
				5217	up_write(&osdc->lock);
				5218	WARN_ON(refcount_read(&osdc->homeless_osd.o_ref) != 1);
				5219	osd_cleanup(&osdc->homeless_osd);
				5220
				5221	WARN_ON(!list_empty(&osdc->osd_lru));
				5222	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
				5223	WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
				5224	WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
				5225	WARN_ON(atomic_read(&osdc->num_requests));
				5226	WARN_ON(atomic_read(&osdc->num_homeless));
				5227
				5228	ceph_osdmap_destroy(osdc->osdmap);
				5229	mempool_destroy(osdc->req_mempool);
				5230	ceph_msgpool_destroy(&osdc->msgpool_op);
				5231	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
				5232	}
				5233
				5234	/*
				5235	* Read some contiguous pages. If we cross a stripe boundary, shorten
				5236	* *plen. Return number of bytes read, or error.
				5237	*/
				5238	int ceph_osdc_readpages(struct ceph_osd_client *osdc,
				5239	struct ceph_vino vino, struct ceph_file_layout *layout,
				5240	u64 off, u64 *plen,
				5241	u32 truncate_seq, u64 truncate_size,
				5242	struct page **pages, int num_pages, int page_align)
				5243	{
				5244	struct ceph_osd_request *req;
				5245	int rc = 0;
				5246
				5247	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
				5248	vino.snap, off, *plen);
				5249	req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
				5250	CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
				5251	NULL, truncate_seq, truncate_size,
				5252	false);
				5253	if (IS_ERR(req))
				5254	return PTR_ERR(req);
				5255
				5256	/* it may be a short read due to an object boundary */
				5257	osd_req_op_extent_osd_data_pages(req, 0,
				5258	pages, *plen, page_align, false, false);
				5259
				5260	dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
				5261	off, plen, plen, page_align);
				5262
				5263	rc = ceph_osdc_start_request(osdc, req, false);
				5264	if (!rc)
				5265	rc = ceph_osdc_wait_request(osdc, req);
				5266
				5267	ceph_osdc_put_request(req);
				5268	dout("readpages result %d\n", rc);
				5269	return rc;
				5270	}
				5271	EXPORT_SYMBOL(ceph_osdc_readpages);
				5272
				5273	/*
				5274	* do a synchronous write on N pages
				5275	*/
				5276	int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
				5277	struct ceph_file_layout *layout,
				5278	struct ceph_snap_context *snapc,
				5279	u64 off, u64 len,
				5280	u32 truncate_seq, u64 truncate_size,
				5281	struct timespec64 *mtime,
				5282	struct page **pages, int num_pages)
				5283	{
				5284	struct ceph_osd_request *req;
				5285	int rc = 0;
				5286	int page_align = off & ~PAGE_MASK;
				5287
				5288	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
				5289	CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
				5290	snapc, truncate_seq, truncate_size,
				5291	true);
				5292	if (IS_ERR(req))
				5293	return PTR_ERR(req);
				5294
				5295	/* it may be a short write due to an object boundary */
				5296	osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
				5297	false, false);
				5298	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
				5299
				5300	req->r_mtime = *mtime;
				5301	rc = ceph_osdc_start_request(osdc, req, true);
				5302	if (!rc)
				5303	rc = ceph_osdc_wait_request(osdc, req);
				5304
				5305	ceph_osdc_put_request(req);
				5306	if (rc == 0)
				5307	rc = len;
				5308	dout("writepages result %d\n", rc);
				5309	return rc;
				5310	}
				5311	EXPORT_SYMBOL(ceph_osdc_writepages);
				5312
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5313	static int osd_req_op_copy_from_init(struct ceph_osd_request *req,
				5314	u64 src_snapid, u64 src_version,
				5315	struct ceph_object_id *src_oid,
				5316	struct ceph_object_locator *src_oloc,
				5317	u32 src_fadvise_flags,
				5318	u32 dst_fadvise_flags,
				5319	u8 copy_from_flags)
				5320	{
				5321	struct ceph_osd_req_op *op;
				5322	struct page **pages;
				5323	void p, end;
				5324
				5325	pages = ceph_alloc_page_vector(1, GFP_KERNEL);
				5326	if (IS_ERR(pages))
				5327	return PTR_ERR(pages);
				5328
				5329	op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, dst_fadvise_flags);
				5330	op->copy_from.snapid = src_snapid;
				5331	op->copy_from.src_version = src_version;
				5332	op->copy_from.flags = copy_from_flags;
				5333	op->copy_from.src_fadvise_flags = src_fadvise_flags;
				5334
				5335	p = page_address(pages[0]);
				5336	end = p + PAGE_SIZE;
				5337	ceph_encode_string(&p, end, src_oid->name, src_oid->name_len);
				5338	encode_oloc(&p, end, src_oloc);
				5339	op->indata_len = PAGE_SIZE - (end - p);
				5340
				5341	ceph_osd_data_pages_init(&op->copy_from.osd_data, pages,
				5342	op->indata_len, 0, false, true);
				5343	return 0;
				5344	}
				5345
				5346	int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
				5347	u64 src_snapid, u64 src_version,
				5348	struct ceph_object_id *src_oid,
				5349	struct ceph_object_locator *src_oloc,
				5350	u32 src_fadvise_flags,
				5351	struct ceph_object_id *dst_oid,
				5352	struct ceph_object_locator *dst_oloc,
				5353	u32 dst_fadvise_flags,
				5354	u8 copy_from_flags)
				5355	{
				5356	struct ceph_osd_request *req;
				5357	int ret;
				5358
				5359	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
				5360	if (!req)
				5361	return -ENOMEM;
				5362
				5363	req->r_flags = CEPH_OSD_FLAG_WRITE;
				5364
				5365	ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc);
				5366	ceph_oid_copy(&req->r_t.base_oid, dst_oid);
				5367
				5368	ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid,
				5369	src_oloc, src_fadvise_flags,
				5370	dst_fadvise_flags, copy_from_flags);
				5371	if (ret)
				5372	goto out;
				5373
				5374	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
				5375	if (ret)
				5376	goto out;
				5377
				5378	ceph_osdc_start_request(osdc, req, false);
				5379	ret = ceph_osdc_wait_request(osdc, req);
				5380
				5381	out:
				5382	ceph_osdc_put_request(req);
				5383	return ret;
				5384	}
				5385	EXPORT_SYMBOL(ceph_osdc_copy_from);
				5386
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5387	int __init ceph_osdc_setup(void)
				5388	{
				5389	size_t size = sizeof(struct ceph_osd_request) +
				5390	CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
				5391
				5392	BUG_ON(ceph_osd_request_cache);
				5393	ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size,
				5394	0, 0, NULL);
				5395
				5396	return ceph_osd_request_cache ? 0 : -ENOMEM;
				5397	}
				5398
				5399	void ceph_osdc_cleanup(void)
				5400	{
				5401	BUG_ON(!ceph_osd_request_cache);
				5402	kmem_cache_destroy(ceph_osd_request_cache);
				5403	ceph_osd_request_cache = NULL;
				5404	}
				5405
				5406	/*
				5407	* handle incoming message
				5408	*/
				5409	static void dispatch(struct ceph_connection con, struct ceph_msg msg)
				5410	{
				5411	struct ceph_osd *osd = con->private;
				5412	struct ceph_osd_client *osdc = osd->o_osdc;
				5413	int type = le16_to_cpu(msg->hdr.type);
				5414
				5415	switch (type) {
				5416	case CEPH_MSG_OSD_MAP:
				5417	ceph_osdc_handle_map(osdc, msg);
				5418	break;
				5419	case CEPH_MSG_OSD_OPREPLY:
				5420	handle_reply(osd, msg);
				5421	break;
				5422	case CEPH_MSG_OSD_BACKOFF:
				5423	handle_backoff(osd, msg);
				5424	break;
				5425	case CEPH_MSG_WATCH_NOTIFY:
				5426	handle_watch_notify(osdc, msg);
				5427	break;
				5428
				5429	default:
				5430	pr_err("received unknown message type %d %s\n", type,
				5431	ceph_msg_type_name(type));
				5432	}
				5433
				5434	ceph_msg_put(msg);
				5435	}
				5436
				5437	/*
				5438	* Lookup and return message for incoming reply. Don't try to do
				5439	* anything about a larger than preallocated data portion of the
				5440	* message at the moment - for now, just skip the message.
				5441	*/
				5442	static struct ceph_msg get_reply(struct ceph_connection con,
				5443	struct ceph_msg_header *hdr,
				5444	int *skip)
				5445	{
				5446	struct ceph_osd *osd = con->private;
				5447	struct ceph_osd_client *osdc = osd->o_osdc;
				5448	struct ceph_msg *m = NULL;
				5449	struct ceph_osd_request *req;
				5450	int front_len = le32_to_cpu(hdr->front_len);
				5451	int data_len = le32_to_cpu(hdr->data_len);
				5452	u64 tid = le64_to_cpu(hdr->tid);
				5453
				5454	down_read(&osdc->lock);
				5455	if (!osd_registered(osd)) {
				5456	dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
				5457	*skip = 1;
				5458	goto out_unlock_osdc;
				5459	}
				5460	WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
				5461
				5462	mutex_lock(&osd->lock);
				5463	req = lookup_request(&osd->o_requests, tid);
				5464	if (!req) {
				5465	dout("%s osd%d tid %llu unknown, skipping\n", __func__,
				5466	osd->o_osd, tid);
				5467	*skip = 1;
				5468	goto out_unlock_session;
				5469	}
				5470
				5471	ceph_msg_revoke_incoming(req->r_reply);
				5472
				5473	if (front_len > req->r_reply->front_alloc_len) {
				5474	pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
				5475	__func__, osd->o_osd, req->r_tid, front_len,
				5476	req->r_reply->front_alloc_len);
				5477	m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
				5478	false);
				5479	if (!m)
				5480	goto out_unlock_session;
				5481	ceph_msg_put(req->r_reply);
				5482	req->r_reply = m;
				5483	}
				5484
				5485	if (data_len > req->r_reply->data_length) {
				5486	pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
				5487	__func__, osd->o_osd, req->r_tid, data_len,
				5488	req->r_reply->data_length);
				5489	m = NULL;
				5490	*skip = 1;
				5491	goto out_unlock_session;
				5492	}
				5493
				5494	m = ceph_msg_get(req->r_reply);
				5495	dout("get_reply tid %lld %p\n", tid, m);
				5496
				5497	out_unlock_session:
				5498	mutex_unlock(&osd->lock);
				5499	out_unlock_osdc:
				5500	up_read(&osdc->lock);
				5501	return m;
				5502	}
				5503
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5504	static struct ceph_msg alloc_msg_with_page_vector(struct ceph_msg_header hdr)
				5505	{
				5506	struct ceph_msg *m;
				5507	int type = le16_to_cpu(hdr->type);
				5508	u32 front_len = le32_to_cpu(hdr->front_len);
				5509	u32 data_len = le32_to_cpu(hdr->data_len);
				5510
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5511	m = ceph_msg_new2(type, front_len, 1, GFP_NOIO, false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5512	if (!m)
				5513	return NULL;
				5514
				5515	if (data_len) {
				5516	struct page **pages;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5517
				5518	pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
				5519	GFP_NOIO);
				5520	if (IS_ERR(pages)) {
				5521	ceph_msg_put(m);
				5522	return NULL;
				5523	}
				5524
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5525	ceph_msg_data_add_pages(m, pages, data_len, 0, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5526	}
				5527
				5528	return m;
				5529	}
				5530
				5531	static struct ceph_msg alloc_msg(struct ceph_connection con,
				5532	struct ceph_msg_header *hdr,
				5533	int *skip)
				5534	{
				5535	struct ceph_osd *osd = con->private;
				5536	int type = le16_to_cpu(hdr->type);
				5537
				5538	*skip = 0;
				5539	switch (type) {
				5540	case CEPH_MSG_OSD_MAP:
				5541	case CEPH_MSG_OSD_BACKOFF:
				5542	case CEPH_MSG_WATCH_NOTIFY:
				5543	return alloc_msg_with_page_vector(hdr);
				5544	case CEPH_MSG_OSD_OPREPLY:
				5545	return get_reply(con, hdr, skip);
				5546	default:
				5547	pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
				5548	osd->o_osd, type);
				5549	*skip = 1;
				5550	return NULL;
				5551	}
				5552	}
				5553
				5554	/*
				5555	* Wrappers to refcount containing ceph_osd struct
				5556	*/
				5557	static struct ceph_connection get_osd_con(struct ceph_connection con)
				5558	{
				5559	struct ceph_osd *osd = con->private;
				5560	if (get_osd(osd))
				5561	return con;
				5562	return NULL;
				5563	}
				5564
				5565	static void put_osd_con(struct ceph_connection *con)
				5566	{
				5567	struct ceph_osd *osd = con->private;
				5568	put_osd(osd);
				5569	}
				5570
				5571	/*
				5572	* authentication
				5573	*/
				5574	/*
				5575	* Note: returned pointer is the address of a structure that's
				5576	* managed separately. Caller must not attempt to free it.
				5577	*/
				5578	static struct ceph_auth_handshake get_authorizer(struct ceph_connection con,
				5579	int *proto, int force_new)
				5580	{
				5581	struct ceph_osd *o = con->private;
				5582	struct ceph_osd_client *osdc = o->o_osdc;
				5583	struct ceph_auth_client *ac = osdc->client->monc.auth;
				5584	struct ceph_auth_handshake *auth = &o->o_auth;
				5585
				5586	if (force_new && auth->authorizer) {
				5587	ceph_auth_destroy_authorizer(auth->authorizer);
				5588	auth->authorizer = NULL;
				5589	}
				5590	if (!auth->authorizer) {
				5591	int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
				5592	auth);
				5593	if (ret)
				5594	return ERR_PTR(ret);
				5595	} else {
				5596	int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
				5597	auth);
				5598	if (ret)
				5599	return ERR_PTR(ret);
				5600	}
				5601	*proto = ac->protocol;
				5602
				5603	return auth;
				5604	}
				5605
				5606	static int add_authorizer_challenge(struct ceph_connection *con,
				5607	void *challenge_buf, int challenge_buf_len)
				5608	{
				5609	struct ceph_osd *o = con->private;
				5610	struct ceph_osd_client *osdc = o->o_osdc;
				5611	struct ceph_auth_client *ac = osdc->client->monc.auth;
				5612
				5613	return ceph_auth_add_authorizer_challenge(ac, o->o_auth.authorizer,
				5614	challenge_buf, challenge_buf_len);
				5615	}
				5616
				5617	static int verify_authorizer_reply(struct ceph_connection *con)
				5618	{
				5619	struct ceph_osd *o = con->private;
				5620	struct ceph_osd_client *osdc = o->o_osdc;
				5621	struct ceph_auth_client *ac = osdc->client->monc.auth;
				5622
				5623	return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer);
				5624	}
				5625
				5626	static int invalidate_authorizer(struct ceph_connection *con)
				5627	{
				5628	struct ceph_osd *o = con->private;
				5629	struct ceph_osd_client *osdc = o->o_osdc;
				5630	struct ceph_auth_client *ac = osdc->client->monc.auth;
				5631
				5632	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
				5633	return ceph_monc_validate_auth(&osdc->client->monc);
				5634	}
				5635
				5636	static void osd_reencode_message(struct ceph_msg *msg)
				5637	{
				5638	int type = le16_to_cpu(msg->hdr.type);
				5639
				5640	if (type == CEPH_MSG_OSD_OP)
				5641	encode_request_finish(msg);
				5642	}
				5643
				5644	static int osd_sign_message(struct ceph_msg *msg)
				5645	{
				5646	struct ceph_osd *o = msg->con->private;
				5647	struct ceph_auth_handshake *auth = &o->o_auth;
				5648
				5649	return ceph_auth_sign_message(auth, msg);
				5650	}
				5651
				5652	static int osd_check_message_signature(struct ceph_msg *msg)
				5653	{
				5654	struct ceph_osd *o = msg->con->private;
				5655	struct ceph_auth_handshake *auth = &o->o_auth;
				5656
				5657	return ceph_auth_check_message_signature(auth, msg);
				5658	}
				5659
				5660	static const struct ceph_connection_operations osd_con_ops = {
				5661	.get = get_osd_con,
				5662	.put = put_osd_con,
				5663	.dispatch = dispatch,
				5664	.get_authorizer = get_authorizer,
				5665	.add_authorizer_challenge = add_authorizer_challenge,
				5666	.verify_authorizer_reply = verify_authorizer_reply,
				5667	.invalidate_authorizer = invalidate_authorizer,
				5668	.alloc_msg = alloc_msg,
				5669	.reencode_message = osd_reencode_message,
				5670	.sign_message = osd_sign_message,
				5671	.check_message_signature = osd_check_message_signature,
				5672	.fault = osd_fault,
				5673	};