Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * Copyright (c) 2015, 2017 Oracle. All rights reserved. |
| 4 | * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. |
| 5 | */ |
| 6 | |
| 7 | /* Lightweight memory registration using Fast Memory Regions (FMR). |
| 8 | * Referred to sometimes as MTHCAFMR mode. |
| 9 | * |
| 10 | * FMR uses synchronous memory registration and deregistration. |
| 11 | * FMR registration is known to be fast, but FMR deregistration |
| 12 | * can take tens of usecs to complete. |
| 13 | */ |
| 14 | |
| 15 | /* Normal operation |
| 16 | * |
| 17 | * A Memory Region is prepared for RDMA READ or WRITE using the |
| 18 | * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is |
| 19 | * finished, the Memory Region is unmapped using the ib_unmap_fmr |
| 20 | * verb (fmr_op_unmap). |
| 21 | */ |
| 22 | |
| 23 | #include <linux/sunrpc/svc_rdma.h> |
| 24 | |
| 25 | #include "xprt_rdma.h" |
| 26 | #include <trace/events/rpcrdma.h> |
| 27 | |
| 28 | #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) |
| 29 | # define RPCDBG_FACILITY RPCDBG_TRANS |
| 30 | #endif |
| 31 | |
| 32 | /* Maximum scatter/gather per FMR */ |
| 33 | #define RPCRDMA_MAX_FMR_SGES (64) |
| 34 | |
| 35 | /* Access mode of externally registered pages */ |
| 36 | enum { |
| 37 | RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE | |
| 38 | IB_ACCESS_REMOTE_READ, |
| 39 | }; |
| 40 | |
| 41 | bool |
| 42 | fmr_is_supported(struct rpcrdma_ia *ia) |
| 43 | { |
| 44 | if (!ia->ri_device->alloc_fmr) { |
| 45 | pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n", |
| 46 | ia->ri_device->name); |
| 47 | return false; |
| 48 | } |
| 49 | return true; |
| 50 | } |
| 51 | |
| 52 | static int |
| 53 | fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) |
| 54 | { |
| 55 | static struct ib_fmr_attr fmr_attr = { |
| 56 | .max_pages = RPCRDMA_MAX_FMR_SGES, |
| 57 | .max_maps = 1, |
| 58 | .page_shift = PAGE_SHIFT |
| 59 | }; |
| 60 | |
| 61 | mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, |
| 62 | sizeof(u64), GFP_KERNEL); |
| 63 | if (!mr->fmr.fm_physaddrs) |
| 64 | goto out_free; |
| 65 | |
| 66 | mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, |
| 67 | sizeof(*mr->mr_sg), GFP_KERNEL); |
| 68 | if (!mr->mr_sg) |
| 69 | goto out_free; |
| 70 | |
| 71 | sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); |
| 72 | |
| 73 | mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, |
| 74 | &fmr_attr); |
| 75 | if (IS_ERR(mr->fmr.fm_mr)) |
| 76 | goto out_fmr_err; |
| 77 | |
| 78 | INIT_LIST_HEAD(&mr->mr_list); |
| 79 | return 0; |
| 80 | |
| 81 | out_fmr_err: |
| 82 | dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, |
| 83 | PTR_ERR(mr->fmr.fm_mr)); |
| 84 | |
| 85 | out_free: |
| 86 | kfree(mr->mr_sg); |
| 87 | kfree(mr->fmr.fm_physaddrs); |
| 88 | return -ENOMEM; |
| 89 | } |
| 90 | |
| 91 | static int |
| 92 | __fmr_unmap(struct rpcrdma_mr *mr) |
| 93 | { |
| 94 | LIST_HEAD(l); |
| 95 | int rc; |
| 96 | |
| 97 | list_add(&mr->fmr.fm_mr->list, &l); |
| 98 | rc = ib_unmap_fmr(&l); |
| 99 | list_del(&mr->fmr.fm_mr->list); |
| 100 | return rc; |
| 101 | } |
| 102 | |
| 103 | static void |
| 104 | fmr_op_release_mr(struct rpcrdma_mr *mr) |
| 105 | { |
| 106 | LIST_HEAD(unmap_list); |
| 107 | int rc; |
| 108 | |
| 109 | kfree(mr->fmr.fm_physaddrs); |
| 110 | kfree(mr->mr_sg); |
| 111 | |
| 112 | /* In case this one was left mapped, try to unmap it |
| 113 | * to prevent dealloc_fmr from failing with EBUSY |
| 114 | */ |
| 115 | rc = __fmr_unmap(mr); |
| 116 | if (rc) |
| 117 | pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", |
| 118 | mr, rc); |
| 119 | |
| 120 | rc = ib_dealloc_fmr(mr->fmr.fm_mr); |
| 121 | if (rc) |
| 122 | pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", |
| 123 | mr, rc); |
| 124 | |
| 125 | kfree(mr); |
| 126 | } |
| 127 | |
| 128 | /* Reset of a single FMR. |
| 129 | */ |
| 130 | static void |
| 131 | fmr_op_recover_mr(struct rpcrdma_mr *mr) |
| 132 | { |
| 133 | struct rpcrdma_xprt *r_xprt = mr->mr_xprt; |
| 134 | int rc; |
| 135 | |
| 136 | /* ORDER: invalidate first */ |
| 137 | rc = __fmr_unmap(mr); |
| 138 | if (rc) |
| 139 | goto out_release; |
| 140 | |
| 141 | /* ORDER: then DMA unmap */ |
| 142 | rpcrdma_mr_unmap_and_put(mr); |
| 143 | |
| 144 | r_xprt->rx_stats.mrs_recovered++; |
| 145 | return; |
| 146 | |
| 147 | out_release: |
| 148 | pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr); |
| 149 | r_xprt->rx_stats.mrs_orphaned++; |
| 150 | |
| 151 | trace_xprtrdma_dma_unmap(mr); |
| 152 | ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, |
| 153 | mr->mr_sg, mr->mr_nents, mr->mr_dir); |
| 154 | |
| 155 | spin_lock(&r_xprt->rx_buf.rb_mrlock); |
| 156 | list_del(&mr->mr_all); |
| 157 | spin_unlock(&r_xprt->rx_buf.rb_mrlock); |
| 158 | |
| 159 | fmr_op_release_mr(mr); |
| 160 | } |
| 161 | |
| 162 | /* On success, sets: |
| 163 | * ep->rep_attr.cap.max_send_wr |
| 164 | * ep->rep_attr.cap.max_recv_wr |
| 165 | * cdata->max_requests |
| 166 | * ia->ri_max_segs |
| 167 | */ |
| 168 | static int |
| 169 | fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, |
| 170 | struct rpcrdma_create_data_internal *cdata) |
| 171 | { |
| 172 | int max_qp_wr; |
| 173 | |
| 174 | max_qp_wr = ia->ri_device->attrs.max_qp_wr; |
| 175 | max_qp_wr -= RPCRDMA_BACKWARD_WRS; |
| 176 | max_qp_wr -= 1; |
| 177 | if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) |
| 178 | return -ENOMEM; |
| 179 | if (cdata->max_requests > max_qp_wr) |
| 180 | cdata->max_requests = max_qp_wr; |
| 181 | ep->rep_attr.cap.max_send_wr = cdata->max_requests; |
| 182 | ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; |
| 183 | ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ |
| 184 | ep->rep_attr.cap.max_recv_wr = cdata->max_requests; |
| 185 | ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; |
| 186 | ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ |
| 187 | |
| 188 | ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / |
| 189 | RPCRDMA_MAX_FMR_SGES); |
| 190 | return 0; |
| 191 | } |
| 192 | |
| 193 | /* FMR mode conveys up to 64 pages of payload per chunk segment. |
| 194 | */ |
| 195 | static size_t |
| 196 | fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) |
| 197 | { |
| 198 | return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, |
| 199 | RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); |
| 200 | } |
| 201 | |
| 202 | /* Use the ib_map_phys_fmr() verb to register a memory region |
| 203 | * for remote access via RDMA READ or RDMA WRITE. |
| 204 | */ |
| 205 | static struct rpcrdma_mr_seg * |
| 206 | fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, |
| 207 | int nsegs, bool writing, struct rpcrdma_mr **out) |
| 208 | { |
| 209 | struct rpcrdma_mr_seg *seg1 = seg; |
| 210 | int len, pageoff, i, rc; |
| 211 | struct rpcrdma_mr *mr; |
| 212 | u64 *dma_pages; |
| 213 | |
| 214 | mr = rpcrdma_mr_get(r_xprt); |
| 215 | if (!mr) |
| 216 | return ERR_PTR(-EAGAIN); |
| 217 | |
| 218 | pageoff = offset_in_page(seg1->mr_offset); |
| 219 | seg1->mr_offset -= pageoff; /* start of page */ |
| 220 | seg1->mr_len += pageoff; |
| 221 | len = -pageoff; |
| 222 | if (nsegs > RPCRDMA_MAX_FMR_SGES) |
| 223 | nsegs = RPCRDMA_MAX_FMR_SGES; |
| 224 | for (i = 0; i < nsegs;) { |
| 225 | if (seg->mr_page) |
| 226 | sg_set_page(&mr->mr_sg[i], |
| 227 | seg->mr_page, |
| 228 | seg->mr_len, |
| 229 | offset_in_page(seg->mr_offset)); |
| 230 | else |
| 231 | sg_set_buf(&mr->mr_sg[i], seg->mr_offset, |
| 232 | seg->mr_len); |
| 233 | len += seg->mr_len; |
| 234 | ++seg; |
| 235 | ++i; |
| 236 | /* Check for holes */ |
| 237 | if ((i < nsegs && offset_in_page(seg->mr_offset)) || |
| 238 | offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) |
| 239 | break; |
| 240 | } |
| 241 | mr->mr_dir = rpcrdma_data_dir(writing); |
| 242 | |
| 243 | mr->mr_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device, |
| 244 | mr->mr_sg, i, mr->mr_dir); |
| 245 | if (!mr->mr_nents) |
| 246 | goto out_dmamap_err; |
| 247 | trace_xprtrdma_dma_map(mr); |
| 248 | |
| 249 | for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) |
| 250 | dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); |
| 251 | rc = ib_map_phys_fmr(mr->fmr.fm_mr, dma_pages, mr->mr_nents, |
| 252 | dma_pages[0]); |
| 253 | if (rc) |
| 254 | goto out_maperr; |
| 255 | |
| 256 | mr->mr_handle = mr->fmr.fm_mr->rkey; |
| 257 | mr->mr_length = len; |
| 258 | mr->mr_offset = dma_pages[0] + pageoff; |
| 259 | |
| 260 | *out = mr; |
| 261 | return seg; |
| 262 | |
| 263 | out_dmamap_err: |
| 264 | pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", |
| 265 | mr->mr_sg, i); |
| 266 | rpcrdma_mr_put(mr); |
| 267 | return ERR_PTR(-EIO); |
| 268 | |
| 269 | out_maperr: |
| 270 | pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", |
| 271 | len, (unsigned long long)dma_pages[0], |
| 272 | pageoff, mr->mr_nents, rc); |
| 273 | rpcrdma_mr_unmap_and_put(mr); |
| 274 | return ERR_PTR(-EIO); |
| 275 | } |
| 276 | |
| 277 | /* Post Send WR containing the RPC Call message. |
| 278 | */ |
| 279 | static int |
| 280 | fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) |
| 281 | { |
| 282 | return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, NULL); |
| 283 | } |
| 284 | |
| 285 | /* Invalidate all memory regions that were registered for "req". |
| 286 | * |
| 287 | * Sleeps until it is safe for the host CPU to access the |
| 288 | * previously mapped memory regions. |
| 289 | * |
| 290 | * Caller ensures that @mrs is not empty before the call. This |
| 291 | * function empties the list. |
| 292 | */ |
| 293 | static void |
| 294 | fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) |
| 295 | { |
| 296 | struct rpcrdma_mr *mr; |
| 297 | LIST_HEAD(unmap_list); |
| 298 | int rc; |
| 299 | |
| 300 | /* ORDER: Invalidate all of the req's MRs first |
| 301 | * |
| 302 | * ib_unmap_fmr() is slow, so use a single call instead |
| 303 | * of one call per mapped FMR. |
| 304 | */ |
| 305 | list_for_each_entry(mr, mrs, mr_list) { |
| 306 | dprintk("RPC: %s: unmapping fmr %p\n", |
| 307 | __func__, &mr->fmr); |
| 308 | trace_xprtrdma_localinv(mr); |
| 309 | list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); |
| 310 | } |
| 311 | r_xprt->rx_stats.local_inv_needed++; |
| 312 | rc = ib_unmap_fmr(&unmap_list); |
| 313 | if (rc) |
| 314 | goto out_reset; |
| 315 | |
| 316 | /* ORDER: Now DMA unmap all of the req's MRs, and return |
| 317 | * them to the free MW list. |
| 318 | */ |
| 319 | while (!list_empty(mrs)) { |
| 320 | mr = rpcrdma_mr_pop(mrs); |
| 321 | list_del(&mr->fmr.fm_mr->list); |
| 322 | rpcrdma_mr_unmap_and_put(mr); |
| 323 | } |
| 324 | |
| 325 | return; |
| 326 | |
| 327 | out_reset: |
| 328 | pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); |
| 329 | |
| 330 | while (!list_empty(mrs)) { |
| 331 | mr = rpcrdma_mr_pop(mrs); |
| 332 | list_del(&mr->fmr.fm_mr->list); |
| 333 | fmr_op_recover_mr(mr); |
| 334 | } |
| 335 | } |
| 336 | |
| 337 | const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { |
| 338 | .ro_map = fmr_op_map, |
| 339 | .ro_send = fmr_op_send, |
| 340 | .ro_unmap_sync = fmr_op_unmap_sync, |
| 341 | .ro_recover_mr = fmr_op_recover_mr, |
| 342 | .ro_open = fmr_op_open, |
| 343 | .ro_maxpages = fmr_op_maxpages, |
| 344 | .ro_init_mr = fmr_op_init_mr, |
| 345 | .ro_release_mr = fmr_op_release_mr, |
| 346 | .ro_displayname = "fmr", |
| 347 | .ro_send_w_inv_ok = 0, |
| 348 | }; |