1 /*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50 #include <linux/pci.h> /* for Tavor hack below */
51
52 #include "xprt_rdma.h"
53
54 /*
55 * Globals/Macros
56 */
57
58 #ifdef RPC_DEBUG
59 # define RPCDBG_FACILITY RPCDBG_TRANS
60 #endif
61
62 /*
63 * internal functions
64 */
65
66 /*
67 * handle replies in tasklet context, using a single, global list
68 * rdma tasklet function -- just turn around and call the func
69 * for all replies on the list
70 */
71
72 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
73 static LIST_HEAD(rpcrdma_tasklets_g);
74
75 static void
rpcrdma_run_tasklet(unsigned long data)76 rpcrdma_run_tasklet(unsigned long data)
77 {
78 struct rpcrdma_rep *rep;
79 void (*func)(struct rpcrdma_rep *);
80 unsigned long flags;
81
82 data = data;
83 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
84 while (!list_empty(&rpcrdma_tasklets_g)) {
85 rep = list_entry(rpcrdma_tasklets_g.next,
86 struct rpcrdma_rep, rr_list);
87 list_del(&rep->rr_list);
88 func = rep->rr_func;
89 rep->rr_func = NULL;
90 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
91
92 if (func)
93 func(rep);
94 else
95 rpcrdma_recv_buffer_put(rep);
96
97 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
98 }
99 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
100 }
101
102 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
103
104 static inline void
rpcrdma_schedule_tasklet(struct rpcrdma_rep * rep)105 rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
106 {
107 unsigned long flags;
108
109 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
110 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
111 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
112 tasklet_schedule(&rpcrdma_tasklet_g);
113 }
114
115 static void
rpcrdma_qp_async_error_upcall(struct ib_event * event,void * context)116 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
117 {
118 struct rpcrdma_ep *ep = context;
119
120 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
121 __func__, event->event, event->device->name, context);
122 if (ep->rep_connected == 1) {
123 ep->rep_connected = -EIO;
124 ep->rep_func(ep);
125 wake_up_all(&ep->rep_connect_wait);
126 }
127 }
128
129 static void
rpcrdma_cq_async_error_upcall(struct ib_event * event,void * context)130 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
131 {
132 struct rpcrdma_ep *ep = context;
133
134 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
135 __func__, event->event, event->device->name, context);
136 if (ep->rep_connected == 1) {
137 ep->rep_connected = -EIO;
138 ep->rep_func(ep);
139 wake_up_all(&ep->rep_connect_wait);
140 }
141 }
142
143 static inline
rpcrdma_event_process(struct ib_wc * wc)144 void rpcrdma_event_process(struct ib_wc *wc)
145 {
146 struct rpcrdma_rep *rep =
147 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
148
149 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
150 __func__, rep, wc->status, wc->opcode, wc->byte_len);
151
152 if (!rep) /* send or bind completion that we don't care about */
153 return;
154
155 if (IB_WC_SUCCESS != wc->status) {
156 dprintk("RPC: %s: %s WC status %X, connection lost\n",
157 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
158 wc->status);
159 rep->rr_len = ~0U;
160 rpcrdma_schedule_tasklet(rep);
161 return;
162 }
163
164 switch (wc->opcode) {
165 case IB_WC_RECV:
166 rep->rr_len = wc->byte_len;
167 ib_dma_sync_single_for_cpu(
168 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
169 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
170 /* Keep (only) the most recent credits, after check validity */
171 if (rep->rr_len >= 16) {
172 struct rpcrdma_msg *p =
173 (struct rpcrdma_msg *) rep->rr_base;
174 unsigned int credits = ntohl(p->rm_credit);
175 if (credits == 0) {
176 dprintk("RPC: %s: server"
177 " dropped credits to 0!\n", __func__);
178 /* don't deadlock */
179 credits = 1;
180 } else if (credits > rep->rr_buffer->rb_max_requests) {
181 dprintk("RPC: %s: server"
182 " over-crediting: %d (%d)\n",
183 __func__, credits,
184 rep->rr_buffer->rb_max_requests);
185 credits = rep->rr_buffer->rb_max_requests;
186 }
187 atomic_set(&rep->rr_buffer->rb_credits, credits);
188 }
189 /* fall through */
190 case IB_WC_BIND_MW:
191 rpcrdma_schedule_tasklet(rep);
192 break;
193 default:
194 dprintk("RPC: %s: unexpected WC event %X\n",
195 __func__, wc->opcode);
196 break;
197 }
198 }
199
200 static inline int
rpcrdma_cq_poll(struct ib_cq * cq)201 rpcrdma_cq_poll(struct ib_cq *cq)
202 {
203 struct ib_wc wc;
204 int rc;
205
206 for (;;) {
207 rc = ib_poll_cq(cq, 1, &wc);
208 if (rc < 0) {
209 dprintk("RPC: %s: ib_poll_cq failed %i\n",
210 __func__, rc);
211 return rc;
212 }
213 if (rc == 0)
214 break;
215
216 rpcrdma_event_process(&wc);
217 }
218
219 return 0;
220 }
221
222 /*
223 * rpcrdma_cq_event_upcall
224 *
225 * This upcall handles recv, send, bind and unbind events.
226 * It is reentrant but processes single events in order to maintain
227 * ordering of receives to keep server credits.
228 *
229 * It is the responsibility of the scheduled tasklet to return
230 * recv buffers to the pool. NOTE: this affects synchronization of
231 * connection shutdown. That is, the structures required for
232 * the completion of the reply handler must remain intact until
233 * all memory has been reclaimed.
234 *
235 * Note that send events are suppressed and do not result in an upcall.
236 */
237 static void
rpcrdma_cq_event_upcall(struct ib_cq * cq,void * context)238 rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
239 {
240 int rc;
241
242 rc = rpcrdma_cq_poll(cq);
243 if (rc)
244 return;
245
246 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
247 if (rc) {
248 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
249 __func__, rc);
250 return;
251 }
252
253 rpcrdma_cq_poll(cq);
254 }
255
256 #ifdef RPC_DEBUG
257 static const char * const conn[] = {
258 "address resolved",
259 "address error",
260 "route resolved",
261 "route error",
262 "connect request",
263 "connect response",
264 "connect error",
265 "unreachable",
266 "rejected",
267 "established",
268 "disconnected",
269 "device removal"
270 };
271 #endif
272
273 static int
rpcrdma_conn_upcall(struct rdma_cm_id * id,struct rdma_cm_event * event)274 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
275 {
276 struct rpcrdma_xprt *xprt = id->context;
277 struct rpcrdma_ia *ia = &xprt->rx_ia;
278 struct rpcrdma_ep *ep = &xprt->rx_ep;
279 #ifdef RPC_DEBUG
280 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
281 #endif
282 struct ib_qp_attr attr;
283 struct ib_qp_init_attr iattr;
284 int connstate = 0;
285
286 switch (event->event) {
287 case RDMA_CM_EVENT_ADDR_RESOLVED:
288 case RDMA_CM_EVENT_ROUTE_RESOLVED:
289 ia->ri_async_rc = 0;
290 complete(&ia->ri_done);
291 break;
292 case RDMA_CM_EVENT_ADDR_ERROR:
293 ia->ri_async_rc = -EHOSTUNREACH;
294 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
295 __func__, ep);
296 complete(&ia->ri_done);
297 break;
298 case RDMA_CM_EVENT_ROUTE_ERROR:
299 ia->ri_async_rc = -ENETUNREACH;
300 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
301 __func__, ep);
302 complete(&ia->ri_done);
303 break;
304 case RDMA_CM_EVENT_ESTABLISHED:
305 connstate = 1;
306 ib_query_qp(ia->ri_id->qp, &attr,
307 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
308 &iattr);
309 dprintk("RPC: %s: %d responder resources"
310 " (%d initiator)\n",
311 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
312 goto connected;
313 case RDMA_CM_EVENT_CONNECT_ERROR:
314 connstate = -ENOTCONN;
315 goto connected;
316 case RDMA_CM_EVENT_UNREACHABLE:
317 connstate = -ENETDOWN;
318 goto connected;
319 case RDMA_CM_EVENT_REJECTED:
320 connstate = -ECONNREFUSED;
321 goto connected;
322 case RDMA_CM_EVENT_DISCONNECTED:
323 connstate = -ECONNABORTED;
324 goto connected;
325 case RDMA_CM_EVENT_DEVICE_REMOVAL:
326 connstate = -ENODEV;
327 connected:
328 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
329 __func__,
330 (event->event <= 11) ? conn[event->event] :
331 "unknown connection error",
332 &addr->sin_addr.s_addr,
333 ntohs(addr->sin_port),
334 ep, event->event);
335 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
336 dprintk("RPC: %s: %sconnected\n",
337 __func__, connstate > 0 ? "" : "dis");
338 ep->rep_connected = connstate;
339 ep->rep_func(ep);
340 wake_up_all(&ep->rep_connect_wait);
341 break;
342 default:
343 dprintk("RPC: %s: unexpected CM event %d\n",
344 __func__, event->event);
345 break;
346 }
347
348 #ifdef RPC_DEBUG
349 if (connstate == 1) {
350 int ird = attr.max_dest_rd_atomic;
351 int tird = ep->rep_remote_cma.responder_resources;
352 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
353 "on %s, memreg %d slots %d ird %d%s\n",
354 &addr->sin_addr.s_addr,
355 ntohs(addr->sin_port),
356 ia->ri_id->device->name,
357 ia->ri_memreg_strategy,
358 xprt->rx_buf.rb_max_requests,
359 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
360 } else if (connstate < 0) {
361 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
362 &addr->sin_addr.s_addr,
363 ntohs(addr->sin_port),
364 connstate);
365 }
366 #endif
367
368 return 0;
369 }
370
371 static struct rdma_cm_id *
rpcrdma_create_id(struct rpcrdma_xprt * xprt,struct rpcrdma_ia * ia,struct sockaddr * addr)372 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
373 struct rpcrdma_ia *ia, struct sockaddr *addr)
374 {
375 struct rdma_cm_id *id;
376 int rc;
377
378 init_completion(&ia->ri_done);
379
380 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
381 if (IS_ERR(id)) {
382 rc = PTR_ERR(id);
383 dprintk("RPC: %s: rdma_create_id() failed %i\n",
384 __func__, rc);
385 return id;
386 }
387
388 ia->ri_async_rc = -ETIMEDOUT;
389 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
390 if (rc) {
391 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
392 __func__, rc);
393 goto out;
394 }
395 wait_for_completion_interruptible_timeout(&ia->ri_done,
396 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
397 rc = ia->ri_async_rc;
398 if (rc)
399 goto out;
400
401 ia->ri_async_rc = -ETIMEDOUT;
402 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
403 if (rc) {
404 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
405 __func__, rc);
406 goto out;
407 }
408 wait_for_completion_interruptible_timeout(&ia->ri_done,
409 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
410 rc = ia->ri_async_rc;
411 if (rc)
412 goto out;
413
414 return id;
415
416 out:
417 rdma_destroy_id(id);
418 return ERR_PTR(rc);
419 }
420
421 /*
422 * Drain any cq, prior to teardown.
423 */
424 static void
rpcrdma_clean_cq(struct ib_cq * cq)425 rpcrdma_clean_cq(struct ib_cq *cq)
426 {
427 struct ib_wc wc;
428 int count = 0;
429
430 while (1 == ib_poll_cq(cq, 1, &wc))
431 ++count;
432
433 if (count)
434 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
435 __func__, count, wc.opcode);
436 }
437
438 /*
439 * Exported functions.
440 */
441
442 /*
443 * Open and initialize an Interface Adapter.
444 * o initializes fields of struct rpcrdma_ia, including
445 * interface and provider attributes and protection zone.
446 */
447 int
rpcrdma_ia_open(struct rpcrdma_xprt * xprt,struct sockaddr * addr,int memreg)448 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
449 {
450 int rc, mem_priv;
451 struct ib_device_attr devattr;
452 struct rpcrdma_ia *ia = &xprt->rx_ia;
453
454 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
455 if (IS_ERR(ia->ri_id)) {
456 rc = PTR_ERR(ia->ri_id);
457 goto out1;
458 }
459
460 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
461 if (IS_ERR(ia->ri_pd)) {
462 rc = PTR_ERR(ia->ri_pd);
463 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
464 __func__, rc);
465 goto out2;
466 }
467
468 /*
469 * Query the device to determine if the requested memory
470 * registration strategy is supported. If it isn't, set the
471 * strategy to a globally supported model.
472 */
473 rc = ib_query_device(ia->ri_id->device, &devattr);
474 if (rc) {
475 dprintk("RPC: %s: ib_query_device failed %d\n",
476 __func__, rc);
477 goto out2;
478 }
479
480 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
481 ia->ri_have_dma_lkey = 1;
482 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
483 }
484
485 switch (memreg) {
486 case RPCRDMA_MEMWINDOWS:
487 case RPCRDMA_MEMWINDOWS_ASYNC:
488 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
489 dprintk("RPC: %s: MEMWINDOWS registration "
490 "specified but not supported by adapter, "
491 "using slower RPCRDMA_REGISTER\n",
492 __func__);
493 memreg = RPCRDMA_REGISTER;
494 }
495 break;
496 case RPCRDMA_MTHCAFMR:
497 if (!ia->ri_id->device->alloc_fmr) {
498 #if RPCRDMA_PERSISTENT_REGISTRATION
499 dprintk("RPC: %s: MTHCAFMR registration "
500 "specified but not supported by adapter, "
501 "using riskier RPCRDMA_ALLPHYSICAL\n",
502 __func__);
503 memreg = RPCRDMA_ALLPHYSICAL;
504 #else
505 dprintk("RPC: %s: MTHCAFMR registration "
506 "specified but not supported by adapter, "
507 "using slower RPCRDMA_REGISTER\n",
508 __func__);
509 memreg = RPCRDMA_REGISTER;
510 #endif
511 }
512 break;
513 case RPCRDMA_FRMR:
514 /* Requires both frmr reg and local dma lkey */
515 if ((devattr.device_cap_flags &
516 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
517 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
518 #if RPCRDMA_PERSISTENT_REGISTRATION
519 dprintk("RPC: %s: FRMR registration "
520 "specified but not supported by adapter, "
521 "using riskier RPCRDMA_ALLPHYSICAL\n",
522 __func__);
523 memreg = RPCRDMA_ALLPHYSICAL;
524 #else
525 dprintk("RPC: %s: FRMR registration "
526 "specified but not supported by adapter, "
527 "using slower RPCRDMA_REGISTER\n",
528 __func__);
529 memreg = RPCRDMA_REGISTER;
530 #endif
531 }
532 break;
533 }
534
535 /*
536 * Optionally obtain an underlying physical identity mapping in
537 * order to do a memory window-based bind. This base registration
538 * is protected from remote access - that is enabled only by binding
539 * for the specific bytes targeted during each RPC operation, and
540 * revoked after the corresponding completion similar to a storage
541 * adapter.
542 */
543 switch (memreg) {
544 case RPCRDMA_BOUNCEBUFFERS:
545 case RPCRDMA_REGISTER:
546 case RPCRDMA_FRMR:
547 break;
548 #if RPCRDMA_PERSISTENT_REGISTRATION
549 case RPCRDMA_ALLPHYSICAL:
550 mem_priv = IB_ACCESS_LOCAL_WRITE |
551 IB_ACCESS_REMOTE_WRITE |
552 IB_ACCESS_REMOTE_READ;
553 goto register_setup;
554 #endif
555 case RPCRDMA_MEMWINDOWS_ASYNC:
556 case RPCRDMA_MEMWINDOWS:
557 mem_priv = IB_ACCESS_LOCAL_WRITE |
558 IB_ACCESS_MW_BIND;
559 goto register_setup;
560 case RPCRDMA_MTHCAFMR:
561 if (ia->ri_have_dma_lkey)
562 break;
563 mem_priv = IB_ACCESS_LOCAL_WRITE;
564 register_setup:
565 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
566 if (IS_ERR(ia->ri_bind_mem)) {
567 printk(KERN_ALERT "%s: ib_get_dma_mr for "
568 "phys register failed with %lX\n\t"
569 "Will continue with degraded performance\n",
570 __func__, PTR_ERR(ia->ri_bind_mem));
571 memreg = RPCRDMA_REGISTER;
572 ia->ri_bind_mem = NULL;
573 }
574 break;
575 default:
576 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
577 __func__, memreg);
578 rc = -EINVAL;
579 goto out2;
580 }
581 dprintk("RPC: %s: memory registration strategy is %d\n",
582 __func__, memreg);
583
584 /* Else will do memory reg/dereg for each chunk */
585 ia->ri_memreg_strategy = memreg;
586
587 return 0;
588 out2:
589 rdma_destroy_id(ia->ri_id);
590 ia->ri_id = NULL;
591 out1:
592 return rc;
593 }
594
595 /*
596 * Clean up/close an IA.
597 * o if event handles and PD have been initialized, free them.
598 * o close the IA
599 */
600 void
rpcrdma_ia_close(struct rpcrdma_ia * ia)601 rpcrdma_ia_close(struct rpcrdma_ia *ia)
602 {
603 int rc;
604
605 dprintk("RPC: %s: entering\n", __func__);
606 if (ia->ri_bind_mem != NULL) {
607 rc = ib_dereg_mr(ia->ri_bind_mem);
608 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
609 __func__, rc);
610 }
611 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
612 if (ia->ri_id->qp)
613 rdma_destroy_qp(ia->ri_id);
614 rdma_destroy_id(ia->ri_id);
615 ia->ri_id = NULL;
616 }
617 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
618 rc = ib_dealloc_pd(ia->ri_pd);
619 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
620 __func__, rc);
621 }
622 }
623
624 /*
625 * Create unconnected endpoint.
626 */
627 int
rpcrdma_ep_create(struct rpcrdma_ep * ep,struct rpcrdma_ia * ia,struct rpcrdma_create_data_internal * cdata)628 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
629 struct rpcrdma_create_data_internal *cdata)
630 {
631 struct ib_device_attr devattr;
632 int rc, err;
633
634 rc = ib_query_device(ia->ri_id->device, &devattr);
635 if (rc) {
636 dprintk("RPC: %s: ib_query_device failed %d\n",
637 __func__, rc);
638 return rc;
639 }
640
641 /* check provider's send/recv wr limits */
642 if (cdata->max_requests > devattr.max_qp_wr)
643 cdata->max_requests = devattr.max_qp_wr;
644
645 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
646 ep->rep_attr.qp_context = ep;
647 /* send_cq and recv_cq initialized below */
648 ep->rep_attr.srq = NULL;
649 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
650 switch (ia->ri_memreg_strategy) {
651 case RPCRDMA_FRMR:
652 /* Add room for frmr register and invalidate WRs */
653 ep->rep_attr.cap.max_send_wr *= 3;
654 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
655 return -EINVAL;
656 break;
657 case RPCRDMA_MEMWINDOWS_ASYNC:
658 case RPCRDMA_MEMWINDOWS:
659 /* Add room for mw_binds+unbinds - overkill! */
660 ep->rep_attr.cap.max_send_wr++;
661 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
662 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
663 return -EINVAL;
664 break;
665 default:
666 break;
667 }
668 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
669 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
670 ep->rep_attr.cap.max_recv_sge = 1;
671 ep->rep_attr.cap.max_inline_data = 0;
672 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
673 ep->rep_attr.qp_type = IB_QPT_RC;
674 ep->rep_attr.port_num = ~0;
675
676 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
677 "iovs: send %d recv %d\n",
678 __func__,
679 ep->rep_attr.cap.max_send_wr,
680 ep->rep_attr.cap.max_recv_wr,
681 ep->rep_attr.cap.max_send_sge,
682 ep->rep_attr.cap.max_recv_sge);
683
684 /* set trigger for requesting send completion */
685 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
686 switch (ia->ri_memreg_strategy) {
687 case RPCRDMA_MEMWINDOWS_ASYNC:
688 case RPCRDMA_MEMWINDOWS:
689 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
690 break;
691 default:
692 break;
693 }
694 if (ep->rep_cqinit <= 2)
695 ep->rep_cqinit = 0;
696 INIT_CQCOUNT(ep);
697 ep->rep_ia = ia;
698 init_waitqueue_head(&ep->rep_connect_wait);
699
700 /*
701 * Create a single cq for receive dto and mw_bind (only ever
702 * care about unbind, really). Send completions are suppressed.
703 * Use single threaded tasklet upcalls to maintain ordering.
704 */
705 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
706 rpcrdma_cq_async_error_upcall, NULL,
707 ep->rep_attr.cap.max_recv_wr +
708 ep->rep_attr.cap.max_send_wr + 1, 0);
709 if (IS_ERR(ep->rep_cq)) {
710 rc = PTR_ERR(ep->rep_cq);
711 dprintk("RPC: %s: ib_create_cq failed: %i\n",
712 __func__, rc);
713 goto out1;
714 }
715
716 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
717 if (rc) {
718 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
719 __func__, rc);
720 goto out2;
721 }
722
723 ep->rep_attr.send_cq = ep->rep_cq;
724 ep->rep_attr.recv_cq = ep->rep_cq;
725
726 /* Initialize cma parameters */
727
728 /* RPC/RDMA does not use private data */
729 ep->rep_remote_cma.private_data = NULL;
730 ep->rep_remote_cma.private_data_len = 0;
731
732 /* Client offers RDMA Read but does not initiate */
733 ep->rep_remote_cma.initiator_depth = 0;
734 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
735 ep->rep_remote_cma.responder_resources = 0;
736 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
737 ep->rep_remote_cma.responder_resources = 32;
738 else
739 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
740
741 ep->rep_remote_cma.retry_count = 7;
742 ep->rep_remote_cma.flow_control = 0;
743 ep->rep_remote_cma.rnr_retry_count = 0;
744
745 return 0;
746
747 out2:
748 err = ib_destroy_cq(ep->rep_cq);
749 if (err)
750 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
751 __func__, err);
752 out1:
753 return rc;
754 }
755
756 /*
757 * rpcrdma_ep_destroy
758 *
759 * Disconnect and destroy endpoint. After this, the only
760 * valid operations on the ep are to free it (if dynamically
761 * allocated) or re-create it.
762 *
763 * The caller's error handling must be sure to not leak the endpoint
764 * if this function fails.
765 */
766 int
rpcrdma_ep_destroy(struct rpcrdma_ep * ep,struct rpcrdma_ia * ia)767 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
768 {
769 int rc;
770
771 dprintk("RPC: %s: entering, connected is %d\n",
772 __func__, ep->rep_connected);
773
774 if (ia->ri_id->qp) {
775 rc = rpcrdma_ep_disconnect(ep, ia);
776 if (rc)
777 dprintk("RPC: %s: rpcrdma_ep_disconnect"
778 " returned %i\n", __func__, rc);
779 rdma_destroy_qp(ia->ri_id);
780 ia->ri_id->qp = NULL;
781 }
782
783 /* padding - could be done in rpcrdma_buffer_destroy... */
784 if (ep->rep_pad_mr) {
785 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
786 ep->rep_pad_mr = NULL;
787 }
788
789 rpcrdma_clean_cq(ep->rep_cq);
790 rc = ib_destroy_cq(ep->rep_cq);
791 if (rc)
792 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
793 __func__, rc);
794
795 return rc;
796 }
797
798 /*
799 * Connect unconnected endpoint.
800 */
801 int
rpcrdma_ep_connect(struct rpcrdma_ep * ep,struct rpcrdma_ia * ia)802 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
803 {
804 struct rdma_cm_id *id;
805 int rc = 0;
806 int retry_count = 0;
807
808 if (ep->rep_connected != 0) {
809 struct rpcrdma_xprt *xprt;
810 retry:
811 rc = rpcrdma_ep_disconnect(ep, ia);
812 if (rc && rc != -ENOTCONN)
813 dprintk("RPC: %s: rpcrdma_ep_disconnect"
814 " status %i\n", __func__, rc);
815 rpcrdma_clean_cq(ep->rep_cq);
816
817 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
818 id = rpcrdma_create_id(xprt, ia,
819 (struct sockaddr *)&xprt->rx_data.addr);
820 if (IS_ERR(id)) {
821 rc = PTR_ERR(id);
822 goto out;
823 }
824 /* TEMP TEMP TEMP - fail if new device:
825 * Deregister/remarshal *all* requests!
826 * Close and recreate adapter, pd, etc!
827 * Re-determine all attributes still sane!
828 * More stuff I haven't thought of!
829 * Rrrgh!
830 */
831 if (ia->ri_id->device != id->device) {
832 printk("RPC: %s: can't reconnect on "
833 "different device!\n", __func__);
834 rdma_destroy_id(id);
835 rc = -ENETDOWN;
836 goto out;
837 }
838 /* END TEMP */
839 rdma_destroy_qp(ia->ri_id);
840 rdma_destroy_id(ia->ri_id);
841 ia->ri_id = id;
842 }
843
844 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
845 if (rc) {
846 dprintk("RPC: %s: rdma_create_qp failed %i\n",
847 __func__, rc);
848 goto out;
849 }
850
851 /* XXX Tavor device performs badly with 2K MTU! */
852 if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
853 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
854 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
855 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
856 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
857 struct ib_qp_attr attr = {
858 .path_mtu = IB_MTU_1024
859 };
860 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
861 }
862 }
863
864 ep->rep_connected = 0;
865
866 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
867 if (rc) {
868 dprintk("RPC: %s: rdma_connect() failed with %i\n",
869 __func__, rc);
870 goto out;
871 }
872
873 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
874
875 /*
876 * Check state. A non-peer reject indicates no listener
877 * (ECONNREFUSED), which may be a transient state. All
878 * others indicate a transport condition which has already
879 * undergone a best-effort.
880 */
881 if (ep->rep_connected == -ECONNREFUSED
882 && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
883 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
884 goto retry;
885 }
886 if (ep->rep_connected <= 0) {
887 /* Sometimes, the only way to reliably connect to remote
888 * CMs is to use same nonzero values for ORD and IRD. */
889 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
890 (ep->rep_remote_cma.responder_resources == 0 ||
891 ep->rep_remote_cma.initiator_depth !=
892 ep->rep_remote_cma.responder_resources)) {
893 if (ep->rep_remote_cma.responder_resources == 0)
894 ep->rep_remote_cma.responder_resources = 1;
895 ep->rep_remote_cma.initiator_depth =
896 ep->rep_remote_cma.responder_resources;
897 goto retry;
898 }
899 rc = ep->rep_connected;
900 } else {
901 dprintk("RPC: %s: connected\n", __func__);
902 }
903
904 out:
905 if (rc)
906 ep->rep_connected = rc;
907 return rc;
908 }
909
910 /*
911 * rpcrdma_ep_disconnect
912 *
913 * This is separate from destroy to facilitate the ability
914 * to reconnect without recreating the endpoint.
915 *
916 * This call is not reentrant, and must not be made in parallel
917 * on the same endpoint.
918 */
919 int
rpcrdma_ep_disconnect(struct rpcrdma_ep * ep,struct rpcrdma_ia * ia)920 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
921 {
922 int rc;
923
924 rpcrdma_clean_cq(ep->rep_cq);
925 rc = rdma_disconnect(ia->ri_id);
926 if (!rc) {
927 /* returns without wait if not connected */
928 wait_event_interruptible(ep->rep_connect_wait,
929 ep->rep_connected != 1);
930 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
931 (ep->rep_connected == 1) ? "still " : "dis");
932 } else {
933 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
934 ep->rep_connected = rc;
935 }
936 return rc;
937 }
938
939 /*
940 * Initialize buffer memory
941 */
942 int
rpcrdma_buffer_create(struct rpcrdma_buffer * buf,struct rpcrdma_ep * ep,struct rpcrdma_ia * ia,struct rpcrdma_create_data_internal * cdata)943 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
944 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
945 {
946 char *p;
947 size_t len;
948 int i, rc;
949 struct rpcrdma_mw *r;
950
951 buf->rb_max_requests = cdata->max_requests;
952 spin_lock_init(&buf->rb_lock);
953 atomic_set(&buf->rb_credits, 1);
954
955 /* Need to allocate:
956 * 1. arrays for send and recv pointers
957 * 2. arrays of struct rpcrdma_req to fill in pointers
958 * 3. array of struct rpcrdma_rep for replies
959 * 4. padding, if any
960 * 5. mw's, fmr's or frmr's, if any
961 * Send/recv buffers in req/rep need to be registered
962 */
963
964 len = buf->rb_max_requests *
965 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
966 len += cdata->padding;
967 switch (ia->ri_memreg_strategy) {
968 case RPCRDMA_FRMR:
969 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
970 sizeof(struct rpcrdma_mw);
971 break;
972 case RPCRDMA_MTHCAFMR:
973 /* TBD we are perhaps overallocating here */
974 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
975 sizeof(struct rpcrdma_mw);
976 break;
977 case RPCRDMA_MEMWINDOWS_ASYNC:
978 case RPCRDMA_MEMWINDOWS:
979 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
980 sizeof(struct rpcrdma_mw);
981 break;
982 default:
983 break;
984 }
985
986 /* allocate 1, 4 and 5 in one shot */
987 p = kzalloc(len, GFP_KERNEL);
988 if (p == NULL) {
989 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
990 __func__, len);
991 rc = -ENOMEM;
992 goto out;
993 }
994 buf->rb_pool = p; /* for freeing it later */
995
996 buf->rb_send_bufs = (struct rpcrdma_req **) p;
997 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
998 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
999 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1000
1001 /*
1002 * Register the zeroed pad buffer, if any.
1003 */
1004 if (cdata->padding) {
1005 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1006 &ep->rep_pad_mr, &ep->rep_pad);
1007 if (rc)
1008 goto out;
1009 }
1010 p += cdata->padding;
1011
1012 /*
1013 * Allocate the fmr's, or mw's for mw_bind chunk registration.
1014 * We "cycle" the mw's in order to minimize rkey reuse,
1015 * and also reduce unbind-to-bind collision.
1016 */
1017 INIT_LIST_HEAD(&buf->rb_mws);
1018 r = (struct rpcrdma_mw *)p;
1019 switch (ia->ri_memreg_strategy) {
1020 case RPCRDMA_FRMR:
1021 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1022 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1023 RPCRDMA_MAX_SEGS);
1024 if (IS_ERR(r->r.frmr.fr_mr)) {
1025 rc = PTR_ERR(r->r.frmr.fr_mr);
1026 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1027 " failed %i\n", __func__, rc);
1028 goto out;
1029 }
1030 r->r.frmr.fr_pgl =
1031 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1032 RPCRDMA_MAX_SEGS);
1033 if (IS_ERR(r->r.frmr.fr_pgl)) {
1034 rc = PTR_ERR(r->r.frmr.fr_pgl);
1035 dprintk("RPC: %s: "
1036 "ib_alloc_fast_reg_page_list "
1037 "failed %i\n", __func__, rc);
1038 goto out;
1039 }
1040 list_add(&r->mw_list, &buf->rb_mws);
1041 ++r;
1042 }
1043 break;
1044 case RPCRDMA_MTHCAFMR:
1045 /* TBD we are perhaps overallocating here */
1046 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1047 static struct ib_fmr_attr fa =
1048 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1049 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1050 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1051 &fa);
1052 if (IS_ERR(r->r.fmr)) {
1053 rc = PTR_ERR(r->r.fmr);
1054 dprintk("RPC: %s: ib_alloc_fmr"
1055 " failed %i\n", __func__, rc);
1056 goto out;
1057 }
1058 list_add(&r->mw_list, &buf->rb_mws);
1059 ++r;
1060 }
1061 break;
1062 case RPCRDMA_MEMWINDOWS_ASYNC:
1063 case RPCRDMA_MEMWINDOWS:
1064 /* Allocate one extra request's worth, for full cycling */
1065 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1066 r->r.mw = ib_alloc_mw(ia->ri_pd);
1067 if (IS_ERR(r->r.mw)) {
1068 rc = PTR_ERR(r->r.mw);
1069 dprintk("RPC: %s: ib_alloc_mw"
1070 " failed %i\n", __func__, rc);
1071 goto out;
1072 }
1073 list_add(&r->mw_list, &buf->rb_mws);
1074 ++r;
1075 }
1076 break;
1077 default:
1078 break;
1079 }
1080
1081 /*
1082 * Allocate/init the request/reply buffers. Doing this
1083 * using kmalloc for now -- one for each buf.
1084 */
1085 for (i = 0; i < buf->rb_max_requests; i++) {
1086 struct rpcrdma_req *req;
1087 struct rpcrdma_rep *rep;
1088
1089 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1090 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1091 /* Typical ~2400b, so rounding up saves work later */
1092 if (len < 4096)
1093 len = 4096;
1094 req = kmalloc(len, GFP_KERNEL);
1095 if (req == NULL) {
1096 dprintk("RPC: %s: request buffer %d alloc"
1097 " failed\n", __func__, i);
1098 rc = -ENOMEM;
1099 goto out;
1100 }
1101 memset(req, 0, sizeof(struct rpcrdma_req));
1102 buf->rb_send_bufs[i] = req;
1103 buf->rb_send_bufs[i]->rl_buffer = buf;
1104
1105 rc = rpcrdma_register_internal(ia, req->rl_base,
1106 len - offsetof(struct rpcrdma_req, rl_base),
1107 &buf->rb_send_bufs[i]->rl_handle,
1108 &buf->rb_send_bufs[i]->rl_iov);
1109 if (rc)
1110 goto out;
1111
1112 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1113
1114 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1115 rep = kmalloc(len, GFP_KERNEL);
1116 if (rep == NULL) {
1117 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1118 __func__, i);
1119 rc = -ENOMEM;
1120 goto out;
1121 }
1122 memset(rep, 0, sizeof(struct rpcrdma_rep));
1123 buf->rb_recv_bufs[i] = rep;
1124 buf->rb_recv_bufs[i]->rr_buffer = buf;
1125 init_waitqueue_head(&rep->rr_unbind);
1126
1127 rc = rpcrdma_register_internal(ia, rep->rr_base,
1128 len - offsetof(struct rpcrdma_rep, rr_base),
1129 &buf->rb_recv_bufs[i]->rr_handle,
1130 &buf->rb_recv_bufs[i]->rr_iov);
1131 if (rc)
1132 goto out;
1133
1134 }
1135 dprintk("RPC: %s: max_requests %d\n",
1136 __func__, buf->rb_max_requests);
1137 /* done */
1138 return 0;
1139 out:
1140 rpcrdma_buffer_destroy(buf);
1141 return rc;
1142 }
1143
1144 /*
1145 * Unregister and destroy buffer memory. Need to deal with
1146 * partial initialization, so it's callable from failed create.
1147 * Must be called before destroying endpoint, as registrations
1148 * reference it.
1149 */
1150 void
rpcrdma_buffer_destroy(struct rpcrdma_buffer * buf)1151 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1152 {
1153 int rc, i;
1154 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1155 struct rpcrdma_mw *r;
1156
1157 /* clean up in reverse order from create
1158 * 1. recv mr memory (mr free, then kfree)
1159 * 1a. bind mw memory
1160 * 2. send mr memory (mr free, then kfree)
1161 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1162 * 4. arrays
1163 */
1164 dprintk("RPC: %s: entering\n", __func__);
1165
1166 for (i = 0; i < buf->rb_max_requests; i++) {
1167 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1168 rpcrdma_deregister_internal(ia,
1169 buf->rb_recv_bufs[i]->rr_handle,
1170 &buf->rb_recv_bufs[i]->rr_iov);
1171 kfree(buf->rb_recv_bufs[i]);
1172 }
1173 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1174 while (!list_empty(&buf->rb_mws)) {
1175 r = list_entry(buf->rb_mws.next,
1176 struct rpcrdma_mw, mw_list);
1177 list_del(&r->mw_list);
1178 switch (ia->ri_memreg_strategy) {
1179 case RPCRDMA_FRMR:
1180 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1181 if (rc)
1182 dprintk("RPC: %s:"
1183 " ib_dereg_mr"
1184 " failed %i\n",
1185 __func__, rc);
1186 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1187 break;
1188 case RPCRDMA_MTHCAFMR:
1189 rc = ib_dealloc_fmr(r->r.fmr);
1190 if (rc)
1191 dprintk("RPC: %s:"
1192 " ib_dealloc_fmr"
1193 " failed %i\n",
1194 __func__, rc);
1195 break;
1196 case RPCRDMA_MEMWINDOWS_ASYNC:
1197 case RPCRDMA_MEMWINDOWS:
1198 rc = ib_dealloc_mw(r->r.mw);
1199 if (rc)
1200 dprintk("RPC: %s:"
1201 " ib_dealloc_mw"
1202 " failed %i\n",
1203 __func__, rc);
1204 break;
1205 default:
1206 break;
1207 }
1208 }
1209 rpcrdma_deregister_internal(ia,
1210 buf->rb_send_bufs[i]->rl_handle,
1211 &buf->rb_send_bufs[i]->rl_iov);
1212 kfree(buf->rb_send_bufs[i]);
1213 }
1214 }
1215
1216 kfree(buf->rb_pool);
1217 }
1218
1219 /*
1220 * Get a set of request/reply buffers.
1221 *
1222 * Reply buffer (if needed) is attached to send buffer upon return.
1223 * Rule:
1224 * rb_send_index and rb_recv_index MUST always be pointing to the
1225 * *next* available buffer (non-NULL). They are incremented after
1226 * removing buffers, and decremented *before* returning them.
1227 */
1228 struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer * buffers)1229 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1230 {
1231 struct rpcrdma_req *req;
1232 unsigned long flags;
1233 int i;
1234 struct rpcrdma_mw *r;
1235
1236 spin_lock_irqsave(&buffers->rb_lock, flags);
1237 if (buffers->rb_send_index == buffers->rb_max_requests) {
1238 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1239 dprintk("RPC: %s: out of request buffers\n", __func__);
1240 return ((struct rpcrdma_req *)NULL);
1241 }
1242
1243 req = buffers->rb_send_bufs[buffers->rb_send_index];
1244 if (buffers->rb_send_index < buffers->rb_recv_index) {
1245 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1246 __func__,
1247 buffers->rb_recv_index - buffers->rb_send_index);
1248 req->rl_reply = NULL;
1249 } else {
1250 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1251 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1252 }
1253 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1254 if (!list_empty(&buffers->rb_mws)) {
1255 i = RPCRDMA_MAX_SEGS - 1;
1256 do {
1257 r = list_entry(buffers->rb_mws.next,
1258 struct rpcrdma_mw, mw_list);
1259 list_del(&r->mw_list);
1260 req->rl_segments[i].mr_chunk.rl_mw = r;
1261 } while (--i >= 0);
1262 }
1263 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1264 return req;
1265 }
1266
1267 /*
1268 * Put request/reply buffers back into pool.
1269 * Pre-decrement counter/array index.
1270 */
1271 void
rpcrdma_buffer_put(struct rpcrdma_req * req)1272 rpcrdma_buffer_put(struct rpcrdma_req *req)
1273 {
1274 struct rpcrdma_buffer *buffers = req->rl_buffer;
1275 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1276 int i;
1277 unsigned long flags;
1278
1279 BUG_ON(req->rl_nchunks != 0);
1280 spin_lock_irqsave(&buffers->rb_lock, flags);
1281 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1282 req->rl_niovs = 0;
1283 if (req->rl_reply) {
1284 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1285 init_waitqueue_head(&req->rl_reply->rr_unbind);
1286 req->rl_reply->rr_func = NULL;
1287 req->rl_reply = NULL;
1288 }
1289 switch (ia->ri_memreg_strategy) {
1290 case RPCRDMA_FRMR:
1291 case RPCRDMA_MTHCAFMR:
1292 case RPCRDMA_MEMWINDOWS_ASYNC:
1293 case RPCRDMA_MEMWINDOWS:
1294 /*
1295 * Cycle mw's back in reverse order, and "spin" them.
1296 * This delays and scrambles reuse as much as possible.
1297 */
1298 i = 1;
1299 do {
1300 struct rpcrdma_mw **mw;
1301 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1302 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1303 *mw = NULL;
1304 } while (++i < RPCRDMA_MAX_SEGS);
1305 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1306 &buffers->rb_mws);
1307 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1308 break;
1309 default:
1310 break;
1311 }
1312 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1313 }
1314
1315 /*
1316 * Recover reply buffers from pool.
1317 * This happens when recovering from error conditions.
1318 * Post-increment counter/array index.
1319 */
1320 void
rpcrdma_recv_buffer_get(struct rpcrdma_req * req)1321 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1322 {
1323 struct rpcrdma_buffer *buffers = req->rl_buffer;
1324 unsigned long flags;
1325
1326 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1327 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1328 spin_lock_irqsave(&buffers->rb_lock, flags);
1329 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1330 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1331 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1332 }
1333 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1334 }
1335
1336 /*
1337 * Put reply buffers back into pool when not attached to
1338 * request. This happens in error conditions, and when
1339 * aborting unbinds. Pre-decrement counter/array index.
1340 */
1341 void
rpcrdma_recv_buffer_put(struct rpcrdma_rep * rep)1342 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1343 {
1344 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1345 unsigned long flags;
1346
1347 rep->rr_func = NULL;
1348 spin_lock_irqsave(&buffers->rb_lock, flags);
1349 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1350 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1351 }
1352
1353 /*
1354 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1355 */
1356
1357 int
rpcrdma_register_internal(struct rpcrdma_ia * ia,void * va,int len,struct ib_mr ** mrp,struct ib_sge * iov)1358 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1359 struct ib_mr **mrp, struct ib_sge *iov)
1360 {
1361 struct ib_phys_buf ipb;
1362 struct ib_mr *mr;
1363 int rc;
1364
1365 /*
1366 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1367 */
1368 iov->addr = ib_dma_map_single(ia->ri_id->device,
1369 va, len, DMA_BIDIRECTIONAL);
1370 iov->length = len;
1371
1372 if (ia->ri_have_dma_lkey) {
1373 *mrp = NULL;
1374 iov->lkey = ia->ri_dma_lkey;
1375 return 0;
1376 } else if (ia->ri_bind_mem != NULL) {
1377 *mrp = NULL;
1378 iov->lkey = ia->ri_bind_mem->lkey;
1379 return 0;
1380 }
1381
1382 ipb.addr = iov->addr;
1383 ipb.size = iov->length;
1384 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1385 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1386
1387 dprintk("RPC: %s: phys convert: 0x%llx "
1388 "registered 0x%llx length %d\n",
1389 __func__, (unsigned long long)ipb.addr,
1390 (unsigned long long)iov->addr, len);
1391
1392 if (IS_ERR(mr)) {
1393 *mrp = NULL;
1394 rc = PTR_ERR(mr);
1395 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1396 } else {
1397 *mrp = mr;
1398 iov->lkey = mr->lkey;
1399 rc = 0;
1400 }
1401
1402 return rc;
1403 }
1404
1405 int
rpcrdma_deregister_internal(struct rpcrdma_ia * ia,struct ib_mr * mr,struct ib_sge * iov)1406 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1407 struct ib_mr *mr, struct ib_sge *iov)
1408 {
1409 int rc;
1410
1411 ib_dma_unmap_single(ia->ri_id->device,
1412 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1413
1414 if (NULL == mr)
1415 return 0;
1416
1417 rc = ib_dereg_mr(mr);
1418 if (rc)
1419 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1420 return rc;
1421 }
1422
1423 /*
1424 * Wrappers for chunk registration, shared by read/write chunk code.
1425 */
1426
1427 static void
rpcrdma_map_one(struct rpcrdma_ia * ia,struct rpcrdma_mr_seg * seg,int writing)1428 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1429 {
1430 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1431 seg->mr_dmalen = seg->mr_len;
1432 if (seg->mr_page)
1433 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1434 seg->mr_page, offset_in_page(seg->mr_offset),
1435 seg->mr_dmalen, seg->mr_dir);
1436 else
1437 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1438 seg->mr_offset,
1439 seg->mr_dmalen, seg->mr_dir);
1440 }
1441
1442 static void
rpcrdma_unmap_one(struct rpcrdma_ia * ia,struct rpcrdma_mr_seg * seg)1443 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1444 {
1445 if (seg->mr_page)
1446 ib_dma_unmap_page(ia->ri_id->device,
1447 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1448 else
1449 ib_dma_unmap_single(ia->ri_id->device,
1450 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1451 }
1452
1453 static int
rpcrdma_register_frmr_external(struct rpcrdma_mr_seg * seg,int * nsegs,int writing,struct rpcrdma_ia * ia,struct rpcrdma_xprt * r_xprt)1454 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1455 int *nsegs, int writing, struct rpcrdma_ia *ia,
1456 struct rpcrdma_xprt *r_xprt)
1457 {
1458 struct rpcrdma_mr_seg *seg1 = seg;
1459 struct ib_send_wr frmr_wr, *bad_wr;
1460 u8 key;
1461 int len, pageoff;
1462 int i, rc;
1463
1464 pageoff = offset_in_page(seg1->mr_offset);
1465 seg1->mr_offset -= pageoff; /* start of page */
1466 seg1->mr_len += pageoff;
1467 len = -pageoff;
1468 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1469 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1470 for (i = 0; i < *nsegs;) {
1471 rpcrdma_map_one(ia, seg, writing);
1472 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1473 len += seg->mr_len;
1474 ++seg;
1475 ++i;
1476 /* Check for holes */
1477 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1478 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1479 break;
1480 }
1481 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1482 __func__, seg1->mr_chunk.rl_mw, i);
1483
1484 /* Bump the key */
1485 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1486 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1487
1488 /* Prepare FRMR WR */
1489 memset(&frmr_wr, 0, sizeof frmr_wr);
1490 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1491 frmr_wr.send_flags = 0; /* unsignaled */
1492 frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma;
1493 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1494 frmr_wr.wr.fast_reg.page_list_len = i;
1495 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1496 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1497 frmr_wr.wr.fast_reg.access_flags = (writing ?
1498 IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ);
1499 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1500 DECR_CQCOUNT(&r_xprt->rx_ep);
1501
1502 rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
1503
1504 if (rc) {
1505 dprintk("RPC: %s: failed ib_post_send for register,"
1506 " status %i\n", __func__, rc);
1507 while (i--)
1508 rpcrdma_unmap_one(ia, --seg);
1509 } else {
1510 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1511 seg1->mr_base = seg1->mr_dma + pageoff;
1512 seg1->mr_nsegs = i;
1513 seg1->mr_len = len;
1514 }
1515 *nsegs = i;
1516 return rc;
1517 }
1518
1519 static int
rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg * seg,struct rpcrdma_ia * ia,struct rpcrdma_xprt * r_xprt)1520 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1521 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1522 {
1523 struct rpcrdma_mr_seg *seg1 = seg;
1524 struct ib_send_wr invalidate_wr, *bad_wr;
1525 int rc;
1526
1527 while (seg1->mr_nsegs--)
1528 rpcrdma_unmap_one(ia, seg++);
1529
1530 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1531 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1532 invalidate_wr.send_flags = 0; /* unsignaled */
1533 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1534 DECR_CQCOUNT(&r_xprt->rx_ep);
1535
1536 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1537 if (rc)
1538 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1539 " status %i\n", __func__, rc);
1540 return rc;
1541 }
1542
1543 static int
rpcrdma_register_fmr_external(struct rpcrdma_mr_seg * seg,int * nsegs,int writing,struct rpcrdma_ia * ia)1544 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1545 int *nsegs, int writing, struct rpcrdma_ia *ia)
1546 {
1547 struct rpcrdma_mr_seg *seg1 = seg;
1548 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1549 int len, pageoff, i, rc;
1550
1551 pageoff = offset_in_page(seg1->mr_offset);
1552 seg1->mr_offset -= pageoff; /* start of page */
1553 seg1->mr_len += pageoff;
1554 len = -pageoff;
1555 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1556 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1557 for (i = 0; i < *nsegs;) {
1558 rpcrdma_map_one(ia, seg, writing);
1559 physaddrs[i] = seg->mr_dma;
1560 len += seg->mr_len;
1561 ++seg;
1562 ++i;
1563 /* Check for holes */
1564 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1565 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1566 break;
1567 }
1568 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1569 physaddrs, i, seg1->mr_dma);
1570 if (rc) {
1571 dprintk("RPC: %s: failed ib_map_phys_fmr "
1572 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1573 len, (unsigned long long)seg1->mr_dma,
1574 pageoff, i, rc);
1575 while (i--)
1576 rpcrdma_unmap_one(ia, --seg);
1577 } else {
1578 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1579 seg1->mr_base = seg1->mr_dma + pageoff;
1580 seg1->mr_nsegs = i;
1581 seg1->mr_len = len;
1582 }
1583 *nsegs = i;
1584 return rc;
1585 }
1586
1587 static int
rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg * seg,struct rpcrdma_ia * ia)1588 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1589 struct rpcrdma_ia *ia)
1590 {
1591 struct rpcrdma_mr_seg *seg1 = seg;
1592 LIST_HEAD(l);
1593 int rc;
1594
1595 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1596 rc = ib_unmap_fmr(&l);
1597 while (seg1->mr_nsegs--)
1598 rpcrdma_unmap_one(ia, seg++);
1599 if (rc)
1600 dprintk("RPC: %s: failed ib_unmap_fmr,"
1601 " status %i\n", __func__, rc);
1602 return rc;
1603 }
1604
1605 static int
rpcrdma_register_memwin_external(struct rpcrdma_mr_seg * seg,int * nsegs,int writing,struct rpcrdma_ia * ia,struct rpcrdma_xprt * r_xprt)1606 rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1607 int *nsegs, int writing, struct rpcrdma_ia *ia,
1608 struct rpcrdma_xprt *r_xprt)
1609 {
1610 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1611 IB_ACCESS_REMOTE_READ);
1612 struct ib_mw_bind param;
1613 int rc;
1614
1615 *nsegs = 1;
1616 rpcrdma_map_one(ia, seg, writing);
1617 param.mr = ia->ri_bind_mem;
1618 param.wr_id = 0ULL; /* no send cookie */
1619 param.addr = seg->mr_dma;
1620 param.length = seg->mr_len;
1621 param.send_flags = 0;
1622 param.mw_access_flags = mem_priv;
1623
1624 DECR_CQCOUNT(&r_xprt->rx_ep);
1625 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m);
1626 if (rc) {
1627 dprintk("RPC: %s: failed ib_bind_mw "
1628 "%u@0x%llx status %i\n",
1629 __func__, seg->mr_len,
1630 (unsigned long long)seg->mr_dma, rc);
1631 rpcrdma_unmap_one(ia, seg);
1632 } else {
1633 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1634 seg->mr_base = param.addr;
1635 seg->mr_nsegs = 1;
1636 }
1637 return rc;
1638 }
1639
1640 static int
rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg * seg,struct rpcrdma_ia * ia,struct rpcrdma_xprt * r_xprt,void ** r)1641 rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1642 struct rpcrdma_ia *ia,
1643 struct rpcrdma_xprt *r_xprt, void **r)
1644 {
1645 struct ib_mw_bind param;
1646 LIST_HEAD(l);
1647 int rc;
1648
1649 BUG_ON(seg->mr_nsegs != 1);
1650 param.mr = ia->ri_bind_mem;
1651 param.addr = 0ULL; /* unbind */
1652 param.length = 0;
1653 param.mw_access_flags = 0;
1654 if (*r) {
1655 param.wr_id = (u64) (unsigned long) *r;
1656 param.send_flags = IB_SEND_SIGNALED;
1657 INIT_CQCOUNT(&r_xprt->rx_ep);
1658 } else {
1659 param.wr_id = 0ULL;
1660 param.send_flags = 0;
1661 DECR_CQCOUNT(&r_xprt->rx_ep);
1662 }
1663 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m);
1664 rpcrdma_unmap_one(ia, seg);
1665 if (rc)
1666 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1667 " status %i\n", __func__, rc);
1668 else
1669 *r = NULL; /* will upcall on completion */
1670 return rc;
1671 }
1672
1673 static int
rpcrdma_register_default_external(struct rpcrdma_mr_seg * seg,int * nsegs,int writing,struct rpcrdma_ia * ia)1674 rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1675 int *nsegs, int writing, struct rpcrdma_ia *ia)
1676 {
1677 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1678 IB_ACCESS_REMOTE_READ);
1679 struct rpcrdma_mr_seg *seg1 = seg;
1680 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1681 int len, i, rc = 0;
1682
1683 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1684 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1685 for (len = 0, i = 0; i < *nsegs;) {
1686 rpcrdma_map_one(ia, seg, writing);
1687 ipb[i].addr = seg->mr_dma;
1688 ipb[i].size = seg->mr_len;
1689 len += seg->mr_len;
1690 ++seg;
1691 ++i;
1692 /* Check for holes */
1693 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1694 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1695 break;
1696 }
1697 seg1->mr_base = seg1->mr_dma;
1698 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1699 ipb, i, mem_priv, &seg1->mr_base);
1700 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1701 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1702 dprintk("RPC: %s: failed ib_reg_phys_mr "
1703 "%u@0x%llx (%d)... status %i\n",
1704 __func__, len,
1705 (unsigned long long)seg1->mr_dma, i, rc);
1706 while (i--)
1707 rpcrdma_unmap_one(ia, --seg);
1708 } else {
1709 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1710 seg1->mr_nsegs = i;
1711 seg1->mr_len = len;
1712 }
1713 *nsegs = i;
1714 return rc;
1715 }
1716
1717 static int
rpcrdma_deregister_default_external(struct rpcrdma_mr_seg * seg,struct rpcrdma_ia * ia)1718 rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1719 struct rpcrdma_ia *ia)
1720 {
1721 struct rpcrdma_mr_seg *seg1 = seg;
1722 int rc;
1723
1724 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1725 seg1->mr_chunk.rl_mr = NULL;
1726 while (seg1->mr_nsegs--)
1727 rpcrdma_unmap_one(ia, seg++);
1728 if (rc)
1729 dprintk("RPC: %s: failed ib_dereg_mr,"
1730 " status %i\n", __func__, rc);
1731 return rc;
1732 }
1733
1734 int
rpcrdma_register_external(struct rpcrdma_mr_seg * seg,int nsegs,int writing,struct rpcrdma_xprt * r_xprt)1735 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1736 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1737 {
1738 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1739 int rc = 0;
1740
1741 switch (ia->ri_memreg_strategy) {
1742
1743 #if RPCRDMA_PERSISTENT_REGISTRATION
1744 case RPCRDMA_ALLPHYSICAL:
1745 rpcrdma_map_one(ia, seg, writing);
1746 seg->mr_rkey = ia->ri_bind_mem->rkey;
1747 seg->mr_base = seg->mr_dma;
1748 seg->mr_nsegs = 1;
1749 nsegs = 1;
1750 break;
1751 #endif
1752
1753 /* Registration using frmr registration */
1754 case RPCRDMA_FRMR:
1755 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1756 break;
1757
1758 /* Registration using fmr memory registration */
1759 case RPCRDMA_MTHCAFMR:
1760 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1761 break;
1762
1763 /* Registration using memory windows */
1764 case RPCRDMA_MEMWINDOWS_ASYNC:
1765 case RPCRDMA_MEMWINDOWS:
1766 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1767 break;
1768
1769 /* Default registration each time */
1770 default:
1771 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
1772 break;
1773 }
1774 if (rc)
1775 return -1;
1776
1777 return nsegs;
1778 }
1779
1780 int
rpcrdma_deregister_external(struct rpcrdma_mr_seg * seg,struct rpcrdma_xprt * r_xprt,void * r)1781 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1782 struct rpcrdma_xprt *r_xprt, void *r)
1783 {
1784 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1785 int nsegs = seg->mr_nsegs, rc;
1786
1787 switch (ia->ri_memreg_strategy) {
1788
1789 #if RPCRDMA_PERSISTENT_REGISTRATION
1790 case RPCRDMA_ALLPHYSICAL:
1791 BUG_ON(nsegs != 1);
1792 rpcrdma_unmap_one(ia, seg);
1793 rc = 0;
1794 break;
1795 #endif
1796
1797 case RPCRDMA_FRMR:
1798 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1799 break;
1800
1801 case RPCRDMA_MTHCAFMR:
1802 rc = rpcrdma_deregister_fmr_external(seg, ia);
1803 break;
1804
1805 case RPCRDMA_MEMWINDOWS_ASYNC:
1806 case RPCRDMA_MEMWINDOWS:
1807 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1808 break;
1809
1810 default:
1811 rc = rpcrdma_deregister_default_external(seg, ia);
1812 break;
1813 }
1814 if (r) {
1815 struct rpcrdma_rep *rep = r;
1816 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1817 rep->rr_func = NULL;
1818 func(rep); /* dereg done, callback now */
1819 }
1820 return nsegs;
1821 }
1822
1823 /*
1824 * Prepost any receive buffer, then post send.
1825 *
1826 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1827 */
1828 int
rpcrdma_ep_post(struct rpcrdma_ia * ia,struct rpcrdma_ep * ep,struct rpcrdma_req * req)1829 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1830 struct rpcrdma_ep *ep,
1831 struct rpcrdma_req *req)
1832 {
1833 struct ib_send_wr send_wr, *send_wr_fail;
1834 struct rpcrdma_rep *rep = req->rl_reply;
1835 int rc;
1836
1837 if (rep) {
1838 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1839 if (rc)
1840 goto out;
1841 req->rl_reply = NULL;
1842 }
1843
1844 send_wr.next = NULL;
1845 send_wr.wr_id = 0ULL; /* no send cookie */
1846 send_wr.sg_list = req->rl_send_iov;
1847 send_wr.num_sge = req->rl_niovs;
1848 send_wr.opcode = IB_WR_SEND;
1849 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1850 ib_dma_sync_single_for_device(ia->ri_id->device,
1851 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1852 DMA_TO_DEVICE);
1853 ib_dma_sync_single_for_device(ia->ri_id->device,
1854 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1855 DMA_TO_DEVICE);
1856 ib_dma_sync_single_for_device(ia->ri_id->device,
1857 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1858 DMA_TO_DEVICE);
1859
1860 if (DECR_CQCOUNT(ep) > 0)
1861 send_wr.send_flags = 0;
1862 else { /* Provider must take a send completion every now and then */
1863 INIT_CQCOUNT(ep);
1864 send_wr.send_flags = IB_SEND_SIGNALED;
1865 }
1866
1867 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1868 if (rc)
1869 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1870 rc);
1871 out:
1872 return rc;
1873 }
1874
1875 /*
1876 * (Re)post a receive buffer.
1877 */
1878 int
rpcrdma_ep_post_recv(struct rpcrdma_ia * ia,struct rpcrdma_ep * ep,struct rpcrdma_rep * rep)1879 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1880 struct rpcrdma_ep *ep,
1881 struct rpcrdma_rep *rep)
1882 {
1883 struct ib_recv_wr recv_wr, *recv_wr_fail;
1884 int rc;
1885
1886 recv_wr.next = NULL;
1887 recv_wr.wr_id = (u64) (unsigned long) rep;
1888 recv_wr.sg_list = &rep->rr_iov;
1889 recv_wr.num_sge = 1;
1890
1891 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1892 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1893
1894 DECR_CQCOUNT(ep);
1895 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1896
1897 if (rc)
1898 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1899 rc);
1900 return rc;
1901 }
1902