1 /*
2 * Copyright(c) 2015, 2016 Intel Corporation.
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * BSD LICENSE
19 *
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
22 * are met:
23 *
24 * - Redistributions of source code must retain the above copyright
25 * notice, this list of conditions and the following disclaimer.
26 * - Redistributions in binary form must reproduce the above copyright
27 * notice, this list of conditions and the following disclaimer in
28 * the documentation and/or other materials provided with the
29 * distribution.
30 * - Neither the name of Intel Corporation nor the names of its
31 * contributors may be used to endorse or promote products derived
32 * from this software without specific prior written permission.
33 *
34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45 *
46 */
47
48 #include <linux/io.h>
49 #include <rdma/rdma_vt.h>
50 #include <rdma/rdmavt_qp.h>
51
52 #include "hfi.h"
53 #include "qp.h"
54 #include "verbs_txreq.h"
55 #include "trace.h"
56
57 /* cut down ridiculously long IB macro names */
58 #define OP(x) RC_OP(x)
59
restart_sge(struct rvt_sge_state * ss,struct rvt_swqe * wqe,u32 psn,u32 pmtu)60 static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
61 u32 psn, u32 pmtu)
62 {
63 u32 len;
64
65 len = delta_psn(psn, wqe->psn) * pmtu;
66 ss->sge = wqe->sg_list[0];
67 ss->sg_list = wqe->sg_list + 1;
68 ss->num_sge = wqe->wr.num_sge;
69 ss->total_len = wqe->length;
70 rvt_skip_sge(ss, len, false);
71 return wqe->length - len;
72 }
73
74 /**
75 * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
76 * @dev: the device for this QP
77 * @qp: a pointer to the QP
78 * @ohdr: a pointer to the IB header being constructed
79 * @ps: the xmit packet state
80 *
81 * Return 1 if constructed; otherwise, return 0.
82 * Note that we are in the responder's side of the QP context.
83 * Note the QP s_lock must be held.
84 */
make_rc_ack(struct hfi1_ibdev * dev,struct rvt_qp * qp,struct ib_other_headers * ohdr,struct hfi1_pkt_state * ps)85 static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
86 struct ib_other_headers *ohdr,
87 struct hfi1_pkt_state *ps)
88 {
89 struct rvt_ack_entry *e;
90 u32 hwords;
91 u32 len;
92 u32 bth0;
93 u32 bth2;
94 int middle = 0;
95 u32 pmtu = qp->pmtu;
96 struct hfi1_qp_priv *priv = qp->priv;
97
98 lockdep_assert_held(&qp->s_lock);
99 /* Don't send an ACK if we aren't supposed to. */
100 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
101 goto bail;
102
103 if (priv->hdr_type == HFI1_PKT_TYPE_9B)
104 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
105 hwords = 5;
106 else
107 /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
108 hwords = 7;
109
110 switch (qp->s_ack_state) {
111 case OP(RDMA_READ_RESPONSE_LAST):
112 case OP(RDMA_READ_RESPONSE_ONLY):
113 e = &qp->s_ack_queue[qp->s_tail_ack_queue];
114 if (e->rdma_sge.mr) {
115 rvt_put_mr(e->rdma_sge.mr);
116 e->rdma_sge.mr = NULL;
117 }
118 /* FALLTHROUGH */
119 case OP(ATOMIC_ACKNOWLEDGE):
120 /*
121 * We can increment the tail pointer now that the last
122 * response has been sent instead of only being
123 * constructed.
124 */
125 if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
126 qp->s_tail_ack_queue = 0;
127 /* FALLTHROUGH */
128 case OP(SEND_ONLY):
129 case OP(ACKNOWLEDGE):
130 /* Check for no next entry in the queue. */
131 if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
132 if (qp->s_flags & RVT_S_ACK_PENDING)
133 goto normal;
134 goto bail;
135 }
136
137 e = &qp->s_ack_queue[qp->s_tail_ack_queue];
138 if (e->opcode == OP(RDMA_READ_REQUEST)) {
139 /*
140 * If a RDMA read response is being resent and
141 * we haven't seen the duplicate request yet,
142 * then stop sending the remaining responses the
143 * responder has seen until the requester re-sends it.
144 */
145 len = e->rdma_sge.sge_length;
146 if (len && !e->rdma_sge.mr) {
147 qp->s_tail_ack_queue = qp->r_head_ack_queue;
148 goto bail;
149 }
150 /* Copy SGE state in case we need to resend */
151 ps->s_txreq->mr = e->rdma_sge.mr;
152 if (ps->s_txreq->mr)
153 rvt_get_mr(ps->s_txreq->mr);
154 qp->s_ack_rdma_sge.sge = e->rdma_sge;
155 qp->s_ack_rdma_sge.num_sge = 1;
156 ps->s_txreq->ss = &qp->s_ack_rdma_sge;
157 if (len > pmtu) {
158 len = pmtu;
159 qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
160 } else {
161 qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
162 e->sent = 1;
163 }
164 ohdr->u.aeth = rvt_compute_aeth(qp);
165 hwords++;
166 qp->s_ack_rdma_psn = e->psn;
167 bth2 = mask_psn(qp->s_ack_rdma_psn++);
168 } else {
169 /* COMPARE_SWAP or FETCH_ADD */
170 ps->s_txreq->ss = NULL;
171 len = 0;
172 qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
173 ohdr->u.at.aeth = rvt_compute_aeth(qp);
174 ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth);
175 hwords += sizeof(ohdr->u.at) / sizeof(u32);
176 bth2 = mask_psn(e->psn);
177 e->sent = 1;
178 }
179 bth0 = qp->s_ack_state << 24;
180 break;
181
182 case OP(RDMA_READ_RESPONSE_FIRST):
183 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
184 /* FALLTHROUGH */
185 case OP(RDMA_READ_RESPONSE_MIDDLE):
186 ps->s_txreq->ss = &qp->s_ack_rdma_sge;
187 ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
188 if (ps->s_txreq->mr)
189 rvt_get_mr(ps->s_txreq->mr);
190 len = qp->s_ack_rdma_sge.sge.sge_length;
191 if (len > pmtu) {
192 len = pmtu;
193 middle = HFI1_CAP_IS_KSET(SDMA_AHG);
194 } else {
195 ohdr->u.aeth = rvt_compute_aeth(qp);
196 hwords++;
197 qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
198 e = &qp->s_ack_queue[qp->s_tail_ack_queue];
199 e->sent = 1;
200 }
201 bth0 = qp->s_ack_state << 24;
202 bth2 = mask_psn(qp->s_ack_rdma_psn++);
203 break;
204
205 default:
206 normal:
207 /*
208 * Send a regular ACK.
209 * Set the s_ack_state so we wait until after sending
210 * the ACK before setting s_ack_state to ACKNOWLEDGE
211 * (see above).
212 */
213 qp->s_ack_state = OP(SEND_ONLY);
214 qp->s_flags &= ~RVT_S_ACK_PENDING;
215 ps->s_txreq->ss = NULL;
216 if (qp->s_nak_state)
217 ohdr->u.aeth =
218 cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
219 (qp->s_nak_state <<
220 IB_AETH_CREDIT_SHIFT));
221 else
222 ohdr->u.aeth = rvt_compute_aeth(qp);
223 hwords++;
224 len = 0;
225 bth0 = OP(ACKNOWLEDGE) << 24;
226 bth2 = mask_psn(qp->s_ack_psn);
227 }
228 qp->s_rdma_ack_cnt++;
229 qp->s_hdrwords = hwords;
230 ps->s_txreq->sde = priv->s_sde;
231 ps->s_txreq->s_cur_size = len;
232 hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
233 /* pbc */
234 ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
235 return 1;
236
237 bail:
238 qp->s_ack_state = OP(ACKNOWLEDGE);
239 /*
240 * Ensure s_rdma_ack_cnt changes are committed prior to resetting
241 * RVT_S_RESP_PENDING
242 */
243 smp_wmb();
244 qp->s_flags &= ~(RVT_S_RESP_PENDING
245 | RVT_S_ACK_PENDING
246 | RVT_S_AHG_VALID);
247 return 0;
248 }
249
250 /**
251 * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
252 * @qp: a pointer to the QP
253 *
254 * Assumes s_lock is held.
255 *
256 * Return 1 if constructed; otherwise, return 0.
257 */
hfi1_make_rc_req(struct rvt_qp * qp,struct hfi1_pkt_state * ps)258 int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
259 {
260 struct hfi1_qp_priv *priv = qp->priv;
261 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
262 struct ib_other_headers *ohdr;
263 struct rvt_sge_state *ss;
264 struct rvt_swqe *wqe;
265 u32 hwords;
266 u32 len;
267 u32 bth0 = 0;
268 u32 bth2;
269 u32 pmtu = qp->pmtu;
270 char newreq;
271 int middle = 0;
272 int delta;
273
274 lockdep_assert_held(&qp->s_lock);
275 ps->s_txreq = get_txreq(ps->dev, qp);
276 if (!ps->s_txreq)
277 goto bail_no_tx;
278
279 ps->s_txreq->phdr.hdr.hdr_type = priv->hdr_type;
280 if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
281 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
282 hwords = 5;
283 if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)
284 ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth;
285 else
286 ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
287 } else {
288 /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
289 hwords = 7;
290 if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
291 (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))))
292 ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth;
293 else
294 ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth;
295 }
296
297 /* Sending responses has higher priority over sending requests. */
298 if ((qp->s_flags & RVT_S_RESP_PENDING) &&
299 make_rc_ack(dev, qp, ohdr, ps))
300 return 1;
301
302 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
303 if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
304 goto bail;
305 /* We are in the error state, flush the work request. */
306 smp_read_barrier_depends(); /* see post_one_send() */
307 if (qp->s_last == READ_ONCE(qp->s_head))
308 goto bail;
309 /* If DMAs are in progress, we can't flush immediately. */
310 if (iowait_sdma_pending(&priv->s_iowait)) {
311 qp->s_flags |= RVT_S_WAIT_DMA;
312 goto bail;
313 }
314 clear_ahg(qp);
315 wqe = rvt_get_swqe_ptr(qp, qp->s_last);
316 hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
317 IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
318 /* will get called again */
319 goto done_free_tx;
320 }
321
322 if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
323 goto bail;
324
325 if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
326 if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
327 qp->s_flags |= RVT_S_WAIT_PSN;
328 goto bail;
329 }
330 qp->s_sending_psn = qp->s_psn;
331 qp->s_sending_hpsn = qp->s_psn - 1;
332 }
333
334 /* Send a request. */
335 wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
336 switch (qp->s_state) {
337 default:
338 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
339 goto bail;
340 /*
341 * Resend an old request or start a new one.
342 *
343 * We keep track of the current SWQE so that
344 * we don't reset the "furthest progress" state
345 * if we need to back up.
346 */
347 newreq = 0;
348 if (qp->s_cur == qp->s_tail) {
349 /* Check if send work queue is empty. */
350 smp_read_barrier_depends(); /* see post_one_send() */
351 if (qp->s_tail == READ_ONCE(qp->s_head)) {
352 clear_ahg(qp);
353 goto bail;
354 }
355 /*
356 * If a fence is requested, wait for previous
357 * RDMA read and atomic operations to finish.
358 */
359 if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
360 qp->s_num_rd_atomic) {
361 qp->s_flags |= RVT_S_WAIT_FENCE;
362 goto bail;
363 }
364 /*
365 * Local operations are processed immediately
366 * after all prior requests have completed
367 */
368 if (wqe->wr.opcode == IB_WR_REG_MR ||
369 wqe->wr.opcode == IB_WR_LOCAL_INV) {
370 int local_ops = 0;
371 int err = 0;
372
373 if (qp->s_last != qp->s_cur)
374 goto bail;
375 if (++qp->s_cur == qp->s_size)
376 qp->s_cur = 0;
377 if (++qp->s_tail == qp->s_size)
378 qp->s_tail = 0;
379 if (!(wqe->wr.send_flags &
380 RVT_SEND_COMPLETION_ONLY)) {
381 err = rvt_invalidate_rkey(
382 qp,
383 wqe->wr.ex.invalidate_rkey);
384 local_ops = 1;
385 }
386 hfi1_send_complete(qp, wqe,
387 err ? IB_WC_LOC_PROT_ERR
388 : IB_WC_SUCCESS);
389 if (local_ops)
390 atomic_dec(&qp->local_ops_pending);
391 qp->s_hdrwords = 0;
392 goto done_free_tx;
393 }
394
395 newreq = 1;
396 qp->s_psn = wqe->psn;
397 }
398 /*
399 * Note that we have to be careful not to modify the
400 * original work request since we may need to resend
401 * it.
402 */
403 len = wqe->length;
404 ss = &qp->s_sge;
405 bth2 = mask_psn(qp->s_psn);
406 switch (wqe->wr.opcode) {
407 case IB_WR_SEND:
408 case IB_WR_SEND_WITH_IMM:
409 case IB_WR_SEND_WITH_INV:
410 /* If no credit, return. */
411 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
412 rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
413 qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
414 goto bail;
415 }
416 if (len > pmtu) {
417 qp->s_state = OP(SEND_FIRST);
418 len = pmtu;
419 break;
420 }
421 if (wqe->wr.opcode == IB_WR_SEND) {
422 qp->s_state = OP(SEND_ONLY);
423 } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
424 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
425 /* Immediate data comes after the BTH */
426 ohdr->u.imm_data = wqe->wr.ex.imm_data;
427 hwords += 1;
428 } else {
429 qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE);
430 /* Invalidate rkey comes after the BTH */
431 ohdr->u.ieth = cpu_to_be32(
432 wqe->wr.ex.invalidate_rkey);
433 hwords += 1;
434 }
435 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
436 bth0 |= IB_BTH_SOLICITED;
437 bth2 |= IB_BTH_REQ_ACK;
438 if (++qp->s_cur == qp->s_size)
439 qp->s_cur = 0;
440 break;
441
442 case IB_WR_RDMA_WRITE:
443 if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
444 qp->s_lsn++;
445 goto no_flow_control;
446 case IB_WR_RDMA_WRITE_WITH_IMM:
447 /* If no credit, return. */
448 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
449 rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
450 qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
451 goto bail;
452 }
453 no_flow_control:
454 put_ib_reth_vaddr(
455 wqe->rdma_wr.remote_addr,
456 &ohdr->u.rc.reth);
457 ohdr->u.rc.reth.rkey =
458 cpu_to_be32(wqe->rdma_wr.rkey);
459 ohdr->u.rc.reth.length = cpu_to_be32(len);
460 hwords += sizeof(struct ib_reth) / sizeof(u32);
461 if (len > pmtu) {
462 qp->s_state = OP(RDMA_WRITE_FIRST);
463 len = pmtu;
464 break;
465 }
466 if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
467 qp->s_state = OP(RDMA_WRITE_ONLY);
468 } else {
469 qp->s_state =
470 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
471 /* Immediate data comes after RETH */
472 ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
473 hwords += 1;
474 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
475 bth0 |= IB_BTH_SOLICITED;
476 }
477 bth2 |= IB_BTH_REQ_ACK;
478 if (++qp->s_cur == qp->s_size)
479 qp->s_cur = 0;
480 break;
481
482 case IB_WR_RDMA_READ:
483 /*
484 * Don't allow more operations to be started
485 * than the QP limits allow.
486 */
487 if (newreq) {
488 if (qp->s_num_rd_atomic >=
489 qp->s_max_rd_atomic) {
490 qp->s_flags |= RVT_S_WAIT_RDMAR;
491 goto bail;
492 }
493 qp->s_num_rd_atomic++;
494 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
495 qp->s_lsn++;
496 }
497 put_ib_reth_vaddr(
498 wqe->rdma_wr.remote_addr,
499 &ohdr->u.rc.reth);
500 ohdr->u.rc.reth.rkey =
501 cpu_to_be32(wqe->rdma_wr.rkey);
502 ohdr->u.rc.reth.length = cpu_to_be32(len);
503 qp->s_state = OP(RDMA_READ_REQUEST);
504 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
505 ss = NULL;
506 len = 0;
507 bth2 |= IB_BTH_REQ_ACK;
508 if (++qp->s_cur == qp->s_size)
509 qp->s_cur = 0;
510 break;
511
512 case IB_WR_ATOMIC_CMP_AND_SWP:
513 case IB_WR_ATOMIC_FETCH_AND_ADD:
514 /*
515 * Don't allow more operations to be started
516 * than the QP limits allow.
517 */
518 if (newreq) {
519 if (qp->s_num_rd_atomic >=
520 qp->s_max_rd_atomic) {
521 qp->s_flags |= RVT_S_WAIT_RDMAR;
522 goto bail;
523 }
524 qp->s_num_rd_atomic++;
525 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
526 qp->s_lsn++;
527 }
528 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
529 qp->s_state = OP(COMPARE_SWAP);
530 put_ib_ateth_swap(wqe->atomic_wr.swap,
531 &ohdr->u.atomic_eth);
532 put_ib_ateth_compare(wqe->atomic_wr.compare_add,
533 &ohdr->u.atomic_eth);
534 } else {
535 qp->s_state = OP(FETCH_ADD);
536 put_ib_ateth_swap(wqe->atomic_wr.compare_add,
537 &ohdr->u.atomic_eth);
538 put_ib_ateth_compare(0, &ohdr->u.atomic_eth);
539 }
540 put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr,
541 &ohdr->u.atomic_eth);
542 ohdr->u.atomic_eth.rkey = cpu_to_be32(
543 wqe->atomic_wr.rkey);
544 hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
545 ss = NULL;
546 len = 0;
547 bth2 |= IB_BTH_REQ_ACK;
548 if (++qp->s_cur == qp->s_size)
549 qp->s_cur = 0;
550 break;
551
552 default:
553 goto bail;
554 }
555 qp->s_sge.sge = wqe->sg_list[0];
556 qp->s_sge.sg_list = wqe->sg_list + 1;
557 qp->s_sge.num_sge = wqe->wr.num_sge;
558 qp->s_sge.total_len = wqe->length;
559 qp->s_len = wqe->length;
560 if (newreq) {
561 qp->s_tail++;
562 if (qp->s_tail >= qp->s_size)
563 qp->s_tail = 0;
564 }
565 if (wqe->wr.opcode == IB_WR_RDMA_READ)
566 qp->s_psn = wqe->lpsn + 1;
567 else
568 qp->s_psn++;
569 break;
570
571 case OP(RDMA_READ_RESPONSE_FIRST):
572 /*
573 * qp->s_state is normally set to the opcode of the
574 * last packet constructed for new requests and therefore
575 * is never set to RDMA read response.
576 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
577 * thread to indicate a SEND needs to be restarted from an
578 * earlier PSN without interfering with the sending thread.
579 * See restart_rc().
580 */
581 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
582 /* FALLTHROUGH */
583 case OP(SEND_FIRST):
584 qp->s_state = OP(SEND_MIDDLE);
585 /* FALLTHROUGH */
586 case OP(SEND_MIDDLE):
587 bth2 = mask_psn(qp->s_psn++);
588 ss = &qp->s_sge;
589 len = qp->s_len;
590 if (len > pmtu) {
591 len = pmtu;
592 middle = HFI1_CAP_IS_KSET(SDMA_AHG);
593 break;
594 }
595 if (wqe->wr.opcode == IB_WR_SEND) {
596 qp->s_state = OP(SEND_LAST);
597 } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
598 qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
599 /* Immediate data comes after the BTH */
600 ohdr->u.imm_data = wqe->wr.ex.imm_data;
601 hwords += 1;
602 } else {
603 qp->s_state = OP(SEND_LAST_WITH_INVALIDATE);
604 /* invalidate data comes after the BTH */
605 ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey);
606 hwords += 1;
607 }
608 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
609 bth0 |= IB_BTH_SOLICITED;
610 bth2 |= IB_BTH_REQ_ACK;
611 qp->s_cur++;
612 if (qp->s_cur >= qp->s_size)
613 qp->s_cur = 0;
614 break;
615
616 case OP(RDMA_READ_RESPONSE_LAST):
617 /*
618 * qp->s_state is normally set to the opcode of the
619 * last packet constructed for new requests and therefore
620 * is never set to RDMA read response.
621 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
622 * thread to indicate a RDMA write needs to be restarted from
623 * an earlier PSN without interfering with the sending thread.
624 * See restart_rc().
625 */
626 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
627 /* FALLTHROUGH */
628 case OP(RDMA_WRITE_FIRST):
629 qp->s_state = OP(RDMA_WRITE_MIDDLE);
630 /* FALLTHROUGH */
631 case OP(RDMA_WRITE_MIDDLE):
632 bth2 = mask_psn(qp->s_psn++);
633 ss = &qp->s_sge;
634 len = qp->s_len;
635 if (len > pmtu) {
636 len = pmtu;
637 middle = HFI1_CAP_IS_KSET(SDMA_AHG);
638 break;
639 }
640 if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
641 qp->s_state = OP(RDMA_WRITE_LAST);
642 } else {
643 qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
644 /* Immediate data comes after the BTH */
645 ohdr->u.imm_data = wqe->wr.ex.imm_data;
646 hwords += 1;
647 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
648 bth0 |= IB_BTH_SOLICITED;
649 }
650 bth2 |= IB_BTH_REQ_ACK;
651 qp->s_cur++;
652 if (qp->s_cur >= qp->s_size)
653 qp->s_cur = 0;
654 break;
655
656 case OP(RDMA_READ_RESPONSE_MIDDLE):
657 /*
658 * qp->s_state is normally set to the opcode of the
659 * last packet constructed for new requests and therefore
660 * is never set to RDMA read response.
661 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
662 * thread to indicate a RDMA read needs to be restarted from
663 * an earlier PSN without interfering with the sending thread.
664 * See restart_rc().
665 */
666 len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
667 put_ib_reth_vaddr(
668 wqe->rdma_wr.remote_addr + len,
669 &ohdr->u.rc.reth);
670 ohdr->u.rc.reth.rkey =
671 cpu_to_be32(wqe->rdma_wr.rkey);
672 ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
673 qp->s_state = OP(RDMA_READ_REQUEST);
674 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
675 bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
676 qp->s_psn = wqe->lpsn + 1;
677 ss = NULL;
678 len = 0;
679 qp->s_cur++;
680 if (qp->s_cur == qp->s_size)
681 qp->s_cur = 0;
682 break;
683 }
684 qp->s_sending_hpsn = bth2;
685 delta = delta_psn(bth2, wqe->psn);
686 if (delta && delta % HFI1_PSN_CREDIT == 0)
687 bth2 |= IB_BTH_REQ_ACK;
688 if (qp->s_flags & RVT_S_SEND_ONE) {
689 qp->s_flags &= ~RVT_S_SEND_ONE;
690 qp->s_flags |= RVT_S_WAIT_ACK;
691 bth2 |= IB_BTH_REQ_ACK;
692 }
693 qp->s_len -= len;
694 qp->s_hdrwords = hwords;
695 ps->s_txreq->sde = priv->s_sde;
696 ps->s_txreq->ss = ss;
697 ps->s_txreq->s_cur_size = len;
698 hfi1_make_ruc_header(
699 qp,
700 ohdr,
701 bth0 | (qp->s_state << 24),
702 bth2,
703 middle,
704 ps);
705 /* pbc */
706 ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
707 return 1;
708
709 done_free_tx:
710 hfi1_put_txreq(ps->s_txreq);
711 ps->s_txreq = NULL;
712 return 1;
713
714 bail:
715 hfi1_put_txreq(ps->s_txreq);
716
717 bail_no_tx:
718 ps->s_txreq = NULL;
719 qp->s_flags &= ~RVT_S_BUSY;
720 qp->s_hdrwords = 0;
721 return 0;
722 }
723
hfi1_make_bth_aeth(struct rvt_qp * qp,struct ib_other_headers * ohdr,u32 bth0,u32 bth1)724 static inline void hfi1_make_bth_aeth(struct rvt_qp *qp,
725 struct ib_other_headers *ohdr,
726 u32 bth0, u32 bth1)
727 {
728 if (qp->r_nak_state)
729 ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
730 (qp->r_nak_state <<
731 IB_AETH_CREDIT_SHIFT));
732 else
733 ohdr->u.aeth = rvt_compute_aeth(qp);
734
735 ohdr->bth[0] = cpu_to_be32(bth0);
736 ohdr->bth[1] = cpu_to_be32(bth1 | qp->remote_qpn);
737 ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
738 }
739
hfi1_queue_rc_ack(struct rvt_qp * qp,bool is_fecn)740 static inline void hfi1_queue_rc_ack(struct rvt_qp *qp, bool is_fecn)
741 {
742 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
743 unsigned long flags;
744
745 spin_lock_irqsave(&qp->s_lock, flags);
746 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
747 goto unlock;
748 this_cpu_inc(*ibp->rvp.rc_qacks);
749 qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
750 qp->s_nak_state = qp->r_nak_state;
751 qp->s_ack_psn = qp->r_ack_psn;
752 if (is_fecn)
753 qp->s_flags |= RVT_S_ECN;
754
755 /* Schedule the send tasklet. */
756 hfi1_schedule_send(qp);
757 unlock:
758 spin_unlock_irqrestore(&qp->s_lock, flags);
759 }
760
hfi1_make_rc_ack_9B(struct rvt_qp * qp,struct hfi1_opa_header * opa_hdr,u8 sc5,bool is_fecn,u64 * pbc_flags,u32 * hwords,u32 * nwords)761 static inline void hfi1_make_rc_ack_9B(struct rvt_qp *qp,
762 struct hfi1_opa_header *opa_hdr,
763 u8 sc5, bool is_fecn,
764 u64 *pbc_flags, u32 *hwords,
765 u32 *nwords)
766 {
767 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
768 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
769 struct ib_header *hdr = &opa_hdr->ibh;
770 struct ib_other_headers *ohdr;
771 u16 lrh0 = HFI1_LRH_BTH;
772 u16 pkey;
773 u32 bth0, bth1;
774
775 opa_hdr->hdr_type = HFI1_PKT_TYPE_9B;
776 ohdr = &hdr->u.oth;
777 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
778 *hwords = 6;
779
780 if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) {
781 *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh,
782 rdma_ah_read_grh(&qp->remote_ah_attr),
783 *hwords - 2, SIZE_OF_CRC);
784 ohdr = &hdr->u.l.oth;
785 lrh0 = HFI1_LRH_GRH;
786 }
787 /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
788 *pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
789
790 /* read pkey_index w/o lock (its atomic) */
791 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
792
793 lrh0 |= (sc5 & IB_SC_MASK) << IB_SC_SHIFT |
794 (rdma_ah_get_sl(&qp->remote_ah_attr) & IB_SL_MASK) <<
795 IB_SL_SHIFT;
796
797 hfi1_make_ib_hdr(hdr, lrh0, *hwords + SIZE_OF_CRC,
798 opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B),
799 ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr));
800
801 bth0 = pkey | (OP(ACKNOWLEDGE) << 24);
802 if (qp->s_mig_state == IB_MIG_MIGRATED)
803 bth0 |= IB_BTH_MIG_REQ;
804 bth1 = (!!is_fecn) << IB_BECN_SHIFT;
805 hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
806 }
807
hfi1_make_rc_ack_16B(struct rvt_qp * qp,struct hfi1_opa_header * opa_hdr,u8 sc5,bool is_fecn,u64 * pbc_flags,u32 * hwords,u32 * nwords)808 static inline void hfi1_make_rc_ack_16B(struct rvt_qp *qp,
809 struct hfi1_opa_header *opa_hdr,
810 u8 sc5, bool is_fecn,
811 u64 *pbc_flags, u32 *hwords,
812 u32 *nwords)
813 {
814 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
815 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
816 struct hfi1_16b_header *hdr = &opa_hdr->opah;
817 struct ib_other_headers *ohdr;
818 u32 bth0, bth1 = 0;
819 u16 len, pkey;
820 u8 becn = !!is_fecn;
821 u8 l4 = OPA_16B_L4_IB_LOCAL;
822 u8 extra_bytes;
823
824 opa_hdr->hdr_type = HFI1_PKT_TYPE_16B;
825 ohdr = &hdr->u.oth;
826 /* header size in 32-bit words 16B LRH+BTH+AETH = (16+12+4)/4 */
827 *hwords = 8;
828 extra_bytes = hfi1_get_16b_padding(*hwords << 2, 0);
829 *nwords = SIZE_OF_CRC + ((extra_bytes + SIZE_OF_LT) >> 2);
830
831 if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
832 hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))) {
833 *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh,
834 rdma_ah_read_grh(&qp->remote_ah_attr),
835 *hwords - 4, *nwords);
836 ohdr = &hdr->u.l.oth;
837 l4 = OPA_16B_L4_IB_GLOBAL;
838 }
839 *pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
840
841 /* read pkey_index w/o lock (its atomic) */
842 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
843
844 /* Convert dwords to flits */
845 len = (*hwords + *nwords) >> 1;
846
847 hfi1_make_16b_hdr(hdr,
848 ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr),
849 opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr),
850 16B),
851 len, pkey, becn, 0, l4, sc5);
852
853 bth0 = pkey | (OP(ACKNOWLEDGE) << 24);
854 bth0 |= extra_bytes << 20;
855 if (qp->s_mig_state == IB_MIG_MIGRATED)
856 bth1 = OPA_BTH_MIG_REQ;
857 hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
858 }
859
860 typedef void (*hfi1_make_rc_ack)(struct rvt_qp *qp,
861 struct hfi1_opa_header *opa_hdr,
862 u8 sc5, bool is_fecn,
863 u64 *pbc_flags, u32 *hwords,
864 u32 *nwords);
865
866 /* We support only two types - 9B and 16B for now */
867 static const hfi1_make_rc_ack hfi1_make_rc_ack_tbl[2] = {
868 [HFI1_PKT_TYPE_9B] = &hfi1_make_rc_ack_9B,
869 [HFI1_PKT_TYPE_16B] = &hfi1_make_rc_ack_16B
870 };
871
872 /**
873 * hfi1_send_rc_ack - Construct an ACK packet and send it
874 * @qp: a pointer to the QP
875 *
876 * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
877 * Note that RDMA reads and atomics are handled in the
878 * send side QP state and send engine.
879 */
hfi1_send_rc_ack(struct hfi1_ctxtdata * rcd,struct rvt_qp * qp,bool is_fecn)880 void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd,
881 struct rvt_qp *qp, bool is_fecn)
882 {
883 struct hfi1_ibport *ibp = rcd_to_iport(rcd);
884 struct hfi1_qp_priv *priv = qp->priv;
885 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
886 u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)];
887 u64 pbc, pbc_flags = 0;
888 u32 hwords = 0;
889 u32 nwords = 0;
890 u32 plen;
891 struct pio_buf *pbuf;
892 struct hfi1_opa_header opa_hdr;
893
894 /* clear the defer count */
895 qp->r_adefered = 0;
896
897 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
898 if (qp->s_flags & RVT_S_RESP_PENDING) {
899 hfi1_queue_rc_ack(qp, is_fecn);
900 return;
901 }
902
903 /* Ensure s_rdma_ack_cnt changes are committed */
904 smp_read_barrier_depends();
905 if (qp->s_rdma_ack_cnt) {
906 hfi1_queue_rc_ack(qp, is_fecn);
907 return;
908 }
909
910 /* Don't try to send ACKs if the link isn't ACTIVE */
911 if (driver_lstate(ppd) != IB_PORT_ACTIVE)
912 return;
913
914 /* Make the appropriate header */
915 hfi1_make_rc_ack_tbl[priv->hdr_type](qp, &opa_hdr, sc5, is_fecn,
916 &pbc_flags, &hwords, &nwords);
917
918 plen = 2 /* PBC */ + hwords + nwords;
919 pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps,
920 sc_to_vlt(ppd->dd, sc5), plen);
921 pbuf = sc_buffer_alloc(rcd->sc, plen, NULL, NULL);
922 if (!pbuf) {
923 /*
924 * We have no room to send at the moment. Pass
925 * responsibility for sending the ACK to the send engine
926 * so that when enough buffer space becomes available,
927 * the ACK is sent ahead of other outgoing packets.
928 */
929 hfi1_queue_rc_ack(qp, is_fecn);
930 return;
931 }
932 trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
933 &opa_hdr, ib_is_sc5(sc5));
934
935 /* write the pbc and data */
936 ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
937 (priv->hdr_type == HFI1_PKT_TYPE_9B ?
938 (void *)&opa_hdr.ibh :
939 (void *)&opa_hdr.opah), hwords);
940 return;
941 }
942
943 /**
944 * reset_psn - reset the QP state to send starting from PSN
945 * @qp: the QP
946 * @psn: the packet sequence number to restart at
947 *
948 * This is called from hfi1_rc_rcv() to process an incoming RC ACK
949 * for the given QP.
950 * Called at interrupt level with the QP s_lock held.
951 */
reset_psn(struct rvt_qp * qp,u32 psn)952 static void reset_psn(struct rvt_qp *qp, u32 psn)
953 {
954 u32 n = qp->s_acked;
955 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
956 u32 opcode;
957
958 lockdep_assert_held(&qp->s_lock);
959 qp->s_cur = n;
960
961 /*
962 * If we are starting the request from the beginning,
963 * let the normal send code handle initialization.
964 */
965 if (cmp_psn(psn, wqe->psn) <= 0) {
966 qp->s_state = OP(SEND_LAST);
967 goto done;
968 }
969
970 /* Find the work request opcode corresponding to the given PSN. */
971 opcode = wqe->wr.opcode;
972 for (;;) {
973 int diff;
974
975 if (++n == qp->s_size)
976 n = 0;
977 if (n == qp->s_tail)
978 break;
979 wqe = rvt_get_swqe_ptr(qp, n);
980 diff = cmp_psn(psn, wqe->psn);
981 if (diff < 0)
982 break;
983 qp->s_cur = n;
984 /*
985 * If we are starting the request from the beginning,
986 * let the normal send code handle initialization.
987 */
988 if (diff == 0) {
989 qp->s_state = OP(SEND_LAST);
990 goto done;
991 }
992 opcode = wqe->wr.opcode;
993 }
994
995 /*
996 * Set the state to restart in the middle of a request.
997 * Don't change the s_sge, s_cur_sge, or s_cur_size.
998 * See hfi1_make_rc_req().
999 */
1000 switch (opcode) {
1001 case IB_WR_SEND:
1002 case IB_WR_SEND_WITH_IMM:
1003 qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
1004 break;
1005
1006 case IB_WR_RDMA_WRITE:
1007 case IB_WR_RDMA_WRITE_WITH_IMM:
1008 qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
1009 break;
1010
1011 case IB_WR_RDMA_READ:
1012 qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
1013 break;
1014
1015 default:
1016 /*
1017 * This case shouldn't happen since its only
1018 * one PSN per req.
1019 */
1020 qp->s_state = OP(SEND_LAST);
1021 }
1022 done:
1023 qp->s_psn = psn;
1024 /*
1025 * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
1026 * asynchronously before the send engine can get scheduled.
1027 * Doing it in hfi1_make_rc_req() is too late.
1028 */
1029 if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
1030 (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
1031 qp->s_flags |= RVT_S_WAIT_PSN;
1032 qp->s_flags &= ~RVT_S_AHG_VALID;
1033 }
1034
1035 /*
1036 * Back up requester to resend the last un-ACKed request.
1037 * The QP r_lock and s_lock should be held and interrupts disabled.
1038 */
hfi1_restart_rc(struct rvt_qp * qp,u32 psn,int wait)1039 void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
1040 {
1041 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1042 struct hfi1_ibport *ibp;
1043
1044 lockdep_assert_held(&qp->r_lock);
1045 lockdep_assert_held(&qp->s_lock);
1046 if (qp->s_retry == 0) {
1047 if (qp->s_mig_state == IB_MIG_ARMED) {
1048 hfi1_migrate_qp(qp);
1049 qp->s_retry = qp->s_retry_cnt;
1050 } else if (qp->s_last == qp->s_acked) {
1051 hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
1052 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1053 return;
1054 } else { /* need to handle delayed completion */
1055 return;
1056 }
1057 } else {
1058 qp->s_retry--;
1059 }
1060
1061 ibp = to_iport(qp->ibqp.device, qp->port_num);
1062 if (wqe->wr.opcode == IB_WR_RDMA_READ)
1063 ibp->rvp.n_rc_resends++;
1064 else
1065 ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
1066
1067 qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
1068 RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
1069 RVT_S_WAIT_ACK);
1070 if (wait)
1071 qp->s_flags |= RVT_S_SEND_ONE;
1072 reset_psn(qp, psn);
1073 }
1074
1075 /*
1076 * Set qp->s_sending_psn to the next PSN after the given one.
1077 * This would be psn+1 except when RDMA reads are present.
1078 */
reset_sending_psn(struct rvt_qp * qp,u32 psn)1079 static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
1080 {
1081 struct rvt_swqe *wqe;
1082 u32 n = qp->s_last;
1083
1084 lockdep_assert_held(&qp->s_lock);
1085 /* Find the work request corresponding to the given PSN. */
1086 for (;;) {
1087 wqe = rvt_get_swqe_ptr(qp, n);
1088 if (cmp_psn(psn, wqe->lpsn) <= 0) {
1089 if (wqe->wr.opcode == IB_WR_RDMA_READ)
1090 qp->s_sending_psn = wqe->lpsn + 1;
1091 else
1092 qp->s_sending_psn = psn + 1;
1093 break;
1094 }
1095 if (++n == qp->s_size)
1096 n = 0;
1097 if (n == qp->s_tail)
1098 break;
1099 }
1100 }
1101
1102 /*
1103 * This should be called with the QP s_lock held and interrupts disabled.
1104 */
hfi1_rc_send_complete(struct rvt_qp * qp,struct hfi1_opa_header * opah)1105 void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
1106 {
1107 struct ib_other_headers *ohdr;
1108 struct hfi1_qp_priv *priv = qp->priv;
1109 struct rvt_swqe *wqe;
1110 struct ib_header *hdr = NULL;
1111 struct hfi1_16b_header *hdr_16b = NULL;
1112 u32 opcode;
1113 u32 psn;
1114
1115 lockdep_assert_held(&qp->s_lock);
1116 if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
1117 return;
1118
1119 /* Find out where the BTH is */
1120 if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
1121 hdr = &opah->ibh;
1122 if (ib_get_lnh(hdr) == HFI1_LRH_BTH)
1123 ohdr = &hdr->u.oth;
1124 else
1125 ohdr = &hdr->u.l.oth;
1126 } else {
1127 u8 l4;
1128
1129 hdr_16b = &opah->opah;
1130 l4 = hfi1_16B_get_l4(hdr_16b);
1131 if (l4 == OPA_16B_L4_IB_LOCAL)
1132 ohdr = &hdr_16b->u.oth;
1133 else
1134 ohdr = &hdr_16b->u.l.oth;
1135 }
1136
1137 opcode = ib_bth_get_opcode(ohdr);
1138 if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1139 opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1140 WARN_ON(!qp->s_rdma_ack_cnt);
1141 qp->s_rdma_ack_cnt--;
1142 return;
1143 }
1144
1145 psn = ib_bth_get_psn(ohdr);
1146 reset_sending_psn(qp, psn);
1147
1148 /*
1149 * Start timer after a packet requesting an ACK has been sent and
1150 * there are still requests that haven't been acked.
1151 */
1152 if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
1153 !(qp->s_flags &
1154 (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
1155 (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
1156 rvt_add_retry_timer(qp);
1157
1158 while (qp->s_last != qp->s_acked) {
1159 u32 s_last;
1160
1161 wqe = rvt_get_swqe_ptr(qp, qp->s_last);
1162 if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
1163 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
1164 break;
1165 rvt_qp_wqe_unreserve(qp, wqe);
1166 s_last = qp->s_last;
1167 trace_hfi1_qp_send_completion(qp, wqe, s_last);
1168 if (++s_last >= qp->s_size)
1169 s_last = 0;
1170 qp->s_last = s_last;
1171 /* see post_send() */
1172 barrier();
1173 rvt_put_swqe(wqe);
1174 rvt_qp_swqe_complete(qp,
1175 wqe,
1176 ib_hfi1_wc_opcode[wqe->wr.opcode],
1177 IB_WC_SUCCESS);
1178 }
1179 /*
1180 * If we were waiting for sends to complete before re-sending,
1181 * and they are now complete, restart sending.
1182 */
1183 trace_hfi1_sendcomplete(qp, psn);
1184 if (qp->s_flags & RVT_S_WAIT_PSN &&
1185 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1186 qp->s_flags &= ~RVT_S_WAIT_PSN;
1187 qp->s_sending_psn = qp->s_psn;
1188 qp->s_sending_hpsn = qp->s_psn - 1;
1189 hfi1_schedule_send(qp);
1190 }
1191 }
1192
update_last_psn(struct rvt_qp * qp,u32 psn)1193 static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
1194 {
1195 qp->s_last_psn = psn;
1196 }
1197
1198 /*
1199 * Generate a SWQE completion.
1200 * This is similar to hfi1_send_complete but has to check to be sure
1201 * that the SGEs are not being referenced if the SWQE is being resent.
1202 */
do_rc_completion(struct rvt_qp * qp,struct rvt_swqe * wqe,struct hfi1_ibport * ibp)1203 static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
1204 struct rvt_swqe *wqe,
1205 struct hfi1_ibport *ibp)
1206 {
1207 lockdep_assert_held(&qp->s_lock);
1208 /*
1209 * Don't decrement refcount and don't generate a
1210 * completion if the SWQE is being resent until the send
1211 * is finished.
1212 */
1213 if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
1214 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1215 u32 s_last;
1216
1217 rvt_put_swqe(wqe);
1218 rvt_qp_wqe_unreserve(qp, wqe);
1219 s_last = qp->s_last;
1220 trace_hfi1_qp_send_completion(qp, wqe, s_last);
1221 if (++s_last >= qp->s_size)
1222 s_last = 0;
1223 qp->s_last = s_last;
1224 /* see post_send() */
1225 barrier();
1226 rvt_qp_swqe_complete(qp,
1227 wqe,
1228 ib_hfi1_wc_opcode[wqe->wr.opcode],
1229 IB_WC_SUCCESS);
1230 } else {
1231 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1232
1233 this_cpu_inc(*ibp->rvp.rc_delayed_comp);
1234 /*
1235 * If send progress not running attempt to progress
1236 * SDMA queue.
1237 */
1238 if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
1239 struct sdma_engine *engine;
1240 u8 sl = rdma_ah_get_sl(&qp->remote_ah_attr);
1241 u8 sc5;
1242
1243 /* For now use sc to find engine */
1244 sc5 = ibp->sl_to_sc[sl];
1245 engine = qp_to_sdma_engine(qp, sc5);
1246 sdma_engine_progress_schedule(engine);
1247 }
1248 }
1249
1250 qp->s_retry = qp->s_retry_cnt;
1251 update_last_psn(qp, wqe->lpsn);
1252
1253 /*
1254 * If we are completing a request which is in the process of
1255 * being resent, we can stop re-sending it since we know the
1256 * responder has already seen it.
1257 */
1258 if (qp->s_acked == qp->s_cur) {
1259 if (++qp->s_cur >= qp->s_size)
1260 qp->s_cur = 0;
1261 qp->s_acked = qp->s_cur;
1262 wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
1263 if (qp->s_acked != qp->s_tail) {
1264 qp->s_state = OP(SEND_LAST);
1265 qp->s_psn = wqe->psn;
1266 }
1267 } else {
1268 if (++qp->s_acked >= qp->s_size)
1269 qp->s_acked = 0;
1270 if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
1271 qp->s_draining = 0;
1272 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1273 }
1274 return wqe;
1275 }
1276
1277 /**
1278 * do_rc_ack - process an incoming RC ACK
1279 * @qp: the QP the ACK came in on
1280 * @psn: the packet sequence number of the ACK
1281 * @opcode: the opcode of the request that resulted in the ACK
1282 *
1283 * This is called from rc_rcv_resp() to process an incoming RC ACK
1284 * for the given QP.
1285 * May be called at interrupt level, with the QP s_lock held.
1286 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
1287 */
do_rc_ack(struct rvt_qp * qp,u32 aeth,u32 psn,int opcode,u64 val,struct hfi1_ctxtdata * rcd)1288 static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
1289 u64 val, struct hfi1_ctxtdata *rcd)
1290 {
1291 struct hfi1_ibport *ibp;
1292 enum ib_wc_status status;
1293 struct rvt_swqe *wqe;
1294 int ret = 0;
1295 u32 ack_psn;
1296 int diff;
1297
1298 lockdep_assert_held(&qp->s_lock);
1299 /*
1300 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
1301 * requests and implicitly NAK RDMA read and atomic requests issued
1302 * before the NAK'ed request. The MSN won't include the NAK'ed
1303 * request but will include an ACK'ed request(s).
1304 */
1305 ack_psn = psn;
1306 if (aeth >> IB_AETH_NAK_SHIFT)
1307 ack_psn--;
1308 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1309 ibp = rcd_to_iport(rcd);
1310
1311 /*
1312 * The MSN might be for a later WQE than the PSN indicates so
1313 * only complete WQEs that the PSN finishes.
1314 */
1315 while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
1316 /*
1317 * RDMA_READ_RESPONSE_ONLY is a special case since
1318 * we want to generate completion events for everything
1319 * before the RDMA read, copy the data, then generate
1320 * the completion for the read.
1321 */
1322 if (wqe->wr.opcode == IB_WR_RDMA_READ &&
1323 opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
1324 diff == 0) {
1325 ret = 1;
1326 goto bail_stop;
1327 }
1328 /*
1329 * If this request is a RDMA read or atomic, and the ACK is
1330 * for a later operation, this ACK NAKs the RDMA read or
1331 * atomic. In other words, only a RDMA_READ_LAST or ONLY
1332 * can ACK a RDMA read and likewise for atomic ops. Note
1333 * that the NAK case can only happen if relaxed ordering is
1334 * used and requests are sent after an RDMA read or atomic
1335 * is sent but before the response is received.
1336 */
1337 if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
1338 (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
1339 ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1340 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
1341 (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
1342 /* Retry this request. */
1343 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
1344 qp->r_flags |= RVT_R_RDMAR_SEQ;
1345 hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
1346 if (list_empty(&qp->rspwait)) {
1347 qp->r_flags |= RVT_R_RSP_SEND;
1348 rvt_get_qp(qp);
1349 list_add_tail(&qp->rspwait,
1350 &rcd->qp_wait_list);
1351 }
1352 }
1353 /*
1354 * No need to process the ACK/NAK since we are
1355 * restarting an earlier request.
1356 */
1357 goto bail_stop;
1358 }
1359 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1360 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
1361 u64 *vaddr = wqe->sg_list[0].vaddr;
1362 *vaddr = val;
1363 }
1364 if (qp->s_num_rd_atomic &&
1365 (wqe->wr.opcode == IB_WR_RDMA_READ ||
1366 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1367 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
1368 qp->s_num_rd_atomic--;
1369 /* Restart sending task if fence is complete */
1370 if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
1371 !qp->s_num_rd_atomic) {
1372 qp->s_flags &= ~(RVT_S_WAIT_FENCE |
1373 RVT_S_WAIT_ACK);
1374 hfi1_schedule_send(qp);
1375 } else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
1376 qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
1377 RVT_S_WAIT_ACK);
1378 hfi1_schedule_send(qp);
1379 }
1380 }
1381 wqe = do_rc_completion(qp, wqe, ibp);
1382 if (qp->s_acked == qp->s_tail)
1383 break;
1384 }
1385
1386 switch (aeth >> IB_AETH_NAK_SHIFT) {
1387 case 0: /* ACK */
1388 this_cpu_inc(*ibp->rvp.rc_acks);
1389 if (qp->s_acked != qp->s_tail) {
1390 /*
1391 * We are expecting more ACKs so
1392 * mod the retry timer.
1393 */
1394 rvt_mod_retry_timer(qp);
1395 /*
1396 * We can stop re-sending the earlier packets and
1397 * continue with the next packet the receiver wants.
1398 */
1399 if (cmp_psn(qp->s_psn, psn) <= 0)
1400 reset_psn(qp, psn + 1);
1401 } else {
1402 /* No more acks - kill all timers */
1403 rvt_stop_rc_timers(qp);
1404 if (cmp_psn(qp->s_psn, psn) <= 0) {
1405 qp->s_state = OP(SEND_LAST);
1406 qp->s_psn = psn + 1;
1407 }
1408 }
1409 if (qp->s_flags & RVT_S_WAIT_ACK) {
1410 qp->s_flags &= ~RVT_S_WAIT_ACK;
1411 hfi1_schedule_send(qp);
1412 }
1413 rvt_get_credit(qp, aeth);
1414 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1415 qp->s_retry = qp->s_retry_cnt;
1416 update_last_psn(qp, psn);
1417 return 1;
1418
1419 case 1: /* RNR NAK */
1420 ibp->rvp.n_rnr_naks++;
1421 if (qp->s_acked == qp->s_tail)
1422 goto bail_stop;
1423 if (qp->s_flags & RVT_S_WAIT_RNR)
1424 goto bail_stop;
1425 if (qp->s_rnr_retry == 0) {
1426 status = IB_WC_RNR_RETRY_EXC_ERR;
1427 goto class_b;
1428 }
1429 if (qp->s_rnr_retry_cnt < 7)
1430 qp->s_rnr_retry--;
1431
1432 /* The last valid PSN is the previous PSN. */
1433 update_last_psn(qp, psn - 1);
1434
1435 ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
1436
1437 reset_psn(qp, psn);
1438
1439 qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
1440 rvt_stop_rc_timers(qp);
1441 rvt_add_rnr_timer(qp, aeth);
1442 return 0;
1443
1444 case 3: /* NAK */
1445 if (qp->s_acked == qp->s_tail)
1446 goto bail_stop;
1447 /* The last valid PSN is the previous PSN. */
1448 update_last_psn(qp, psn - 1);
1449 switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
1450 IB_AETH_CREDIT_MASK) {
1451 case 0: /* PSN sequence error */
1452 ibp->rvp.n_seq_naks++;
1453 /*
1454 * Back up to the responder's expected PSN.
1455 * Note that we might get a NAK in the middle of an
1456 * RDMA READ response which terminates the RDMA
1457 * READ.
1458 */
1459 hfi1_restart_rc(qp, psn, 0);
1460 hfi1_schedule_send(qp);
1461 break;
1462
1463 case 1: /* Invalid Request */
1464 status = IB_WC_REM_INV_REQ_ERR;
1465 ibp->rvp.n_other_naks++;
1466 goto class_b;
1467
1468 case 2: /* Remote Access Error */
1469 status = IB_WC_REM_ACCESS_ERR;
1470 ibp->rvp.n_other_naks++;
1471 goto class_b;
1472
1473 case 3: /* Remote Operation Error */
1474 status = IB_WC_REM_OP_ERR;
1475 ibp->rvp.n_other_naks++;
1476 class_b:
1477 if (qp->s_last == qp->s_acked) {
1478 hfi1_send_complete(qp, wqe, status);
1479 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1480 }
1481 break;
1482
1483 default:
1484 /* Ignore other reserved NAK error codes */
1485 goto reserved;
1486 }
1487 qp->s_retry = qp->s_retry_cnt;
1488 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1489 goto bail_stop;
1490
1491 default: /* 2: reserved */
1492 reserved:
1493 /* Ignore reserved NAK codes. */
1494 goto bail_stop;
1495 }
1496 /* cannot be reached */
1497 bail_stop:
1498 rvt_stop_rc_timers(qp);
1499 return ret;
1500 }
1501
1502 /*
1503 * We have seen an out of sequence RDMA read middle or last packet.
1504 * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
1505 */
rdma_seq_err(struct rvt_qp * qp,struct hfi1_ibport * ibp,u32 psn,struct hfi1_ctxtdata * rcd)1506 static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
1507 struct hfi1_ctxtdata *rcd)
1508 {
1509 struct rvt_swqe *wqe;
1510
1511 lockdep_assert_held(&qp->s_lock);
1512 /* Remove QP from retry timer */
1513 rvt_stop_rc_timers(qp);
1514
1515 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1516
1517 while (cmp_psn(psn, wqe->lpsn) > 0) {
1518 if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1519 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1520 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
1521 break;
1522 wqe = do_rc_completion(qp, wqe, ibp);
1523 }
1524
1525 ibp->rvp.n_rdma_seq++;
1526 qp->r_flags |= RVT_R_RDMAR_SEQ;
1527 hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
1528 if (list_empty(&qp->rspwait)) {
1529 qp->r_flags |= RVT_R_RSP_SEND;
1530 rvt_get_qp(qp);
1531 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1532 }
1533 }
1534
1535 /**
1536 * rc_rcv_resp - process an incoming RC response packet
1537 * @packet: data packet information
1538 *
1539 * This is called from hfi1_rc_rcv() to process an incoming RC response
1540 * packet for the given QP.
1541 * Called at interrupt level.
1542 */
rc_rcv_resp(struct hfi1_packet * packet)1543 static void rc_rcv_resp(struct hfi1_packet *packet)
1544 {
1545 struct hfi1_ctxtdata *rcd = packet->rcd;
1546 void *data = packet->payload;
1547 u32 tlen = packet->tlen;
1548 struct rvt_qp *qp = packet->qp;
1549 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1550 struct ib_other_headers *ohdr = packet->ohdr;
1551 struct rvt_swqe *wqe;
1552 enum ib_wc_status status;
1553 unsigned long flags;
1554 int diff;
1555 u64 val;
1556 u32 aeth;
1557 u32 psn = ib_bth_get_psn(packet->ohdr);
1558 u32 pmtu = qp->pmtu;
1559 u16 hdrsize = packet->hlen;
1560 u8 opcode = packet->opcode;
1561 u8 pad = packet->pad;
1562 u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
1563
1564 spin_lock_irqsave(&qp->s_lock, flags);
1565 trace_hfi1_ack(qp, psn);
1566
1567 /* Ignore invalid responses. */
1568 smp_read_barrier_depends(); /* see post_one_send */
1569 if (cmp_psn(psn, READ_ONCE(qp->s_next_psn)) >= 0)
1570 goto ack_done;
1571
1572 /* Ignore duplicate responses. */
1573 diff = cmp_psn(psn, qp->s_last_psn);
1574 if (unlikely(diff <= 0)) {
1575 /* Update credits for "ghost" ACKs */
1576 if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
1577 aeth = be32_to_cpu(ohdr->u.aeth);
1578 if ((aeth >> IB_AETH_NAK_SHIFT) == 0)
1579 rvt_get_credit(qp, aeth);
1580 }
1581 goto ack_done;
1582 }
1583
1584 /*
1585 * Skip everything other than the PSN we expect, if we are waiting
1586 * for a reply to a restarted RDMA read or atomic op.
1587 */
1588 if (qp->r_flags & RVT_R_RDMAR_SEQ) {
1589 if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
1590 goto ack_done;
1591 qp->r_flags &= ~RVT_R_RDMAR_SEQ;
1592 }
1593
1594 if (unlikely(qp->s_acked == qp->s_tail))
1595 goto ack_done;
1596 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1597 status = IB_WC_SUCCESS;
1598
1599 switch (opcode) {
1600 case OP(ACKNOWLEDGE):
1601 case OP(ATOMIC_ACKNOWLEDGE):
1602 case OP(RDMA_READ_RESPONSE_FIRST):
1603 aeth = be32_to_cpu(ohdr->u.aeth);
1604 if (opcode == OP(ATOMIC_ACKNOWLEDGE))
1605 val = ib_u64_get(&ohdr->u.at.atomic_ack_eth);
1606 else
1607 val = 0;
1608 if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
1609 opcode != OP(RDMA_READ_RESPONSE_FIRST))
1610 goto ack_done;
1611 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1612 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1613 goto ack_op_err;
1614 /*
1615 * If this is a response to a resent RDMA read, we
1616 * have to be careful to copy the data to the right
1617 * location.
1618 */
1619 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1620 wqe, psn, pmtu);
1621 goto read_middle;
1622
1623 case OP(RDMA_READ_RESPONSE_MIDDLE):
1624 /* no AETH, no ACK */
1625 if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
1626 goto ack_seq_err;
1627 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1628 goto ack_op_err;
1629 read_middle:
1630 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
1631 goto ack_len_err;
1632 if (unlikely(pmtu >= qp->s_rdma_read_len))
1633 goto ack_len_err;
1634
1635 /*
1636 * We got a response so update the timeout.
1637 * 4.096 usec. * (1 << qp->timeout)
1638 */
1639 rvt_mod_retry_timer(qp);
1640 if (qp->s_flags & RVT_S_WAIT_ACK) {
1641 qp->s_flags &= ~RVT_S_WAIT_ACK;
1642 hfi1_schedule_send(qp);
1643 }
1644
1645 if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
1646 qp->s_retry = qp->s_retry_cnt;
1647
1648 /*
1649 * Update the RDMA receive state but do the copy w/o
1650 * holding the locks and blocking interrupts.
1651 */
1652 qp->s_rdma_read_len -= pmtu;
1653 update_last_psn(qp, psn);
1654 spin_unlock_irqrestore(&qp->s_lock, flags);
1655 hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false);
1656 goto bail;
1657
1658 case OP(RDMA_READ_RESPONSE_ONLY):
1659 aeth = be32_to_cpu(ohdr->u.aeth);
1660 if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
1661 goto ack_done;
1662 /*
1663 * Check that the data size is >= 0 && <= pmtu.
1664 * Remember to account for ICRC (4).
1665 */
1666 if (unlikely(tlen < (hdrsize + extra_bytes)))
1667 goto ack_len_err;
1668 /*
1669 * If this is a response to a resent RDMA read, we
1670 * have to be careful to copy the data to the right
1671 * location.
1672 */
1673 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1674 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1675 wqe, psn, pmtu);
1676 goto read_last;
1677
1678 case OP(RDMA_READ_RESPONSE_LAST):
1679 /* ACKs READ req. */
1680 if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
1681 goto ack_seq_err;
1682 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1683 goto ack_op_err;
1684 /*
1685 * Check that the data size is >= 1 && <= pmtu.
1686 * Remember to account for ICRC (4).
1687 */
1688 if (unlikely(tlen <= (hdrsize + extra_bytes)))
1689 goto ack_len_err;
1690 read_last:
1691 tlen -= hdrsize + extra_bytes;
1692 if (unlikely(tlen != qp->s_rdma_read_len))
1693 goto ack_len_err;
1694 aeth = be32_to_cpu(ohdr->u.aeth);
1695 hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false);
1696 WARN_ON(qp->s_rdma_read_sge.num_sge);
1697 (void)do_rc_ack(qp, aeth, psn,
1698 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
1699 goto ack_done;
1700 }
1701
1702 ack_op_err:
1703 status = IB_WC_LOC_QP_OP_ERR;
1704 goto ack_err;
1705
1706 ack_seq_err:
1707 rdma_seq_err(qp, ibp, psn, rcd);
1708 goto ack_done;
1709
1710 ack_len_err:
1711 status = IB_WC_LOC_LEN_ERR;
1712 ack_err:
1713 if (qp->s_last == qp->s_acked) {
1714 hfi1_send_complete(qp, wqe, status);
1715 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1716 }
1717 ack_done:
1718 spin_unlock_irqrestore(&qp->s_lock, flags);
1719 bail:
1720 return;
1721 }
1722
rc_defered_ack(struct hfi1_ctxtdata * rcd,struct rvt_qp * qp)1723 static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
1724 struct rvt_qp *qp)
1725 {
1726 if (list_empty(&qp->rspwait)) {
1727 qp->r_flags |= RVT_R_RSP_NAK;
1728 rvt_get_qp(qp);
1729 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1730 }
1731 }
1732
rc_cancel_ack(struct rvt_qp * qp)1733 static inline void rc_cancel_ack(struct rvt_qp *qp)
1734 {
1735 qp->r_adefered = 0;
1736 if (list_empty(&qp->rspwait))
1737 return;
1738 list_del_init(&qp->rspwait);
1739 qp->r_flags &= ~RVT_R_RSP_NAK;
1740 rvt_put_qp(qp);
1741 }
1742
1743 /**
1744 * rc_rcv_error - process an incoming duplicate or error RC packet
1745 * @ohdr: the other headers for this packet
1746 * @data: the packet data
1747 * @qp: the QP for this packet
1748 * @opcode: the opcode for this packet
1749 * @psn: the packet sequence number for this packet
1750 * @diff: the difference between the PSN and the expected PSN
1751 *
1752 * This is called from hfi1_rc_rcv() to process an unexpected
1753 * incoming RC packet for the given QP.
1754 * Called at interrupt level.
1755 * Return 1 if no more processing is needed; otherwise return 0 to
1756 * schedule a response to be sent.
1757 */
rc_rcv_error(struct ib_other_headers * ohdr,void * data,struct rvt_qp * qp,u32 opcode,u32 psn,int diff,struct hfi1_ctxtdata * rcd)1758 static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
1759 struct rvt_qp *qp, u32 opcode, u32 psn,
1760 int diff, struct hfi1_ctxtdata *rcd)
1761 {
1762 struct hfi1_ibport *ibp = rcd_to_iport(rcd);
1763 struct rvt_ack_entry *e;
1764 unsigned long flags;
1765 u8 i, prev;
1766 int old_req;
1767
1768 trace_hfi1_rcv_error(qp, psn);
1769 if (diff > 0) {
1770 /*
1771 * Packet sequence error.
1772 * A NAK will ACK earlier sends and RDMA writes.
1773 * Don't queue the NAK if we already sent one.
1774 */
1775 if (!qp->r_nak_state) {
1776 ibp->rvp.n_rc_seqnak++;
1777 qp->r_nak_state = IB_NAK_PSN_ERROR;
1778 /* Use the expected PSN. */
1779 qp->r_ack_psn = qp->r_psn;
1780 /*
1781 * Wait to send the sequence NAK until all packets
1782 * in the receive queue have been processed.
1783 * Otherwise, we end up propagating congestion.
1784 */
1785 rc_defered_ack(rcd, qp);
1786 }
1787 goto done;
1788 }
1789
1790 /*
1791 * Handle a duplicate request. Don't re-execute SEND, RDMA
1792 * write or atomic op. Don't NAK errors, just silently drop
1793 * the duplicate request. Note that r_sge, r_len, and
1794 * r_rcv_len may be in use so don't modify them.
1795 *
1796 * We are supposed to ACK the earliest duplicate PSN but we
1797 * can coalesce an outstanding duplicate ACK. We have to
1798 * send the earliest so that RDMA reads can be restarted at
1799 * the requester's expected PSN.
1800 *
1801 * First, find where this duplicate PSN falls within the
1802 * ACKs previously sent.
1803 * old_req is true if there is an older response that is scheduled
1804 * to be sent before sending this one.
1805 */
1806 e = NULL;
1807 old_req = 1;
1808 ibp->rvp.n_rc_dupreq++;
1809
1810 spin_lock_irqsave(&qp->s_lock, flags);
1811
1812 for (i = qp->r_head_ack_queue; ; i = prev) {
1813 if (i == qp->s_tail_ack_queue)
1814 old_req = 0;
1815 if (i)
1816 prev = i - 1;
1817 else
1818 prev = HFI1_MAX_RDMA_ATOMIC;
1819 if (prev == qp->r_head_ack_queue) {
1820 e = NULL;
1821 break;
1822 }
1823 e = &qp->s_ack_queue[prev];
1824 if (!e->opcode) {
1825 e = NULL;
1826 break;
1827 }
1828 if (cmp_psn(psn, e->psn) >= 0) {
1829 if (prev == qp->s_tail_ack_queue &&
1830 cmp_psn(psn, e->lpsn) <= 0)
1831 old_req = 0;
1832 break;
1833 }
1834 }
1835 switch (opcode) {
1836 case OP(RDMA_READ_REQUEST): {
1837 struct ib_reth *reth;
1838 u32 offset;
1839 u32 len;
1840
1841 /*
1842 * If we didn't find the RDMA read request in the ack queue,
1843 * we can ignore this request.
1844 */
1845 if (!e || e->opcode != OP(RDMA_READ_REQUEST))
1846 goto unlock_done;
1847 /* RETH comes after BTH */
1848 reth = &ohdr->u.rc.reth;
1849 /*
1850 * Address range must be a subset of the original
1851 * request and start on pmtu boundaries.
1852 * We reuse the old ack_queue slot since the requester
1853 * should not back up and request an earlier PSN for the
1854 * same request.
1855 */
1856 offset = delta_psn(psn, e->psn) * qp->pmtu;
1857 len = be32_to_cpu(reth->length);
1858 if (unlikely(offset + len != e->rdma_sge.sge_length))
1859 goto unlock_done;
1860 if (e->rdma_sge.mr) {
1861 rvt_put_mr(e->rdma_sge.mr);
1862 e->rdma_sge.mr = NULL;
1863 }
1864 if (len != 0) {
1865 u32 rkey = be32_to_cpu(reth->rkey);
1866 u64 vaddr = get_ib_reth_vaddr(reth);
1867 int ok;
1868
1869 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
1870 IB_ACCESS_REMOTE_READ);
1871 if (unlikely(!ok))
1872 goto unlock_done;
1873 } else {
1874 e->rdma_sge.vaddr = NULL;
1875 e->rdma_sge.length = 0;
1876 e->rdma_sge.sge_length = 0;
1877 }
1878 e->psn = psn;
1879 if (old_req)
1880 goto unlock_done;
1881 qp->s_tail_ack_queue = prev;
1882 break;
1883 }
1884
1885 case OP(COMPARE_SWAP):
1886 case OP(FETCH_ADD): {
1887 /*
1888 * If we didn't find the atomic request in the ack queue
1889 * or the send engine is already backed up to send an
1890 * earlier entry, we can ignore this request.
1891 */
1892 if (!e || e->opcode != (u8)opcode || old_req)
1893 goto unlock_done;
1894 qp->s_tail_ack_queue = prev;
1895 break;
1896 }
1897
1898 default:
1899 /*
1900 * Ignore this operation if it doesn't request an ACK
1901 * or an earlier RDMA read or atomic is going to be resent.
1902 */
1903 if (!(psn & IB_BTH_REQ_ACK) || old_req)
1904 goto unlock_done;
1905 /*
1906 * Resend the most recent ACK if this request is
1907 * after all the previous RDMA reads and atomics.
1908 */
1909 if (i == qp->r_head_ack_queue) {
1910 spin_unlock_irqrestore(&qp->s_lock, flags);
1911 qp->r_nak_state = 0;
1912 qp->r_ack_psn = qp->r_psn - 1;
1913 goto send_ack;
1914 }
1915
1916 /*
1917 * Resend the RDMA read or atomic op which
1918 * ACKs this duplicate request.
1919 */
1920 qp->s_tail_ack_queue = i;
1921 break;
1922 }
1923 qp->s_ack_state = OP(ACKNOWLEDGE);
1924 qp->s_flags |= RVT_S_RESP_PENDING;
1925 qp->r_nak_state = 0;
1926 hfi1_schedule_send(qp);
1927
1928 unlock_done:
1929 spin_unlock_irqrestore(&qp->s_lock, flags);
1930 done:
1931 return 1;
1932
1933 send_ack:
1934 return 0;
1935 }
1936
update_ack_queue(struct rvt_qp * qp,unsigned n)1937 static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
1938 {
1939 unsigned next;
1940
1941 next = n + 1;
1942 if (next > HFI1_MAX_RDMA_ATOMIC)
1943 next = 0;
1944 qp->s_tail_ack_queue = next;
1945 qp->s_ack_state = OP(ACKNOWLEDGE);
1946 }
1947
log_cca_event(struct hfi1_pportdata * ppd,u8 sl,u32 rlid,u32 lqpn,u32 rqpn,u8 svc_type)1948 static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
1949 u32 lqpn, u32 rqpn, u8 svc_type)
1950 {
1951 struct opa_hfi1_cong_log_event_internal *cc_event;
1952 unsigned long flags;
1953
1954 if (sl >= OPA_MAX_SLS)
1955 return;
1956
1957 spin_lock_irqsave(&ppd->cc_log_lock, flags);
1958
1959 ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8);
1960 ppd->threshold_event_counter++;
1961
1962 cc_event = &ppd->cc_events[ppd->cc_log_idx++];
1963 if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
1964 ppd->cc_log_idx = 0;
1965 cc_event->lqpn = lqpn & RVT_QPN_MASK;
1966 cc_event->rqpn = rqpn & RVT_QPN_MASK;
1967 cc_event->sl = sl;
1968 cc_event->svc_type = svc_type;
1969 cc_event->rlid = rlid;
1970 /* keep timestamp in units of 1.024 usec */
1971 cc_event->timestamp = ktime_to_ns(ktime_get()) / 1024;
1972
1973 spin_unlock_irqrestore(&ppd->cc_log_lock, flags);
1974 }
1975
process_becn(struct hfi1_pportdata * ppd,u8 sl,u32 rlid,u32 lqpn,u32 rqpn,u8 svc_type)1976 void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn,
1977 u32 rqpn, u8 svc_type)
1978 {
1979 struct cca_timer *cca_timer;
1980 u16 ccti, ccti_incr, ccti_timer, ccti_limit;
1981 u8 trigger_threshold;
1982 struct cc_state *cc_state;
1983 unsigned long flags;
1984
1985 if (sl >= OPA_MAX_SLS)
1986 return;
1987
1988 cc_state = get_cc_state(ppd);
1989
1990 if (!cc_state)
1991 return;
1992
1993 /*
1994 * 1) increase CCTI (for this SL)
1995 * 2) select IPG (i.e., call set_link_ipg())
1996 * 3) start timer
1997 */
1998 ccti_limit = cc_state->cct.ccti_limit;
1999 ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
2000 ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
2001 trigger_threshold =
2002 cc_state->cong_setting.entries[sl].trigger_threshold;
2003
2004 spin_lock_irqsave(&ppd->cca_timer_lock, flags);
2005
2006 cca_timer = &ppd->cca_timer[sl];
2007 if (cca_timer->ccti < ccti_limit) {
2008 if (cca_timer->ccti + ccti_incr <= ccti_limit)
2009 cca_timer->ccti += ccti_incr;
2010 else
2011 cca_timer->ccti = ccti_limit;
2012 set_link_ipg(ppd);
2013 }
2014
2015 ccti = cca_timer->ccti;
2016
2017 if (!hrtimer_active(&cca_timer->hrtimer)) {
2018 /* ccti_timer is in units of 1.024 usec */
2019 unsigned long nsec = 1024 * ccti_timer;
2020
2021 hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
2022 HRTIMER_MODE_REL);
2023 }
2024
2025 spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
2026
2027 if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
2028 log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
2029 }
2030
2031 /**
2032 * hfi1_rc_rcv - process an incoming RC packet
2033 * @packet: data packet information
2034 *
2035 * This is called from qp_rcv() to process an incoming RC packet
2036 * for the given QP.
2037 * May be called at interrupt level.
2038 */
hfi1_rc_rcv(struct hfi1_packet * packet)2039 void hfi1_rc_rcv(struct hfi1_packet *packet)
2040 {
2041 struct hfi1_ctxtdata *rcd = packet->rcd;
2042 void *data = packet->payload;
2043 u32 tlen = packet->tlen;
2044 struct rvt_qp *qp = packet->qp;
2045 struct hfi1_ibport *ibp = rcd_to_iport(rcd);
2046 struct ib_other_headers *ohdr = packet->ohdr;
2047 u32 bth0 = be32_to_cpu(ohdr->bth[0]);
2048 u32 opcode = packet->opcode;
2049 u32 hdrsize = packet->hlen;
2050 u32 psn = ib_bth_get_psn(packet->ohdr);
2051 u32 pad = packet->pad;
2052 struct ib_wc wc;
2053 u32 pmtu = qp->pmtu;
2054 int diff;
2055 struct ib_reth *reth;
2056 unsigned long flags;
2057 int ret;
2058 bool is_fecn = false;
2059 bool copy_last = false;
2060 u32 rkey;
2061 u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
2062
2063 lockdep_assert_held(&qp->r_lock);
2064
2065 if (hfi1_ruc_check_hdr(ibp, packet))
2066 return;
2067
2068 is_fecn = process_ecn(qp, packet, false);
2069
2070 /*
2071 * Process responses (ACKs) before anything else. Note that the
2072 * packet sequence number will be for something in the send work
2073 * queue rather than the expected receive packet sequence number.
2074 * In other words, this QP is the requester.
2075 */
2076 if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
2077 opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
2078 rc_rcv_resp(packet);
2079 if (is_fecn)
2080 goto send_ack;
2081 return;
2082 }
2083
2084 /* Compute 24 bits worth of difference. */
2085 diff = delta_psn(psn, qp->r_psn);
2086 if (unlikely(diff)) {
2087 if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
2088 return;
2089 goto send_ack;
2090 }
2091
2092 /* Check for opcode sequence errors. */
2093 switch (qp->r_state) {
2094 case OP(SEND_FIRST):
2095 case OP(SEND_MIDDLE):
2096 if (opcode == OP(SEND_MIDDLE) ||
2097 opcode == OP(SEND_LAST) ||
2098 opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
2099 opcode == OP(SEND_LAST_WITH_INVALIDATE))
2100 break;
2101 goto nack_inv;
2102
2103 case OP(RDMA_WRITE_FIRST):
2104 case OP(RDMA_WRITE_MIDDLE):
2105 if (opcode == OP(RDMA_WRITE_MIDDLE) ||
2106 opcode == OP(RDMA_WRITE_LAST) ||
2107 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
2108 break;
2109 goto nack_inv;
2110
2111 default:
2112 if (opcode == OP(SEND_MIDDLE) ||
2113 opcode == OP(SEND_LAST) ||
2114 opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
2115 opcode == OP(SEND_LAST_WITH_INVALIDATE) ||
2116 opcode == OP(RDMA_WRITE_MIDDLE) ||
2117 opcode == OP(RDMA_WRITE_LAST) ||
2118 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
2119 goto nack_inv;
2120 /*
2121 * Note that it is up to the requester to not send a new
2122 * RDMA read or atomic operation before receiving an ACK
2123 * for the previous operation.
2124 */
2125 break;
2126 }
2127
2128 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
2129 rvt_comm_est(qp);
2130
2131 /* OK, process the packet. */
2132 switch (opcode) {
2133 case OP(SEND_FIRST):
2134 ret = hfi1_rvt_get_rwqe(qp, 0);
2135 if (ret < 0)
2136 goto nack_op_err;
2137 if (!ret)
2138 goto rnr_nak;
2139 qp->r_rcv_len = 0;
2140 /* FALLTHROUGH */
2141 case OP(SEND_MIDDLE):
2142 case OP(RDMA_WRITE_MIDDLE):
2143 send_middle:
2144 /* Check for invalid length PMTU or posted rwqe len. */
2145 /*
2146 * There will be no padding for 9B packet but 16B packets
2147 * will come in with some padding since we always add
2148 * CRC and LT bytes which will need to be flit aligned
2149 */
2150 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
2151 goto nack_inv;
2152 qp->r_rcv_len += pmtu;
2153 if (unlikely(qp->r_rcv_len > qp->r_len))
2154 goto nack_inv;
2155 hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false);
2156 break;
2157
2158 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
2159 /* consume RWQE */
2160 ret = hfi1_rvt_get_rwqe(qp, 1);
2161 if (ret < 0)
2162 goto nack_op_err;
2163 if (!ret)
2164 goto rnr_nak;
2165 goto send_last_imm;
2166
2167 case OP(SEND_ONLY):
2168 case OP(SEND_ONLY_WITH_IMMEDIATE):
2169 case OP(SEND_ONLY_WITH_INVALIDATE):
2170 ret = hfi1_rvt_get_rwqe(qp, 0);
2171 if (ret < 0)
2172 goto nack_op_err;
2173 if (!ret)
2174 goto rnr_nak;
2175 qp->r_rcv_len = 0;
2176 if (opcode == OP(SEND_ONLY))
2177 goto no_immediate_data;
2178 if (opcode == OP(SEND_ONLY_WITH_INVALIDATE))
2179 goto send_last_inv;
2180 /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
2181 case OP(SEND_LAST_WITH_IMMEDIATE):
2182 send_last_imm:
2183 wc.ex.imm_data = ohdr->u.imm_data;
2184 wc.wc_flags = IB_WC_WITH_IMM;
2185 goto send_last;
2186 case OP(SEND_LAST_WITH_INVALIDATE):
2187 send_last_inv:
2188 rkey = be32_to_cpu(ohdr->u.ieth);
2189 if (rvt_invalidate_rkey(qp, rkey))
2190 goto no_immediate_data;
2191 wc.ex.invalidate_rkey = rkey;
2192 wc.wc_flags = IB_WC_WITH_INVALIDATE;
2193 goto send_last;
2194 case OP(RDMA_WRITE_LAST):
2195 copy_last = rvt_is_user_qp(qp);
2196 /* fall through */
2197 case OP(SEND_LAST):
2198 no_immediate_data:
2199 wc.wc_flags = 0;
2200 wc.ex.imm_data = 0;
2201 send_last:
2202 /* Check for invalid length. */
2203 /* LAST len should be >= 1 */
2204 if (unlikely(tlen < (hdrsize + extra_bytes)))
2205 goto nack_inv;
2206 /* Don't count the CRC(and padding and LT byte for 16B). */
2207 tlen -= (hdrsize + extra_bytes);
2208 wc.byte_len = tlen + qp->r_rcv_len;
2209 if (unlikely(wc.byte_len > qp->r_len))
2210 goto nack_inv;
2211 hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last);
2212 rvt_put_ss(&qp->r_sge);
2213 qp->r_msn++;
2214 if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
2215 break;
2216 wc.wr_id = qp->r_wr_id;
2217 wc.status = IB_WC_SUCCESS;
2218 if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
2219 opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
2220 wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
2221 else
2222 wc.opcode = IB_WC_RECV;
2223 wc.qp = &qp->ibqp;
2224 wc.src_qp = qp->remote_qpn;
2225 wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr);
2226 /*
2227 * It seems that IB mandates the presence of an SL in a
2228 * work completion only for the UD transport (see section
2229 * 11.4.2 of IBTA Vol. 1).
2230 *
2231 * However, the way the SL is chosen below is consistent
2232 * with the way that IB/qib works and is trying avoid
2233 * introducing incompatibilities.
2234 *
2235 * See also OPA Vol. 1, section 9.7.6, and table 9-17.
2236 */
2237 wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
2238 /* zero fields that are N/A */
2239 wc.vendor_err = 0;
2240 wc.pkey_index = 0;
2241 wc.dlid_path_bits = 0;
2242 wc.port_num = 0;
2243 /* Signal completion event if the solicited bit is set. */
2244 rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
2245 (bth0 & IB_BTH_SOLICITED) != 0);
2246 break;
2247
2248 case OP(RDMA_WRITE_ONLY):
2249 copy_last = rvt_is_user_qp(qp);
2250 /* fall through */
2251 case OP(RDMA_WRITE_FIRST):
2252 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
2253 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
2254 goto nack_inv;
2255 /* consume RWQE */
2256 reth = &ohdr->u.rc.reth;
2257 qp->r_len = be32_to_cpu(reth->length);
2258 qp->r_rcv_len = 0;
2259 qp->r_sge.sg_list = NULL;
2260 if (qp->r_len != 0) {
2261 u32 rkey = be32_to_cpu(reth->rkey);
2262 u64 vaddr = get_ib_reth_vaddr(reth);
2263 int ok;
2264
2265 /* Check rkey & NAK */
2266 ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
2267 rkey, IB_ACCESS_REMOTE_WRITE);
2268 if (unlikely(!ok))
2269 goto nack_acc;
2270 qp->r_sge.num_sge = 1;
2271 } else {
2272 qp->r_sge.num_sge = 0;
2273 qp->r_sge.sge.mr = NULL;
2274 qp->r_sge.sge.vaddr = NULL;
2275 qp->r_sge.sge.length = 0;
2276 qp->r_sge.sge.sge_length = 0;
2277 }
2278 if (opcode == OP(RDMA_WRITE_FIRST))
2279 goto send_middle;
2280 else if (opcode == OP(RDMA_WRITE_ONLY))
2281 goto no_immediate_data;
2282 ret = hfi1_rvt_get_rwqe(qp, 1);
2283 if (ret < 0)
2284 goto nack_op_err;
2285 if (!ret) {
2286 /* peer will send again */
2287 rvt_put_ss(&qp->r_sge);
2288 goto rnr_nak;
2289 }
2290 wc.ex.imm_data = ohdr->u.rc.imm_data;
2291 wc.wc_flags = IB_WC_WITH_IMM;
2292 goto send_last;
2293
2294 case OP(RDMA_READ_REQUEST): {
2295 struct rvt_ack_entry *e;
2296 u32 len;
2297 u8 next;
2298
2299 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
2300 goto nack_inv;
2301 next = qp->r_head_ack_queue + 1;
2302 /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
2303 if (next > HFI1_MAX_RDMA_ATOMIC)
2304 next = 0;
2305 spin_lock_irqsave(&qp->s_lock, flags);
2306 if (unlikely(next == qp->s_tail_ack_queue)) {
2307 if (!qp->s_ack_queue[next].sent)
2308 goto nack_inv_unlck;
2309 update_ack_queue(qp, next);
2310 }
2311 e = &qp->s_ack_queue[qp->r_head_ack_queue];
2312 if (e->rdma_sge.mr) {
2313 rvt_put_mr(e->rdma_sge.mr);
2314 e->rdma_sge.mr = NULL;
2315 }
2316 reth = &ohdr->u.rc.reth;
2317 len = be32_to_cpu(reth->length);
2318 if (len) {
2319 u32 rkey = be32_to_cpu(reth->rkey);
2320 u64 vaddr = get_ib_reth_vaddr(reth);
2321 int ok;
2322
2323 /* Check rkey & NAK */
2324 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
2325 rkey, IB_ACCESS_REMOTE_READ);
2326 if (unlikely(!ok))
2327 goto nack_acc_unlck;
2328 /*
2329 * Update the next expected PSN. We add 1 later
2330 * below, so only add the remainder here.
2331 */
2332 qp->r_psn += rvt_div_mtu(qp, len - 1);
2333 } else {
2334 e->rdma_sge.mr = NULL;
2335 e->rdma_sge.vaddr = NULL;
2336 e->rdma_sge.length = 0;
2337 e->rdma_sge.sge_length = 0;
2338 }
2339 e->opcode = opcode;
2340 e->sent = 0;
2341 e->psn = psn;
2342 e->lpsn = qp->r_psn;
2343 /*
2344 * We need to increment the MSN here instead of when we
2345 * finish sending the result since a duplicate request would
2346 * increment it more than once.
2347 */
2348 qp->r_msn++;
2349 qp->r_psn++;
2350 qp->r_state = opcode;
2351 qp->r_nak_state = 0;
2352 qp->r_head_ack_queue = next;
2353
2354 /* Schedule the send engine. */
2355 qp->s_flags |= RVT_S_RESP_PENDING;
2356 hfi1_schedule_send(qp);
2357
2358 spin_unlock_irqrestore(&qp->s_lock, flags);
2359 if (is_fecn)
2360 goto send_ack;
2361 return;
2362 }
2363
2364 case OP(COMPARE_SWAP):
2365 case OP(FETCH_ADD): {
2366 struct ib_atomic_eth *ateth;
2367 struct rvt_ack_entry *e;
2368 u64 vaddr;
2369 atomic64_t *maddr;
2370 u64 sdata;
2371 u32 rkey;
2372 u8 next;
2373
2374 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
2375 goto nack_inv;
2376 next = qp->r_head_ack_queue + 1;
2377 if (next > HFI1_MAX_RDMA_ATOMIC)
2378 next = 0;
2379 spin_lock_irqsave(&qp->s_lock, flags);
2380 if (unlikely(next == qp->s_tail_ack_queue)) {
2381 if (!qp->s_ack_queue[next].sent)
2382 goto nack_inv_unlck;
2383 update_ack_queue(qp, next);
2384 }
2385 e = &qp->s_ack_queue[qp->r_head_ack_queue];
2386 if (e->rdma_sge.mr) {
2387 rvt_put_mr(e->rdma_sge.mr);
2388 e->rdma_sge.mr = NULL;
2389 }
2390 ateth = &ohdr->u.atomic_eth;
2391 vaddr = get_ib_ateth_vaddr(ateth);
2392 if (unlikely(vaddr & (sizeof(u64) - 1)))
2393 goto nack_inv_unlck;
2394 rkey = be32_to_cpu(ateth->rkey);
2395 /* Check rkey & NAK */
2396 if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
2397 vaddr, rkey,
2398 IB_ACCESS_REMOTE_ATOMIC)))
2399 goto nack_acc_unlck;
2400 /* Perform atomic OP and save result. */
2401 maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
2402 sdata = get_ib_ateth_swap(ateth);
2403 e->atomic_data = (opcode == OP(FETCH_ADD)) ?
2404 (u64)atomic64_add_return(sdata, maddr) - sdata :
2405 (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
2406 get_ib_ateth_compare(ateth),
2407 sdata);
2408 rvt_put_mr(qp->r_sge.sge.mr);
2409 qp->r_sge.num_sge = 0;
2410 e->opcode = opcode;
2411 e->sent = 0;
2412 e->psn = psn;
2413 e->lpsn = psn;
2414 qp->r_msn++;
2415 qp->r_psn++;
2416 qp->r_state = opcode;
2417 qp->r_nak_state = 0;
2418 qp->r_head_ack_queue = next;
2419
2420 /* Schedule the send engine. */
2421 qp->s_flags |= RVT_S_RESP_PENDING;
2422 hfi1_schedule_send(qp);
2423
2424 spin_unlock_irqrestore(&qp->s_lock, flags);
2425 if (is_fecn)
2426 goto send_ack;
2427 return;
2428 }
2429
2430 default:
2431 /* NAK unknown opcodes. */
2432 goto nack_inv;
2433 }
2434 qp->r_psn++;
2435 qp->r_state = opcode;
2436 qp->r_ack_psn = psn;
2437 qp->r_nak_state = 0;
2438 /* Send an ACK if requested or required. */
2439 if (psn & IB_BTH_REQ_ACK) {
2440 if (packet->numpkt == 0) {
2441 rc_cancel_ack(qp);
2442 goto send_ack;
2443 }
2444 if (qp->r_adefered >= HFI1_PSN_CREDIT) {
2445 rc_cancel_ack(qp);
2446 goto send_ack;
2447 }
2448 if (unlikely(is_fecn)) {
2449 rc_cancel_ack(qp);
2450 goto send_ack;
2451 }
2452 qp->r_adefered++;
2453 rc_defered_ack(rcd, qp);
2454 }
2455 return;
2456
2457 rnr_nak:
2458 qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK;
2459 qp->r_ack_psn = qp->r_psn;
2460 /* Queue RNR NAK for later */
2461 rc_defered_ack(rcd, qp);
2462 return;
2463
2464 nack_op_err:
2465 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2466 qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2467 qp->r_ack_psn = qp->r_psn;
2468 /* Queue NAK for later */
2469 rc_defered_ack(rcd, qp);
2470 return;
2471
2472 nack_inv_unlck:
2473 spin_unlock_irqrestore(&qp->s_lock, flags);
2474 nack_inv:
2475 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2476 qp->r_nak_state = IB_NAK_INVALID_REQUEST;
2477 qp->r_ack_psn = qp->r_psn;
2478 /* Queue NAK for later */
2479 rc_defered_ack(rcd, qp);
2480 return;
2481
2482 nack_acc_unlck:
2483 spin_unlock_irqrestore(&qp->s_lock, flags);
2484 nack_acc:
2485 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
2486 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2487 qp->r_ack_psn = qp->r_psn;
2488 send_ack:
2489 hfi1_send_rc_ack(rcd, qp, is_fecn);
2490 }
2491
hfi1_rc_hdrerr(struct hfi1_ctxtdata * rcd,struct hfi1_packet * packet,struct rvt_qp * qp)2492 void hfi1_rc_hdrerr(
2493 struct hfi1_ctxtdata *rcd,
2494 struct hfi1_packet *packet,
2495 struct rvt_qp *qp)
2496 {
2497 struct hfi1_ibport *ibp = rcd_to_iport(rcd);
2498 int diff;
2499 u32 opcode;
2500 u32 psn;
2501
2502 if (hfi1_ruc_check_hdr(ibp, packet))
2503 return;
2504
2505 psn = ib_bth_get_psn(packet->ohdr);
2506 opcode = ib_bth_get_opcode(packet->ohdr);
2507
2508 /* Only deal with RDMA Writes for now */
2509 if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
2510 diff = delta_psn(psn, qp->r_psn);
2511 if (!qp->r_nak_state && diff >= 0) {
2512 ibp->rvp.n_rc_seqnak++;
2513 qp->r_nak_state = IB_NAK_PSN_ERROR;
2514 /* Use the expected PSN. */
2515 qp->r_ack_psn = qp->r_psn;
2516 /*
2517 * Wait to send the sequence
2518 * NAK until all packets
2519 * in the receive queue have
2520 * been processed.
2521 * Otherwise, we end up
2522 * propagating congestion.
2523 */
2524 rc_defered_ack(rcd, qp);
2525 } /* Out of sequence NAK */
2526 } /* QP Request NAKs */
2527 }
2528