1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2019 Mellanox Technologies. */
3
4 #include <linux/smp.h>
5 #include "dr_types.h"
6
7 #define QUEUE_SIZE 128
8 #define SIGNAL_PER_DIV_QUEUE 16
9 #define TH_NUMS_TO_DRAIN 2
10
11 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
12
13 struct dr_data_seg {
14 u64 addr;
15 u32 length;
16 u32 lkey;
17 unsigned int send_flags;
18 };
19
20 struct postsend_info {
21 struct dr_data_seg write;
22 struct dr_data_seg read;
23 u64 remote_addr;
24 u32 rkey;
25 };
26
27 struct dr_qp_rtr_attr {
28 struct mlx5dr_cmd_gid_attr dgid_attr;
29 enum ib_mtu mtu;
30 u32 qp_num;
31 u16 port_num;
32 u8 min_rnr_timer;
33 u8 sgid_index;
34 u16 udp_src_port;
35 u8 fl:1;
36 };
37
38 struct dr_qp_rts_attr {
39 u8 timeout;
40 u8 retry_cnt;
41 u8 rnr_retry;
42 };
43
44 struct dr_qp_init_attr {
45 u32 cqn;
46 u32 pdn;
47 u32 max_send_wr;
48 struct mlx5_uars_page *uar;
49 u8 isolate_vl_tc:1;
50 };
51
dr_parse_cqe(struct mlx5dr_cq * dr_cq,struct mlx5_cqe64 * cqe64)52 static int dr_parse_cqe(struct mlx5dr_cq *dr_cq, struct mlx5_cqe64 *cqe64)
53 {
54 unsigned int idx;
55 u8 opcode;
56
57 opcode = get_cqe_opcode(cqe64);
58 if (opcode == MLX5_CQE_REQ_ERR) {
59 idx = be16_to_cpu(cqe64->wqe_counter) &
60 (dr_cq->qp->sq.wqe_cnt - 1);
61 dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
62 } else if (opcode == MLX5_CQE_RESP_ERR) {
63 ++dr_cq->qp->sq.cc;
64 } else {
65 idx = be16_to_cpu(cqe64->wqe_counter) &
66 (dr_cq->qp->sq.wqe_cnt - 1);
67 dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
68
69 return CQ_OK;
70 }
71
72 return CQ_POLL_ERR;
73 }
74
dr_cq_poll_one(struct mlx5dr_cq * dr_cq)75 static int dr_cq_poll_one(struct mlx5dr_cq *dr_cq)
76 {
77 struct mlx5_cqe64 *cqe64;
78 int err;
79
80 cqe64 = mlx5_cqwq_get_cqe(&dr_cq->wq);
81 if (!cqe64)
82 return CQ_EMPTY;
83
84 mlx5_cqwq_pop(&dr_cq->wq);
85 err = dr_parse_cqe(dr_cq, cqe64);
86 mlx5_cqwq_update_db_record(&dr_cq->wq);
87
88 return err;
89 }
90
dr_poll_cq(struct mlx5dr_cq * dr_cq,int ne)91 static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne)
92 {
93 int npolled;
94 int err = 0;
95
96 for (npolled = 0; npolled < ne; ++npolled) {
97 err = dr_cq_poll_one(dr_cq);
98 if (err != CQ_OK)
99 break;
100 }
101
102 return err == CQ_POLL_ERR ? err : npolled;
103 }
104
dr_create_rc_qp(struct mlx5_core_dev * mdev,struct dr_qp_init_attr * attr)105 static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
106 struct dr_qp_init_attr *attr)
107 {
108 u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
109 u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {};
110 struct mlx5_wq_param wqp;
111 struct mlx5dr_qp *dr_qp;
112 int inlen;
113 void *qpc;
114 void *in;
115 int err;
116
117 dr_qp = kzalloc(sizeof(*dr_qp), GFP_KERNEL);
118 if (!dr_qp)
119 return NULL;
120
121 wqp.buf_numa_node = mdev->priv.numa_node;
122 wqp.db_numa_node = mdev->priv.numa_node;
123
124 dr_qp->rq.pc = 0;
125 dr_qp->rq.cc = 0;
126 dr_qp->rq.wqe_cnt = 4;
127 dr_qp->sq.pc = 0;
128 dr_qp->sq.cc = 0;
129 dr_qp->sq.wqe_cnt = roundup_pow_of_two(attr->max_send_wr);
130
131 MLX5_SET(qpc, temp_qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
132 MLX5_SET(qpc, temp_qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
133 MLX5_SET(qpc, temp_qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
134 err = mlx5_wq_qp_create(mdev, &wqp, temp_qpc, &dr_qp->wq,
135 &dr_qp->wq_ctrl);
136 if (err) {
137 mlx5_core_warn(mdev, "Can't create QP WQ\n");
138 goto err_wq;
139 }
140
141 dr_qp->sq.wqe_head = kcalloc(dr_qp->sq.wqe_cnt,
142 sizeof(dr_qp->sq.wqe_head[0]),
143 GFP_KERNEL);
144
145 if (!dr_qp->sq.wqe_head) {
146 mlx5_core_warn(mdev, "Can't allocate wqe head\n");
147 goto err_wqe_head;
148 }
149
150 inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
151 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
152 dr_qp->wq_ctrl.buf.npages;
153 in = kvzalloc(inlen, GFP_KERNEL);
154 if (!in) {
155 err = -ENOMEM;
156 goto err_in;
157 }
158
159 qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
160 MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
161 MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
162 MLX5_SET(qpc, qpc, isolate_vl_tc, attr->isolate_vl_tc);
163 MLX5_SET(qpc, qpc, pd, attr->pdn);
164 MLX5_SET(qpc, qpc, uar_page, attr->uar->index);
165 MLX5_SET(qpc, qpc, log_page_size,
166 dr_qp->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
167 MLX5_SET(qpc, qpc, fre, 1);
168 MLX5_SET(qpc, qpc, rlky, 1);
169 MLX5_SET(qpc, qpc, cqn_snd, attr->cqn);
170 MLX5_SET(qpc, qpc, cqn_rcv, attr->cqn);
171 MLX5_SET(qpc, qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
172 MLX5_SET(qpc, qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
173 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
174 MLX5_SET(qpc, qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
175 MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
176 MLX5_SET64(qpc, qpc, dbr_addr, dr_qp->wq_ctrl.db.dma);
177 if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
178 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
179 mlx5_fill_page_frag_array(&dr_qp->wq_ctrl.buf,
180 (__be64 *)MLX5_ADDR_OF(create_qp_in,
181 in, pas));
182
183 MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
184 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
185 dr_qp->qpn = MLX5_GET(create_qp_out, out, qpn);
186 kvfree(in);
187 if (err)
188 goto err_in;
189 dr_qp->uar = attr->uar;
190
191 return dr_qp;
192
193 err_in:
194 kfree(dr_qp->sq.wqe_head);
195 err_wqe_head:
196 mlx5_wq_destroy(&dr_qp->wq_ctrl);
197 err_wq:
198 kfree(dr_qp);
199 return NULL;
200 }
201
dr_destroy_qp(struct mlx5_core_dev * mdev,struct mlx5dr_qp * dr_qp)202 static void dr_destroy_qp(struct mlx5_core_dev *mdev,
203 struct mlx5dr_qp *dr_qp)
204 {
205 u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
206
207 MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
208 MLX5_SET(destroy_qp_in, in, qpn, dr_qp->qpn);
209 mlx5_cmd_exec_in(mdev, destroy_qp, in);
210
211 kfree(dr_qp->sq.wqe_head);
212 mlx5_wq_destroy(&dr_qp->wq_ctrl);
213 kfree(dr_qp);
214 }
215
dr_cmd_notify_hw(struct mlx5dr_qp * dr_qp,void * ctrl)216 static void dr_cmd_notify_hw(struct mlx5dr_qp *dr_qp, void *ctrl)
217 {
218 dma_wmb();
219 *dr_qp->wq.sq.db = cpu_to_be32(dr_qp->sq.pc & 0xffff);
220
221 /* After wmb() the hw aware of new work */
222 wmb();
223
224 mlx5_write64(ctrl, dr_qp->uar->map + MLX5_BF_OFFSET);
225 }
226
dr_rdma_segments(struct mlx5dr_qp * dr_qp,u64 remote_addr,u32 rkey,struct dr_data_seg * data_seg,u32 opcode,bool notify_hw)227 static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr,
228 u32 rkey, struct dr_data_seg *data_seg,
229 u32 opcode, bool notify_hw)
230 {
231 struct mlx5_wqe_raddr_seg *wq_raddr;
232 struct mlx5_wqe_ctrl_seg *wq_ctrl;
233 struct mlx5_wqe_data_seg *wq_dseg;
234 unsigned int size;
235 unsigned int idx;
236
237 size = sizeof(*wq_ctrl) / 16 + sizeof(*wq_dseg) / 16 +
238 sizeof(*wq_raddr) / 16;
239
240 idx = dr_qp->sq.pc & (dr_qp->sq.wqe_cnt - 1);
241
242 wq_ctrl = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx);
243 wq_ctrl->imm = 0;
244 wq_ctrl->fm_ce_se = (data_seg->send_flags) ?
245 MLX5_WQE_CTRL_CQ_UPDATE : 0;
246 wq_ctrl->opmod_idx_opcode = cpu_to_be32(((dr_qp->sq.pc & 0xffff) << 8) |
247 opcode);
248 wq_ctrl->qpn_ds = cpu_to_be32(size | dr_qp->qpn << 8);
249 wq_raddr = (void *)(wq_ctrl + 1);
250 wq_raddr->raddr = cpu_to_be64(remote_addr);
251 wq_raddr->rkey = cpu_to_be32(rkey);
252 wq_raddr->reserved = 0;
253
254 wq_dseg = (void *)(wq_raddr + 1);
255 wq_dseg->byte_count = cpu_to_be32(data_seg->length);
256 wq_dseg->lkey = cpu_to_be32(data_seg->lkey);
257 wq_dseg->addr = cpu_to_be64(data_seg->addr);
258
259 dr_qp->sq.wqe_head[idx] = dr_qp->sq.pc++;
260
261 if (notify_hw)
262 dr_cmd_notify_hw(dr_qp, wq_ctrl);
263 }
264
dr_post_send(struct mlx5dr_qp * dr_qp,struct postsend_info * send_info)265 static void dr_post_send(struct mlx5dr_qp *dr_qp, struct postsend_info *send_info)
266 {
267 dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
268 &send_info->write, MLX5_OPCODE_RDMA_WRITE, false);
269 dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
270 &send_info->read, MLX5_OPCODE_RDMA_READ, true);
271 }
272
273 /**
274 * mlx5dr_send_fill_and_append_ste_send_info: Add data to be sent
275 * with send_list parameters:
276 *
277 * @ste: The data that attached to this specific ste
278 * @size: of data to write
279 * @offset: of the data from start of the hw_ste entry
280 * @data: data
281 * @ste_info: ste to be sent with send_list
282 * @send_list: to append into it
283 * @copy_data: if true indicates that the data should be kept because
284 * it's not backuped any where (like in re-hash).
285 * if false, it lets the data to be updated after
286 * it was added to the list.
287 */
mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste * ste,u16 size,u16 offset,u8 * data,struct mlx5dr_ste_send_info * ste_info,struct list_head * send_list,bool copy_data)288 void mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste *ste, u16 size,
289 u16 offset, u8 *data,
290 struct mlx5dr_ste_send_info *ste_info,
291 struct list_head *send_list,
292 bool copy_data)
293 {
294 ste_info->size = size;
295 ste_info->ste = ste;
296 ste_info->offset = offset;
297
298 if (copy_data) {
299 memcpy(ste_info->data_cont, data, size);
300 ste_info->data = ste_info->data_cont;
301 } else {
302 ste_info->data = data;
303 }
304
305 list_add_tail(&ste_info->send_list, send_list);
306 }
307
308 /* The function tries to consume one wc each time, unless the queue is full, in
309 * that case, which means that the hw is behind the sw in a full queue len
310 * the function will drain the cq till it empty.
311 */
dr_handle_pending_wc(struct mlx5dr_domain * dmn,struct mlx5dr_send_ring * send_ring)312 static int dr_handle_pending_wc(struct mlx5dr_domain *dmn,
313 struct mlx5dr_send_ring *send_ring)
314 {
315 bool is_drain = false;
316 int ne;
317
318 if (send_ring->pending_wqe < send_ring->signal_th)
319 return 0;
320
321 /* Queue is full start drain it */
322 if (send_ring->pending_wqe >=
323 dmn->send_ring->signal_th * TH_NUMS_TO_DRAIN)
324 is_drain = true;
325
326 do {
327 ne = dr_poll_cq(send_ring->cq, 1);
328 if (unlikely(ne < 0)) {
329 mlx5_core_warn_once(dmn->mdev, "SMFS QPN 0x%x is disabled/limited",
330 send_ring->qp->qpn);
331 send_ring->err_state = true;
332 return ne;
333 } else if (ne == 1) {
334 send_ring->pending_wqe -= send_ring->signal_th;
335 }
336 } while (is_drain && send_ring->pending_wqe);
337
338 return 0;
339 }
340
dr_fill_data_segs(struct mlx5dr_send_ring * send_ring,struct postsend_info * send_info)341 static void dr_fill_data_segs(struct mlx5dr_send_ring *send_ring,
342 struct postsend_info *send_info)
343 {
344 send_ring->pending_wqe++;
345
346 if (send_ring->pending_wqe % send_ring->signal_th == 0)
347 send_info->write.send_flags |= IB_SEND_SIGNALED;
348
349 send_ring->pending_wqe++;
350 send_info->read.length = send_info->write.length;
351 /* Read into the same write area */
352 send_info->read.addr = (uintptr_t)send_info->write.addr;
353 send_info->read.lkey = send_ring->mr->mkey;
354
355 if (send_ring->pending_wqe % send_ring->signal_th == 0)
356 send_info->read.send_flags = IB_SEND_SIGNALED;
357 else
358 send_info->read.send_flags = 0;
359 }
360
dr_postsend_icm_data(struct mlx5dr_domain * dmn,struct postsend_info * send_info)361 static int dr_postsend_icm_data(struct mlx5dr_domain *dmn,
362 struct postsend_info *send_info)
363 {
364 struct mlx5dr_send_ring *send_ring = dmn->send_ring;
365 u32 buff_offset;
366 int ret;
367
368 if (unlikely(dmn->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR ||
369 send_ring->err_state)) {
370 mlx5_core_dbg_once(dmn->mdev,
371 "Skipping post send: QP err state: %d, device state: %d\n",
372 send_ring->err_state, dmn->mdev->state);
373 return 0;
374 }
375
376 spin_lock(&send_ring->lock);
377
378 ret = dr_handle_pending_wc(dmn, send_ring);
379 if (ret)
380 goto out_unlock;
381
382 if (send_info->write.length > dmn->info.max_inline_size) {
383 buff_offset = (send_ring->tx_head &
384 (dmn->send_ring->signal_th - 1)) *
385 send_ring->max_post_send_size;
386 /* Copy to ring mr */
387 memcpy(send_ring->buf + buff_offset,
388 (void *)(uintptr_t)send_info->write.addr,
389 send_info->write.length);
390 send_info->write.addr = (uintptr_t)send_ring->mr->dma_addr + buff_offset;
391 send_info->write.lkey = send_ring->mr->mkey;
392 }
393
394 send_ring->tx_head++;
395 dr_fill_data_segs(send_ring, send_info);
396 dr_post_send(send_ring->qp, send_info);
397
398 out_unlock:
399 spin_unlock(&send_ring->lock);
400 return ret;
401 }
402
dr_get_tbl_copy_details(struct mlx5dr_domain * dmn,struct mlx5dr_ste_htbl * htbl,u8 ** data,u32 * byte_size,int * iterations,int * num_stes)403 static int dr_get_tbl_copy_details(struct mlx5dr_domain *dmn,
404 struct mlx5dr_ste_htbl *htbl,
405 u8 **data,
406 u32 *byte_size,
407 int *iterations,
408 int *num_stes)
409 {
410 u32 chunk_byte_size = mlx5dr_icm_pool_get_chunk_byte_size(htbl->chunk);
411 int alloc_size;
412
413 if (chunk_byte_size > dmn->send_ring->max_post_send_size) {
414 *iterations = chunk_byte_size / dmn->send_ring->max_post_send_size;
415 *byte_size = dmn->send_ring->max_post_send_size;
416 alloc_size = *byte_size;
417 *num_stes = *byte_size / DR_STE_SIZE;
418 } else {
419 *iterations = 1;
420 *num_stes = mlx5dr_icm_pool_get_chunk_num_of_entries(htbl->chunk);
421 alloc_size = *num_stes * DR_STE_SIZE;
422 }
423
424 *data = kvzalloc(alloc_size, GFP_KERNEL);
425 if (!*data)
426 return -ENOMEM;
427
428 return 0;
429 }
430
431 /**
432 * mlx5dr_send_postsend_ste: write size bytes into offset from the hw cm.
433 *
434 * @dmn: Domain
435 * @ste: The ste struct that contains the data (at
436 * least part of it)
437 * @data: The real data to send size data
438 * @size: for writing.
439 * @offset: The offset from the icm mapped data to
440 * start write to this for write only part of the
441 * buffer.
442 *
443 * Return: 0 on success.
444 */
mlx5dr_send_postsend_ste(struct mlx5dr_domain * dmn,struct mlx5dr_ste * ste,u8 * data,u16 size,u16 offset)445 int mlx5dr_send_postsend_ste(struct mlx5dr_domain *dmn, struct mlx5dr_ste *ste,
446 u8 *data, u16 size, u16 offset)
447 {
448 struct postsend_info send_info = {};
449
450 mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, data, size);
451
452 send_info.write.addr = (uintptr_t)data;
453 send_info.write.length = size;
454 send_info.write.lkey = 0;
455 send_info.remote_addr = mlx5dr_ste_get_mr_addr(ste) + offset;
456 send_info.rkey = mlx5dr_icm_pool_get_chunk_rkey(ste->htbl->chunk);
457
458 return dr_postsend_icm_data(dmn, &send_info);
459 }
460
mlx5dr_send_postsend_htbl(struct mlx5dr_domain * dmn,struct mlx5dr_ste_htbl * htbl,u8 * formatted_ste,u8 * mask)461 int mlx5dr_send_postsend_htbl(struct mlx5dr_domain *dmn,
462 struct mlx5dr_ste_htbl *htbl,
463 u8 *formatted_ste, u8 *mask)
464 {
465 u32 byte_size = mlx5dr_icm_pool_get_chunk_byte_size(htbl->chunk);
466 int num_stes_per_iter;
467 int iterations;
468 u8 *data;
469 int ret;
470 int i;
471 int j;
472
473 ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
474 &iterations, &num_stes_per_iter);
475 if (ret)
476 return ret;
477
478 mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, formatted_ste, DR_STE_SIZE);
479
480 /* Send the data iteration times */
481 for (i = 0; i < iterations; i++) {
482 u32 ste_index = i * (byte_size / DR_STE_SIZE);
483 struct postsend_info send_info = {};
484
485 /* Copy all ste's on the data buffer
486 * need to add the bit_mask
487 */
488 for (j = 0; j < num_stes_per_iter; j++) {
489 struct mlx5dr_ste *ste = &htbl->chunk->ste_arr[ste_index + j];
490 u32 ste_off = j * DR_STE_SIZE;
491
492 if (mlx5dr_ste_is_not_used(ste)) {
493 memcpy(data + ste_off,
494 formatted_ste, DR_STE_SIZE);
495 } else {
496 /* Copy data */
497 memcpy(data + ste_off,
498 htbl->chunk->hw_ste_arr +
499 DR_STE_SIZE_REDUCED * (ste_index + j),
500 DR_STE_SIZE_REDUCED);
501 /* Copy bit_mask */
502 memcpy(data + ste_off + DR_STE_SIZE_REDUCED,
503 mask, DR_STE_SIZE_MASK);
504 /* Only when we have mask we need to re-arrange the STE */
505 mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx,
506 data + (j * DR_STE_SIZE),
507 DR_STE_SIZE);
508 }
509 }
510
511 send_info.write.addr = (uintptr_t)data;
512 send_info.write.length = byte_size;
513 send_info.write.lkey = 0;
514 send_info.remote_addr =
515 mlx5dr_ste_get_mr_addr(htbl->chunk->ste_arr + ste_index);
516 send_info.rkey = mlx5dr_icm_pool_get_chunk_rkey(htbl->chunk);
517
518 ret = dr_postsend_icm_data(dmn, &send_info);
519 if (ret)
520 goto out_free;
521 }
522
523 out_free:
524 kvfree(data);
525 return ret;
526 }
527
528 /* Initialize htble with default STEs */
mlx5dr_send_postsend_formatted_htbl(struct mlx5dr_domain * dmn,struct mlx5dr_ste_htbl * htbl,u8 * ste_init_data,bool update_hw_ste)529 int mlx5dr_send_postsend_formatted_htbl(struct mlx5dr_domain *dmn,
530 struct mlx5dr_ste_htbl *htbl,
531 u8 *ste_init_data,
532 bool update_hw_ste)
533 {
534 u32 byte_size = mlx5dr_icm_pool_get_chunk_byte_size(htbl->chunk);
535 int iterations;
536 int num_stes;
537 u8 *copy_dst;
538 u8 *data;
539 int ret;
540 int i;
541
542 ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
543 &iterations, &num_stes);
544 if (ret)
545 return ret;
546
547 if (update_hw_ste) {
548 /* Copy the reduced STE to hash table ste_arr */
549 for (i = 0; i < num_stes; i++) {
550 copy_dst = htbl->chunk->hw_ste_arr + i * DR_STE_SIZE_REDUCED;
551 memcpy(copy_dst, ste_init_data, DR_STE_SIZE_REDUCED);
552 }
553 }
554
555 mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, ste_init_data, DR_STE_SIZE);
556
557 /* Copy the same STE on the data buffer */
558 for (i = 0; i < num_stes; i++) {
559 copy_dst = data + i * DR_STE_SIZE;
560 memcpy(copy_dst, ste_init_data, DR_STE_SIZE);
561 }
562
563 /* Send the data iteration times */
564 for (i = 0; i < iterations; i++) {
565 u8 ste_index = i * (byte_size / DR_STE_SIZE);
566 struct postsend_info send_info = {};
567
568 send_info.write.addr = (uintptr_t)data;
569 send_info.write.length = byte_size;
570 send_info.write.lkey = 0;
571 send_info.remote_addr =
572 mlx5dr_ste_get_mr_addr(htbl->chunk->ste_arr + ste_index);
573 send_info.rkey = mlx5dr_icm_pool_get_chunk_rkey(htbl->chunk);
574
575 ret = dr_postsend_icm_data(dmn, &send_info);
576 if (ret)
577 goto out_free;
578 }
579
580 out_free:
581 kvfree(data);
582 return ret;
583 }
584
mlx5dr_send_postsend_action(struct mlx5dr_domain * dmn,struct mlx5dr_action * action)585 int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn,
586 struct mlx5dr_action *action)
587 {
588 struct postsend_info send_info = {};
589 int ret;
590
591 send_info.write.addr = (uintptr_t)action->rewrite->data;
592 send_info.write.length = action->rewrite->num_of_actions *
593 DR_MODIFY_ACTION_SIZE;
594 send_info.write.lkey = 0;
595 send_info.remote_addr =
596 mlx5dr_icm_pool_get_chunk_mr_addr(action->rewrite->chunk);
597 send_info.rkey = mlx5dr_icm_pool_get_chunk_rkey(action->rewrite->chunk);
598
599 ret = dr_postsend_icm_data(dmn, &send_info);
600
601 return ret;
602 }
603
dr_modify_qp_rst2init(struct mlx5_core_dev * mdev,struct mlx5dr_qp * dr_qp,int port)604 static int dr_modify_qp_rst2init(struct mlx5_core_dev *mdev,
605 struct mlx5dr_qp *dr_qp,
606 int port)
607 {
608 u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
609 void *qpc;
610
611 qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc);
612
613 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, port);
614 MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
615 MLX5_SET(qpc, qpc, rre, 1);
616 MLX5_SET(qpc, qpc, rwe, 1);
617
618 MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP);
619 MLX5_SET(rst2init_qp_in, in, qpn, dr_qp->qpn);
620
621 return mlx5_cmd_exec_in(mdev, rst2init_qp, in);
622 }
623
dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev * mdev,struct mlx5dr_qp * dr_qp,struct dr_qp_rts_attr * attr)624 static int dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev *mdev,
625 struct mlx5dr_qp *dr_qp,
626 struct dr_qp_rts_attr *attr)
627 {
628 u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
629 void *qpc;
630
631 qpc = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc);
632
633 MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn);
634
635 MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt);
636 MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry);
637 MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
638
639 MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
640 MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn);
641
642 return mlx5_cmd_exec_in(mdev, rtr2rts_qp, in);
643 }
644
dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev * mdev,struct mlx5dr_qp * dr_qp,struct dr_qp_rtr_attr * attr)645 static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev,
646 struct mlx5dr_qp *dr_qp,
647 struct dr_qp_rtr_attr *attr)
648 {
649 u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
650 void *qpc;
651
652 qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc);
653
654 MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn);
655
656 MLX5_SET(qpc, qpc, mtu, attr->mtu);
657 MLX5_SET(qpc, qpc, log_msg_max, DR_CHUNK_SIZE_MAX - 1);
658 MLX5_SET(qpc, qpc, remote_qpn, attr->qp_num);
659 memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32),
660 attr->dgid_attr.mac, sizeof(attr->dgid_attr.mac));
661 memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
662 attr->dgid_attr.gid, sizeof(attr->dgid_attr.gid));
663 MLX5_SET(qpc, qpc, primary_address_path.src_addr_index,
664 attr->sgid_index);
665
666 if (attr->dgid_attr.roce_ver == MLX5_ROCE_VERSION_2)
667 MLX5_SET(qpc, qpc, primary_address_path.udp_sport,
668 attr->udp_src_port);
669
670 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num);
671 MLX5_SET(qpc, qpc, primary_address_path.fl, attr->fl);
672 MLX5_SET(qpc, qpc, min_rnr_nak, 1);
673
674 MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
675 MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn);
676
677 return mlx5_cmd_exec_in(mdev, init2rtr_qp, in);
678 }
679
dr_send_allow_fl(struct mlx5dr_cmd_caps * caps)680 static bool dr_send_allow_fl(struct mlx5dr_cmd_caps *caps)
681 {
682 /* Check whether RC RoCE QP creation with force loopback is allowed.
683 * There are two separate capability bits for this:
684 * - force loopback when RoCE is enabled
685 * - force loopback when RoCE is disabled
686 */
687 return ((caps->roce_caps.roce_en &&
688 caps->roce_caps.fl_rc_qp_when_roce_enabled) ||
689 (!caps->roce_caps.roce_en &&
690 caps->roce_caps.fl_rc_qp_when_roce_disabled));
691 }
692
dr_prepare_qp_to_rts(struct mlx5dr_domain * dmn)693 static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn)
694 {
695 struct mlx5dr_qp *dr_qp = dmn->send_ring->qp;
696 struct dr_qp_rts_attr rts_attr = {};
697 struct dr_qp_rtr_attr rtr_attr = {};
698 enum ib_mtu mtu = IB_MTU_1024;
699 u16 gid_index = 0;
700 int port = 1;
701 int ret;
702
703 /* Init */
704 ret = dr_modify_qp_rst2init(dmn->mdev, dr_qp, port);
705 if (ret) {
706 mlx5dr_err(dmn, "Failed modify QP rst2init\n");
707 return ret;
708 }
709
710 /* RTR */
711 rtr_attr.mtu = mtu;
712 rtr_attr.qp_num = dr_qp->qpn;
713 rtr_attr.min_rnr_timer = 12;
714 rtr_attr.port_num = port;
715 rtr_attr.udp_src_port = dmn->info.caps.roce_min_src_udp;
716
717 /* If QP creation with force loopback is allowed, then there
718 * is no need for GID index when creating the QP.
719 * Otherwise we query GID attributes and use GID index.
720 */
721 rtr_attr.fl = dr_send_allow_fl(&dmn->info.caps);
722 if (!rtr_attr.fl) {
723 ret = mlx5dr_cmd_query_gid(dmn->mdev, port, gid_index,
724 &rtr_attr.dgid_attr);
725 if (ret)
726 return ret;
727
728 rtr_attr.sgid_index = gid_index;
729 }
730
731 ret = dr_cmd_modify_qp_init2rtr(dmn->mdev, dr_qp, &rtr_attr);
732 if (ret) {
733 mlx5dr_err(dmn, "Failed modify QP init2rtr\n");
734 return ret;
735 }
736
737 /* RTS */
738 rts_attr.timeout = 14;
739 rts_attr.retry_cnt = 7;
740 rts_attr.rnr_retry = 7;
741
742 ret = dr_cmd_modify_qp_rtr2rts(dmn->mdev, dr_qp, &rts_attr);
743 if (ret) {
744 mlx5dr_err(dmn, "Failed modify QP rtr2rts\n");
745 return ret;
746 }
747
748 return 0;
749 }
750
dr_cq_complete(struct mlx5_core_cq * mcq,struct mlx5_eqe * eqe)751 static void dr_cq_complete(struct mlx5_core_cq *mcq,
752 struct mlx5_eqe *eqe)
753 {
754 pr_err("CQ completion CQ: #%u\n", mcq->cqn);
755 }
756
dr_create_cq(struct mlx5_core_dev * mdev,struct mlx5_uars_page * uar,size_t ncqe)757 static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev,
758 struct mlx5_uars_page *uar,
759 size_t ncqe)
760 {
761 u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {};
762 u32 out[MLX5_ST_SZ_DW(create_cq_out)];
763 struct mlx5_wq_param wqp;
764 struct mlx5_cqe64 *cqe;
765 struct mlx5dr_cq *cq;
766 int inlen, err, eqn;
767 void *cqc, *in;
768 __be64 *pas;
769 int vector;
770 u32 i;
771
772 cq = kzalloc(sizeof(*cq), GFP_KERNEL);
773 if (!cq)
774 return NULL;
775
776 ncqe = roundup_pow_of_two(ncqe);
777 MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(ncqe));
778
779 wqp.buf_numa_node = mdev->priv.numa_node;
780 wqp.db_numa_node = mdev->priv.numa_node;
781
782 err = mlx5_cqwq_create(mdev, &wqp, temp_cqc, &cq->wq,
783 &cq->wq_ctrl);
784 if (err)
785 goto out;
786
787 for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
788 cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
789 cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK;
790 }
791
792 inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
793 sizeof(u64) * cq->wq_ctrl.buf.npages;
794 in = kvzalloc(inlen, GFP_KERNEL);
795 if (!in)
796 goto err_cqwq;
797
798 vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev);
799 err = mlx5_vector2eqn(mdev, vector, &eqn);
800 if (err) {
801 kvfree(in);
802 goto err_cqwq;
803 }
804
805 cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
806 MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
807 MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
808 MLX5_SET(cqc, cqc, uar_page, uar->index);
809 MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
810 MLX5_ADAPTER_PAGE_SHIFT);
811 MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma);
812
813 pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
814 mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas);
815
816 cq->mcq.comp = dr_cq_complete;
817
818 err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
819 kvfree(in);
820
821 if (err)
822 goto err_cqwq;
823
824 cq->mcq.cqe_sz = 64;
825 cq->mcq.set_ci_db = cq->wq_ctrl.db.db;
826 cq->mcq.arm_db = cq->wq_ctrl.db.db + 1;
827 *cq->mcq.set_ci_db = 0;
828
829 /* set no-zero value, in order to avoid the HW to run db-recovery on
830 * CQ that used in polling mode.
831 */
832 *cq->mcq.arm_db = cpu_to_be32(2 << 28);
833
834 cq->mcq.vector = 0;
835 cq->mcq.uar = uar;
836
837 return cq;
838
839 err_cqwq:
840 mlx5_wq_destroy(&cq->wq_ctrl);
841 out:
842 kfree(cq);
843 return NULL;
844 }
845
dr_destroy_cq(struct mlx5_core_dev * mdev,struct mlx5dr_cq * cq)846 static void dr_destroy_cq(struct mlx5_core_dev *mdev, struct mlx5dr_cq *cq)
847 {
848 mlx5_core_destroy_cq(mdev, &cq->mcq);
849 mlx5_wq_destroy(&cq->wq_ctrl);
850 kfree(cq);
851 }
852
dr_create_mkey(struct mlx5_core_dev * mdev,u32 pdn,u32 * mkey)853 static int dr_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey)
854 {
855 u32 in[MLX5_ST_SZ_DW(create_mkey_in)] = {};
856 void *mkc;
857
858 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
859 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
860 MLX5_SET(mkc, mkc, a, 1);
861 MLX5_SET(mkc, mkc, rw, 1);
862 MLX5_SET(mkc, mkc, rr, 1);
863 MLX5_SET(mkc, mkc, lw, 1);
864 MLX5_SET(mkc, mkc, lr, 1);
865
866 MLX5_SET(mkc, mkc, pd, pdn);
867 MLX5_SET(mkc, mkc, length64, 1);
868 MLX5_SET(mkc, mkc, qpn, 0xffffff);
869
870 return mlx5_core_create_mkey(mdev, mkey, in, sizeof(in));
871 }
872
dr_reg_mr(struct mlx5_core_dev * mdev,u32 pdn,void * buf,size_t size)873 static struct mlx5dr_mr *dr_reg_mr(struct mlx5_core_dev *mdev,
874 u32 pdn, void *buf, size_t size)
875 {
876 struct mlx5dr_mr *mr = kzalloc(sizeof(*mr), GFP_KERNEL);
877 struct device *dma_device;
878 dma_addr_t dma_addr;
879 int err;
880
881 if (!mr)
882 return NULL;
883
884 dma_device = mlx5_core_dma_dev(mdev);
885 dma_addr = dma_map_single(dma_device, buf, size,
886 DMA_BIDIRECTIONAL);
887 err = dma_mapping_error(dma_device, dma_addr);
888 if (err) {
889 mlx5_core_warn(mdev, "Can't dma buf\n");
890 kfree(mr);
891 return NULL;
892 }
893
894 err = dr_create_mkey(mdev, pdn, &mr->mkey);
895 if (err) {
896 mlx5_core_warn(mdev, "Can't create mkey\n");
897 dma_unmap_single(dma_device, dma_addr, size,
898 DMA_BIDIRECTIONAL);
899 kfree(mr);
900 return NULL;
901 }
902
903 mr->dma_addr = dma_addr;
904 mr->size = size;
905 mr->addr = buf;
906
907 return mr;
908 }
909
dr_dereg_mr(struct mlx5_core_dev * mdev,struct mlx5dr_mr * mr)910 static void dr_dereg_mr(struct mlx5_core_dev *mdev, struct mlx5dr_mr *mr)
911 {
912 mlx5_core_destroy_mkey(mdev, mr->mkey);
913 dma_unmap_single(mlx5_core_dma_dev(mdev), mr->dma_addr, mr->size,
914 DMA_BIDIRECTIONAL);
915 kfree(mr);
916 }
917
mlx5dr_send_ring_alloc(struct mlx5dr_domain * dmn)918 int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn)
919 {
920 struct dr_qp_init_attr init_attr = {};
921 int cq_size;
922 int size;
923 int ret;
924
925 dmn->send_ring = kzalloc(sizeof(*dmn->send_ring), GFP_KERNEL);
926 if (!dmn->send_ring)
927 return -ENOMEM;
928
929 cq_size = QUEUE_SIZE + 1;
930 dmn->send_ring->cq = dr_create_cq(dmn->mdev, dmn->uar, cq_size);
931 if (!dmn->send_ring->cq) {
932 mlx5dr_err(dmn, "Failed creating CQ\n");
933 ret = -ENOMEM;
934 goto free_send_ring;
935 }
936
937 init_attr.cqn = dmn->send_ring->cq->mcq.cqn;
938 init_attr.pdn = dmn->pdn;
939 init_attr.uar = dmn->uar;
940 init_attr.max_send_wr = QUEUE_SIZE;
941
942 /* Isolated VL is applicable only if force loopback is supported */
943 if (dr_send_allow_fl(&dmn->info.caps))
944 init_attr.isolate_vl_tc = dmn->info.caps.isolate_vl_tc;
945
946 spin_lock_init(&dmn->send_ring->lock);
947
948 dmn->send_ring->qp = dr_create_rc_qp(dmn->mdev, &init_attr);
949 if (!dmn->send_ring->qp) {
950 mlx5dr_err(dmn, "Failed creating QP\n");
951 ret = -ENOMEM;
952 goto clean_cq;
953 }
954
955 dmn->send_ring->cq->qp = dmn->send_ring->qp;
956
957 dmn->info.max_send_wr = QUEUE_SIZE;
958 dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data,
959 DR_STE_SIZE);
960
961 dmn->send_ring->signal_th = dmn->info.max_send_wr /
962 SIGNAL_PER_DIV_QUEUE;
963
964 /* Prepare qp to be used */
965 ret = dr_prepare_qp_to_rts(dmn);
966 if (ret)
967 goto clean_qp;
968
969 dmn->send_ring->max_post_send_size =
970 mlx5dr_icm_pool_chunk_size_to_byte(DR_CHUNK_SIZE_1K,
971 DR_ICM_TYPE_STE);
972
973 /* Allocating the max size as a buffer for writing */
974 size = dmn->send_ring->signal_th * dmn->send_ring->max_post_send_size;
975 dmn->send_ring->buf = kzalloc(size, GFP_KERNEL);
976 if (!dmn->send_ring->buf) {
977 ret = -ENOMEM;
978 goto clean_qp;
979 }
980
981 dmn->send_ring->buf_size = size;
982
983 dmn->send_ring->mr = dr_reg_mr(dmn->mdev,
984 dmn->pdn, dmn->send_ring->buf, size);
985 if (!dmn->send_ring->mr) {
986 ret = -ENOMEM;
987 goto free_mem;
988 }
989
990 dmn->send_ring->sync_mr = dr_reg_mr(dmn->mdev,
991 dmn->pdn, dmn->send_ring->sync_buff,
992 MIN_READ_SYNC);
993 if (!dmn->send_ring->sync_mr) {
994 ret = -ENOMEM;
995 goto clean_mr;
996 }
997
998 return 0;
999
1000 clean_mr:
1001 dr_dereg_mr(dmn->mdev, dmn->send_ring->mr);
1002 free_mem:
1003 kfree(dmn->send_ring->buf);
1004 clean_qp:
1005 dr_destroy_qp(dmn->mdev, dmn->send_ring->qp);
1006 clean_cq:
1007 dr_destroy_cq(dmn->mdev, dmn->send_ring->cq);
1008 free_send_ring:
1009 kfree(dmn->send_ring);
1010
1011 return ret;
1012 }
1013
mlx5dr_send_ring_free(struct mlx5dr_domain * dmn,struct mlx5dr_send_ring * send_ring)1014 void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn,
1015 struct mlx5dr_send_ring *send_ring)
1016 {
1017 dr_destroy_qp(dmn->mdev, send_ring->qp);
1018 dr_destroy_cq(dmn->mdev, send_ring->cq);
1019 dr_dereg_mr(dmn->mdev, send_ring->sync_mr);
1020 dr_dereg_mr(dmn->mdev, send_ring->mr);
1021 kfree(send_ring->buf);
1022 kfree(send_ring);
1023 }
1024
mlx5dr_send_ring_force_drain(struct mlx5dr_domain * dmn)1025 int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn)
1026 {
1027 struct mlx5dr_send_ring *send_ring = dmn->send_ring;
1028 struct postsend_info send_info = {};
1029 u8 data[DR_STE_SIZE];
1030 int num_of_sends_req;
1031 int ret;
1032 int i;
1033
1034 /* Sending this amount of requests makes sure we will get drain */
1035 num_of_sends_req = send_ring->signal_th * TH_NUMS_TO_DRAIN / 2;
1036
1037 /* Send fake requests forcing the last to be signaled */
1038 send_info.write.addr = (uintptr_t)data;
1039 send_info.write.length = DR_STE_SIZE;
1040 send_info.write.lkey = 0;
1041 /* Using the sync_mr in order to write/read */
1042 send_info.remote_addr = (uintptr_t)send_ring->sync_mr->addr;
1043 send_info.rkey = send_ring->sync_mr->mkey;
1044
1045 for (i = 0; i < num_of_sends_req; i++) {
1046 ret = dr_postsend_icm_data(dmn, &send_info);
1047 if (ret)
1048 return ret;
1049 }
1050
1051 spin_lock(&send_ring->lock);
1052 ret = dr_handle_pending_wc(dmn, send_ring);
1053 spin_unlock(&send_ring->lock);
1054
1055 return ret;
1056 }
1057