1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4 */
5
6 #include "cmd.h"
7
8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
9
10 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
11 u16 *vhca_id);
12 static void
13 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
14
mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device * mvdev,u16 op_mod)15 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
16 {
17 u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
18 u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
19
20 lockdep_assert_held(&mvdev->state_mutex);
21 if (mvdev->mdev_detach)
22 return -ENOTCONN;
23
24 MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
25 MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
26 MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
27
28 return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
29 }
30
mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device * mvdev,u16 op_mod)31 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
32 {
33 u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
34 u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
35
36 lockdep_assert_held(&mvdev->state_mutex);
37 if (mvdev->mdev_detach)
38 return -ENOTCONN;
39
40 MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
41 MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
42 MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
43
44 return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
45 }
46
mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device * mvdev,size_t * state_size)47 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
48 size_t *state_size)
49 {
50 u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
51 u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
52 int ret;
53
54 lockdep_assert_held(&mvdev->state_mutex);
55 if (mvdev->mdev_detach)
56 return -ENOTCONN;
57
58 MLX5_SET(query_vhca_migration_state_in, in, opcode,
59 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
60 MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
61 MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
62
63 ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
64 out);
65 if (ret)
66 return ret;
67
68 *state_size = MLX5_GET(query_vhca_migration_state_out, out,
69 required_umem_size);
70 return 0;
71 }
72
set_tracker_error(struct mlx5vf_pci_core_device * mvdev)73 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
74 {
75 /* Mark the tracker under an error and wake it up if it's running */
76 mvdev->tracker.is_err = true;
77 complete(&mvdev->tracker_comp);
78 }
79
mlx5fv_vf_event(struct notifier_block * nb,unsigned long event,void * data)80 static int mlx5fv_vf_event(struct notifier_block *nb,
81 unsigned long event, void *data)
82 {
83 struct mlx5vf_pci_core_device *mvdev =
84 container_of(nb, struct mlx5vf_pci_core_device, nb);
85
86 switch (event) {
87 case MLX5_PF_NOTIFY_ENABLE_VF:
88 mutex_lock(&mvdev->state_mutex);
89 mvdev->mdev_detach = false;
90 mlx5vf_state_mutex_unlock(mvdev);
91 break;
92 case MLX5_PF_NOTIFY_DISABLE_VF:
93 mlx5vf_cmd_close_migratable(mvdev);
94 mutex_lock(&mvdev->state_mutex);
95 mvdev->mdev_detach = true;
96 mlx5vf_state_mutex_unlock(mvdev);
97 break;
98 default:
99 break;
100 }
101
102 return 0;
103 }
104
mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device * mvdev)105 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
106 {
107 if (!mvdev->migrate_cap)
108 return;
109
110 /* Must be done outside the lock to let it progress */
111 set_tracker_error(mvdev);
112 mutex_lock(&mvdev->state_mutex);
113 mlx5vf_disable_fds(mvdev);
114 _mlx5vf_free_page_tracker_resources(mvdev);
115 mlx5vf_state_mutex_unlock(mvdev);
116 }
117
mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device * mvdev)118 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
119 {
120 if (!mvdev->migrate_cap)
121 return;
122
123 mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
124 &mvdev->nb);
125 destroy_workqueue(mvdev->cb_wq);
126 }
127
mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device * mvdev,const struct vfio_migration_ops * mig_ops,const struct vfio_log_ops * log_ops)128 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
129 const struct vfio_migration_ops *mig_ops,
130 const struct vfio_log_ops *log_ops)
131 {
132 struct pci_dev *pdev = mvdev->core_device.pdev;
133 int ret;
134
135 if (!pdev->is_virtfn)
136 return;
137
138 mvdev->mdev = mlx5_vf_get_core_dev(pdev);
139 if (!mvdev->mdev)
140 return;
141
142 if (!MLX5_CAP_GEN(mvdev->mdev, migration))
143 goto end;
144
145 mvdev->vf_id = pci_iov_vf_id(pdev);
146 if (mvdev->vf_id < 0)
147 goto end;
148
149 if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
150 &mvdev->vhca_id))
151 goto end;
152
153 mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
154 if (!mvdev->cb_wq)
155 goto end;
156
157 mutex_init(&mvdev->state_mutex);
158 spin_lock_init(&mvdev->reset_lock);
159 mvdev->nb.notifier_call = mlx5fv_vf_event;
160 ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
161 &mvdev->nb);
162 if (ret) {
163 destroy_workqueue(mvdev->cb_wq);
164 goto end;
165 }
166
167 mvdev->migrate_cap = 1;
168 mvdev->core_device.vdev.migration_flags =
169 VFIO_MIGRATION_STOP_COPY |
170 VFIO_MIGRATION_P2P;
171 mvdev->core_device.vdev.mig_ops = mig_ops;
172 init_completion(&mvdev->tracker_comp);
173 if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
174 mvdev->core_device.vdev.log_ops = log_ops;
175
176 end:
177 mlx5_vf_put_core_dev(mvdev->mdev);
178 }
179
mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev * mdev,u16 function_id,u16 * vhca_id)180 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
181 u16 *vhca_id)
182 {
183 u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
184 int out_size;
185 void *out;
186 int ret;
187
188 out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
189 out = kzalloc(out_size, GFP_KERNEL);
190 if (!out)
191 return -ENOMEM;
192
193 MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
194 MLX5_SET(query_hca_cap_in, in, other_function, 1);
195 MLX5_SET(query_hca_cap_in, in, function_id, function_id);
196 MLX5_SET(query_hca_cap_in, in, op_mod,
197 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
198 HCA_CAP_OPMOD_GET_CUR);
199
200 ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
201 if (ret)
202 goto err_exec;
203
204 *vhca_id = MLX5_GET(query_hca_cap_out, out,
205 capability.cmd_hca_cap.vhca_id);
206
207 err_exec:
208 kfree(out);
209 return ret;
210 }
211
_create_mkey(struct mlx5_core_dev * mdev,u32 pdn,struct mlx5_vf_migration_file * migf,struct mlx5_vhca_recv_buf * recv_buf,u32 * mkey)212 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
213 struct mlx5_vf_migration_file *migf,
214 struct mlx5_vhca_recv_buf *recv_buf,
215 u32 *mkey)
216 {
217 size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) :
218 recv_buf->npages;
219 int err = 0, inlen;
220 __be64 *mtt;
221 void *mkc;
222 u32 *in;
223
224 inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
225 sizeof(*mtt) * round_up(npages, 2);
226
227 in = kvzalloc(inlen, GFP_KERNEL);
228 if (!in)
229 return -ENOMEM;
230
231 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
232 DIV_ROUND_UP(npages, 2));
233 mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
234
235 if (migf) {
236 struct sg_dma_page_iter dma_iter;
237
238 for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0)
239 *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
240 } else {
241 int i;
242
243 for (i = 0; i < npages; i++)
244 *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
245 }
246
247 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
248 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
249 MLX5_SET(mkc, mkc, lr, 1);
250 MLX5_SET(mkc, mkc, lw, 1);
251 MLX5_SET(mkc, mkc, rr, 1);
252 MLX5_SET(mkc, mkc, rw, 1);
253 MLX5_SET(mkc, mkc, pd, pdn);
254 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
255 MLX5_SET(mkc, mkc, qpn, 0xffffff);
256 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
257 MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
258 MLX5_SET64(mkc, mkc, len,
259 migf ? migf->total_length : (npages * PAGE_SIZE));
260 err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
261 kvfree(in);
262 return err;
263 }
264
mlx5vf_mig_file_cleanup_cb(struct work_struct * _work)265 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
266 {
267 struct mlx5vf_async_data *async_data = container_of(_work,
268 struct mlx5vf_async_data, work);
269 struct mlx5_vf_migration_file *migf = container_of(async_data,
270 struct mlx5_vf_migration_file, async_data);
271 struct mlx5_core_dev *mdev = migf->mvdev->mdev;
272
273 mutex_lock(&migf->lock);
274 if (async_data->status) {
275 migf->is_err = true;
276 wake_up_interruptible(&migf->poll_wait);
277 }
278 mutex_unlock(&migf->lock);
279
280 mlx5_core_destroy_mkey(mdev, async_data->mkey);
281 dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0);
282 mlx5_core_dealloc_pd(mdev, async_data->pdn);
283 kvfree(async_data->out);
284 fput(migf->filp);
285 }
286
mlx5vf_save_callback(int status,struct mlx5_async_work * context)287 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
288 {
289 struct mlx5vf_async_data *async_data = container_of(context,
290 struct mlx5vf_async_data, cb_work);
291 struct mlx5_vf_migration_file *migf = container_of(async_data,
292 struct mlx5_vf_migration_file, async_data);
293
294 if (!status) {
295 WRITE_ONCE(migf->total_length,
296 MLX5_GET(save_vhca_state_out, async_data->out,
297 actual_image_size));
298 wake_up_interruptible(&migf->poll_wait);
299 }
300
301 /*
302 * The error and the cleanup flows can't run from an
303 * interrupt context
304 */
305 async_data->status = status;
306 queue_work(migf->mvdev->cb_wq, &async_data->work);
307 }
308
mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device * mvdev,struct mlx5_vf_migration_file * migf)309 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
310 struct mlx5_vf_migration_file *migf)
311 {
312 u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
313 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
314 struct mlx5vf_async_data *async_data;
315 struct mlx5_core_dev *mdev;
316 u32 pdn, mkey;
317 int err;
318
319 lockdep_assert_held(&mvdev->state_mutex);
320 if (mvdev->mdev_detach)
321 return -ENOTCONN;
322
323 mdev = mvdev->mdev;
324 err = mlx5_core_alloc_pd(mdev, &pdn);
325 if (err)
326 return err;
327
328 err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE,
329 0);
330 if (err)
331 goto err_dma_map;
332
333 err = _create_mkey(mdev, pdn, migf, NULL, &mkey);
334 if (err)
335 goto err_create_mkey;
336
337 MLX5_SET(save_vhca_state_in, in, opcode,
338 MLX5_CMD_OP_SAVE_VHCA_STATE);
339 MLX5_SET(save_vhca_state_in, in, op_mod, 0);
340 MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
341 MLX5_SET(save_vhca_state_in, in, mkey, mkey);
342 MLX5_SET(save_vhca_state_in, in, size, migf->total_length);
343
344 async_data = &migf->async_data;
345 async_data->out = kvzalloc(out_size, GFP_KERNEL);
346 if (!async_data->out) {
347 err = -ENOMEM;
348 goto err_out;
349 }
350
351 /* no data exists till the callback comes back */
352 migf->total_length = 0;
353 get_file(migf->filp);
354 async_data->mkey = mkey;
355 async_data->pdn = pdn;
356 err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
357 async_data->out,
358 out_size, mlx5vf_save_callback,
359 &async_data->cb_work);
360 if (err)
361 goto err_exec;
362
363 return 0;
364
365 err_exec:
366 fput(migf->filp);
367 kvfree(async_data->out);
368 err_out:
369 mlx5_core_destroy_mkey(mdev, mkey);
370 err_create_mkey:
371 dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0);
372 err_dma_map:
373 mlx5_core_dealloc_pd(mdev, pdn);
374 return err;
375 }
376
mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device * mvdev,struct mlx5_vf_migration_file * migf)377 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
378 struct mlx5_vf_migration_file *migf)
379 {
380 struct mlx5_core_dev *mdev;
381 u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {};
382 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
383 u32 pdn, mkey;
384 int err;
385
386 lockdep_assert_held(&mvdev->state_mutex);
387 if (mvdev->mdev_detach)
388 return -ENOTCONN;
389
390 mutex_lock(&migf->lock);
391 if (!migf->total_length) {
392 err = -EINVAL;
393 goto end;
394 }
395
396 mdev = mvdev->mdev;
397 err = mlx5_core_alloc_pd(mdev, &pdn);
398 if (err)
399 goto end;
400
401 err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0);
402 if (err)
403 goto err_reg;
404
405 err = _create_mkey(mdev, pdn, migf, NULL, &mkey);
406 if (err)
407 goto err_mkey;
408
409 MLX5_SET(load_vhca_state_in, in, opcode,
410 MLX5_CMD_OP_LOAD_VHCA_STATE);
411 MLX5_SET(load_vhca_state_in, in, op_mod, 0);
412 MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
413 MLX5_SET(load_vhca_state_in, in, mkey, mkey);
414 MLX5_SET(load_vhca_state_in, in, size, migf->total_length);
415
416 err = mlx5_cmd_exec_inout(mdev, load_vhca_state, in, out);
417
418 mlx5_core_destroy_mkey(mdev, mkey);
419 err_mkey:
420 dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0);
421 err_reg:
422 mlx5_core_dealloc_pd(mdev, pdn);
423 end:
424 mutex_unlock(&migf->lock);
425 return err;
426 }
427
combine_ranges(struct rb_root_cached * root,u32 cur_nodes,u32 req_nodes)428 static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes,
429 u32 req_nodes)
430 {
431 struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
432 unsigned long min_gap;
433 unsigned long curr_gap;
434
435 /* Special shortcut when a single range is required */
436 if (req_nodes == 1) {
437 unsigned long last;
438
439 curr = comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
440 while (curr) {
441 last = curr->last;
442 prev = curr;
443 curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
444 if (prev != comb_start)
445 interval_tree_remove(prev, root);
446 }
447 comb_start->last = last;
448 return;
449 }
450
451 /* Combine ranges which have the smallest gap */
452 while (cur_nodes > req_nodes) {
453 prev = NULL;
454 min_gap = ULONG_MAX;
455 curr = interval_tree_iter_first(root, 0, ULONG_MAX);
456 while (curr) {
457 if (prev) {
458 curr_gap = curr->start - prev->last;
459 if (curr_gap < min_gap) {
460 min_gap = curr_gap;
461 comb_start = prev;
462 comb_end = curr;
463 }
464 }
465 prev = curr;
466 curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
467 }
468 comb_start->last = comb_end->last;
469 interval_tree_remove(comb_end, root);
470 cur_nodes--;
471 }
472 }
473
mlx5vf_create_tracker(struct mlx5_core_dev * mdev,struct mlx5vf_pci_core_device * mvdev,struct rb_root_cached * ranges,u32 nnodes)474 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
475 struct mlx5vf_pci_core_device *mvdev,
476 struct rb_root_cached *ranges, u32 nnodes)
477 {
478 int max_num_range =
479 MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
480 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
481 int record_size = MLX5_ST_SZ_BYTES(page_track_range);
482 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
483 struct interval_tree_node *node = NULL;
484 u64 total_ranges_len = 0;
485 u32 num_ranges = nnodes;
486 u8 log_addr_space_size;
487 void *range_list_ptr;
488 void *obj_context;
489 void *cmd_hdr;
490 int inlen;
491 void *in;
492 int err;
493 int i;
494
495 if (num_ranges > max_num_range) {
496 combine_ranges(ranges, nnodes, max_num_range);
497 num_ranges = max_num_range;
498 }
499
500 inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
501 record_size * num_ranges;
502 in = kzalloc(inlen, GFP_KERNEL);
503 if (!in)
504 return -ENOMEM;
505
506 cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
507 general_obj_in_cmd_hdr);
508 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
509 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
510 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
511 MLX5_OBJ_TYPE_PAGE_TRACK);
512 obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
513 MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
514 MLX5_SET(page_track, obj_context, track_type, 1);
515 MLX5_SET(page_track, obj_context, log_page_size,
516 ilog2(tracker->host_qp->tracked_page_size));
517 MLX5_SET(page_track, obj_context, log_msg_size,
518 ilog2(tracker->host_qp->max_msg_size));
519 MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
520 MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
521
522 range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
523 node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
524 for (i = 0; i < num_ranges; i++) {
525 void *addr_range_i_base = range_list_ptr + record_size * i;
526 unsigned long length = node->last - node->start;
527
528 MLX5_SET64(page_track_range, addr_range_i_base, start_address,
529 node->start);
530 MLX5_SET64(page_track_range, addr_range_i_base, length, length);
531 total_ranges_len += length;
532 node = interval_tree_iter_next(node, 0, ULONG_MAX);
533 }
534
535 WARN_ON(node);
536 log_addr_space_size = ilog2(total_ranges_len);
537 if (log_addr_space_size <
538 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
539 log_addr_space_size >
540 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
541 err = -EOPNOTSUPP;
542 goto out;
543 }
544
545 MLX5_SET(page_track, obj_context, log_addr_space_size,
546 log_addr_space_size);
547 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
548 if (err)
549 goto out;
550
551 tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
552 out:
553 kfree(in);
554 return err;
555 }
556
mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev * mdev,u32 tracker_id)557 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
558 u32 tracker_id)
559 {
560 u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
561 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
562
563 MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
564 MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
565 MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
566
567 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
568 }
569
mlx5vf_cmd_modify_tracker(struct mlx5_core_dev * mdev,u32 tracker_id,unsigned long iova,unsigned long length,u32 tracker_state)570 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
571 u32 tracker_id, unsigned long iova,
572 unsigned long length, u32 tracker_state)
573 {
574 u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
575 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
576 void *obj_context;
577 void *cmd_hdr;
578
579 cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
580 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
581 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
582 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
583
584 obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
585 MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
586 MLX5_SET64(page_track, obj_context, range_start_address, iova);
587 MLX5_SET64(page_track, obj_context, length, length);
588 MLX5_SET(page_track, obj_context, state, tracker_state);
589
590 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
591 }
592
alloc_cq_frag_buf(struct mlx5_core_dev * mdev,struct mlx5_vhca_cq_buf * buf,int nent,int cqe_size)593 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
594 struct mlx5_vhca_cq_buf *buf, int nent,
595 int cqe_size)
596 {
597 struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
598 u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
599 u8 log_wq_sz = ilog2(cqe_size);
600 int err;
601
602 err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
603 mdev->priv.numa_node);
604 if (err)
605 return err;
606
607 mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
608 buf->cqe_size = cqe_size;
609 buf->nent = nent;
610 return 0;
611 }
612
init_cq_frag_buf(struct mlx5_vhca_cq_buf * buf)613 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
614 {
615 struct mlx5_cqe64 *cqe64;
616 void *cqe;
617 int i;
618
619 for (i = 0; i < buf->nent; i++) {
620 cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
621 cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
622 cqe64->op_own = MLX5_CQE_INVALID << 4;
623 }
624 }
625
mlx5vf_destroy_cq(struct mlx5_core_dev * mdev,struct mlx5_vhca_cq * cq)626 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
627 struct mlx5_vhca_cq *cq)
628 {
629 mlx5_core_destroy_cq(mdev, &cq->mcq);
630 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
631 mlx5_db_free(mdev, &cq->db);
632 }
633
mlx5vf_cq_event(struct mlx5_core_cq * mcq,enum mlx5_event type)634 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
635 {
636 if (type != MLX5_EVENT_TYPE_CQ_ERROR)
637 return;
638
639 set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
640 tracker.cq.mcq));
641 }
642
mlx5vf_event_notifier(struct notifier_block * nb,unsigned long type,void * data)643 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
644 void *data)
645 {
646 struct mlx5_vhca_page_tracker *tracker =
647 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
648 struct mlx5vf_pci_core_device *mvdev = container_of(
649 tracker, struct mlx5vf_pci_core_device, tracker);
650 struct mlx5_eqe *eqe = data;
651 u8 event_type = (u8)type;
652 u8 queue_type;
653 int qp_num;
654
655 switch (event_type) {
656 case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
657 case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
658 case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
659 queue_type = eqe->data.qp_srq.type;
660 if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
661 break;
662 qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
663 if (qp_num != tracker->host_qp->qpn &&
664 qp_num != tracker->fw_qp->qpn)
665 break;
666 set_tracker_error(mvdev);
667 break;
668 default:
669 break;
670 }
671
672 return NOTIFY_OK;
673 }
674
mlx5vf_cq_complete(struct mlx5_core_cq * mcq,struct mlx5_eqe * eqe)675 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
676 struct mlx5_eqe *eqe)
677 {
678 struct mlx5vf_pci_core_device *mvdev =
679 container_of(mcq, struct mlx5vf_pci_core_device,
680 tracker.cq.mcq);
681
682 complete(&mvdev->tracker_comp);
683 }
684
mlx5vf_create_cq(struct mlx5_core_dev * mdev,struct mlx5_vhca_page_tracker * tracker,size_t ncqe)685 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
686 struct mlx5_vhca_page_tracker *tracker,
687 size_t ncqe)
688 {
689 int cqe_size = cache_line_size() == 128 ? 128 : 64;
690 u32 out[MLX5_ST_SZ_DW(create_cq_out)];
691 struct mlx5_vhca_cq *cq;
692 int inlen, err, eqn;
693 void *cqc, *in;
694 __be64 *pas;
695 int vector;
696
697 cq = &tracker->cq;
698 ncqe = roundup_pow_of_two(ncqe);
699 err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
700 if (err)
701 return err;
702
703 cq->ncqe = ncqe;
704 cq->mcq.set_ci_db = cq->db.db;
705 cq->mcq.arm_db = cq->db.db + 1;
706 cq->mcq.cqe_sz = cqe_size;
707 err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
708 if (err)
709 goto err_db_free;
710
711 init_cq_frag_buf(&cq->buf);
712 inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
713 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
714 cq->buf.frag_buf.npages;
715 in = kvzalloc(inlen, GFP_KERNEL);
716 if (!in) {
717 err = -ENOMEM;
718 goto err_buff;
719 }
720
721 vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev);
722 err = mlx5_vector2eqn(mdev, vector, &eqn);
723 if (err)
724 goto err_vec;
725
726 cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
727 MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
728 MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
729 MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
730 MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
731 MLX5_ADAPTER_PAGE_SHIFT);
732 MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
733 pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
734 mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
735 cq->mcq.comp = mlx5vf_cq_complete;
736 cq->mcq.event = mlx5vf_cq_event;
737 err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
738 if (err)
739 goto err_vec;
740
741 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
742 cq->mcq.cons_index);
743 kvfree(in);
744 return 0;
745
746 err_vec:
747 kvfree(in);
748 err_buff:
749 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
750 err_db_free:
751 mlx5_db_free(mdev, &cq->db);
752 return err;
753 }
754
755 static struct mlx5_vhca_qp *
mlx5vf_create_rc_qp(struct mlx5_core_dev * mdev,struct mlx5_vhca_page_tracker * tracker,u32 max_recv_wr)756 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
757 struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
758 {
759 u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
760 struct mlx5_vhca_qp *qp;
761 u8 log_rq_stride;
762 u8 log_rq_sz;
763 void *qpc;
764 int inlen;
765 void *in;
766 int err;
767
768 qp = kzalloc(sizeof(*qp), GFP_KERNEL);
769 if (!qp)
770 return ERR_PTR(-ENOMEM);
771
772 qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
773 log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
774 log_rq_sz = ilog2(qp->rq.wqe_cnt);
775 err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
776 if (err)
777 goto err_free;
778
779 if (max_recv_wr) {
780 err = mlx5_frag_buf_alloc_node(mdev,
781 wq_get_byte_sz(log_rq_sz, log_rq_stride),
782 &qp->buf, mdev->priv.numa_node);
783 if (err)
784 goto err_db_free;
785 mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
786 }
787
788 qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
789 inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
790 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
791 qp->buf.npages;
792 in = kvzalloc(inlen, GFP_KERNEL);
793 if (!in) {
794 err = -ENOMEM;
795 goto err_in;
796 }
797
798 qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
799 MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
800 MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
801 MLX5_SET(qpc, qpc, pd, tracker->pdn);
802 MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
803 MLX5_SET(qpc, qpc, log_page_size,
804 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
805 MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
806 if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
807 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
808 MLX5_SET(qpc, qpc, no_sq, 1);
809 if (max_recv_wr) {
810 MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
811 MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
812 MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
813 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
814 MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
815 mlx5_fill_page_frag_array(&qp->buf,
816 (__be64 *)MLX5_ADDR_OF(create_qp_in,
817 in, pas));
818 } else {
819 MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
820 }
821
822 MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
823 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
824 kvfree(in);
825 if (err)
826 goto err_in;
827
828 qp->qpn = MLX5_GET(create_qp_out, out, qpn);
829 return qp;
830
831 err_in:
832 if (max_recv_wr)
833 mlx5_frag_buf_free(mdev, &qp->buf);
834 err_db_free:
835 mlx5_db_free(mdev, &qp->db);
836 err_free:
837 kfree(qp);
838 return ERR_PTR(err);
839 }
840
mlx5vf_post_recv(struct mlx5_vhca_qp * qp)841 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
842 {
843 struct mlx5_wqe_data_seg *data;
844 unsigned int ix;
845
846 WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
847 ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
848 data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
849 data->byte_count = cpu_to_be32(qp->max_msg_size);
850 data->lkey = cpu_to_be32(qp->recv_buf.mkey);
851 data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
852 qp->rq.pc++;
853 /* Make sure that descriptors are written before doorbell record. */
854 dma_wmb();
855 *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
856 }
857
mlx5vf_activate_qp(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp,u32 remote_qpn,bool host_qp)858 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
859 struct mlx5_vhca_qp *qp, u32 remote_qpn,
860 bool host_qp)
861 {
862 u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
863 u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
864 u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
865 void *qpc;
866 int ret;
867
868 /* Init */
869 qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
870 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
871 MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
872 MLX5_SET(qpc, qpc, rre, 1);
873 MLX5_SET(qpc, qpc, rwe, 1);
874 MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
875 MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
876 ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
877 if (ret)
878 return ret;
879
880 if (host_qp) {
881 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
882 int i;
883
884 for (i = 0; i < qp->rq.wqe_cnt; i++) {
885 mlx5vf_post_recv(qp);
886 recv_buf->next_rq_offset += qp->max_msg_size;
887 }
888 }
889
890 /* RTR */
891 qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
892 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
893 MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
894 MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
895 MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
896 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
897 MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
898 MLX5_SET(qpc, qpc, min_rnr_nak, 1);
899 MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
900 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
901 ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
902 if (ret || host_qp)
903 return ret;
904
905 /* RTS */
906 qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
907 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
908 MLX5_SET(qpc, qpc, retry_count, 7);
909 MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
910 MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
911 MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
912 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
913
914 return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
915 }
916
mlx5vf_destroy_qp(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp)917 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
918 struct mlx5_vhca_qp *qp)
919 {
920 u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
921
922 MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
923 MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
924 mlx5_cmd_exec_in(mdev, destroy_qp, in);
925
926 mlx5_frag_buf_free(mdev, &qp->buf);
927 mlx5_db_free(mdev, &qp->db);
928 kfree(qp);
929 }
930
free_recv_pages(struct mlx5_vhca_recv_buf * recv_buf)931 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
932 {
933 int i;
934
935 /* Undo alloc_pages_bulk_array() */
936 for (i = 0; i < recv_buf->npages; i++)
937 __free_page(recv_buf->page_list[i]);
938
939 kvfree(recv_buf->page_list);
940 }
941
alloc_recv_pages(struct mlx5_vhca_recv_buf * recv_buf,unsigned int npages)942 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
943 unsigned int npages)
944 {
945 unsigned int filled = 0, done = 0;
946 int i;
947
948 recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
949 GFP_KERNEL);
950 if (!recv_buf->page_list)
951 return -ENOMEM;
952
953 for (;;) {
954 filled = alloc_pages_bulk_array(GFP_KERNEL, npages - done,
955 recv_buf->page_list + done);
956 if (!filled)
957 goto err;
958
959 done += filled;
960 if (done == npages)
961 break;
962 }
963
964 recv_buf->npages = npages;
965 return 0;
966
967 err:
968 for (i = 0; i < npages; i++) {
969 if (recv_buf->page_list[i])
970 __free_page(recv_buf->page_list[i]);
971 }
972
973 kvfree(recv_buf->page_list);
974 return -ENOMEM;
975 }
976
register_dma_recv_pages(struct mlx5_core_dev * mdev,struct mlx5_vhca_recv_buf * recv_buf)977 static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
978 struct mlx5_vhca_recv_buf *recv_buf)
979 {
980 int i, j;
981
982 recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
983 sizeof(*recv_buf->dma_addrs),
984 GFP_KERNEL);
985 if (!recv_buf->dma_addrs)
986 return -ENOMEM;
987
988 for (i = 0; i < recv_buf->npages; i++) {
989 recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
990 recv_buf->page_list[i],
991 0, PAGE_SIZE,
992 DMA_FROM_DEVICE);
993 if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
994 goto error;
995 }
996 return 0;
997
998 error:
999 for (j = 0; j < i; j++)
1000 dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
1001 PAGE_SIZE, DMA_FROM_DEVICE);
1002
1003 kvfree(recv_buf->dma_addrs);
1004 return -ENOMEM;
1005 }
1006
unregister_dma_recv_pages(struct mlx5_core_dev * mdev,struct mlx5_vhca_recv_buf * recv_buf)1007 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
1008 struct mlx5_vhca_recv_buf *recv_buf)
1009 {
1010 int i;
1011
1012 for (i = 0; i < recv_buf->npages; i++)
1013 dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
1014 PAGE_SIZE, DMA_FROM_DEVICE);
1015
1016 kvfree(recv_buf->dma_addrs);
1017 }
1018
mlx5vf_free_qp_recv_resources(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp)1019 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1020 struct mlx5_vhca_qp *qp)
1021 {
1022 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1023
1024 mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
1025 unregister_dma_recv_pages(mdev, recv_buf);
1026 free_recv_pages(&qp->recv_buf);
1027 }
1028
mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev * mdev,struct mlx5_vhca_qp * qp,u32 pdn,u64 rq_size)1029 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1030 struct mlx5_vhca_qp *qp, u32 pdn,
1031 u64 rq_size)
1032 {
1033 unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1034 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1035 int err;
1036
1037 err = alloc_recv_pages(recv_buf, npages);
1038 if (err < 0)
1039 return err;
1040
1041 err = register_dma_recv_pages(mdev, recv_buf);
1042 if (err)
1043 goto end;
1044
1045 err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
1046 if (err)
1047 goto err_create_mkey;
1048
1049 return 0;
1050
1051 err_create_mkey:
1052 unregister_dma_recv_pages(mdev, recv_buf);
1053 end:
1054 free_recv_pages(recv_buf);
1055 return err;
1056 }
1057
1058 static void
_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device * mvdev)1059 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1060 {
1061 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1062 struct mlx5_core_dev *mdev = mvdev->mdev;
1063
1064 lockdep_assert_held(&mvdev->state_mutex);
1065
1066 if (!mvdev->log_active)
1067 return;
1068
1069 WARN_ON(mvdev->mdev_detach);
1070
1071 mlx5_eq_notifier_unregister(mdev, &tracker->nb);
1072 mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
1073 mlx5vf_destroy_qp(mdev, tracker->fw_qp);
1074 mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
1075 mlx5vf_destroy_qp(mdev, tracker->host_qp);
1076 mlx5vf_destroy_cq(mdev, &tracker->cq);
1077 mlx5_core_dealloc_pd(mdev, tracker->pdn);
1078 mlx5_put_uars_page(mdev, tracker->uar);
1079 mvdev->log_active = false;
1080 }
1081
mlx5vf_stop_page_tracker(struct vfio_device * vdev)1082 int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1083 {
1084 struct mlx5vf_pci_core_device *mvdev = container_of(
1085 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1086
1087 mutex_lock(&mvdev->state_mutex);
1088 if (!mvdev->log_active)
1089 goto end;
1090
1091 _mlx5vf_free_page_tracker_resources(mvdev);
1092 mvdev->log_active = false;
1093 end:
1094 mlx5vf_state_mutex_unlock(mvdev);
1095 return 0;
1096 }
1097
mlx5vf_start_page_tracker(struct vfio_device * vdev,struct rb_root_cached * ranges,u32 nnodes,u64 * page_size)1098 int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1099 struct rb_root_cached *ranges, u32 nnodes,
1100 u64 *page_size)
1101 {
1102 struct mlx5vf_pci_core_device *mvdev = container_of(
1103 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1104 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1105 u8 log_tracked_page = ilog2(*page_size);
1106 struct mlx5_vhca_qp *host_qp;
1107 struct mlx5_vhca_qp *fw_qp;
1108 struct mlx5_core_dev *mdev;
1109 u32 max_msg_size = PAGE_SIZE;
1110 u64 rq_size = SZ_2M;
1111 u32 max_recv_wr;
1112 int err;
1113
1114 mutex_lock(&mvdev->state_mutex);
1115 if (mvdev->mdev_detach) {
1116 err = -ENOTCONN;
1117 goto end;
1118 }
1119
1120 if (mvdev->log_active) {
1121 err = -EINVAL;
1122 goto end;
1123 }
1124
1125 mdev = mvdev->mdev;
1126 memset(tracker, 0, sizeof(*tracker));
1127 tracker->uar = mlx5_get_uars_page(mdev);
1128 if (IS_ERR(tracker->uar)) {
1129 err = PTR_ERR(tracker->uar);
1130 goto end;
1131 }
1132
1133 err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
1134 if (err)
1135 goto err_uar;
1136
1137 max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1138 err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
1139 if (err)
1140 goto err_dealloc_pd;
1141
1142 host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1143 if (IS_ERR(host_qp)) {
1144 err = PTR_ERR(host_qp);
1145 goto err_cq;
1146 }
1147
1148 host_qp->max_msg_size = max_msg_size;
1149 if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1150 pg_track_log_min_page_size)) {
1151 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1152 pg_track_log_min_page_size);
1153 } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1154 pg_track_log_max_page_size)) {
1155 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1156 pg_track_log_max_page_size);
1157 }
1158
1159 host_qp->tracked_page_size = (1ULL << log_tracked_page);
1160 err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
1161 rq_size);
1162 if (err)
1163 goto err_host_qp;
1164
1165 fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
1166 if (IS_ERR(fw_qp)) {
1167 err = PTR_ERR(fw_qp);
1168 goto err_recv_resources;
1169 }
1170
1171 err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
1172 if (err)
1173 goto err_activate;
1174
1175 err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
1176 if (err)
1177 goto err_activate;
1178
1179 tracker->host_qp = host_qp;
1180 tracker->fw_qp = fw_qp;
1181 err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1182 if (err)
1183 goto err_activate;
1184
1185 MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1186 mlx5_eq_notifier_register(mdev, &tracker->nb);
1187 *page_size = host_qp->tracked_page_size;
1188 mvdev->log_active = true;
1189 mlx5vf_state_mutex_unlock(mvdev);
1190 return 0;
1191
1192 err_activate:
1193 mlx5vf_destroy_qp(mdev, fw_qp);
1194 err_recv_resources:
1195 mlx5vf_free_qp_recv_resources(mdev, host_qp);
1196 err_host_qp:
1197 mlx5vf_destroy_qp(mdev, host_qp);
1198 err_cq:
1199 mlx5vf_destroy_cq(mdev, &tracker->cq);
1200 err_dealloc_pd:
1201 mlx5_core_dealloc_pd(mdev, tracker->pdn);
1202 err_uar:
1203 mlx5_put_uars_page(mdev, tracker->uar);
1204 end:
1205 mlx5vf_state_mutex_unlock(mvdev);
1206 return err;
1207 }
1208
1209 static void
set_report_output(u32 size,int index,struct mlx5_vhca_qp * qp,struct iova_bitmap * dirty)1210 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1211 struct iova_bitmap *dirty)
1212 {
1213 u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1214 u32 nent = size / entry_size;
1215 struct page *page;
1216 u64 addr;
1217 u64 *buf;
1218 int i;
1219
1220 if (WARN_ON(index >= qp->recv_buf.npages ||
1221 (nent > qp->max_msg_size / entry_size)))
1222 return;
1223
1224 page = qp->recv_buf.page_list[index];
1225 buf = kmap_local_page(page);
1226 for (i = 0; i < nent; i++) {
1227 addr = MLX5_GET(page_track_report_entry, buf + i,
1228 dirty_address_low);
1229 addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1230 dirty_address_high) << 32;
1231 iova_bitmap_set(dirty, addr, qp->tracked_page_size);
1232 }
1233 kunmap_local(buf);
1234 }
1235
1236 static void
mlx5vf_rq_cqe(struct mlx5_vhca_qp * qp,struct mlx5_cqe64 * cqe,struct iova_bitmap * dirty,int * tracker_status)1237 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1238 struct iova_bitmap *dirty, int *tracker_status)
1239 {
1240 u32 size;
1241 int ix;
1242
1243 qp->rq.cc++;
1244 *tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1245 size = be32_to_cpu(cqe->byte_cnt);
1246 ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1247
1248 /* zero length CQE, no data */
1249 WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1250 if (size)
1251 set_report_output(size, ix, qp, dirty);
1252
1253 qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1254 mlx5vf_post_recv(qp);
1255 }
1256
get_cqe(struct mlx5_vhca_cq * cq,int n)1257 static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1258 {
1259 return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
1260 }
1261
get_sw_cqe(struct mlx5_vhca_cq * cq,int n)1262 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1263 {
1264 void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
1265 struct mlx5_cqe64 *cqe64;
1266
1267 cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1268
1269 if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1270 !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1271 return cqe64;
1272 } else {
1273 return NULL;
1274 }
1275 }
1276
1277 static int
mlx5vf_cq_poll_one(struct mlx5_vhca_cq * cq,struct mlx5_vhca_qp * qp,struct iova_bitmap * dirty,int * tracker_status)1278 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1279 struct iova_bitmap *dirty, int *tracker_status)
1280 {
1281 struct mlx5_cqe64 *cqe;
1282 u8 opcode;
1283
1284 cqe = get_sw_cqe(cq, cq->mcq.cons_index);
1285 if (!cqe)
1286 return CQ_EMPTY;
1287
1288 ++cq->mcq.cons_index;
1289 /*
1290 * Make sure we read CQ entry contents after we've checked the
1291 * ownership bit.
1292 */
1293 rmb();
1294 opcode = get_cqe_opcode(cqe);
1295 switch (opcode) {
1296 case MLX5_CQE_RESP_SEND_IMM:
1297 mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1298 return CQ_OK;
1299 default:
1300 return CQ_POLL_ERR;
1301 }
1302 }
1303
mlx5vf_tracker_read_and_clear(struct vfio_device * vdev,unsigned long iova,unsigned long length,struct iova_bitmap * dirty)1304 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1305 unsigned long length,
1306 struct iova_bitmap *dirty)
1307 {
1308 struct mlx5vf_pci_core_device *mvdev = container_of(
1309 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1310 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1311 struct mlx5_vhca_cq *cq = &tracker->cq;
1312 struct mlx5_core_dev *mdev;
1313 int poll_err, err;
1314
1315 mutex_lock(&mvdev->state_mutex);
1316 if (!mvdev->log_active) {
1317 err = -EINVAL;
1318 goto end;
1319 }
1320
1321 if (mvdev->mdev_detach) {
1322 err = -ENOTCONN;
1323 goto end;
1324 }
1325
1326 mdev = mvdev->mdev;
1327 err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
1328 MLX5_PAGE_TRACK_STATE_REPORTING);
1329 if (err)
1330 goto end;
1331
1332 tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1333 while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1334 !tracker->is_err) {
1335 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
1336 &tracker->status);
1337 if (poll_err == CQ_EMPTY) {
1338 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1339 cq->mcq.cons_index);
1340 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
1341 dirty, &tracker->status);
1342 if (poll_err == CQ_EMPTY) {
1343 wait_for_completion(&mvdev->tracker_comp);
1344 continue;
1345 }
1346 }
1347 if (poll_err == CQ_POLL_ERR) {
1348 err = -EIO;
1349 goto end;
1350 }
1351 mlx5_cq_set_ci(&cq->mcq);
1352 }
1353
1354 if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1355 tracker->is_err = true;
1356
1357 if (tracker->is_err)
1358 err = -EIO;
1359 end:
1360 mlx5vf_state_mutex_unlock(mvdev);
1361 return err;
1362 }
1363