1 /*
2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3 * Copyright (c) 2020, Intel Corporation. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
13 * conditions are met:
14 *
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer.
18 *
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34
35 #include <linux/kref.h>
36 #include <linux/random.h>
37 #include <linux/debugfs.h>
38 #include <linux/export.h>
39 #include <linux/delay.h>
40 #include <linux/dma-buf.h>
41 #include <linux/dma-resv.h>
42 #include <rdma/ib_umem_odp.h>
43 #include "dm.h"
44 #include "mlx5_ib.h"
45 #include "umr.h"
46 #include "data_direct.h"
47
48 enum {
49 MAX_PENDING_REG_MR = 8,
50 };
51
52 #define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4
53 #define MLX5_UMR_ALIGN 2048
54
55 static void
56 create_mkey_callback(int status, struct mlx5_async_work *context);
57 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
58 u64 iova, int access_flags,
59 unsigned long page_size, bool populate,
60 int access_mode);
61 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr);
62
set_mkc_access_pd_addr_fields(void * mkc,int acc,u64 start_addr,struct ib_pd * pd)63 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
64 struct ib_pd *pd)
65 {
66 struct mlx5_ib_dev *dev = to_mdev(pd->device);
67
68 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
69 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
70 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
71 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
72 MLX5_SET(mkc, mkc, lr, 1);
73
74 if (acc & IB_ACCESS_RELAXED_ORDERING) {
75 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
76 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1);
77
78 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
79 (MLX5_CAP_GEN(dev->mdev,
80 relaxed_ordering_read_pci_enabled) &&
81 pcie_relaxed_ordering_enabled(dev->mdev->pdev)))
82 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1);
83 }
84
85 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
86 MLX5_SET(mkc, mkc, qpn, 0xffffff);
87 MLX5_SET64(mkc, mkc, start_addr, start_addr);
88 }
89
assign_mkey_variant(struct mlx5_ib_dev * dev,u32 * mkey,u32 * in)90 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
91 {
92 u8 key = atomic_inc_return(&dev->mkey_var);
93 void *mkc;
94
95 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
96 MLX5_SET(mkc, mkc, mkey_7_0, key);
97 *mkey = key;
98 }
99
mlx5_ib_create_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mkey * mkey,u32 * in,int inlen)100 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
101 struct mlx5_ib_mkey *mkey, u32 *in, int inlen)
102 {
103 int ret;
104
105 assign_mkey_variant(dev, &mkey->key, in);
106 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
107 if (!ret)
108 init_waitqueue_head(&mkey->wait);
109
110 return ret;
111 }
112
mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey * async_create)113 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create)
114 {
115 struct mlx5_ib_dev *dev = async_create->ent->dev;
116 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
117 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out);
118
119 MLX5_SET(create_mkey_in, async_create->in, opcode,
120 MLX5_CMD_OP_CREATE_MKEY);
121 assign_mkey_variant(dev, &async_create->mkey, async_create->in);
122 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen,
123 async_create->out, outlen, create_mkey_callback,
124 &async_create->cb_work);
125 }
126
127 static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
128 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
129
destroy_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)130 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
131 {
132 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
133
134 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
135 }
136
create_mkey_warn(struct mlx5_ib_dev * dev,int status,void * out)137 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
138 {
139 if (status == -ENXIO) /* core driver is not available */
140 return;
141
142 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
143 if (status != -EREMOTEIO) /* driver specific failure */
144 return;
145
146 /* Failed in FW, print cmd out failure details */
147 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
148 }
149
push_mkey_locked(struct mlx5_cache_ent * ent,u32 mkey)150 static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey)
151 {
152 unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE;
153 struct mlx5_mkeys_page *page;
154
155 lockdep_assert_held(&ent->mkeys_queue.lock);
156 if (ent->mkeys_queue.ci >=
157 ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) {
158 page = kzalloc(sizeof(*page), GFP_ATOMIC);
159 if (!page)
160 return -ENOMEM;
161 ent->mkeys_queue.num_pages++;
162 list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
163 } else {
164 page = list_last_entry(&ent->mkeys_queue.pages_list,
165 struct mlx5_mkeys_page, list);
166 }
167
168 page->mkeys[tmp] = mkey;
169 ent->mkeys_queue.ci++;
170 return 0;
171 }
172
pop_mkey_locked(struct mlx5_cache_ent * ent)173 static int pop_mkey_locked(struct mlx5_cache_ent *ent)
174 {
175 unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE;
176 struct mlx5_mkeys_page *last_page;
177 u32 mkey;
178
179 lockdep_assert_held(&ent->mkeys_queue.lock);
180 last_page = list_last_entry(&ent->mkeys_queue.pages_list,
181 struct mlx5_mkeys_page, list);
182 mkey = last_page->mkeys[tmp];
183 last_page->mkeys[tmp] = 0;
184 ent->mkeys_queue.ci--;
185 if (ent->mkeys_queue.num_pages > 1 && !tmp) {
186 list_del(&last_page->list);
187 ent->mkeys_queue.num_pages--;
188 kfree(last_page);
189 }
190 return mkey;
191 }
192
create_mkey_callback(int status,struct mlx5_async_work * context)193 static void create_mkey_callback(int status, struct mlx5_async_work *context)
194 {
195 struct mlx5r_async_create_mkey *mkey_out =
196 container_of(context, struct mlx5r_async_create_mkey, cb_work);
197 struct mlx5_cache_ent *ent = mkey_out->ent;
198 struct mlx5_ib_dev *dev = ent->dev;
199 unsigned long flags;
200
201 if (status) {
202 create_mkey_warn(dev, status, mkey_out->out);
203 kfree(mkey_out);
204 spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
205 ent->pending--;
206 WRITE_ONCE(dev->fill_delay, 1);
207 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
208 mod_timer(&dev->delay_timer, jiffies + HZ);
209 return;
210 }
211
212 mkey_out->mkey |= mlx5_idx_to_mkey(
213 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
214 WRITE_ONCE(dev->cache.last_add, jiffies);
215
216 spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
217 push_mkey_locked(ent, mkey_out->mkey);
218 ent->pending--;
219 /* If we are doing fill_to_high_water then keep going. */
220 queue_adjust_cache_locked(ent);
221 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
222 kfree(mkey_out);
223 }
224
get_mkc_octo_size(unsigned int access_mode,unsigned int ndescs)225 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
226 {
227 int ret = 0;
228
229 switch (access_mode) {
230 case MLX5_MKC_ACCESS_MODE_MTT:
231 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
232 sizeof(struct mlx5_mtt));
233 break;
234 case MLX5_MKC_ACCESS_MODE_KSM:
235 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
236 sizeof(struct mlx5_klm));
237 break;
238 default:
239 WARN_ON(1);
240 }
241 return ret;
242 }
243
set_cache_mkc(struct mlx5_cache_ent * ent,void * mkc)244 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
245 {
246 set_mkc_access_pd_addr_fields(mkc, ent->rb_key.access_flags, 0,
247 ent->dev->umrc.pd);
248 MLX5_SET(mkc, mkc, free, 1);
249 MLX5_SET(mkc, mkc, umr_en, 1);
250 MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
251 MLX5_SET(mkc, mkc, access_mode_4_2,
252 (ent->rb_key.access_mode >> 2) & 0x7);
253 MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats);
254
255 MLX5_SET(mkc, mkc, translations_octword_size,
256 get_mkc_octo_size(ent->rb_key.access_mode,
257 ent->rb_key.ndescs));
258 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
259 }
260
261 /* Asynchronously schedule new MRs to be populated in the cache. */
add_keys(struct mlx5_cache_ent * ent,unsigned int num)262 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
263 {
264 struct mlx5r_async_create_mkey *async_create;
265 void *mkc;
266 int err = 0;
267 int i;
268
269 for (i = 0; i < num; i++) {
270 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey),
271 GFP_KERNEL);
272 if (!async_create)
273 return -ENOMEM;
274 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in,
275 memory_key_mkey_entry);
276 set_cache_mkc(ent, mkc);
277 async_create->ent = ent;
278
279 spin_lock_irq(&ent->mkeys_queue.lock);
280 if (ent->pending >= MAX_PENDING_REG_MR) {
281 err = -EAGAIN;
282 goto free_async_create;
283 }
284 ent->pending++;
285 spin_unlock_irq(&ent->mkeys_queue.lock);
286
287 err = mlx5_ib_create_mkey_cb(async_create);
288 if (err) {
289 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
290 goto err_create_mkey;
291 }
292 }
293
294 return 0;
295
296 err_create_mkey:
297 spin_lock_irq(&ent->mkeys_queue.lock);
298 ent->pending--;
299 free_async_create:
300 spin_unlock_irq(&ent->mkeys_queue.lock);
301 kfree(async_create);
302 return err;
303 }
304
305 /* Synchronously create a MR in the cache */
create_cache_mkey(struct mlx5_cache_ent * ent,u32 * mkey)306 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey)
307 {
308 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
309 void *mkc;
310 u32 *in;
311 int err;
312
313 in = kzalloc(inlen, GFP_KERNEL);
314 if (!in)
315 return -ENOMEM;
316 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
317 set_cache_mkc(ent, mkc);
318
319 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen);
320 if (err)
321 goto free_in;
322
323 WRITE_ONCE(ent->dev->cache.last_add, jiffies);
324 free_in:
325 kfree(in);
326 return err;
327 }
328
remove_cache_mr_locked(struct mlx5_cache_ent * ent)329 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
330 {
331 u32 mkey;
332
333 lockdep_assert_held(&ent->mkeys_queue.lock);
334 if (!ent->mkeys_queue.ci)
335 return;
336 mkey = pop_mkey_locked(ent);
337 spin_unlock_irq(&ent->mkeys_queue.lock);
338 mlx5_core_destroy_mkey(ent->dev->mdev, mkey);
339 spin_lock_irq(&ent->mkeys_queue.lock);
340 }
341
resize_available_mrs(struct mlx5_cache_ent * ent,unsigned int target,bool limit_fill)342 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
343 bool limit_fill)
344 __acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock)
345 {
346 int err;
347
348 lockdep_assert_held(&ent->mkeys_queue.lock);
349
350 while (true) {
351 if (limit_fill)
352 target = ent->limit * 2;
353 if (target == ent->pending + ent->mkeys_queue.ci)
354 return 0;
355 if (target > ent->pending + ent->mkeys_queue.ci) {
356 u32 todo = target - (ent->pending + ent->mkeys_queue.ci);
357
358 spin_unlock_irq(&ent->mkeys_queue.lock);
359 err = add_keys(ent, todo);
360 if (err == -EAGAIN)
361 usleep_range(3000, 5000);
362 spin_lock_irq(&ent->mkeys_queue.lock);
363 if (err) {
364 if (err != -EAGAIN)
365 return err;
366 } else
367 return 0;
368 } else {
369 remove_cache_mr_locked(ent);
370 }
371 }
372 }
373
size_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)374 static ssize_t size_write(struct file *filp, const char __user *buf,
375 size_t count, loff_t *pos)
376 {
377 struct mlx5_cache_ent *ent = filp->private_data;
378 u32 target;
379 int err;
380
381 err = kstrtou32_from_user(buf, count, 0, &target);
382 if (err)
383 return err;
384
385 /*
386 * Target is the new value of total_mrs the user requests, however we
387 * cannot free MRs that are in use. Compute the target value for stored
388 * mkeys.
389 */
390 spin_lock_irq(&ent->mkeys_queue.lock);
391 if (target < ent->in_use) {
392 err = -EINVAL;
393 goto err_unlock;
394 }
395 target = target - ent->in_use;
396 if (target < ent->limit || target > ent->limit*2) {
397 err = -EINVAL;
398 goto err_unlock;
399 }
400 err = resize_available_mrs(ent, target, false);
401 if (err)
402 goto err_unlock;
403 spin_unlock_irq(&ent->mkeys_queue.lock);
404
405 return count;
406
407 err_unlock:
408 spin_unlock_irq(&ent->mkeys_queue.lock);
409 return err;
410 }
411
size_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)412 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
413 loff_t *pos)
414 {
415 struct mlx5_cache_ent *ent = filp->private_data;
416 char lbuf[20];
417 int err;
418
419 err = snprintf(lbuf, sizeof(lbuf), "%ld\n",
420 ent->mkeys_queue.ci + ent->in_use);
421 if (err < 0)
422 return err;
423
424 return simple_read_from_buffer(buf, count, pos, lbuf, err);
425 }
426
427 static const struct file_operations size_fops = {
428 .owner = THIS_MODULE,
429 .open = simple_open,
430 .write = size_write,
431 .read = size_read,
432 };
433
limit_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)434 static ssize_t limit_write(struct file *filp, const char __user *buf,
435 size_t count, loff_t *pos)
436 {
437 struct mlx5_cache_ent *ent = filp->private_data;
438 u32 var;
439 int err;
440
441 err = kstrtou32_from_user(buf, count, 0, &var);
442 if (err)
443 return err;
444
445 /*
446 * Upon set we immediately fill the cache to high water mark implied by
447 * the limit.
448 */
449 spin_lock_irq(&ent->mkeys_queue.lock);
450 ent->limit = var;
451 err = resize_available_mrs(ent, 0, true);
452 spin_unlock_irq(&ent->mkeys_queue.lock);
453 if (err)
454 return err;
455 return count;
456 }
457
limit_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)458 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
459 loff_t *pos)
460 {
461 struct mlx5_cache_ent *ent = filp->private_data;
462 char lbuf[20];
463 int err;
464
465 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
466 if (err < 0)
467 return err;
468
469 return simple_read_from_buffer(buf, count, pos, lbuf, err);
470 }
471
472 static const struct file_operations limit_fops = {
473 .owner = THIS_MODULE,
474 .open = simple_open,
475 .write = limit_write,
476 .read = limit_read,
477 };
478
someone_adding(struct mlx5_mkey_cache * cache)479 static bool someone_adding(struct mlx5_mkey_cache *cache)
480 {
481 struct mlx5_cache_ent *ent;
482 struct rb_node *node;
483 bool ret;
484
485 mutex_lock(&cache->rb_lock);
486 for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
487 ent = rb_entry(node, struct mlx5_cache_ent, node);
488 spin_lock_irq(&ent->mkeys_queue.lock);
489 ret = ent->mkeys_queue.ci < ent->limit;
490 spin_unlock_irq(&ent->mkeys_queue.lock);
491 if (ret) {
492 mutex_unlock(&cache->rb_lock);
493 return true;
494 }
495 }
496 mutex_unlock(&cache->rb_lock);
497 return false;
498 }
499
500 /*
501 * Check if the bucket is outside the high/low water mark and schedule an async
502 * update. The cache refill has hysteresis, once the low water mark is hit it is
503 * refilled up to the high mark.
504 */
queue_adjust_cache_locked(struct mlx5_cache_ent * ent)505 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
506 {
507 lockdep_assert_held(&ent->mkeys_queue.lock);
508
509 if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
510 return;
511 if (ent->mkeys_queue.ci < ent->limit) {
512 ent->fill_to_high_water = true;
513 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
514 } else if (ent->fill_to_high_water &&
515 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) {
516 /*
517 * Once we start populating due to hitting a low water mark
518 * continue until we pass the high water mark.
519 */
520 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
521 } else if (ent->mkeys_queue.ci == 2 * ent->limit) {
522 ent->fill_to_high_water = false;
523 } else if (ent->mkeys_queue.ci > 2 * ent->limit) {
524 /* Queue deletion of excess entries */
525 ent->fill_to_high_water = false;
526 if (ent->pending)
527 queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
528 msecs_to_jiffies(1000));
529 else
530 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
531 }
532 }
533
clean_keys(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)534 static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
535 {
536 u32 mkey;
537
538 spin_lock_irq(&ent->mkeys_queue.lock);
539 while (ent->mkeys_queue.ci) {
540 mkey = pop_mkey_locked(ent);
541 spin_unlock_irq(&ent->mkeys_queue.lock);
542 mlx5_core_destroy_mkey(dev->mdev, mkey);
543 spin_lock_irq(&ent->mkeys_queue.lock);
544 }
545 ent->tmp_cleanup_scheduled = false;
546 spin_unlock_irq(&ent->mkeys_queue.lock);
547 }
548
__cache_work_func(struct mlx5_cache_ent * ent)549 static void __cache_work_func(struct mlx5_cache_ent *ent)
550 {
551 struct mlx5_ib_dev *dev = ent->dev;
552 struct mlx5_mkey_cache *cache = &dev->cache;
553 int err;
554
555 spin_lock_irq(&ent->mkeys_queue.lock);
556 if (ent->disabled)
557 goto out;
558
559 if (ent->fill_to_high_water &&
560 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit &&
561 !READ_ONCE(dev->fill_delay)) {
562 spin_unlock_irq(&ent->mkeys_queue.lock);
563 err = add_keys(ent, 1);
564 spin_lock_irq(&ent->mkeys_queue.lock);
565 if (ent->disabled)
566 goto out;
567 if (err) {
568 /*
569 * EAGAIN only happens if there are pending MRs, so we
570 * will be rescheduled when storing them. The only
571 * failure path here is ENOMEM.
572 */
573 if (err != -EAGAIN) {
574 mlx5_ib_warn(
575 dev,
576 "add keys command failed, err %d\n",
577 err);
578 queue_delayed_work(cache->wq, &ent->dwork,
579 msecs_to_jiffies(1000));
580 }
581 }
582 } else if (ent->mkeys_queue.ci > 2 * ent->limit) {
583 bool need_delay;
584
585 /*
586 * The remove_cache_mr() logic is performed as garbage
587 * collection task. Such task is intended to be run when no
588 * other active processes are running.
589 *
590 * The need_resched() will return TRUE if there are user tasks
591 * to be activated in near future.
592 *
593 * In such case, we don't execute remove_cache_mr() and postpone
594 * the garbage collection work to try to run in next cycle, in
595 * order to free CPU resources to other tasks.
596 */
597 spin_unlock_irq(&ent->mkeys_queue.lock);
598 need_delay = need_resched() || someone_adding(cache) ||
599 !time_after(jiffies,
600 READ_ONCE(cache->last_add) + 300 * HZ);
601 spin_lock_irq(&ent->mkeys_queue.lock);
602 if (ent->disabled)
603 goto out;
604 if (need_delay) {
605 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
606 goto out;
607 }
608 remove_cache_mr_locked(ent);
609 queue_adjust_cache_locked(ent);
610 }
611 out:
612 spin_unlock_irq(&ent->mkeys_queue.lock);
613 }
614
delayed_cache_work_func(struct work_struct * work)615 static void delayed_cache_work_func(struct work_struct *work)
616 {
617 struct mlx5_cache_ent *ent;
618
619 ent = container_of(work, struct mlx5_cache_ent, dwork.work);
620 /* temp entries are never filled, only cleaned */
621 if (ent->is_tmp)
622 clean_keys(ent->dev, ent);
623 else
624 __cache_work_func(ent);
625 }
626
cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,struct mlx5r_cache_rb_key key2)627 static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
628 struct mlx5r_cache_rb_key key2)
629 {
630 int res;
631
632 res = key1.ats - key2.ats;
633 if (res)
634 return res;
635
636 res = key1.access_mode - key2.access_mode;
637 if (res)
638 return res;
639
640 res = key1.access_flags - key2.access_flags;
641 if (res)
642 return res;
643
644 /*
645 * keep ndescs the last in the compare table since the find function
646 * searches for an exact match on all properties and only closest
647 * match in size.
648 */
649 return key1.ndescs - key2.ndescs;
650 }
651
mlx5_cache_ent_insert(struct mlx5_mkey_cache * cache,struct mlx5_cache_ent * ent)652 static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
653 struct mlx5_cache_ent *ent)
654 {
655 struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
656 struct mlx5_cache_ent *cur;
657 int cmp;
658
659 /* Figure out where to put new node */
660 while (*new) {
661 cur = rb_entry(*new, struct mlx5_cache_ent, node);
662 parent = *new;
663 cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key);
664 if (cmp > 0)
665 new = &((*new)->rb_left);
666 if (cmp < 0)
667 new = &((*new)->rb_right);
668 if (cmp == 0)
669 return -EEXIST;
670 }
671
672 /* Add new node and rebalance tree. */
673 rb_link_node(&ent->node, parent, new);
674 rb_insert_color(&ent->node, &cache->rb_root);
675
676 return 0;
677 }
678
679 static struct mlx5_cache_ent *
mkey_cache_ent_from_rb_key(struct mlx5_ib_dev * dev,struct mlx5r_cache_rb_key rb_key)680 mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
681 struct mlx5r_cache_rb_key rb_key)
682 {
683 struct rb_node *node = dev->cache.rb_root.rb_node;
684 struct mlx5_cache_ent *cur, *smallest = NULL;
685 u64 ndescs_limit;
686 int cmp;
687
688 /*
689 * Find the smallest ent with order >= requested_order.
690 */
691 while (node) {
692 cur = rb_entry(node, struct mlx5_cache_ent, node);
693 cmp = cache_ent_key_cmp(cur->rb_key, rb_key);
694 if (cmp > 0) {
695 smallest = cur;
696 node = node->rb_left;
697 }
698 if (cmp < 0)
699 node = node->rb_right;
700 if (cmp == 0)
701 return cur;
702 }
703
704 /*
705 * Limit the usage of mkeys larger than twice the required size while
706 * also allowing the usage of smallest cache entry for small MRs.
707 */
708 ndescs_limit = max_t(u64, rb_key.ndescs * 2,
709 MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS);
710
711 return (smallest &&
712 smallest->rb_key.access_mode == rb_key.access_mode &&
713 smallest->rb_key.access_flags == rb_key.access_flags &&
714 smallest->rb_key.ats == rb_key.ats &&
715 smallest->rb_key.ndescs <= ndescs_limit) ?
716 smallest :
717 NULL;
718 }
719
_mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent,int access_flags)720 static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
721 struct mlx5_cache_ent *ent,
722 int access_flags)
723 {
724 struct mlx5_ib_mr *mr;
725 int err;
726
727 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
728 if (!mr)
729 return ERR_PTR(-ENOMEM);
730
731 spin_lock_irq(&ent->mkeys_queue.lock);
732 ent->in_use++;
733
734 if (!ent->mkeys_queue.ci) {
735 queue_adjust_cache_locked(ent);
736 ent->miss++;
737 spin_unlock_irq(&ent->mkeys_queue.lock);
738 err = create_cache_mkey(ent, &mr->mmkey.key);
739 if (err) {
740 spin_lock_irq(&ent->mkeys_queue.lock);
741 ent->in_use--;
742 spin_unlock_irq(&ent->mkeys_queue.lock);
743 kfree(mr);
744 return ERR_PTR(err);
745 }
746 } else {
747 mr->mmkey.key = pop_mkey_locked(ent);
748 queue_adjust_cache_locked(ent);
749 spin_unlock_irq(&ent->mkeys_queue.lock);
750 }
751 mr->mmkey.cache_ent = ent;
752 mr->mmkey.type = MLX5_MKEY_MR;
753 mr->mmkey.rb_key = ent->rb_key;
754 mr->mmkey.cacheable = true;
755 init_waitqueue_head(&mr->mmkey.wait);
756 return mr;
757 }
758
get_unchangeable_access_flags(struct mlx5_ib_dev * dev,int access_flags)759 static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
760 int access_flags)
761 {
762 int ret = 0;
763
764 if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
765 MLX5_CAP_GEN(dev->mdev, atomic) &&
766 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
767 ret |= IB_ACCESS_REMOTE_ATOMIC;
768
769 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
770 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
771 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
772 ret |= IB_ACCESS_RELAXED_ORDERING;
773
774 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
775 (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
776 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled)) &&
777 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
778 ret |= IB_ACCESS_RELAXED_ORDERING;
779
780 return ret;
781 }
782
mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,int access_flags,int access_mode,int ndescs)783 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
784 int access_flags, int access_mode,
785 int ndescs)
786 {
787 struct mlx5r_cache_rb_key rb_key = {
788 .ndescs = ndescs,
789 .access_mode = access_mode,
790 .access_flags = get_unchangeable_access_flags(dev, access_flags)
791 };
792 struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key);
793
794 if (!ent)
795 return ERR_PTR(-EOPNOTSUPP);
796
797 return _mlx5_mr_cache_alloc(dev, ent, access_flags);
798 }
799
mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev * dev)800 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
801 {
802 if (!mlx5_debugfs_root || dev->is_rep)
803 return;
804
805 debugfs_remove_recursive(dev->cache.fs_root);
806 dev->cache.fs_root = NULL;
807 }
808
mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)809 static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev,
810 struct mlx5_cache_ent *ent)
811 {
812 int order = order_base_2(ent->rb_key.ndescs);
813 struct dentry *dir;
814
815 if (!mlx5_debugfs_root || dev->is_rep)
816 return;
817
818 if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
819 order = MLX5_IMR_KSM_CACHE_ENTRY + 2;
820
821 sprintf(ent->name, "%d", order);
822 dir = debugfs_create_dir(ent->name, dev->cache.fs_root);
823 debugfs_create_file("size", 0600, dir, ent, &size_fops);
824 debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
825 debugfs_create_ulong("cur", 0400, dir, &ent->mkeys_queue.ci);
826 debugfs_create_u32("miss", 0600, dir, &ent->miss);
827 }
828
mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev * dev)829 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
830 {
831 struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev);
832 struct mlx5_mkey_cache *cache = &dev->cache;
833
834 if (!mlx5_debugfs_root || dev->is_rep)
835 return;
836
837 cache->fs_root = debugfs_create_dir("mr_cache", dbg_root);
838 }
839
delay_time_func(struct timer_list * t)840 static void delay_time_func(struct timer_list *t)
841 {
842 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
843
844 WRITE_ONCE(dev->fill_delay, 0);
845 }
846
mlx5r_mkeys_init(struct mlx5_cache_ent * ent)847 static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent)
848 {
849 struct mlx5_mkeys_page *page;
850
851 page = kzalloc(sizeof(*page), GFP_KERNEL);
852 if (!page)
853 return -ENOMEM;
854 INIT_LIST_HEAD(&ent->mkeys_queue.pages_list);
855 spin_lock_init(&ent->mkeys_queue.lock);
856 list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
857 ent->mkeys_queue.num_pages++;
858 return 0;
859 }
860
mlx5r_mkeys_uninit(struct mlx5_cache_ent * ent)861 static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent)
862 {
863 struct mlx5_mkeys_page *page;
864
865 WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1);
866 page = list_last_entry(&ent->mkeys_queue.pages_list,
867 struct mlx5_mkeys_page, list);
868 list_del(&page->list);
869 kfree(page);
870 }
871
872 struct mlx5_cache_ent *
mlx5r_cache_create_ent_locked(struct mlx5_ib_dev * dev,struct mlx5r_cache_rb_key rb_key,bool persistent_entry)873 mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
874 struct mlx5r_cache_rb_key rb_key,
875 bool persistent_entry)
876 {
877 struct mlx5_cache_ent *ent;
878 int order;
879 int ret;
880
881 ent = kzalloc(sizeof(*ent), GFP_KERNEL);
882 if (!ent)
883 return ERR_PTR(-ENOMEM);
884
885 ret = mlx5r_mkeys_init(ent);
886 if (ret)
887 goto mkeys_err;
888 ent->rb_key = rb_key;
889 ent->dev = dev;
890 ent->is_tmp = !persistent_entry;
891
892 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
893
894 ret = mlx5_cache_ent_insert(&dev->cache, ent);
895 if (ret)
896 goto ent_insert_err;
897
898 if (persistent_entry) {
899 if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
900 order = MLX5_IMR_KSM_CACHE_ENTRY;
901 else
902 order = order_base_2(rb_key.ndescs) - 2;
903
904 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
905 !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
906 mlx5r_umr_can_load_pas(dev, 0))
907 ent->limit = dev->mdev->profile.mr_cache[order].limit;
908 else
909 ent->limit = 0;
910
911 mlx5_mkey_cache_debugfs_add_ent(dev, ent);
912 }
913
914 return ent;
915 ent_insert_err:
916 mlx5r_mkeys_uninit(ent);
917 mkeys_err:
918 kfree(ent);
919 return ERR_PTR(ret);
920 }
921
mlx5r_destroy_cache_entries(struct mlx5_ib_dev * dev)922 static void mlx5r_destroy_cache_entries(struct mlx5_ib_dev *dev)
923 {
924 struct rb_root *root = &dev->cache.rb_root;
925 struct mlx5_cache_ent *ent;
926 struct rb_node *node;
927
928 mutex_lock(&dev->cache.rb_lock);
929 node = rb_first(root);
930 while (node) {
931 ent = rb_entry(node, struct mlx5_cache_ent, node);
932 node = rb_next(node);
933 clean_keys(dev, ent);
934 rb_erase(&ent->node, root);
935 mlx5r_mkeys_uninit(ent);
936 kfree(ent);
937 }
938 mutex_unlock(&dev->cache.rb_lock);
939 }
940
mlx5_mkey_cache_init(struct mlx5_ib_dev * dev)941 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
942 {
943 struct mlx5_mkey_cache *cache = &dev->cache;
944 struct rb_root *root = &dev->cache.rb_root;
945 struct mlx5r_cache_rb_key rb_key = {
946 .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
947 };
948 struct mlx5_cache_ent *ent;
949 struct rb_node *node;
950 int ret;
951 int i;
952
953 mutex_init(&dev->slow_path_mutex);
954 mutex_init(&dev->cache.rb_lock);
955 dev->cache.rb_root = RB_ROOT;
956 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
957 if (!cache->wq) {
958 mlx5_ib_warn(dev, "failed to create work queue\n");
959 return -ENOMEM;
960 }
961
962 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
963 timer_setup(&dev->delay_timer, delay_time_func, 0);
964 mlx5_mkey_cache_debugfs_init(dev);
965 mutex_lock(&cache->rb_lock);
966 for (i = 0; i <= mkey_cache_max_order(dev); i++) {
967 rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i;
968 ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
969 if (IS_ERR(ent)) {
970 ret = PTR_ERR(ent);
971 goto err;
972 }
973 }
974
975 ret = mlx5_odp_init_mkey_cache(dev);
976 if (ret)
977 goto err;
978
979 mutex_unlock(&cache->rb_lock);
980 for (node = rb_first(root); node; node = rb_next(node)) {
981 ent = rb_entry(node, struct mlx5_cache_ent, node);
982 spin_lock_irq(&ent->mkeys_queue.lock);
983 queue_adjust_cache_locked(ent);
984 spin_unlock_irq(&ent->mkeys_queue.lock);
985 }
986
987 return 0;
988
989 err:
990 mutex_unlock(&cache->rb_lock);
991 mlx5_mkey_cache_debugfs_cleanup(dev);
992 mlx5r_destroy_cache_entries(dev);
993 destroy_workqueue(cache->wq);
994 mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
995 return ret;
996 }
997
mlx5_mkey_cache_cleanup(struct mlx5_ib_dev * dev)998 void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
999 {
1000 struct rb_root *root = &dev->cache.rb_root;
1001 struct mlx5_cache_ent *ent;
1002 struct rb_node *node;
1003
1004 if (!dev->cache.wq)
1005 return;
1006
1007 mutex_lock(&dev->cache.rb_lock);
1008 for (node = rb_first(root); node; node = rb_next(node)) {
1009 ent = rb_entry(node, struct mlx5_cache_ent, node);
1010 spin_lock_irq(&ent->mkeys_queue.lock);
1011 ent->disabled = true;
1012 spin_unlock_irq(&ent->mkeys_queue.lock);
1013 cancel_delayed_work(&ent->dwork);
1014 }
1015 mutex_unlock(&dev->cache.rb_lock);
1016
1017 /*
1018 * After all entries are disabled and will not reschedule on WQ,
1019 * flush it and all async commands.
1020 */
1021 flush_workqueue(dev->cache.wq);
1022
1023 mlx5_mkey_cache_debugfs_cleanup(dev);
1024 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
1025
1026 /* At this point all entries are disabled and have no concurrent work. */
1027 mlx5r_destroy_cache_entries(dev);
1028
1029 destroy_workqueue(dev->cache.wq);
1030 del_timer_sync(&dev->delay_timer);
1031 }
1032
mlx5_ib_get_dma_mr(struct ib_pd * pd,int acc)1033 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
1034 {
1035 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1036 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1037 struct mlx5_ib_mr *mr;
1038 void *mkc;
1039 u32 *in;
1040 int err;
1041
1042 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1043 if (!mr)
1044 return ERR_PTR(-ENOMEM);
1045
1046 in = kzalloc(inlen, GFP_KERNEL);
1047 if (!in) {
1048 err = -ENOMEM;
1049 goto err_free;
1050 }
1051
1052 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1053
1054 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
1055 MLX5_SET(mkc, mkc, length64, 1);
1056 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
1057 pd);
1058 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
1059
1060 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1061 if (err)
1062 goto err_in;
1063
1064 kfree(in);
1065 mr->mmkey.type = MLX5_MKEY_MR;
1066 mr->ibmr.lkey = mr->mmkey.key;
1067 mr->ibmr.rkey = mr->mmkey.key;
1068 mr->umem = NULL;
1069
1070 return &mr->ibmr;
1071
1072 err_in:
1073 kfree(in);
1074
1075 err_free:
1076 kfree(mr);
1077
1078 return ERR_PTR(err);
1079 }
1080
get_octo_len(u64 addr,u64 len,int page_shift)1081 static int get_octo_len(u64 addr, u64 len, int page_shift)
1082 {
1083 u64 page_size = 1ULL << page_shift;
1084 u64 offset;
1085 int npages;
1086
1087 offset = addr & (page_size - 1);
1088 npages = ALIGN(len + offset, page_size) >> page_shift;
1089 return (npages + 1) / 2;
1090 }
1091
mkey_cache_max_order(struct mlx5_ib_dev * dev)1092 static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
1093 {
1094 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
1095 return MKEY_CACHE_LAST_STD_ENTRY;
1096 return MLX5_MAX_UMR_SHIFT;
1097 }
1098
set_mr_fields(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr,u64 length,int access_flags,u64 iova)1099 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1100 u64 length, int access_flags, u64 iova)
1101 {
1102 mr->ibmr.lkey = mr->mmkey.key;
1103 mr->ibmr.rkey = mr->mmkey.key;
1104 mr->ibmr.length = length;
1105 mr->ibmr.device = &dev->ib_dev;
1106 mr->ibmr.iova = iova;
1107 mr->access_flags = access_flags;
1108 }
1109
mlx5_umem_dmabuf_default_pgsz(struct ib_umem * umem,u64 iova)1110 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
1111 u64 iova)
1112 {
1113 /*
1114 * The alignment of iova has already been checked upon entering
1115 * UVERBS_METHOD_REG_DMABUF_MR
1116 */
1117 umem->iova = iova;
1118 return PAGE_SIZE;
1119 }
1120
alloc_cacheable_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags,int access_mode)1121 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
1122 struct ib_umem *umem, u64 iova,
1123 int access_flags, int access_mode)
1124 {
1125 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1126 struct mlx5r_cache_rb_key rb_key = {};
1127 struct mlx5_cache_ent *ent;
1128 struct mlx5_ib_mr *mr;
1129 unsigned long page_size;
1130
1131 if (umem->is_dmabuf)
1132 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
1133 else
1134 page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
1135 if (WARN_ON(!page_size))
1136 return ERR_PTR(-EINVAL);
1137
1138 rb_key.access_mode = access_mode;
1139 rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
1140 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
1141 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
1142 ent = mkey_cache_ent_from_rb_key(dev, rb_key);
1143 /*
1144 * If the MR can't come from the cache then synchronously create an uncached
1145 * one.
1146 */
1147 if (!ent) {
1148 mutex_lock(&dev->slow_path_mutex);
1149 mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode);
1150 mutex_unlock(&dev->slow_path_mutex);
1151 if (IS_ERR(mr))
1152 return mr;
1153 mr->mmkey.rb_key = rb_key;
1154 mr->mmkey.cacheable = true;
1155 return mr;
1156 }
1157
1158 mr = _mlx5_mr_cache_alloc(dev, ent, access_flags);
1159 if (IS_ERR(mr))
1160 return mr;
1161
1162 mr->ibmr.pd = pd;
1163 mr->umem = umem;
1164 mr->page_shift = order_base_2(page_size);
1165 set_mr_fields(dev, mr, umem->length, access_flags, iova);
1166
1167 return mr;
1168 }
1169
1170 static struct ib_mr *
reg_create_crossing_vhca_mr(struct ib_pd * pd,u64 iova,u64 length,int access_flags,u32 crossed_lkey)1171 reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags,
1172 u32 crossed_lkey)
1173 {
1174 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1175 int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING;
1176 struct mlx5_ib_mr *mr;
1177 void *mkc;
1178 int inlen;
1179 u32 *in;
1180 int err;
1181
1182 if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey))
1183 return ERR_PTR(-EOPNOTSUPP);
1184
1185 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1186 if (!mr)
1187 return ERR_PTR(-ENOMEM);
1188
1189 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1190 in = kvzalloc(inlen, GFP_KERNEL);
1191 if (!in) {
1192 err = -ENOMEM;
1193 goto err_1;
1194 }
1195
1196 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1197 MLX5_SET(mkc, mkc, crossing_target_vhca_id,
1198 MLX5_CAP_GEN(dev->mdev, vhca_id));
1199 MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey);
1200 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1201 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1202
1203 /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */
1204 set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd);
1205 MLX5_SET64(mkc, mkc, len, iova + length);
1206
1207 MLX5_SET(mkc, mkc, free, 0);
1208 MLX5_SET(mkc, mkc, umr_en, 0);
1209 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1210 if (err)
1211 goto err_2;
1212
1213 mr->mmkey.type = MLX5_MKEY_MR;
1214 set_mr_fields(dev, mr, length, access_flags, iova);
1215 mr->ibmr.pd = pd;
1216 kvfree(in);
1217 mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key);
1218
1219 return &mr->ibmr;
1220 err_2:
1221 kvfree(in);
1222 err_1:
1223 kfree(mr);
1224 return ERR_PTR(err);
1225 }
1226
1227 /*
1228 * If ibmr is NULL it will be allocated by reg_create.
1229 * Else, the given ibmr will be used.
1230 */
reg_create(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags,unsigned long page_size,bool populate,int access_mode)1231 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1232 u64 iova, int access_flags,
1233 unsigned long page_size, bool populate,
1234 int access_mode)
1235 {
1236 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1237 struct mlx5_ib_mr *mr;
1238 __be64 *pas;
1239 void *mkc;
1240 int inlen;
1241 u32 *in;
1242 int err;
1243 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) &&
1244 (access_mode == MLX5_MKC_ACCESS_MODE_MTT);
1245 bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
1246
1247 if (!page_size)
1248 return ERR_PTR(-EINVAL);
1249 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1250 if (!mr)
1251 return ERR_PTR(-ENOMEM);
1252
1253 mr->ibmr.pd = pd;
1254 mr->access_flags = access_flags;
1255 mr->page_shift = order_base_2(page_size);
1256
1257 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1258 if (populate)
1259 inlen += sizeof(*pas) *
1260 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1261 in = kvzalloc(inlen, GFP_KERNEL);
1262 if (!in) {
1263 err = -ENOMEM;
1264 goto err_1;
1265 }
1266 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1267 if (populate) {
1268 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) {
1269 err = -EINVAL;
1270 goto err_2;
1271 }
1272 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1273 pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1274 }
1275
1276 /* The pg_access bit allows setting the access flags
1277 * in the page list submitted with the command.
1278 */
1279 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1280
1281 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1282 set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1283 populate ? pd : dev->umrc.pd);
1284 /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */
1285 if (umem->is_dmabuf && ksm_mode)
1286 MLX5_SET(mkc, mkc, pd, dev->ddr.pdn);
1287
1288 MLX5_SET(mkc, mkc, free, !populate);
1289 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode);
1290 MLX5_SET(mkc, mkc, umr_en, 1);
1291
1292 MLX5_SET64(mkc, mkc, len, umem->length);
1293 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1294 if (ksm_mode)
1295 MLX5_SET(mkc, mkc, translations_octword_size,
1296 get_octo_len(iova, umem->length, mr->page_shift) * 2);
1297 else
1298 MLX5_SET(mkc, mkc, translations_octword_size,
1299 get_octo_len(iova, umem->length, mr->page_shift));
1300 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1301 if (mlx5_umem_needs_ats(dev, umem, access_flags))
1302 MLX5_SET(mkc, mkc, ma_translation_mode, 1);
1303 if (populate) {
1304 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1305 get_octo_len(iova, umem->length, mr->page_shift));
1306 }
1307
1308 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1309 if (err) {
1310 mlx5_ib_warn(dev, "create mkey failed\n");
1311 goto err_2;
1312 }
1313 mr->mmkey.type = MLX5_MKEY_MR;
1314 mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift);
1315 mr->umem = umem;
1316 set_mr_fields(dev, mr, umem->length, access_flags, iova);
1317 kvfree(in);
1318
1319 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1320
1321 return mr;
1322
1323 err_2:
1324 kvfree(in);
1325 err_1:
1326 kfree(mr);
1327 return ERR_PTR(err);
1328 }
1329
mlx5_ib_get_dm_mr(struct ib_pd * pd,u64 start_addr,u64 length,int acc,int mode)1330 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1331 u64 length, int acc, int mode)
1332 {
1333 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1334 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1335 struct mlx5_ib_mr *mr;
1336 void *mkc;
1337 u32 *in;
1338 int err;
1339
1340 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1341 if (!mr)
1342 return ERR_PTR(-ENOMEM);
1343
1344 in = kzalloc(inlen, GFP_KERNEL);
1345 if (!in) {
1346 err = -ENOMEM;
1347 goto err_free;
1348 }
1349
1350 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1351
1352 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1353 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1354 MLX5_SET64(mkc, mkc, len, length);
1355 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1356
1357 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1358 if (err)
1359 goto err_in;
1360
1361 kfree(in);
1362
1363 set_mr_fields(dev, mr, length, acc, start_addr);
1364
1365 return &mr->ibmr;
1366
1367 err_in:
1368 kfree(in);
1369
1370 err_free:
1371 kfree(mr);
1372
1373 return ERR_PTR(err);
1374 }
1375
mlx5_ib_advise_mr(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 flags,struct ib_sge * sg_list,u32 num_sge,struct uverbs_attr_bundle * attrs)1376 int mlx5_ib_advise_mr(struct ib_pd *pd,
1377 enum ib_uverbs_advise_mr_advice advice,
1378 u32 flags,
1379 struct ib_sge *sg_list,
1380 u32 num_sge,
1381 struct uverbs_attr_bundle *attrs)
1382 {
1383 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1384 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1385 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1386 return -EOPNOTSUPP;
1387
1388 return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1389 sg_list, num_sge);
1390 }
1391
mlx5_ib_reg_dm_mr(struct ib_pd * pd,struct ib_dm * dm,struct ib_dm_mr_attr * attr,struct uverbs_attr_bundle * attrs)1392 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1393 struct ib_dm_mr_attr *attr,
1394 struct uverbs_attr_bundle *attrs)
1395 {
1396 struct mlx5_ib_dm *mdm = to_mdm(dm);
1397 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1398 u64 start_addr = mdm->dev_addr + attr->offset;
1399 int mode;
1400
1401 switch (mdm->type) {
1402 case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1403 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1404 return ERR_PTR(-EINVAL);
1405
1406 mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1407 start_addr -= pci_resource_start(dev->pdev, 0);
1408 break;
1409 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1410 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1411 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
1412 case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM:
1413 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1414 return ERR_PTR(-EINVAL);
1415
1416 mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1417 break;
1418 default:
1419 return ERR_PTR(-EINVAL);
1420 }
1421
1422 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1423 attr->access_flags, mode);
1424 }
1425
create_real_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags)1426 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1427 u64 iova, int access_flags)
1428 {
1429 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1430 struct mlx5_ib_mr *mr = NULL;
1431 bool xlt_with_umr;
1432 int err;
1433
1434 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
1435 if (xlt_with_umr) {
1436 mr = alloc_cacheable_mr(pd, umem, iova, access_flags,
1437 MLX5_MKC_ACCESS_MODE_MTT);
1438 } else {
1439 unsigned long page_size =
1440 mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
1441
1442 mutex_lock(&dev->slow_path_mutex);
1443 mr = reg_create(pd, umem, iova, access_flags, page_size,
1444 true, MLX5_MKC_ACCESS_MODE_MTT);
1445 mutex_unlock(&dev->slow_path_mutex);
1446 }
1447 if (IS_ERR(mr)) {
1448 ib_umem_release(umem);
1449 return ERR_CAST(mr);
1450 }
1451
1452 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1453
1454 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1455
1456 if (xlt_with_umr) {
1457 /*
1458 * If the MR was created with reg_create then it will be
1459 * configured properly but left disabled. It is safe to go ahead
1460 * and configure it again via UMR while enabling it.
1461 */
1462 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1463 if (err) {
1464 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1465 return ERR_PTR(err);
1466 }
1467 }
1468 return &mr->ibmr;
1469 }
1470
create_user_odp_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1471 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1472 u64 iova, int access_flags,
1473 struct ib_udata *udata)
1474 {
1475 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1476 struct ib_umem_odp *odp;
1477 struct mlx5_ib_mr *mr;
1478 int err;
1479
1480 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1481 return ERR_PTR(-EOPNOTSUPP);
1482
1483 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
1484 if (err)
1485 return ERR_PTR(err);
1486 if (!start && length == U64_MAX) {
1487 if (iova != 0)
1488 return ERR_PTR(-EINVAL);
1489 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1490 return ERR_PTR(-EINVAL);
1491
1492 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1493 if (IS_ERR(mr))
1494 return ERR_CAST(mr);
1495 return &mr->ibmr;
1496 }
1497
1498 /* ODP requires xlt update via umr to work. */
1499 if (!mlx5r_umr_can_load_pas(dev, length))
1500 return ERR_PTR(-EINVAL);
1501
1502 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1503 &mlx5_mn_ops);
1504 if (IS_ERR(odp))
1505 return ERR_CAST(odp);
1506
1507 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags,
1508 MLX5_MKC_ACCESS_MODE_MTT);
1509 if (IS_ERR(mr)) {
1510 ib_umem_release(&odp->umem);
1511 return ERR_CAST(mr);
1512 }
1513 xa_init(&mr->implicit_children);
1514
1515 odp->private = mr;
1516 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1517 if (err)
1518 goto err_dereg_mr;
1519
1520 err = mlx5_ib_init_odp_mr(mr);
1521 if (err)
1522 goto err_dereg_mr;
1523 return &mr->ibmr;
1524
1525 err_dereg_mr:
1526 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1527 return ERR_PTR(err);
1528 }
1529
mlx5_ib_reg_user_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1530 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1531 u64 iova, int access_flags,
1532 struct ib_udata *udata)
1533 {
1534 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1535 struct ib_umem *umem;
1536 int err;
1537
1538 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1539 return ERR_PTR(-EOPNOTSUPP);
1540
1541 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1542 start, iova, length, access_flags);
1543
1544 err = mlx5r_umr_resource_init(dev);
1545 if (err)
1546 return ERR_PTR(err);
1547
1548 if (access_flags & IB_ACCESS_ON_DEMAND)
1549 return create_user_odp_mr(pd, start, length, iova, access_flags,
1550 udata);
1551 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1552 if (IS_ERR(umem))
1553 return ERR_CAST(umem);
1554 return create_real_mr(pd, umem, iova, access_flags);
1555 }
1556
mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment * attach)1557 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1558 {
1559 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1560 struct mlx5_ib_mr *mr = umem_dmabuf->private;
1561
1562 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1563
1564 if (!umem_dmabuf->sgt || !mr)
1565 return;
1566
1567 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1568 ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1569 }
1570
1571 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1572 .allow_peer2peer = 1,
1573 .move_notify = mlx5_ib_dmabuf_invalidate_cb,
1574 };
1575
1576 static struct ib_mr *
reg_user_mr_dmabuf(struct ib_pd * pd,struct device * dma_device,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags,int access_mode)1577 reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
1578 u64 offset, u64 length, u64 virt_addr,
1579 int fd, int access_flags, int access_mode)
1580 {
1581 bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
1582 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1583 struct mlx5_ib_mr *mr = NULL;
1584 struct ib_umem_dmabuf *umem_dmabuf;
1585 int err;
1586
1587 err = mlx5r_umr_resource_init(dev);
1588 if (err)
1589 return ERR_PTR(err);
1590
1591 if (!pinned_mode)
1592 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev,
1593 offset, length, fd,
1594 access_flags,
1595 &mlx5_ib_dmabuf_attach_ops);
1596 else
1597 umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev,
1598 dma_device, offset, length,
1599 fd, access_flags);
1600
1601 if (IS_ERR(umem_dmabuf)) {
1602 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
1603 PTR_ERR(umem_dmabuf));
1604 return ERR_CAST(umem_dmabuf);
1605 }
1606
1607 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
1608 access_flags, access_mode);
1609 if (IS_ERR(mr)) {
1610 ib_umem_release(&umem_dmabuf->umem);
1611 return ERR_CAST(mr);
1612 }
1613
1614 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1615
1616 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1617 umem_dmabuf->private = mr;
1618 if (!pinned_mode) {
1619 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1620 if (err)
1621 goto err_dereg_mr;
1622 } else {
1623 mr->data_direct = true;
1624 }
1625
1626 err = mlx5_ib_init_dmabuf_mr(mr);
1627 if (err)
1628 goto err_dereg_mr;
1629 return &mr->ibmr;
1630
1631 err_dereg_mr:
1632 __mlx5_ib_dereg_mr(&mr->ibmr);
1633 return ERR_PTR(err);
1634 }
1635
1636 static struct ib_mr *
reg_user_mr_dmabuf_by_data_direct(struct ib_pd * pd,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags)1637 reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset,
1638 u64 length, u64 virt_addr,
1639 int fd, int access_flags)
1640 {
1641 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1642 struct mlx5_data_direct_dev *data_direct_dev;
1643 struct ib_mr *crossing_mr;
1644 struct ib_mr *crossed_mr;
1645 int ret = 0;
1646
1647 /* As of HW behaviour the IOVA must be page aligned in KSM mode */
1648 if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND))
1649 return ERR_PTR(-EOPNOTSUPP);
1650
1651 mutex_lock(&dev->data_direct_lock);
1652 data_direct_dev = dev->data_direct_dev;
1653 if (!data_direct_dev) {
1654 ret = -EINVAL;
1655 goto end;
1656 }
1657
1658 /* The device's 'data direct mkey' was created without RO flags to
1659 * simplify things and allow for a single mkey per device.
1660 * Since RO is not a must, mask it out accordingly.
1661 */
1662 access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
1663 crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev,
1664 offset, length, virt_addr, fd,
1665 access_flags, MLX5_MKC_ACCESS_MODE_KSM);
1666 if (IS_ERR(crossed_mr)) {
1667 ret = PTR_ERR(crossed_mr);
1668 goto end;
1669 }
1670
1671 mutex_lock(&dev->slow_path_mutex);
1672 crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags,
1673 crossed_mr->lkey);
1674 mutex_unlock(&dev->slow_path_mutex);
1675 if (IS_ERR(crossing_mr)) {
1676 __mlx5_ib_dereg_mr(crossed_mr);
1677 ret = PTR_ERR(crossing_mr);
1678 goto end;
1679 }
1680
1681 list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list);
1682 to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr);
1683 to_mmr(crossing_mr)->data_direct = true;
1684 end:
1685 mutex_unlock(&dev->data_direct_lock);
1686 return ret ? ERR_PTR(ret) : crossing_mr;
1687 }
1688
mlx5_ib_reg_user_mr_dmabuf(struct ib_pd * pd,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags,struct uverbs_attr_bundle * attrs)1689 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1690 u64 length, u64 virt_addr,
1691 int fd, int access_flags,
1692 struct uverbs_attr_bundle *attrs)
1693 {
1694 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1695 int mlx5_access_flags = 0;
1696 int err;
1697
1698 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1699 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1700 return ERR_PTR(-EOPNOTSUPP);
1701
1702 if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) {
1703 err = uverbs_get_flags32(&mlx5_access_flags, attrs,
1704 MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
1705 MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT);
1706 if (err)
1707 return ERR_PTR(err);
1708 }
1709
1710 mlx5_ib_dbg(dev,
1711 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n",
1712 offset, virt_addr, length, fd, access_flags, mlx5_access_flags);
1713
1714 /* dmabuf requires xlt update via umr to work. */
1715 if (!mlx5r_umr_can_load_pas(dev, length))
1716 return ERR_PTR(-EINVAL);
1717
1718 if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT)
1719 return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr,
1720 fd, access_flags);
1721
1722 return reg_user_mr_dmabuf(pd, pd->device->dma_device,
1723 offset, length, virt_addr,
1724 fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT);
1725 }
1726
1727 /*
1728 * True if the change in access flags can be done via UMR, only some access
1729 * flags can be updated.
1730 */
can_use_umr_rereg_access(struct mlx5_ib_dev * dev,unsigned int current_access_flags,unsigned int target_access_flags)1731 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1732 unsigned int current_access_flags,
1733 unsigned int target_access_flags)
1734 {
1735 unsigned int diffs = current_access_flags ^ target_access_flags;
1736
1737 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1738 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING |
1739 IB_ACCESS_REMOTE_ATOMIC))
1740 return false;
1741 return mlx5r_umr_can_reconfig(dev, current_access_flags,
1742 target_access_flags);
1743 }
1744
can_use_umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_umem * new_umem,int new_access_flags,u64 iova,unsigned long * page_size)1745 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1746 struct ib_umem *new_umem,
1747 int new_access_flags, u64 iova,
1748 unsigned long *page_size)
1749 {
1750 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1751
1752 /* We only track the allocated sizes of MRs from the cache */
1753 if (!mr->mmkey.cache_ent)
1754 return false;
1755 if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
1756 return false;
1757
1758 *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova);
1759 if (WARN_ON(!*page_size))
1760 return false;
1761 return (mr->mmkey.cache_ent->rb_key.ndescs) >=
1762 ib_umem_num_dma_blocks(new_umem, *page_size);
1763 }
1764
umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_pd * pd,int access_flags,int flags,struct ib_umem * new_umem,u64 iova,unsigned long page_size)1765 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1766 int access_flags, int flags, struct ib_umem *new_umem,
1767 u64 iova, unsigned long page_size)
1768 {
1769 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1770 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1771 struct ib_umem *old_umem = mr->umem;
1772 int err;
1773
1774 /*
1775 * To keep everything simple the MR is revoked before we start to mess
1776 * with it. This ensure the change is atomic relative to any use of the
1777 * MR.
1778 */
1779 err = mlx5r_umr_revoke_mr(mr);
1780 if (err)
1781 return err;
1782
1783 if (flags & IB_MR_REREG_PD) {
1784 mr->ibmr.pd = pd;
1785 upd_flags |= MLX5_IB_UPD_XLT_PD;
1786 }
1787 if (flags & IB_MR_REREG_ACCESS) {
1788 mr->access_flags = access_flags;
1789 upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1790 }
1791
1792 mr->ibmr.iova = iova;
1793 mr->ibmr.length = new_umem->length;
1794 mr->page_shift = order_base_2(page_size);
1795 mr->umem = new_umem;
1796 err = mlx5r_umr_update_mr_pas(mr, upd_flags);
1797 if (err) {
1798 /*
1799 * The MR is revoked at this point so there is no issue to free
1800 * new_umem.
1801 */
1802 mr->umem = old_umem;
1803 return err;
1804 }
1805
1806 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1807 ib_umem_release(old_umem);
1808 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1809 return 0;
1810 }
1811
mlx5_ib_rereg_user_mr(struct ib_mr * ib_mr,int flags,u64 start,u64 length,u64 iova,int new_access_flags,struct ib_pd * new_pd,struct ib_udata * udata)1812 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1813 u64 length, u64 iova, int new_access_flags,
1814 struct ib_pd *new_pd,
1815 struct ib_udata *udata)
1816 {
1817 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1818 struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1819 int err;
1820
1821 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct)
1822 return ERR_PTR(-EOPNOTSUPP);
1823
1824 mlx5_ib_dbg(
1825 dev,
1826 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1827 start, iova, length, new_access_flags);
1828
1829 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1830 return ERR_PTR(-EOPNOTSUPP);
1831
1832 if (!(flags & IB_MR_REREG_ACCESS))
1833 new_access_flags = mr->access_flags;
1834 if (!(flags & IB_MR_REREG_PD))
1835 new_pd = ib_mr->pd;
1836
1837 if (!(flags & IB_MR_REREG_TRANS)) {
1838 struct ib_umem *umem;
1839
1840 /* Fast path for PD/access change */
1841 if (can_use_umr_rereg_access(dev, mr->access_flags,
1842 new_access_flags)) {
1843 err = mlx5r_umr_rereg_pd_access(mr, new_pd,
1844 new_access_flags);
1845 if (err)
1846 return ERR_PTR(err);
1847 return NULL;
1848 }
1849 /* DM or ODP MR's don't have a normal umem so we can't re-use it */
1850 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1851 goto recreate;
1852
1853 /*
1854 * Only one active MR can refer to a umem at one time, revoke
1855 * the old MR before assigning the umem to the new one.
1856 */
1857 err = mlx5r_umr_revoke_mr(mr);
1858 if (err)
1859 return ERR_PTR(err);
1860 umem = mr->umem;
1861 mr->umem = NULL;
1862 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1863
1864 return create_real_mr(new_pd, umem, mr->ibmr.iova,
1865 new_access_flags);
1866 }
1867
1868 /*
1869 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1870 * but the logic around releasing the umem is different
1871 */
1872 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1873 goto recreate;
1874
1875 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1876 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1877 struct ib_umem *new_umem;
1878 unsigned long page_size;
1879
1880 new_umem = ib_umem_get(&dev->ib_dev, start, length,
1881 new_access_flags);
1882 if (IS_ERR(new_umem))
1883 return ERR_CAST(new_umem);
1884
1885 /* Fast path for PAS change */
1886 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1887 &page_size)) {
1888 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1889 new_umem, iova, page_size);
1890 if (err) {
1891 ib_umem_release(new_umem);
1892 return ERR_PTR(err);
1893 }
1894 return NULL;
1895 }
1896 return create_real_mr(new_pd, new_umem, iova, new_access_flags);
1897 }
1898
1899 /*
1900 * Everything else has no state we can preserve, just create a new MR
1901 * from scratch
1902 */
1903 recreate:
1904 return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1905 new_access_flags, udata);
1906 }
1907
1908 static int
mlx5_alloc_priv_descs(struct ib_device * device,struct mlx5_ib_mr * mr,int ndescs,int desc_size)1909 mlx5_alloc_priv_descs(struct ib_device *device,
1910 struct mlx5_ib_mr *mr,
1911 int ndescs,
1912 int desc_size)
1913 {
1914 struct mlx5_ib_dev *dev = to_mdev(device);
1915 struct device *ddev = &dev->mdev->pdev->dev;
1916 int size = ndescs * desc_size;
1917 int add_size;
1918 int ret;
1919
1920 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1921 if (is_power_of_2(MLX5_UMR_ALIGN) && add_size) {
1922 int end = max_t(int, MLX5_UMR_ALIGN, roundup_pow_of_two(size));
1923
1924 add_size = min_t(int, end - size, add_size);
1925 }
1926
1927 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1928 if (!mr->descs_alloc)
1929 return -ENOMEM;
1930
1931 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1932
1933 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
1934 if (dma_mapping_error(ddev, mr->desc_map)) {
1935 ret = -ENOMEM;
1936 goto err;
1937 }
1938
1939 return 0;
1940 err:
1941 kfree(mr->descs_alloc);
1942
1943 return ret;
1944 }
1945
1946 static void
mlx5_free_priv_descs(struct mlx5_ib_mr * mr)1947 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1948 {
1949 if (!mr->umem && !mr->data_direct &&
1950 mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) {
1951 struct ib_device *device = mr->ibmr.device;
1952 int size = mr->max_descs * mr->desc_size;
1953 struct mlx5_ib_dev *dev = to_mdev(device);
1954
1955 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
1956 DMA_TO_DEVICE);
1957 kfree(mr->descs_alloc);
1958 mr->descs = NULL;
1959 }
1960 }
1961
cache_ent_find_and_store(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)1962 static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
1963 struct mlx5_ib_mr *mr)
1964 {
1965 struct mlx5_mkey_cache *cache = &dev->cache;
1966 struct mlx5_cache_ent *ent;
1967 int ret;
1968
1969 if (mr->mmkey.cache_ent) {
1970 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
1971 goto end;
1972 }
1973
1974 mutex_lock(&cache->rb_lock);
1975 ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
1976 if (ent) {
1977 if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
1978 if (ent->disabled) {
1979 mutex_unlock(&cache->rb_lock);
1980 return -EOPNOTSUPP;
1981 }
1982 mr->mmkey.cache_ent = ent;
1983 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
1984 mutex_unlock(&cache->rb_lock);
1985 goto end;
1986 }
1987 }
1988
1989 ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false);
1990 mutex_unlock(&cache->rb_lock);
1991 if (IS_ERR(ent))
1992 return PTR_ERR(ent);
1993
1994 mr->mmkey.cache_ent = ent;
1995 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
1996
1997 end:
1998 ret = push_mkey_locked(mr->mmkey.cache_ent, mr->mmkey.key);
1999 spin_unlock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
2000 return ret;
2001 }
2002
mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr * mr)2003 static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr)
2004 {
2005 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
2006 struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
2007 int err;
2008
2009 lockdep_assert_held(&dev->data_direct_lock);
2010 mr->revoked = true;
2011 err = mlx5r_umr_revoke_mr(mr);
2012 if (WARN_ON(err))
2013 return err;
2014
2015 ib_umem_dmabuf_revoke(umem_dmabuf);
2016 return 0;
2017 }
2018
mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev * dev)2019 void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev)
2020 {
2021 struct mlx5_ib_mr *mr, *next;
2022
2023 lockdep_assert_held(&dev->data_direct_lock);
2024
2025 list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) {
2026 list_del(&mr->dd_node);
2027 mlx5_ib_revoke_data_direct_mr(mr);
2028 }
2029 }
2030
mlx5_umr_revoke_mr_with_lock(struct mlx5_ib_mr * mr)2031 static int mlx5_umr_revoke_mr_with_lock(struct mlx5_ib_mr *mr)
2032 {
2033 bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
2034 !to_ib_umem_dmabuf(mr->umem)->pinned;
2035 bool is_odp = is_odp_mr(mr);
2036 int ret;
2037
2038 if (is_odp)
2039 mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2040
2041 if (is_odp_dma_buf)
2042 dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv,
2043 NULL);
2044
2045 ret = mlx5r_umr_revoke_mr(mr);
2046
2047 if (is_odp) {
2048 if (!ret)
2049 to_ib_umem_odp(mr->umem)->private = NULL;
2050 mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2051 }
2052
2053 if (is_odp_dma_buf) {
2054 if (!ret)
2055 to_ib_umem_dmabuf(mr->umem)->private = NULL;
2056 dma_resv_unlock(
2057 to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
2058 }
2059
2060 return ret;
2061 }
2062
mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr * mr)2063 static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr)
2064 {
2065 bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
2066 !to_ib_umem_dmabuf(mr->umem)->pinned;
2067 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
2068 struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
2069 bool is_odp = is_odp_mr(mr);
2070 bool from_cache = !!ent;
2071 int ret;
2072
2073 if (mr->mmkey.cacheable && !mlx5_umr_revoke_mr_with_lock(mr) &&
2074 !cache_ent_find_and_store(dev, mr)) {
2075 ent = mr->mmkey.cache_ent;
2076 /* upon storing to a clean temp entry - schedule its cleanup */
2077 spin_lock_irq(&ent->mkeys_queue.lock);
2078 if (from_cache)
2079 ent->in_use--;
2080 if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
2081 mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
2082 msecs_to_jiffies(30 * 1000));
2083 ent->tmp_cleanup_scheduled = true;
2084 }
2085 spin_unlock_irq(&ent->mkeys_queue.lock);
2086 return 0;
2087 }
2088
2089 if (ent) {
2090 spin_lock_irq(&ent->mkeys_queue.lock);
2091 ent->in_use--;
2092 mr->mmkey.cache_ent = NULL;
2093 spin_unlock_irq(&ent->mkeys_queue.lock);
2094 }
2095
2096 if (is_odp)
2097 mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2098
2099 if (is_odp_dma_buf)
2100 dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv,
2101 NULL);
2102 ret = destroy_mkey(dev, mr);
2103 if (is_odp) {
2104 if (!ret)
2105 to_ib_umem_odp(mr->umem)->private = NULL;
2106 mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2107 }
2108
2109 if (is_odp_dma_buf) {
2110 if (!ret)
2111 to_ib_umem_dmabuf(mr->umem)->private = NULL;
2112 dma_resv_unlock(
2113 to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
2114 }
2115 return ret;
2116 }
2117
__mlx5_ib_dereg_mr(struct ib_mr * ibmr)2118 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr)
2119 {
2120 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2121 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
2122 int rc;
2123
2124 /*
2125 * Any async use of the mr must hold the refcount, once the refcount
2126 * goes to zero no other thread, such as ODP page faults, prefetch, any
2127 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
2128 */
2129 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2130 refcount_read(&mr->mmkey.usecount) != 0 &&
2131 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
2132 mlx5r_deref_wait_odp_mkey(&mr->mmkey);
2133
2134 if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
2135 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2136 mr->sig, NULL, GFP_KERNEL);
2137
2138 if (mr->mtt_mr) {
2139 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2140 if (rc)
2141 return rc;
2142 mr->mtt_mr = NULL;
2143 }
2144 if (mr->klm_mr) {
2145 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2146 if (rc)
2147 return rc;
2148 mr->klm_mr = NULL;
2149 }
2150
2151 if (mlx5_core_destroy_psv(dev->mdev,
2152 mr->sig->psv_memory.psv_idx))
2153 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2154 mr->sig->psv_memory.psv_idx);
2155 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2156 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2157 mr->sig->psv_wire.psv_idx);
2158 kfree(mr->sig);
2159 mr->sig = NULL;
2160 }
2161
2162 /* Stop DMA */
2163 rc = mlx5r_handle_mkey_cleanup(mr);
2164 if (rc)
2165 return rc;
2166
2167 if (mr->umem) {
2168 bool is_odp = is_odp_mr(mr);
2169
2170 if (!is_odp)
2171 atomic_sub(ib_umem_num_pages(mr->umem),
2172 &dev->mdev->priv.reg_pages);
2173 ib_umem_release(mr->umem);
2174 if (is_odp)
2175 mlx5_ib_free_odp_mr(mr);
2176 }
2177
2178 if (!mr->mmkey.cache_ent)
2179 mlx5_free_priv_descs(mr);
2180
2181 kfree(mr);
2182 return 0;
2183 }
2184
dereg_crossing_data_direct_mr(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)2185 static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev,
2186 struct mlx5_ib_mr *mr)
2187 {
2188 struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr;
2189 int ret;
2190
2191 ret = __mlx5_ib_dereg_mr(&mr->ibmr);
2192 if (ret)
2193 return ret;
2194
2195 mutex_lock(&dev->data_direct_lock);
2196 if (!dd_crossed_mr->revoked)
2197 list_del(&dd_crossed_mr->dd_node);
2198
2199 ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr);
2200 mutex_unlock(&dev->data_direct_lock);
2201 return ret;
2202 }
2203
mlx5_ib_dereg_mr(struct ib_mr * ibmr,struct ib_udata * udata)2204 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
2205 {
2206 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2207 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
2208
2209 if (mr->data_direct)
2210 return dereg_crossing_data_direct_mr(dev, mr);
2211
2212 return __mlx5_ib_dereg_mr(ibmr);
2213 }
2214
mlx5_set_umr_free_mkey(struct ib_pd * pd,u32 * in,int ndescs,int access_mode,int page_shift)2215 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
2216 int access_mode, int page_shift)
2217 {
2218 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2219 void *mkc;
2220
2221 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2222
2223 /* This is only used from the kernel, so setting the PD is OK. */
2224 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
2225 MLX5_SET(mkc, mkc, free, 1);
2226 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2227 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
2228 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
2229 MLX5_SET(mkc, mkc, umr_en, 1);
2230 MLX5_SET(mkc, mkc, log_page_size, page_shift);
2231 if (access_mode == MLX5_MKC_ACCESS_MODE_PA ||
2232 access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2233 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
2234 }
2235
_mlx5_alloc_mkey_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,int desc_size,int page_shift,int access_mode,u32 * in,int inlen)2236 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2237 int ndescs, int desc_size, int page_shift,
2238 int access_mode, u32 *in, int inlen)
2239 {
2240 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2241 int err;
2242
2243 mr->access_mode = access_mode;
2244 mr->desc_size = desc_size;
2245 mr->max_descs = ndescs;
2246
2247 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
2248 if (err)
2249 return err;
2250
2251 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
2252
2253 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
2254 if (err)
2255 goto err_free_descs;
2256
2257 mr->mmkey.type = MLX5_MKEY_MR;
2258 mr->ibmr.lkey = mr->mmkey.key;
2259 mr->ibmr.rkey = mr->mmkey.key;
2260
2261 return 0;
2262
2263 err_free_descs:
2264 mlx5_free_priv_descs(mr);
2265 return err;
2266 }
2267
mlx5_ib_alloc_pi_mr(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg,int desc_size,int access_mode)2268 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
2269 u32 max_num_sg, u32 max_num_meta_sg,
2270 int desc_size, int access_mode)
2271 {
2272 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2273 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
2274 int page_shift = 0;
2275 struct mlx5_ib_mr *mr;
2276 u32 *in;
2277 int err;
2278
2279 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2280 if (!mr)
2281 return ERR_PTR(-ENOMEM);
2282
2283 mr->ibmr.pd = pd;
2284 mr->ibmr.device = pd->device;
2285
2286 in = kzalloc(inlen, GFP_KERNEL);
2287 if (!in) {
2288 err = -ENOMEM;
2289 goto err_free;
2290 }
2291
2292 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2293 page_shift = PAGE_SHIFT;
2294
2295 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
2296 access_mode, in, inlen);
2297 if (err)
2298 goto err_free_in;
2299
2300 mr->umem = NULL;
2301 kfree(in);
2302
2303 return mr;
2304
2305 err_free_in:
2306 kfree(in);
2307 err_free:
2308 kfree(mr);
2309 return ERR_PTR(err);
2310 }
2311
mlx5_alloc_mem_reg_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2312 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2313 int ndescs, u32 *in, int inlen)
2314 {
2315 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
2316 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
2317 inlen);
2318 }
2319
mlx5_alloc_sg_gaps_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2320 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2321 int ndescs, u32 *in, int inlen)
2322 {
2323 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
2324 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2325 }
2326
mlx5_alloc_integrity_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int max_num_sg,int max_num_meta_sg,u32 * in,int inlen)2327 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2328 int max_num_sg, int max_num_meta_sg,
2329 u32 *in, int inlen)
2330 {
2331 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2332 u32 psv_index[2];
2333 void *mkc;
2334 int err;
2335
2336 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2337 if (!mr->sig)
2338 return -ENOMEM;
2339
2340 /* create mem & wire PSVs */
2341 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
2342 if (err)
2343 goto err_free_sig;
2344
2345 mr->sig->psv_memory.psv_idx = psv_index[0];
2346 mr->sig->psv_wire.psv_idx = psv_index[1];
2347
2348 mr->sig->sig_status_checked = true;
2349 mr->sig->sig_err_exists = false;
2350 /* Next UMR, Arm SIGERR */
2351 ++mr->sig->sigerr_count;
2352 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2353 sizeof(struct mlx5_klm),
2354 MLX5_MKC_ACCESS_MODE_KLMS);
2355 if (IS_ERR(mr->klm_mr)) {
2356 err = PTR_ERR(mr->klm_mr);
2357 goto err_destroy_psv;
2358 }
2359 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2360 sizeof(struct mlx5_mtt),
2361 MLX5_MKC_ACCESS_MODE_MTT);
2362 if (IS_ERR(mr->mtt_mr)) {
2363 err = PTR_ERR(mr->mtt_mr);
2364 goto err_free_klm_mr;
2365 }
2366
2367 /* Set bsf descriptors for mkey */
2368 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2369 MLX5_SET(mkc, mkc, bsf_en, 1);
2370 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2371
2372 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
2373 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2374 if (err)
2375 goto err_free_mtt_mr;
2376
2377 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2378 mr->sig, GFP_KERNEL));
2379 if (err)
2380 goto err_free_descs;
2381 return 0;
2382
2383 err_free_descs:
2384 destroy_mkey(dev, mr);
2385 mlx5_free_priv_descs(mr);
2386 err_free_mtt_mr:
2387 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2388 mr->mtt_mr = NULL;
2389 err_free_klm_mr:
2390 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2391 mr->klm_mr = NULL;
2392 err_destroy_psv:
2393 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2394 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2395 mr->sig->psv_memory.psv_idx);
2396 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2397 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2398 mr->sig->psv_wire.psv_idx);
2399 err_free_sig:
2400 kfree(mr->sig);
2401
2402 return err;
2403 }
2404
__mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg,u32 max_num_meta_sg)2405 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2406 enum ib_mr_type mr_type, u32 max_num_sg,
2407 u32 max_num_meta_sg)
2408 {
2409 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2410 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2411 int ndescs = ALIGN(max_num_sg, 4);
2412 struct mlx5_ib_mr *mr;
2413 u32 *in;
2414 int err;
2415
2416 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2417 if (!mr)
2418 return ERR_PTR(-ENOMEM);
2419
2420 in = kzalloc(inlen, GFP_KERNEL);
2421 if (!in) {
2422 err = -ENOMEM;
2423 goto err_free;
2424 }
2425
2426 mr->ibmr.device = pd->device;
2427 mr->umem = NULL;
2428
2429 switch (mr_type) {
2430 case IB_MR_TYPE_MEM_REG:
2431 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2432 break;
2433 case IB_MR_TYPE_SG_GAPS:
2434 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2435 break;
2436 case IB_MR_TYPE_INTEGRITY:
2437 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2438 max_num_meta_sg, in, inlen);
2439 break;
2440 default:
2441 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2442 err = -EINVAL;
2443 }
2444
2445 if (err)
2446 goto err_free_in;
2447
2448 kfree(in);
2449
2450 return &mr->ibmr;
2451
2452 err_free_in:
2453 kfree(in);
2454 err_free:
2455 kfree(mr);
2456 return ERR_PTR(err);
2457 }
2458
mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg)2459 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2460 u32 max_num_sg)
2461 {
2462 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2463 }
2464
mlx5_ib_alloc_mr_integrity(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg)2465 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2466 u32 max_num_sg, u32 max_num_meta_sg)
2467 {
2468 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2469 max_num_meta_sg);
2470 }
2471
mlx5_ib_alloc_mw(struct ib_mw * ibmw,struct ib_udata * udata)2472 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2473 {
2474 struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2475 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2476 struct mlx5_ib_mw *mw = to_mmw(ibmw);
2477 unsigned int ndescs;
2478 u32 *in = NULL;
2479 void *mkc;
2480 int err;
2481 struct mlx5_ib_alloc_mw req = {};
2482 struct {
2483 __u32 comp_mask;
2484 __u32 response_length;
2485 } resp = {};
2486
2487 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2488 if (err)
2489 return err;
2490
2491 if (req.comp_mask || req.reserved1 || req.reserved2)
2492 return -EOPNOTSUPP;
2493
2494 if (udata->inlen > sizeof(req) &&
2495 !ib_is_udata_cleared(udata, sizeof(req),
2496 udata->inlen - sizeof(req)))
2497 return -EOPNOTSUPP;
2498
2499 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2500
2501 in = kzalloc(inlen, GFP_KERNEL);
2502 if (!in)
2503 return -ENOMEM;
2504
2505 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2506
2507 MLX5_SET(mkc, mkc, free, 1);
2508 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2509 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2510 MLX5_SET(mkc, mkc, umr_en, 1);
2511 MLX5_SET(mkc, mkc, lr, 1);
2512 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2513 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2514 MLX5_SET(mkc, mkc, qpn, 0xffffff);
2515
2516 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2517 if (err)
2518 goto free;
2519
2520 mw->mmkey.type = MLX5_MKEY_MW;
2521 ibmw->rkey = mw->mmkey.key;
2522 mw->mmkey.ndescs = ndescs;
2523
2524 resp.response_length =
2525 min(offsetofend(typeof(resp), response_length), udata->outlen);
2526 if (resp.response_length) {
2527 err = ib_copy_to_udata(udata, &resp, resp.response_length);
2528 if (err)
2529 goto free_mkey;
2530 }
2531
2532 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2533 err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
2534 if (err)
2535 goto free_mkey;
2536 }
2537
2538 kfree(in);
2539 return 0;
2540
2541 free_mkey:
2542 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key);
2543 free:
2544 kfree(in);
2545 return err;
2546 }
2547
mlx5_ib_dealloc_mw(struct ib_mw * mw)2548 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2549 {
2550 struct mlx5_ib_dev *dev = to_mdev(mw->device);
2551 struct mlx5_ib_mw *mmw = to_mmw(mw);
2552
2553 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2554 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
2555 /*
2556 * pagefault_single_data_segment() may be accessing mmw
2557 * if the user bound an ODP MR to this MW.
2558 */
2559 mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
2560
2561 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key);
2562 }
2563
mlx5_ib_check_mr_status(struct ib_mr * ibmr,u32 check_mask,struct ib_mr_status * mr_status)2564 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2565 struct ib_mr_status *mr_status)
2566 {
2567 struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2568 int ret = 0;
2569
2570 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2571 pr_err("Invalid status check mask\n");
2572 ret = -EINVAL;
2573 goto done;
2574 }
2575
2576 mr_status->fail_status = 0;
2577 if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2578 if (!mmr->sig) {
2579 ret = -EINVAL;
2580 pr_err("signature status check requested on a non-signature enabled MR\n");
2581 goto done;
2582 }
2583
2584 mmr->sig->sig_status_checked = true;
2585 if (!mmr->sig->sig_err_exists)
2586 goto done;
2587
2588 if (ibmr->lkey == mmr->sig->err_item.key)
2589 memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2590 sizeof(mr_status->sig_err));
2591 else {
2592 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2593 mr_status->sig_err.sig_err_offset = 0;
2594 mr_status->sig_err.key = mmr->sig->err_item.key;
2595 }
2596
2597 mmr->sig->sig_err_exists = false;
2598 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2599 }
2600
2601 done:
2602 return ret;
2603 }
2604
2605 static int
mlx5_ib_map_pa_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2606 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2607 int data_sg_nents, unsigned int *data_sg_offset,
2608 struct scatterlist *meta_sg, int meta_sg_nents,
2609 unsigned int *meta_sg_offset)
2610 {
2611 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2612 unsigned int sg_offset = 0;
2613 int n = 0;
2614
2615 mr->meta_length = 0;
2616 if (data_sg_nents == 1) {
2617 n++;
2618 mr->mmkey.ndescs = 1;
2619 if (data_sg_offset)
2620 sg_offset = *data_sg_offset;
2621 mr->data_length = sg_dma_len(data_sg) - sg_offset;
2622 mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2623 if (meta_sg_nents == 1) {
2624 n++;
2625 mr->meta_ndescs = 1;
2626 if (meta_sg_offset)
2627 sg_offset = *meta_sg_offset;
2628 else
2629 sg_offset = 0;
2630 mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2631 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2632 }
2633 ibmr->length = mr->data_length + mr->meta_length;
2634 }
2635
2636 return n;
2637 }
2638
2639 static int
mlx5_ib_sg_to_klms(struct mlx5_ib_mr * mr,struct scatterlist * sgl,unsigned short sg_nents,unsigned int * sg_offset_p,struct scatterlist * meta_sgl,unsigned short meta_sg_nents,unsigned int * meta_sg_offset_p)2640 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2641 struct scatterlist *sgl,
2642 unsigned short sg_nents,
2643 unsigned int *sg_offset_p,
2644 struct scatterlist *meta_sgl,
2645 unsigned short meta_sg_nents,
2646 unsigned int *meta_sg_offset_p)
2647 {
2648 struct scatterlist *sg = sgl;
2649 struct mlx5_klm *klms = mr->descs;
2650 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2651 u32 lkey = mr->ibmr.pd->local_dma_lkey;
2652 int i, j = 0;
2653
2654 mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2655 mr->ibmr.length = 0;
2656
2657 for_each_sg(sgl, sg, sg_nents, i) {
2658 if (unlikely(i >= mr->max_descs))
2659 break;
2660 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2661 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2662 klms[i].key = cpu_to_be32(lkey);
2663 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2664
2665 sg_offset = 0;
2666 }
2667
2668 if (sg_offset_p)
2669 *sg_offset_p = sg_offset;
2670
2671 mr->mmkey.ndescs = i;
2672 mr->data_length = mr->ibmr.length;
2673
2674 if (meta_sg_nents) {
2675 sg = meta_sgl;
2676 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2677 for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2678 if (unlikely(i + j >= mr->max_descs))
2679 break;
2680 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2681 sg_offset);
2682 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2683 sg_offset);
2684 klms[i + j].key = cpu_to_be32(lkey);
2685 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2686
2687 sg_offset = 0;
2688 }
2689 if (meta_sg_offset_p)
2690 *meta_sg_offset_p = sg_offset;
2691
2692 mr->meta_ndescs = j;
2693 mr->meta_length = mr->ibmr.length - mr->data_length;
2694 }
2695
2696 return i + j;
2697 }
2698
mlx5_set_page(struct ib_mr * ibmr,u64 addr)2699 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2700 {
2701 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2702 __be64 *descs;
2703
2704 if (unlikely(mr->mmkey.ndescs == mr->max_descs))
2705 return -ENOMEM;
2706
2707 descs = mr->descs;
2708 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2709
2710 return 0;
2711 }
2712
mlx5_set_page_pi(struct ib_mr * ibmr,u64 addr)2713 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2714 {
2715 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2716 __be64 *descs;
2717
2718 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs))
2719 return -ENOMEM;
2720
2721 descs = mr->descs;
2722 descs[mr->mmkey.ndescs + mr->meta_ndescs++] =
2723 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2724
2725 return 0;
2726 }
2727
2728 static int
mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2729 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2730 int data_sg_nents, unsigned int *data_sg_offset,
2731 struct scatterlist *meta_sg, int meta_sg_nents,
2732 unsigned int *meta_sg_offset)
2733 {
2734 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2735 struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2736 int n;
2737
2738 pi_mr->mmkey.ndescs = 0;
2739 pi_mr->meta_ndescs = 0;
2740 pi_mr->meta_length = 0;
2741
2742 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2743 pi_mr->desc_size * pi_mr->max_descs,
2744 DMA_TO_DEVICE);
2745
2746 pi_mr->ibmr.page_size = ibmr->page_size;
2747 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2748 mlx5_set_page);
2749 if (n != data_sg_nents)
2750 return n;
2751
2752 pi_mr->data_iova = pi_mr->ibmr.iova;
2753 pi_mr->data_length = pi_mr->ibmr.length;
2754 pi_mr->ibmr.length = pi_mr->data_length;
2755 ibmr->length = pi_mr->data_length;
2756
2757 if (meta_sg_nents) {
2758 u64 page_mask = ~((u64)ibmr->page_size - 1);
2759 u64 iova = pi_mr->data_iova;
2760
2761 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2762 meta_sg_offset, mlx5_set_page_pi);
2763
2764 pi_mr->meta_length = pi_mr->ibmr.length;
2765 /*
2766 * PI address for the HW is the offset of the metadata address
2767 * relative to the first data page address.
2768 * It equals to first data page address + size of data pages +
2769 * metadata offset at the first metadata page
2770 */
2771 pi_mr->pi_iova = (iova & page_mask) +
2772 pi_mr->mmkey.ndescs * ibmr->page_size +
2773 (pi_mr->ibmr.iova & ~page_mask);
2774 /*
2775 * In order to use one MTT MR for data and metadata, we register
2776 * also the gaps between the end of the data and the start of
2777 * the metadata (the sig MR will verify that the HW will access
2778 * to right addresses). This mapping is safe because we use
2779 * internal mkey for the registration.
2780 */
2781 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2782 pi_mr->ibmr.iova = iova;
2783 ibmr->length += pi_mr->meta_length;
2784 }
2785
2786 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2787 pi_mr->desc_size * pi_mr->max_descs,
2788 DMA_TO_DEVICE);
2789
2790 return n;
2791 }
2792
2793 static int
mlx5_ib_map_klm_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2794 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2795 int data_sg_nents, unsigned int *data_sg_offset,
2796 struct scatterlist *meta_sg, int meta_sg_nents,
2797 unsigned int *meta_sg_offset)
2798 {
2799 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2800 struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2801 int n;
2802
2803 pi_mr->mmkey.ndescs = 0;
2804 pi_mr->meta_ndescs = 0;
2805 pi_mr->meta_length = 0;
2806
2807 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2808 pi_mr->desc_size * pi_mr->max_descs,
2809 DMA_TO_DEVICE);
2810
2811 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2812 meta_sg, meta_sg_nents, meta_sg_offset);
2813
2814 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2815 pi_mr->desc_size * pi_mr->max_descs,
2816 DMA_TO_DEVICE);
2817
2818 /* This is zero-based memory region */
2819 pi_mr->data_iova = 0;
2820 pi_mr->ibmr.iova = 0;
2821 pi_mr->pi_iova = pi_mr->data_length;
2822 ibmr->length = pi_mr->ibmr.length;
2823
2824 return n;
2825 }
2826
mlx5_ib_map_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2827 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2828 int data_sg_nents, unsigned int *data_sg_offset,
2829 struct scatterlist *meta_sg, int meta_sg_nents,
2830 unsigned int *meta_sg_offset)
2831 {
2832 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2833 struct mlx5_ib_mr *pi_mr = NULL;
2834 int n;
2835
2836 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2837
2838 mr->mmkey.ndescs = 0;
2839 mr->data_length = 0;
2840 mr->data_iova = 0;
2841 mr->meta_ndescs = 0;
2842 mr->pi_iova = 0;
2843 /*
2844 * As a performance optimization, if possible, there is no need to
2845 * perform UMR operation to register the data/metadata buffers.
2846 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2847 * Fallback to UMR only in case of a failure.
2848 */
2849 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2850 data_sg_offset, meta_sg, meta_sg_nents,
2851 meta_sg_offset);
2852 if (n == data_sg_nents + meta_sg_nents)
2853 goto out;
2854 /*
2855 * As a performance optimization, if possible, there is no need to map
2856 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2857 * descriptors and fallback to KLM only in case of a failure.
2858 * It's more efficient for the HW to work with MTT descriptors
2859 * (especially in high load).
2860 * Use KLM (indirect access) only if it's mandatory.
2861 */
2862 pi_mr = mr->mtt_mr;
2863 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2864 data_sg_offset, meta_sg, meta_sg_nents,
2865 meta_sg_offset);
2866 if (n == data_sg_nents + meta_sg_nents)
2867 goto out;
2868
2869 pi_mr = mr->klm_mr;
2870 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2871 data_sg_offset, meta_sg, meta_sg_nents,
2872 meta_sg_offset);
2873 if (unlikely(n != data_sg_nents + meta_sg_nents))
2874 return -ENOMEM;
2875
2876 out:
2877 /* This is zero-based memory region */
2878 ibmr->iova = 0;
2879 mr->pi_mr = pi_mr;
2880 if (pi_mr)
2881 ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2882 else
2883 ibmr->sig_attrs->meta_length = mr->meta_length;
2884
2885 return 0;
2886 }
2887
mlx5_ib_map_mr_sg(struct ib_mr * ibmr,struct scatterlist * sg,int sg_nents,unsigned int * sg_offset)2888 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2889 unsigned int *sg_offset)
2890 {
2891 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2892 int n;
2893
2894 mr->mmkey.ndescs = 0;
2895
2896 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2897 mr->desc_size * mr->max_descs,
2898 DMA_TO_DEVICE);
2899
2900 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2901 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2902 NULL);
2903 else
2904 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2905 mlx5_set_page);
2906
2907 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2908 mr->desc_size * mr->max_descs,
2909 DMA_TO_DEVICE);
2910
2911 return n;
2912 }
2913