1 /*
2 * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33 #include <linux/netdevice.h>
34 #include <net/bonding.h>
35 #include <linux/mlx5/driver.h>
36 #include <linux/mlx5/eswitch.h>
37 #include <linux/mlx5/vport.h>
38 #include "lib/devcom.h"
39 #include "mlx5_core.h"
40 #include "eswitch.h"
41 #include "esw/acl/ofld.h"
42 #include "lag.h"
43 #include "mp.h"
44 #include "mpesw.h"
45
46 enum {
47 MLX5_LAG_EGRESS_PORT_1 = 1,
48 MLX5_LAG_EGRESS_PORT_2,
49 };
50
51 /* General purpose, use for short periods of time.
52 * Beware of lock dependencies (preferably, no locks should be acquired
53 * under it).
54 */
55 static DEFINE_SPINLOCK(lag_lock);
56
get_port_sel_mode(enum mlx5_lag_mode mode,unsigned long flags)57 static int get_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags)
58 {
59 if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
60 return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT;
61
62 if (mode == MLX5_LAG_MODE_MPESW)
63 return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW;
64
65 return MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY;
66 }
67
lag_active_port_bits(struct mlx5_lag * ldev)68 static u8 lag_active_port_bits(struct mlx5_lag *ldev)
69 {
70 u8 enabled_ports[MLX5_MAX_PORTS] = {};
71 u8 active_port = 0;
72 int num_enabled;
73 int idx;
74
75 mlx5_infer_tx_enabled(&ldev->tracker, ldev->ports, enabled_ports,
76 &num_enabled);
77 for (idx = 0; idx < num_enabled; idx++)
78 active_port |= BIT_MASK(enabled_ports[idx]);
79
80 return active_port;
81 }
82
mlx5_cmd_create_lag(struct mlx5_core_dev * dev,u8 * ports,int mode,unsigned long flags)83 static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 *ports, int mode,
84 unsigned long flags)
85 {
86 bool fdb_sel_mode = test_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE,
87 &flags);
88 int port_sel_mode = get_port_sel_mode(mode, flags);
89 u32 in[MLX5_ST_SZ_DW(create_lag_in)] = {};
90 void *lag_ctx;
91
92 lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx);
93 MLX5_SET(create_lag_in, in, opcode, MLX5_CMD_OP_CREATE_LAG);
94 MLX5_SET(lagc, lag_ctx, fdb_selection_mode, fdb_sel_mode);
95
96 switch (port_sel_mode) {
97 case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY:
98 MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]);
99 MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]);
100 break;
101 case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT:
102 if (!MLX5_CAP_PORT_SELECTION(dev, port_select_flow_table_bypass))
103 break;
104
105 MLX5_SET(lagc, lag_ctx, active_port,
106 lag_active_port_bits(mlx5_lag_dev(dev)));
107 break;
108 default:
109 break;
110 }
111 MLX5_SET(lagc, lag_ctx, port_select_mode, port_sel_mode);
112
113 return mlx5_cmd_exec_in(dev, create_lag, in);
114 }
115
mlx5_cmd_modify_lag(struct mlx5_core_dev * dev,u8 num_ports,u8 * ports)116 static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 num_ports,
117 u8 *ports)
118 {
119 u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {};
120 void *lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx);
121
122 MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG);
123 MLX5_SET(modify_lag_in, in, field_select, 0x1);
124
125 MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]);
126 MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]);
127
128 return mlx5_cmd_exec_in(dev, modify_lag, in);
129 }
130
mlx5_cmd_create_vport_lag(struct mlx5_core_dev * dev)131 int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev)
132 {
133 u32 in[MLX5_ST_SZ_DW(create_vport_lag_in)] = {};
134
135 MLX5_SET(create_vport_lag_in, in, opcode, MLX5_CMD_OP_CREATE_VPORT_LAG);
136
137 return mlx5_cmd_exec_in(dev, create_vport_lag, in);
138 }
139 EXPORT_SYMBOL(mlx5_cmd_create_vport_lag);
140
mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev * dev)141 int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
142 {
143 u32 in[MLX5_ST_SZ_DW(destroy_vport_lag_in)] = {};
144
145 MLX5_SET(destroy_vport_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_VPORT_LAG);
146
147 return mlx5_cmd_exec_in(dev, destroy_vport_lag, in);
148 }
149 EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag);
150
mlx5_infer_tx_disabled(struct lag_tracker * tracker,u8 num_ports,u8 * ports,int * num_disabled)151 static void mlx5_infer_tx_disabled(struct lag_tracker *tracker, u8 num_ports,
152 u8 *ports, int *num_disabled)
153 {
154 int i;
155
156 *num_disabled = 0;
157 for (i = 0; i < num_ports; i++) {
158 if (!tracker->netdev_state[i].tx_enabled ||
159 !tracker->netdev_state[i].link_up)
160 ports[(*num_disabled)++] = i;
161 }
162 }
163
mlx5_infer_tx_enabled(struct lag_tracker * tracker,u8 num_ports,u8 * ports,int * num_enabled)164 void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports,
165 u8 *ports, int *num_enabled)
166 {
167 int i;
168
169 *num_enabled = 0;
170 for (i = 0; i < num_ports; i++) {
171 if (tracker->netdev_state[i].tx_enabled &&
172 tracker->netdev_state[i].link_up)
173 ports[(*num_enabled)++] = i;
174 }
175
176 if (*num_enabled == 0)
177 mlx5_infer_tx_disabled(tracker, num_ports, ports, num_enabled);
178 }
179
mlx5_lag_print_mapping(struct mlx5_core_dev * dev,struct mlx5_lag * ldev,struct lag_tracker * tracker,unsigned long flags)180 static void mlx5_lag_print_mapping(struct mlx5_core_dev *dev,
181 struct mlx5_lag *ldev,
182 struct lag_tracker *tracker,
183 unsigned long flags)
184 {
185 char buf[MLX5_MAX_PORTS * 10 + 1] = {};
186 u8 enabled_ports[MLX5_MAX_PORTS] = {};
187 int written = 0;
188 int num_enabled;
189 int idx;
190 int err;
191 int i;
192 int j;
193
194 if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
195 mlx5_infer_tx_enabled(tracker, ldev->ports, enabled_ports,
196 &num_enabled);
197 for (i = 0; i < num_enabled; i++) {
198 err = scnprintf(buf + written, 4, "%d, ", enabled_ports[i] + 1);
199 if (err != 3)
200 return;
201 written += err;
202 }
203 buf[written - 2] = 0;
204 mlx5_core_info(dev, "lag map active ports: %s\n", buf);
205 } else {
206 for (i = 0; i < ldev->ports; i++) {
207 for (j = 0; j < ldev->buckets; j++) {
208 idx = i * ldev->buckets + j;
209 err = scnprintf(buf + written, 10,
210 " port %d:%d", i + 1, ldev->v2p_map[idx]);
211 if (err != 9)
212 return;
213 written += err;
214 }
215 }
216 mlx5_core_info(dev, "lag map:%s\n", buf);
217 }
218 }
219
220 static int mlx5_lag_netdev_event(struct notifier_block *this,
221 unsigned long event, void *ptr);
222 static void mlx5_do_bond_work(struct work_struct *work);
223
mlx5_ldev_free(struct kref * ref)224 static void mlx5_ldev_free(struct kref *ref)
225 {
226 struct mlx5_lag *ldev = container_of(ref, struct mlx5_lag, ref);
227
228 if (ldev->nb.notifier_call)
229 unregister_netdevice_notifier_net(&init_net, &ldev->nb);
230 mlx5_lag_mp_cleanup(ldev);
231 cancel_delayed_work_sync(&ldev->bond_work);
232 destroy_workqueue(ldev->wq);
233 mutex_destroy(&ldev->lock);
234 kfree(ldev);
235 }
236
mlx5_ldev_put(struct mlx5_lag * ldev)237 static void mlx5_ldev_put(struct mlx5_lag *ldev)
238 {
239 kref_put(&ldev->ref, mlx5_ldev_free);
240 }
241
mlx5_ldev_get(struct mlx5_lag * ldev)242 static void mlx5_ldev_get(struct mlx5_lag *ldev)
243 {
244 kref_get(&ldev->ref);
245 }
246
mlx5_lag_dev_alloc(struct mlx5_core_dev * dev)247 static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev)
248 {
249 struct mlx5_lag *ldev;
250 int err;
251
252 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
253 if (!ldev)
254 return NULL;
255
256 ldev->wq = create_singlethread_workqueue("mlx5_lag");
257 if (!ldev->wq) {
258 kfree(ldev);
259 return NULL;
260 }
261
262 kref_init(&ldev->ref);
263 mutex_init(&ldev->lock);
264 INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work);
265
266 ldev->nb.notifier_call = mlx5_lag_netdev_event;
267 if (register_netdevice_notifier_net(&init_net, &ldev->nb)) {
268 ldev->nb.notifier_call = NULL;
269 mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
270 }
271 ldev->mode = MLX5_LAG_MODE_NONE;
272
273 err = mlx5_lag_mp_init(ldev);
274 if (err)
275 mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
276 err);
277
278 ldev->ports = MLX5_CAP_GEN(dev, num_lag_ports);
279 ldev->buckets = 1;
280
281 return ldev;
282 }
283
mlx5_lag_dev_get_netdev_idx(struct mlx5_lag * ldev,struct net_device * ndev)284 int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev,
285 struct net_device *ndev)
286 {
287 int i;
288
289 for (i = 0; i < ldev->ports; i++)
290 if (ldev->pf[i].netdev == ndev)
291 return i;
292
293 return -ENOENT;
294 }
295
__mlx5_lag_is_roce(struct mlx5_lag * ldev)296 static bool __mlx5_lag_is_roce(struct mlx5_lag *ldev)
297 {
298 return ldev->mode == MLX5_LAG_MODE_ROCE;
299 }
300
__mlx5_lag_is_sriov(struct mlx5_lag * ldev)301 static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev)
302 {
303 return ldev->mode == MLX5_LAG_MODE_SRIOV;
304 }
305
306 /* Create a mapping between steering slots and active ports.
307 * As we have ldev->buckets slots per port first assume the native
308 * mapping should be used.
309 * If there are ports that are disabled fill the relevant slots
310 * with mapping that points to active ports.
311 */
mlx5_infer_tx_affinity_mapping(struct lag_tracker * tracker,u8 num_ports,u8 buckets,u8 * ports)312 static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker,
313 u8 num_ports,
314 u8 buckets,
315 u8 *ports)
316 {
317 int disabled[MLX5_MAX_PORTS] = {};
318 int enabled[MLX5_MAX_PORTS] = {};
319 int disabled_ports_num = 0;
320 int enabled_ports_num = 0;
321 int idx;
322 u32 rand;
323 int i;
324 int j;
325
326 for (i = 0; i < num_ports; i++) {
327 if (tracker->netdev_state[i].tx_enabled &&
328 tracker->netdev_state[i].link_up)
329 enabled[enabled_ports_num++] = i;
330 else
331 disabled[disabled_ports_num++] = i;
332 }
333
334 /* Use native mapping by default where each port's buckets
335 * point the native port: 1 1 1 .. 1 2 2 2 ... 2 3 3 3 ... 3 etc
336 */
337 for (i = 0; i < num_ports; i++)
338 for (j = 0; j < buckets; j++) {
339 idx = i * buckets + j;
340 ports[idx] = MLX5_LAG_EGRESS_PORT_1 + i;
341 }
342
343 /* If all ports are disabled/enabled keep native mapping */
344 if (enabled_ports_num == num_ports ||
345 disabled_ports_num == num_ports)
346 return;
347
348 /* Go over the disabled ports and for each assign a random active port */
349 for (i = 0; i < disabled_ports_num; i++) {
350 for (j = 0; j < buckets; j++) {
351 get_random_bytes(&rand, 4);
352 ports[disabled[i] * buckets + j] = enabled[rand % enabled_ports_num] + 1;
353 }
354 }
355 }
356
mlx5_lag_has_drop_rule(struct mlx5_lag * ldev)357 static bool mlx5_lag_has_drop_rule(struct mlx5_lag *ldev)
358 {
359 int i;
360
361 for (i = 0; i < ldev->ports; i++)
362 if (ldev->pf[i].has_drop)
363 return true;
364 return false;
365 }
366
mlx5_lag_drop_rule_cleanup(struct mlx5_lag * ldev)367 static void mlx5_lag_drop_rule_cleanup(struct mlx5_lag *ldev)
368 {
369 int i;
370
371 for (i = 0; i < ldev->ports; i++) {
372 if (!ldev->pf[i].has_drop)
373 continue;
374
375 mlx5_esw_acl_ingress_vport_drop_rule_destroy(ldev->pf[i].dev->priv.eswitch,
376 MLX5_VPORT_UPLINK);
377 ldev->pf[i].has_drop = false;
378 }
379 }
380
mlx5_lag_drop_rule_setup(struct mlx5_lag * ldev,struct lag_tracker * tracker)381 static void mlx5_lag_drop_rule_setup(struct mlx5_lag *ldev,
382 struct lag_tracker *tracker)
383 {
384 u8 disabled_ports[MLX5_MAX_PORTS] = {};
385 struct mlx5_core_dev *dev;
386 int disabled_index;
387 int num_disabled;
388 int err;
389 int i;
390
391 /* First delete the current drop rule so there won't be any dropped
392 * packets
393 */
394 mlx5_lag_drop_rule_cleanup(ldev);
395
396 if (!ldev->tracker.has_inactive)
397 return;
398
399 mlx5_infer_tx_disabled(tracker, ldev->ports, disabled_ports, &num_disabled);
400
401 for (i = 0; i < num_disabled; i++) {
402 disabled_index = disabled_ports[i];
403 dev = ldev->pf[disabled_index].dev;
404 err = mlx5_esw_acl_ingress_vport_drop_rule_create(dev->priv.eswitch,
405 MLX5_VPORT_UPLINK);
406 if (!err)
407 ldev->pf[disabled_index].has_drop = true;
408 else
409 mlx5_core_err(dev,
410 "Failed to create lag drop rule, error: %d", err);
411 }
412 }
413
mlx5_cmd_modify_active_port(struct mlx5_core_dev * dev,u8 ports)414 static int mlx5_cmd_modify_active_port(struct mlx5_core_dev *dev, u8 ports)
415 {
416 u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {};
417 void *lag_ctx;
418
419 lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx);
420
421 MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG);
422 MLX5_SET(modify_lag_in, in, field_select, 0x2);
423
424 MLX5_SET(lagc, lag_ctx, active_port, ports);
425
426 return mlx5_cmd_exec_in(dev, modify_lag, in);
427 }
428
_mlx5_modify_lag(struct mlx5_lag * ldev,u8 * ports)429 static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 *ports)
430 {
431 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
432 u8 active_ports;
433 int ret;
434
435 if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags)) {
436 ret = mlx5_lag_port_sel_modify(ldev, ports);
437 if (ret ||
438 !MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table_bypass))
439 return ret;
440
441 active_ports = lag_active_port_bits(ldev);
442
443 return mlx5_cmd_modify_active_port(dev0, active_ports);
444 }
445 return mlx5_cmd_modify_lag(dev0, ldev->ports, ports);
446 }
447
mlx5_lag_active_backup_get_netdev(struct mlx5_core_dev * dev)448 static struct net_device *mlx5_lag_active_backup_get_netdev(struct mlx5_core_dev *dev)
449 {
450 struct net_device *ndev = NULL;
451 struct mlx5_lag *ldev;
452 unsigned long flags;
453 int i;
454
455 spin_lock_irqsave(&lag_lock, flags);
456 ldev = mlx5_lag_dev(dev);
457
458 if (!ldev)
459 goto unlock;
460
461 for (i = 0; i < ldev->ports; i++)
462 if (ldev->tracker.netdev_state[i].tx_enabled)
463 ndev = ldev->pf[i].netdev;
464 if (!ndev)
465 ndev = ldev->pf[ldev->ports - 1].netdev;
466
467 if (ndev)
468 dev_hold(ndev);
469
470 unlock:
471 spin_unlock_irqrestore(&lag_lock, flags);
472
473 return ndev;
474 }
475
mlx5_modify_lag(struct mlx5_lag * ldev,struct lag_tracker * tracker)476 void mlx5_modify_lag(struct mlx5_lag *ldev,
477 struct lag_tracker *tracker)
478 {
479 u8 ports[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS] = {};
480 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
481 int idx;
482 int err;
483 int i;
484 int j;
485
486 mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ports);
487
488 for (i = 0; i < ldev->ports; i++) {
489 for (j = 0; j < ldev->buckets; j++) {
490 idx = i * ldev->buckets + j;
491 if (ports[idx] == ldev->v2p_map[idx])
492 continue;
493 err = _mlx5_modify_lag(ldev, ports);
494 if (err) {
495 mlx5_core_err(dev0,
496 "Failed to modify LAG (%d)\n",
497 err);
498 return;
499 }
500 memcpy(ldev->v2p_map, ports, sizeof(ports));
501
502 mlx5_lag_print_mapping(dev0, ldev, tracker,
503 ldev->mode_flags);
504 break;
505 }
506 }
507
508 if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
509 struct net_device *ndev = mlx5_lag_active_backup_get_netdev(dev0);
510
511 if(!(ldev->mode == MLX5_LAG_MODE_ROCE))
512 mlx5_lag_drop_rule_setup(ldev, tracker);
513 /** Only sriov and roce lag should have tracker->tx_type set so
514 * no need to check the mode
515 */
516 blocking_notifier_call_chain(&dev0->priv.lag_nh,
517 MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE,
518 ndev);
519 dev_put(ndev);
520 }
521 }
522
mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag * ldev,unsigned long * flags)523 static int mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag *ldev,
524 unsigned long *flags)
525 {
526 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
527
528 if (!MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table)) {
529 if (ldev->ports > 2)
530 return -EINVAL;
531 return 0;
532 }
533
534 if (ldev->ports > 2)
535 ldev->buckets = MLX5_LAG_MAX_HASH_BUCKETS;
536
537 set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags);
538
539 return 0;
540 }
541
mlx5_lag_set_port_sel_mode_offloads(struct mlx5_lag * ldev,struct lag_tracker * tracker,enum mlx5_lag_mode mode,unsigned long * flags)542 static void mlx5_lag_set_port_sel_mode_offloads(struct mlx5_lag *ldev,
543 struct lag_tracker *tracker,
544 enum mlx5_lag_mode mode,
545 unsigned long *flags)
546 {
547 struct lag_func *dev0 = &ldev->pf[MLX5_LAG_P1];
548
549 if (mode == MLX5_LAG_MODE_MPESW)
550 return;
551
552 if (MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table) &&
553 tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH) {
554 if (ldev->ports > 2)
555 ldev->buckets = MLX5_LAG_MAX_HASH_BUCKETS;
556 set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags);
557 }
558 }
559
mlx5_lag_set_flags(struct mlx5_lag * ldev,enum mlx5_lag_mode mode,struct lag_tracker * tracker,bool shared_fdb,unsigned long * flags)560 static int mlx5_lag_set_flags(struct mlx5_lag *ldev, enum mlx5_lag_mode mode,
561 struct lag_tracker *tracker, bool shared_fdb,
562 unsigned long *flags)
563 {
564 bool roce_lag = mode == MLX5_LAG_MODE_ROCE;
565
566 *flags = 0;
567 if (shared_fdb) {
568 set_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, flags);
569 set_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, flags);
570 }
571
572 if (mode == MLX5_LAG_MODE_MPESW)
573 set_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, flags);
574
575 if (roce_lag)
576 return mlx5_lag_set_port_sel_mode_roce(ldev, flags);
577
578 mlx5_lag_set_port_sel_mode_offloads(ldev, tracker, mode, flags);
579 return 0;
580 }
581
mlx5_get_str_port_sel_mode(enum mlx5_lag_mode mode,unsigned long flags)582 char *mlx5_get_str_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags)
583 {
584 int port_sel_mode = get_port_sel_mode(mode, flags);
585
586 switch (port_sel_mode) {
587 case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY: return "queue_affinity";
588 case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT: return "hash";
589 case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW: return "mpesw";
590 default: return "invalid";
591 }
592 }
593
mlx5_lag_create_single_fdb(struct mlx5_lag * ldev)594 static int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev)
595 {
596 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
597 struct mlx5_eswitch *master_esw = dev0->priv.eswitch;
598 int err;
599 int i;
600
601 for (i = MLX5_LAG_P1 + 1; i < ldev->ports; i++) {
602 struct mlx5_eswitch *slave_esw = ldev->pf[i].dev->priv.eswitch;
603
604 err = mlx5_eswitch_offloads_single_fdb_add_one(master_esw,
605 slave_esw, ldev->ports);
606 if (err)
607 goto err;
608 }
609 return 0;
610 err:
611 for (; i > MLX5_LAG_P1; i--)
612 mlx5_eswitch_offloads_single_fdb_del_one(master_esw,
613 ldev->pf[i].dev->priv.eswitch);
614 return err;
615 }
616
mlx5_create_lag(struct mlx5_lag * ldev,struct lag_tracker * tracker,enum mlx5_lag_mode mode,unsigned long flags)617 static int mlx5_create_lag(struct mlx5_lag *ldev,
618 struct lag_tracker *tracker,
619 enum mlx5_lag_mode mode,
620 unsigned long flags)
621 {
622 bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
623 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
624 u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
625 int err;
626
627 if (tracker)
628 mlx5_lag_print_mapping(dev0, ldev, tracker, flags);
629 mlx5_core_info(dev0, "shared_fdb:%d mode:%s\n",
630 shared_fdb, mlx5_get_str_port_sel_mode(mode, flags));
631
632 err = mlx5_cmd_create_lag(dev0, ldev->v2p_map, mode, flags);
633 if (err) {
634 mlx5_core_err(dev0,
635 "Failed to create LAG (%d)\n",
636 err);
637 return err;
638 }
639
640 if (shared_fdb) {
641 err = mlx5_lag_create_single_fdb(ldev);
642 if (err)
643 mlx5_core_err(dev0, "Can't enable single FDB mode\n");
644 else
645 mlx5_core_info(dev0, "Operation mode is single FDB\n");
646 }
647
648 if (err) {
649 MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
650 if (mlx5_cmd_exec_in(dev0, destroy_lag, in))
651 mlx5_core_err(dev0,
652 "Failed to deactivate RoCE LAG; driver restart required\n");
653 }
654 BLOCKING_INIT_NOTIFIER_HEAD(&dev0->priv.lag_nh);
655
656 return err;
657 }
658
mlx5_activate_lag(struct mlx5_lag * ldev,struct lag_tracker * tracker,enum mlx5_lag_mode mode,bool shared_fdb)659 int mlx5_activate_lag(struct mlx5_lag *ldev,
660 struct lag_tracker *tracker,
661 enum mlx5_lag_mode mode,
662 bool shared_fdb)
663 {
664 bool roce_lag = mode == MLX5_LAG_MODE_ROCE;
665 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
666 unsigned long flags = 0;
667 int err;
668
669 err = mlx5_lag_set_flags(ldev, mode, tracker, shared_fdb, &flags);
670 if (err)
671 return err;
672
673 if (mode != MLX5_LAG_MODE_MPESW) {
674 mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ldev->v2p_map);
675 if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
676 err = mlx5_lag_port_sel_create(ldev, tracker->hash_type,
677 ldev->v2p_map);
678 if (err) {
679 mlx5_core_err(dev0,
680 "Failed to create LAG port selection(%d)\n",
681 err);
682 return err;
683 }
684 }
685 }
686
687 err = mlx5_create_lag(ldev, tracker, mode, flags);
688 if (err) {
689 if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
690 mlx5_lag_port_sel_destroy(ldev);
691 if (roce_lag)
692 mlx5_core_err(dev0,
693 "Failed to activate RoCE LAG\n");
694 else
695 mlx5_core_err(dev0,
696 "Failed to activate VF LAG\n"
697 "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n");
698 return err;
699 }
700
701 if (tracker && tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
702 !roce_lag)
703 mlx5_lag_drop_rule_setup(ldev, tracker);
704
705 ldev->mode = mode;
706 ldev->mode_flags = flags;
707 return 0;
708 }
709
mlx5_deactivate_lag(struct mlx5_lag * ldev)710 int mlx5_deactivate_lag(struct mlx5_lag *ldev)
711 {
712 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
713 struct mlx5_eswitch *master_esw = dev0->priv.eswitch;
714 u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
715 bool roce_lag = __mlx5_lag_is_roce(ldev);
716 unsigned long flags = ldev->mode_flags;
717 int err;
718 int i;
719
720 ldev->mode = MLX5_LAG_MODE_NONE;
721 ldev->mode_flags = 0;
722 mlx5_lag_mp_reset(ldev);
723
724 if (test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags)) {
725 for (i = MLX5_LAG_P1 + 1; i < ldev->ports; i++)
726 mlx5_eswitch_offloads_single_fdb_del_one(master_esw,
727 ldev->pf[i].dev->priv.eswitch);
728 clear_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
729 }
730
731 MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
732 err = mlx5_cmd_exec_in(dev0, destroy_lag, in);
733 if (err) {
734 if (roce_lag) {
735 mlx5_core_err(dev0,
736 "Failed to deactivate RoCE LAG; driver restart required\n");
737 } else {
738 mlx5_core_err(dev0,
739 "Failed to deactivate VF LAG; driver restart required\n"
740 "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n");
741 }
742 return err;
743 }
744
745 if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
746 mlx5_lag_port_sel_destroy(ldev);
747 ldev->buckets = 1;
748 }
749 if (mlx5_lag_has_drop_rule(ldev))
750 mlx5_lag_drop_rule_cleanup(ldev);
751
752 return 0;
753 }
754
mlx5_lag_check_prereq(struct mlx5_lag * ldev)755 bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
756 {
757 #ifdef CONFIG_MLX5_ESWITCH
758 struct mlx5_core_dev *dev;
759 u8 mode;
760 #endif
761 bool roce_support;
762 int i;
763
764 for (i = 0; i < ldev->ports; i++)
765 if (!ldev->pf[i].dev)
766 return false;
767
768 #ifdef CONFIG_MLX5_ESWITCH
769 for (i = 0; i < ldev->ports; i++) {
770 dev = ldev->pf[i].dev;
771 if (mlx5_eswitch_num_vfs(dev->priv.eswitch) && !is_mdev_switchdev_mode(dev))
772 return false;
773 }
774
775 dev = ldev->pf[MLX5_LAG_P1].dev;
776 mode = mlx5_eswitch_mode(dev);
777 for (i = 0; i < ldev->ports; i++)
778 if (mlx5_eswitch_mode(ldev->pf[i].dev) != mode)
779 return false;
780
781 #else
782 for (i = 0; i < ldev->ports; i++)
783 if (mlx5_sriov_is_enabled(ldev->pf[i].dev))
784 return false;
785 #endif
786 roce_support = mlx5_get_roce_state(ldev->pf[MLX5_LAG_P1].dev);
787 for (i = 1; i < ldev->ports; i++)
788 if (mlx5_get_roce_state(ldev->pf[i].dev) != roce_support)
789 return false;
790
791 return true;
792 }
793
mlx5_lag_add_devices(struct mlx5_lag * ldev)794 void mlx5_lag_add_devices(struct mlx5_lag *ldev)
795 {
796 int i;
797
798 for (i = 0; i < ldev->ports; i++) {
799 if (!ldev->pf[i].dev)
800 continue;
801
802 if (ldev->pf[i].dev->priv.flags &
803 MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
804 continue;
805
806 ldev->pf[i].dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
807 mlx5_rescan_drivers_locked(ldev->pf[i].dev);
808 }
809 }
810
mlx5_lag_remove_devices(struct mlx5_lag * ldev)811 void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
812 {
813 int i;
814
815 for (i = 0; i < ldev->ports; i++) {
816 if (!ldev->pf[i].dev)
817 continue;
818
819 if (ldev->pf[i].dev->priv.flags &
820 MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
821 continue;
822
823 ldev->pf[i].dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
824 mlx5_rescan_drivers_locked(ldev->pf[i].dev);
825 }
826 }
827
mlx5_disable_lag(struct mlx5_lag * ldev)828 void mlx5_disable_lag(struct mlx5_lag *ldev)
829 {
830 bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
831 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
832 bool roce_lag;
833 int err;
834 int i;
835
836 roce_lag = __mlx5_lag_is_roce(ldev);
837
838 if (shared_fdb) {
839 mlx5_lag_remove_devices(ldev);
840 } else if (roce_lag) {
841 if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)) {
842 dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
843 mlx5_rescan_drivers_locked(dev0);
844 }
845 for (i = 1; i < ldev->ports; i++)
846 mlx5_nic_vport_disable_roce(ldev->pf[i].dev);
847 }
848
849 err = mlx5_deactivate_lag(ldev);
850 if (err)
851 return;
852
853 if (shared_fdb || roce_lag)
854 mlx5_lag_add_devices(ldev);
855
856 if (shared_fdb)
857 for (i = 0; i < ldev->ports; i++)
858 if (!(ldev->pf[i].dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
859 mlx5_eswitch_reload_ib_reps(ldev->pf[i].dev->priv.eswitch);
860 }
861
mlx5_lag_shared_fdb_supported(struct mlx5_lag * ldev)862 bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev)
863 {
864 struct mlx5_core_dev *dev;
865 int i;
866
867 for (i = MLX5_LAG_P1 + 1; i < ldev->ports; i++) {
868 dev = ldev->pf[i].dev;
869 if (is_mdev_switchdev_mode(dev) &&
870 mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch) &&
871 MLX5_CAP_GEN(dev, lag_native_fdb_selection) &&
872 MLX5_CAP_ESW(dev, root_ft_on_other_esw) &&
873 mlx5_eswitch_get_npeers(dev->priv.eswitch) ==
874 MLX5_CAP_GEN(dev, num_lag_ports) - 1)
875 continue;
876 return false;
877 }
878
879 dev = ldev->pf[MLX5_LAG_P1].dev;
880 if (is_mdev_switchdev_mode(dev) &&
881 mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch) &&
882 mlx5_esw_offloads_devcom_is_ready(dev->priv.eswitch) &&
883 MLX5_CAP_ESW(dev, esw_shared_ingress_acl) &&
884 mlx5_eswitch_get_npeers(dev->priv.eswitch) == MLX5_CAP_GEN(dev, num_lag_ports) - 1)
885 return true;
886
887 return false;
888 }
889
mlx5_lag_is_roce_lag(struct mlx5_lag * ldev)890 static bool mlx5_lag_is_roce_lag(struct mlx5_lag *ldev)
891 {
892 bool roce_lag = true;
893 int i;
894
895 for (i = 0; i < ldev->ports; i++)
896 roce_lag = roce_lag && !mlx5_sriov_is_enabled(ldev->pf[i].dev);
897
898 #ifdef CONFIG_MLX5_ESWITCH
899 for (i = 0; i < ldev->ports; i++)
900 roce_lag = roce_lag && is_mdev_legacy_mode(ldev->pf[i].dev);
901 #endif
902
903 return roce_lag;
904 }
905
mlx5_lag_should_modify_lag(struct mlx5_lag * ldev,bool do_bond)906 static bool mlx5_lag_should_modify_lag(struct mlx5_lag *ldev, bool do_bond)
907 {
908 return do_bond && __mlx5_lag_is_active(ldev) &&
909 ldev->mode != MLX5_LAG_MODE_MPESW;
910 }
911
mlx5_lag_should_disable_lag(struct mlx5_lag * ldev,bool do_bond)912 static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond)
913 {
914 return !do_bond && __mlx5_lag_is_active(ldev) &&
915 ldev->mode != MLX5_LAG_MODE_MPESW;
916 }
917
mlx5_do_bond(struct mlx5_lag * ldev)918 static void mlx5_do_bond(struct mlx5_lag *ldev)
919 {
920 struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
921 struct lag_tracker tracker = { };
922 struct net_device *ndev;
923 bool do_bond, roce_lag;
924 int err;
925 int i;
926
927 if (!mlx5_lag_is_ready(ldev)) {
928 do_bond = false;
929 } else {
930 /* VF LAG is in multipath mode, ignore bond change requests */
931 if (mlx5_lag_is_multipath(dev0))
932 return;
933
934 tracker = ldev->tracker;
935
936 do_bond = tracker.is_bonded && mlx5_lag_check_prereq(ldev);
937 }
938
939 if (do_bond && !__mlx5_lag_is_active(ldev)) {
940 bool shared_fdb = mlx5_lag_shared_fdb_supported(ldev);
941
942 roce_lag = mlx5_lag_is_roce_lag(ldev);
943
944 if (shared_fdb || roce_lag)
945 mlx5_lag_remove_devices(ldev);
946
947 err = mlx5_activate_lag(ldev, &tracker,
948 roce_lag ? MLX5_LAG_MODE_ROCE :
949 MLX5_LAG_MODE_SRIOV,
950 shared_fdb);
951 if (err) {
952 if (shared_fdb || roce_lag)
953 mlx5_lag_add_devices(ldev);
954
955 return;
956 } else if (roce_lag) {
957 dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
958 mlx5_rescan_drivers_locked(dev0);
959 for (i = 1; i < ldev->ports; i++) {
960 if (mlx5_get_roce_state(ldev->pf[i].dev))
961 mlx5_nic_vport_enable_roce(ldev->pf[i].dev);
962 }
963 } else if (shared_fdb) {
964 int i;
965
966 dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
967 mlx5_rescan_drivers_locked(dev0);
968
969 for (i = 0; i < ldev->ports; i++) {
970 err = mlx5_eswitch_reload_ib_reps(ldev->pf[i].dev->priv.eswitch);
971 if (err)
972 break;
973 }
974
975 if (err) {
976 dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
977 mlx5_rescan_drivers_locked(dev0);
978 mlx5_deactivate_lag(ldev);
979 mlx5_lag_add_devices(ldev);
980 for (i = 0; i < ldev->ports; i++)
981 mlx5_eswitch_reload_ib_reps(ldev->pf[i].dev->priv.eswitch);
982 mlx5_core_err(dev0, "Failed to enable lag\n");
983 return;
984 }
985 }
986 if (tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
987 ndev = mlx5_lag_active_backup_get_netdev(dev0);
988 /** Only sriov and roce lag should have tracker->TX_type
989 * set so no need to check the mode
990 */
991 blocking_notifier_call_chain(&dev0->priv.lag_nh,
992 MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE,
993 ndev);
994 dev_put(ndev);
995 }
996 } else if (mlx5_lag_should_modify_lag(ldev, do_bond)) {
997 mlx5_modify_lag(ldev, &tracker);
998 } else if (mlx5_lag_should_disable_lag(ldev, do_bond)) {
999 mlx5_disable_lag(ldev);
1000 }
1001 }
1002
1003 /* The last mdev to unregister will destroy the workqueue before removing the
1004 * devcom component, and as all the mdevs use the same devcom component we are
1005 * guaranteed that the devcom is valid while the calling work is running.
1006 */
mlx5_lag_get_devcom_comp(struct mlx5_lag * ldev)1007 struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev)
1008 {
1009 struct mlx5_devcom_comp_dev *devcom = NULL;
1010 int i;
1011
1012 mutex_lock(&ldev->lock);
1013 for (i = 0; i < ldev->ports; i++) {
1014 if (ldev->pf[i].dev) {
1015 devcom = ldev->pf[i].dev->priv.hca_devcom_comp;
1016 break;
1017 }
1018 }
1019 mutex_unlock(&ldev->lock);
1020 return devcom;
1021 }
1022
mlx5_queue_bond_work(struct mlx5_lag * ldev,unsigned long delay)1023 static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay)
1024 {
1025 queue_delayed_work(ldev->wq, &ldev->bond_work, delay);
1026 }
1027
mlx5_do_bond_work(struct work_struct * work)1028 static void mlx5_do_bond_work(struct work_struct *work)
1029 {
1030 struct delayed_work *delayed_work = to_delayed_work(work);
1031 struct mlx5_lag *ldev = container_of(delayed_work, struct mlx5_lag,
1032 bond_work);
1033 struct mlx5_devcom_comp_dev *devcom;
1034 int status;
1035
1036 devcom = mlx5_lag_get_devcom_comp(ldev);
1037 if (!devcom)
1038 return;
1039
1040 status = mlx5_devcom_comp_trylock(devcom);
1041 if (!status) {
1042 mlx5_queue_bond_work(ldev, HZ);
1043 return;
1044 }
1045
1046 mutex_lock(&ldev->lock);
1047 if (ldev->mode_changes_in_progress) {
1048 mutex_unlock(&ldev->lock);
1049 mlx5_devcom_comp_unlock(devcom);
1050 mlx5_queue_bond_work(ldev, HZ);
1051 return;
1052 }
1053
1054 mlx5_do_bond(ldev);
1055 mutex_unlock(&ldev->lock);
1056 mlx5_devcom_comp_unlock(devcom);
1057 }
1058
mlx5_handle_changeupper_event(struct mlx5_lag * ldev,struct lag_tracker * tracker,struct netdev_notifier_changeupper_info * info)1059 static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev,
1060 struct lag_tracker *tracker,
1061 struct netdev_notifier_changeupper_info *info)
1062 {
1063 struct net_device *upper = info->upper_dev, *ndev_tmp;
1064 struct netdev_lag_upper_info *lag_upper_info = NULL;
1065 bool is_bonded, is_in_lag, mode_supported;
1066 bool has_inactive = 0;
1067 struct slave *slave;
1068 u8 bond_status = 0;
1069 int num_slaves = 0;
1070 int changed = 0;
1071 int idx;
1072
1073 if (!netif_is_lag_master(upper))
1074 return 0;
1075
1076 if (info->linking)
1077 lag_upper_info = info->upper_info;
1078
1079 /* The event may still be of interest if the slave does not belong to
1080 * us, but is enslaved to a master which has one or more of our netdevs
1081 * as slaves (e.g., if a new slave is added to a master that bonds two
1082 * of our netdevs, we should unbond).
1083 */
1084 rcu_read_lock();
1085 for_each_netdev_in_bond_rcu(upper, ndev_tmp) {
1086 idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp);
1087 if (idx >= 0) {
1088 slave = bond_slave_get_rcu(ndev_tmp);
1089 if (slave)
1090 has_inactive |= bond_is_slave_inactive(slave);
1091 bond_status |= (1 << idx);
1092 }
1093
1094 num_slaves++;
1095 }
1096 rcu_read_unlock();
1097
1098 /* None of this lagdev's netdevs are slaves of this master. */
1099 if (!(bond_status & GENMASK(ldev->ports - 1, 0)))
1100 return 0;
1101
1102 if (lag_upper_info) {
1103 tracker->tx_type = lag_upper_info->tx_type;
1104 tracker->hash_type = lag_upper_info->hash_type;
1105 }
1106
1107 tracker->has_inactive = has_inactive;
1108 /* Determine bonding status:
1109 * A device is considered bonded if both its physical ports are slaves
1110 * of the same lag master, and only them.
1111 */
1112 is_in_lag = num_slaves == ldev->ports &&
1113 bond_status == GENMASK(ldev->ports - 1, 0);
1114
1115 /* Lag mode must be activebackup or hash. */
1116 mode_supported = tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP ||
1117 tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH;
1118
1119 is_bonded = is_in_lag && mode_supported;
1120 if (tracker->is_bonded != is_bonded) {
1121 tracker->is_bonded = is_bonded;
1122 changed = 1;
1123 }
1124
1125 if (!is_in_lag)
1126 return changed;
1127
1128 if (!mlx5_lag_is_ready(ldev))
1129 NL_SET_ERR_MSG_MOD(info->info.extack,
1130 "Can't activate LAG offload, PF is configured with more than 64 VFs");
1131 else if (!mode_supported)
1132 NL_SET_ERR_MSG_MOD(info->info.extack,
1133 "Can't activate LAG offload, TX type isn't supported");
1134
1135 return changed;
1136 }
1137
mlx5_handle_changelowerstate_event(struct mlx5_lag * ldev,struct lag_tracker * tracker,struct net_device * ndev,struct netdev_notifier_changelowerstate_info * info)1138 static int mlx5_handle_changelowerstate_event(struct mlx5_lag *ldev,
1139 struct lag_tracker *tracker,
1140 struct net_device *ndev,
1141 struct netdev_notifier_changelowerstate_info *info)
1142 {
1143 struct netdev_lag_lower_state_info *lag_lower_info;
1144 int idx;
1145
1146 if (!netif_is_lag_port(ndev))
1147 return 0;
1148
1149 idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev);
1150 if (idx < 0)
1151 return 0;
1152
1153 /* This information is used to determine virtual to physical
1154 * port mapping.
1155 */
1156 lag_lower_info = info->lower_state_info;
1157 if (!lag_lower_info)
1158 return 0;
1159
1160 tracker->netdev_state[idx] = *lag_lower_info;
1161
1162 return 1;
1163 }
1164
mlx5_handle_changeinfodata_event(struct mlx5_lag * ldev,struct lag_tracker * tracker,struct net_device * ndev)1165 static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev,
1166 struct lag_tracker *tracker,
1167 struct net_device *ndev)
1168 {
1169 struct net_device *ndev_tmp;
1170 struct slave *slave;
1171 bool has_inactive = 0;
1172 int idx;
1173
1174 if (!netif_is_lag_master(ndev))
1175 return 0;
1176
1177 rcu_read_lock();
1178 for_each_netdev_in_bond_rcu(ndev, ndev_tmp) {
1179 idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp);
1180 if (idx < 0)
1181 continue;
1182
1183 slave = bond_slave_get_rcu(ndev_tmp);
1184 if (slave)
1185 has_inactive |= bond_is_slave_inactive(slave);
1186 }
1187 rcu_read_unlock();
1188
1189 if (tracker->has_inactive == has_inactive)
1190 return 0;
1191
1192 tracker->has_inactive = has_inactive;
1193
1194 return 1;
1195 }
1196
1197 /* this handler is always registered to netdev events */
mlx5_lag_netdev_event(struct notifier_block * this,unsigned long event,void * ptr)1198 static int mlx5_lag_netdev_event(struct notifier_block *this,
1199 unsigned long event, void *ptr)
1200 {
1201 struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
1202 struct lag_tracker tracker;
1203 struct mlx5_lag *ldev;
1204 int changed = 0;
1205
1206 if (event != NETDEV_CHANGEUPPER &&
1207 event != NETDEV_CHANGELOWERSTATE &&
1208 event != NETDEV_CHANGEINFODATA)
1209 return NOTIFY_DONE;
1210
1211 ldev = container_of(this, struct mlx5_lag, nb);
1212
1213 tracker = ldev->tracker;
1214
1215 switch (event) {
1216 case NETDEV_CHANGEUPPER:
1217 changed = mlx5_handle_changeupper_event(ldev, &tracker, ptr);
1218 break;
1219 case NETDEV_CHANGELOWERSTATE:
1220 changed = mlx5_handle_changelowerstate_event(ldev, &tracker,
1221 ndev, ptr);
1222 break;
1223 case NETDEV_CHANGEINFODATA:
1224 changed = mlx5_handle_changeinfodata_event(ldev, &tracker, ndev);
1225 break;
1226 }
1227
1228 ldev->tracker = tracker;
1229
1230 if (changed)
1231 mlx5_queue_bond_work(ldev, 0);
1232
1233 return NOTIFY_DONE;
1234 }
1235
mlx5_ldev_add_netdev(struct mlx5_lag * ldev,struct mlx5_core_dev * dev,struct net_device * netdev)1236 static void mlx5_ldev_add_netdev(struct mlx5_lag *ldev,
1237 struct mlx5_core_dev *dev,
1238 struct net_device *netdev)
1239 {
1240 unsigned int fn = mlx5_get_dev_index(dev);
1241 unsigned long flags;
1242
1243 if (fn >= ldev->ports)
1244 return;
1245
1246 spin_lock_irqsave(&lag_lock, flags);
1247 ldev->pf[fn].netdev = netdev;
1248 ldev->tracker.netdev_state[fn].link_up = 0;
1249 ldev->tracker.netdev_state[fn].tx_enabled = 0;
1250 spin_unlock_irqrestore(&lag_lock, flags);
1251 }
1252
mlx5_ldev_remove_netdev(struct mlx5_lag * ldev,struct net_device * netdev)1253 static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev,
1254 struct net_device *netdev)
1255 {
1256 unsigned long flags;
1257 int i;
1258
1259 spin_lock_irqsave(&lag_lock, flags);
1260 for (i = 0; i < ldev->ports; i++) {
1261 if (ldev->pf[i].netdev == netdev) {
1262 ldev->pf[i].netdev = NULL;
1263 break;
1264 }
1265 }
1266 spin_unlock_irqrestore(&lag_lock, flags);
1267 }
1268
mlx5_ldev_add_mdev(struct mlx5_lag * ldev,struct mlx5_core_dev * dev)1269 static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
1270 struct mlx5_core_dev *dev)
1271 {
1272 unsigned int fn = mlx5_get_dev_index(dev);
1273
1274 if (fn >= ldev->ports)
1275 return;
1276
1277 ldev->pf[fn].dev = dev;
1278 dev->priv.lag = ldev;
1279 }
1280
mlx5_ldev_remove_mdev(struct mlx5_lag * ldev,struct mlx5_core_dev * dev)1281 static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
1282 struct mlx5_core_dev *dev)
1283 {
1284 int i;
1285
1286 for (i = 0; i < ldev->ports; i++)
1287 if (ldev->pf[i].dev == dev)
1288 break;
1289
1290 if (i == ldev->ports)
1291 return;
1292
1293 ldev->pf[i].dev = NULL;
1294 dev->priv.lag = NULL;
1295 }
1296
1297 /* Must be called with HCA devcom component lock held */
__mlx5_lag_dev_add_mdev(struct mlx5_core_dev * dev)1298 static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
1299 {
1300 struct mlx5_devcom_comp_dev *pos = NULL;
1301 struct mlx5_lag *ldev = NULL;
1302 struct mlx5_core_dev *tmp_dev;
1303
1304 tmp_dev = mlx5_devcom_get_next_peer_data(dev->priv.hca_devcom_comp, &pos);
1305 if (tmp_dev)
1306 ldev = mlx5_lag_dev(tmp_dev);
1307
1308 if (!ldev) {
1309 ldev = mlx5_lag_dev_alloc(dev);
1310 if (!ldev) {
1311 mlx5_core_err(dev, "Failed to alloc lag dev\n");
1312 return 0;
1313 }
1314 mlx5_ldev_add_mdev(ldev, dev);
1315 return 0;
1316 }
1317
1318 mutex_lock(&ldev->lock);
1319 if (ldev->mode_changes_in_progress) {
1320 mutex_unlock(&ldev->lock);
1321 return -EAGAIN;
1322 }
1323 mlx5_ldev_get(ldev);
1324 mlx5_ldev_add_mdev(ldev, dev);
1325 mutex_unlock(&ldev->lock);
1326
1327 return 0;
1328 }
1329
mlx5_lag_remove_mdev(struct mlx5_core_dev * dev)1330 void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev)
1331 {
1332 struct mlx5_lag *ldev;
1333
1334 ldev = mlx5_lag_dev(dev);
1335 if (!ldev)
1336 return;
1337
1338 /* mdev is being removed, might as well remove debugfs
1339 * as early as possible.
1340 */
1341 mlx5_ldev_remove_debugfs(dev->priv.dbg.lag_debugfs);
1342 recheck:
1343 mutex_lock(&ldev->lock);
1344 if (ldev->mode_changes_in_progress) {
1345 mutex_unlock(&ldev->lock);
1346 msleep(100);
1347 goto recheck;
1348 }
1349 mlx5_ldev_remove_mdev(ldev, dev);
1350 mutex_unlock(&ldev->lock);
1351 mlx5_ldev_put(ldev);
1352 }
1353
mlx5_lag_add_mdev(struct mlx5_core_dev * dev)1354 void mlx5_lag_add_mdev(struct mlx5_core_dev *dev)
1355 {
1356 int err;
1357
1358 if (!mlx5_lag_is_supported(dev))
1359 return;
1360
1361 if (IS_ERR_OR_NULL(dev->priv.hca_devcom_comp))
1362 return;
1363
1364 recheck:
1365 mlx5_devcom_comp_lock(dev->priv.hca_devcom_comp);
1366 err = __mlx5_lag_dev_add_mdev(dev);
1367 mlx5_devcom_comp_unlock(dev->priv.hca_devcom_comp);
1368
1369 if (err) {
1370 msleep(100);
1371 goto recheck;
1372 }
1373 mlx5_ldev_add_debugfs(dev);
1374 }
1375
mlx5_lag_remove_netdev(struct mlx5_core_dev * dev,struct net_device * netdev)1376 void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev,
1377 struct net_device *netdev)
1378 {
1379 struct mlx5_lag *ldev;
1380 bool lag_is_active;
1381
1382 ldev = mlx5_lag_dev(dev);
1383 if (!ldev)
1384 return;
1385
1386 mutex_lock(&ldev->lock);
1387 mlx5_ldev_remove_netdev(ldev, netdev);
1388 clear_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags);
1389
1390 lag_is_active = __mlx5_lag_is_active(ldev);
1391 mutex_unlock(&ldev->lock);
1392
1393 if (lag_is_active)
1394 mlx5_queue_bond_work(ldev, 0);
1395 }
1396
mlx5_lag_add_netdev(struct mlx5_core_dev * dev,struct net_device * netdev)1397 void mlx5_lag_add_netdev(struct mlx5_core_dev *dev,
1398 struct net_device *netdev)
1399 {
1400 struct mlx5_lag *ldev;
1401 int i;
1402
1403 ldev = mlx5_lag_dev(dev);
1404 if (!ldev)
1405 return;
1406
1407 mutex_lock(&ldev->lock);
1408 mlx5_ldev_add_netdev(ldev, dev, netdev);
1409
1410 for (i = 0; i < ldev->ports; i++)
1411 if (!ldev->pf[i].netdev)
1412 break;
1413
1414 if (i >= ldev->ports)
1415 set_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags);
1416 mutex_unlock(&ldev->lock);
1417 mlx5_queue_bond_work(ldev, 0);
1418 }
1419
mlx5_lag_is_roce(struct mlx5_core_dev * dev)1420 bool mlx5_lag_is_roce(struct mlx5_core_dev *dev)
1421 {
1422 struct mlx5_lag *ldev;
1423 unsigned long flags;
1424 bool res;
1425
1426 spin_lock_irqsave(&lag_lock, flags);
1427 ldev = mlx5_lag_dev(dev);
1428 res = ldev && __mlx5_lag_is_roce(ldev);
1429 spin_unlock_irqrestore(&lag_lock, flags);
1430
1431 return res;
1432 }
1433 EXPORT_SYMBOL(mlx5_lag_is_roce);
1434
mlx5_lag_is_active(struct mlx5_core_dev * dev)1435 bool mlx5_lag_is_active(struct mlx5_core_dev *dev)
1436 {
1437 struct mlx5_lag *ldev;
1438 unsigned long flags;
1439 bool res;
1440
1441 spin_lock_irqsave(&lag_lock, flags);
1442 ldev = mlx5_lag_dev(dev);
1443 res = ldev && __mlx5_lag_is_active(ldev);
1444 spin_unlock_irqrestore(&lag_lock, flags);
1445
1446 return res;
1447 }
1448 EXPORT_SYMBOL(mlx5_lag_is_active);
1449
mlx5_lag_mode_is_hash(struct mlx5_core_dev * dev)1450 bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev)
1451 {
1452 struct mlx5_lag *ldev;
1453 unsigned long flags;
1454 bool res = 0;
1455
1456 spin_lock_irqsave(&lag_lock, flags);
1457 ldev = mlx5_lag_dev(dev);
1458 if (ldev)
1459 res = test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags);
1460 spin_unlock_irqrestore(&lag_lock, flags);
1461
1462 return res;
1463 }
1464 EXPORT_SYMBOL(mlx5_lag_mode_is_hash);
1465
mlx5_lag_is_master(struct mlx5_core_dev * dev)1466 bool mlx5_lag_is_master(struct mlx5_core_dev *dev)
1467 {
1468 struct mlx5_lag *ldev;
1469 unsigned long flags;
1470 bool res;
1471
1472 spin_lock_irqsave(&lag_lock, flags);
1473 ldev = mlx5_lag_dev(dev);
1474 res = ldev && __mlx5_lag_is_active(ldev) &&
1475 dev == ldev->pf[MLX5_LAG_P1].dev;
1476 spin_unlock_irqrestore(&lag_lock, flags);
1477
1478 return res;
1479 }
1480 EXPORT_SYMBOL(mlx5_lag_is_master);
1481
mlx5_lag_is_sriov(struct mlx5_core_dev * dev)1482 bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev)
1483 {
1484 struct mlx5_lag *ldev;
1485 unsigned long flags;
1486 bool res;
1487
1488 spin_lock_irqsave(&lag_lock, flags);
1489 ldev = mlx5_lag_dev(dev);
1490 res = ldev && __mlx5_lag_is_sriov(ldev);
1491 spin_unlock_irqrestore(&lag_lock, flags);
1492
1493 return res;
1494 }
1495 EXPORT_SYMBOL(mlx5_lag_is_sriov);
1496
mlx5_lag_is_shared_fdb(struct mlx5_core_dev * dev)1497 bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
1498 {
1499 struct mlx5_lag *ldev;
1500 unsigned long flags;
1501 bool res;
1502
1503 spin_lock_irqsave(&lag_lock, flags);
1504 ldev = mlx5_lag_dev(dev);
1505 res = ldev && test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
1506 spin_unlock_irqrestore(&lag_lock, flags);
1507
1508 return res;
1509 }
1510 EXPORT_SYMBOL(mlx5_lag_is_shared_fdb);
1511
mlx5_lag_disable_change(struct mlx5_core_dev * dev)1512 void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
1513 {
1514 struct mlx5_lag *ldev;
1515
1516 ldev = mlx5_lag_dev(dev);
1517 if (!ldev)
1518 return;
1519
1520 mlx5_devcom_comp_lock(dev->priv.hca_devcom_comp);
1521 mutex_lock(&ldev->lock);
1522
1523 ldev->mode_changes_in_progress++;
1524 if (__mlx5_lag_is_active(ldev))
1525 mlx5_disable_lag(ldev);
1526
1527 mutex_unlock(&ldev->lock);
1528 mlx5_devcom_comp_unlock(dev->priv.hca_devcom_comp);
1529 }
1530
mlx5_lag_enable_change(struct mlx5_core_dev * dev)1531 void mlx5_lag_enable_change(struct mlx5_core_dev *dev)
1532 {
1533 struct mlx5_lag *ldev;
1534
1535 ldev = mlx5_lag_dev(dev);
1536 if (!ldev)
1537 return;
1538
1539 mutex_lock(&ldev->lock);
1540 ldev->mode_changes_in_progress--;
1541 mutex_unlock(&ldev->lock);
1542 mlx5_queue_bond_work(ldev, 0);
1543 }
1544
mlx5_lag_get_slave_port(struct mlx5_core_dev * dev,struct net_device * slave)1545 u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev,
1546 struct net_device *slave)
1547 {
1548 struct mlx5_lag *ldev;
1549 unsigned long flags;
1550 u8 port = 0;
1551 int i;
1552
1553 spin_lock_irqsave(&lag_lock, flags);
1554 ldev = mlx5_lag_dev(dev);
1555 if (!(ldev && __mlx5_lag_is_roce(ldev)))
1556 goto unlock;
1557
1558 for (i = 0; i < ldev->ports; i++) {
1559 if (ldev->pf[i].netdev == slave) {
1560 port = i;
1561 break;
1562 }
1563 }
1564
1565 port = ldev->v2p_map[port * ldev->buckets];
1566
1567 unlock:
1568 spin_unlock_irqrestore(&lag_lock, flags);
1569 return port;
1570 }
1571 EXPORT_SYMBOL(mlx5_lag_get_slave_port);
1572
mlx5_lag_get_num_ports(struct mlx5_core_dev * dev)1573 u8 mlx5_lag_get_num_ports(struct mlx5_core_dev *dev)
1574 {
1575 struct mlx5_lag *ldev;
1576
1577 ldev = mlx5_lag_dev(dev);
1578 if (!ldev)
1579 return 0;
1580
1581 return ldev->ports;
1582 }
1583 EXPORT_SYMBOL(mlx5_lag_get_num_ports);
1584
mlx5_lag_get_next_peer_mdev(struct mlx5_core_dev * dev,int * i)1585 struct mlx5_core_dev *mlx5_lag_get_next_peer_mdev(struct mlx5_core_dev *dev, int *i)
1586 {
1587 struct mlx5_core_dev *peer_dev = NULL;
1588 struct mlx5_lag *ldev;
1589 unsigned long flags;
1590 int idx;
1591
1592 spin_lock_irqsave(&lag_lock, flags);
1593 ldev = mlx5_lag_dev(dev);
1594 if (!ldev)
1595 goto unlock;
1596
1597 if (*i == ldev->ports)
1598 goto unlock;
1599 for (idx = *i; idx < ldev->ports; idx++)
1600 if (ldev->pf[idx].dev != dev)
1601 break;
1602
1603 if (idx == ldev->ports) {
1604 *i = idx;
1605 goto unlock;
1606 }
1607 *i = idx + 1;
1608
1609 peer_dev = ldev->pf[idx].dev;
1610
1611 unlock:
1612 spin_unlock_irqrestore(&lag_lock, flags);
1613 return peer_dev;
1614 }
1615 EXPORT_SYMBOL(mlx5_lag_get_next_peer_mdev);
1616
mlx5_lag_query_cong_counters(struct mlx5_core_dev * dev,u64 * values,int num_counters,size_t * offsets)1617 int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
1618 u64 *values,
1619 int num_counters,
1620 size_t *offsets)
1621 {
1622 int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
1623 struct mlx5_core_dev **mdev;
1624 struct mlx5_lag *ldev;
1625 unsigned long flags;
1626 int num_ports;
1627 int ret, i, j;
1628 void *out;
1629
1630 out = kvzalloc(outlen, GFP_KERNEL);
1631 if (!out)
1632 return -ENOMEM;
1633
1634 mdev = kvzalloc(sizeof(mdev[0]) * MLX5_MAX_PORTS, GFP_KERNEL);
1635 if (!mdev) {
1636 ret = -ENOMEM;
1637 goto free_out;
1638 }
1639
1640 memset(values, 0, sizeof(*values) * num_counters);
1641
1642 spin_lock_irqsave(&lag_lock, flags);
1643 ldev = mlx5_lag_dev(dev);
1644 if (ldev && __mlx5_lag_is_active(ldev)) {
1645 num_ports = ldev->ports;
1646 for (i = 0; i < ldev->ports; i++)
1647 mdev[i] = ldev->pf[i].dev;
1648 } else {
1649 num_ports = 1;
1650 mdev[MLX5_LAG_P1] = dev;
1651 }
1652 spin_unlock_irqrestore(&lag_lock, flags);
1653
1654 for (i = 0; i < num_ports; ++i) {
1655 u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = {};
1656
1657 MLX5_SET(query_cong_statistics_in, in, opcode,
1658 MLX5_CMD_OP_QUERY_CONG_STATISTICS);
1659 ret = mlx5_cmd_exec_inout(mdev[i], query_cong_statistics, in,
1660 out);
1661 if (ret)
1662 goto free_mdev;
1663
1664 for (j = 0; j < num_counters; ++j)
1665 values[j] += be64_to_cpup((__be64 *)(out + offsets[j]));
1666 }
1667
1668 free_mdev:
1669 kvfree(mdev);
1670 free_out:
1671 kvfree(out);
1672 return ret;
1673 }
1674 EXPORT_SYMBOL(mlx5_lag_query_cong_counters);
1675