1 /* SPDX-License-Identifier: GPL-2.0 */
2 /* Copyright (c) 2019 Mellanox Technologies. */
3
4 #include "health.h"
5
mlx5e_wait_for_sq_flush(struct mlx5e_txqsq * sq)6 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
7 {
8 unsigned long exp_time = jiffies +
9 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC);
10
11 while (time_before(jiffies, exp_time)) {
12 if (sq->cc == sq->pc)
13 return 0;
14
15 msleep(20);
16 }
17
18 netdev_err(sq->channel->netdev,
19 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
20 sq->sqn, sq->cc, sq->pc);
21
22 return -ETIMEDOUT;
23 }
24
mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq * sq)25 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
26 {
27 WARN_ONCE(sq->cc != sq->pc,
28 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
29 sq->sqn, sq->cc, sq->pc);
30 sq->cc = 0;
31 sq->dma_fifo_cc = 0;
32 sq->pc = 0;
33 }
34
mlx5e_tx_reporter_err_cqe_recover(void * ctx)35 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
36 {
37 struct mlx5_core_dev *mdev;
38 struct net_device *dev;
39 struct mlx5e_txqsq *sq;
40 u8 state;
41 int err;
42
43 sq = ctx;
44 mdev = sq->channel->mdev;
45 dev = sq->channel->netdev;
46
47 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
48 return 0;
49
50 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
51 if (err) {
52 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
53 sq->sqn, err);
54 goto out;
55 }
56
57 if (state != MLX5_SQC_STATE_ERR)
58 goto out;
59
60 mlx5e_tx_disable_queue(sq->txq);
61
62 err = mlx5e_wait_for_sq_flush(sq);
63 if (err)
64 goto out;
65
66 /* At this point, no new packets will arrive from the stack as TXQ is
67 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
68 * pending WQEs. SQ can safely reset the SQ.
69 */
70
71 err = mlx5e_health_sq_to_ready(sq->channel, sq->sqn);
72 if (err)
73 goto out;
74
75 mlx5e_reset_txqsq_cc_pc(sq);
76 sq->stats->recover++;
77 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
78 mlx5e_activate_txqsq(sq);
79
80 return 0;
81 out:
82 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
83 return err;
84 }
85
86 struct mlx5e_tx_timeout_ctx {
87 struct mlx5e_txqsq *sq;
88 signed int status;
89 };
90
mlx5e_tx_reporter_timeout_recover(void * ctx)91 static int mlx5e_tx_reporter_timeout_recover(void *ctx)
92 {
93 struct mlx5e_tx_timeout_ctx *to_ctx;
94 struct mlx5e_priv *priv;
95 struct mlx5_eq_comp *eq;
96 struct mlx5e_txqsq *sq;
97 int err;
98
99 to_ctx = ctx;
100 sq = to_ctx->sq;
101 eq = sq->cq.mcq.eq;
102 priv = sq->channel->priv;
103 err = mlx5e_health_channel_eq_recover(eq, sq->channel);
104 if (!err) {
105 to_ctx->status = 0; /* this sq recovered */
106 return err;
107 }
108
109 err = mlx5e_safe_reopen_channels(priv);
110 if (!err) {
111 to_ctx->status = 1; /* all channels recovered */
112 return err;
113 }
114
115 to_ctx->status = err;
116 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
117 netdev_err(priv->netdev,
118 "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
119 err);
120
121 return err;
122 }
123
124 /* state lock cannot be grabbed within this function.
125 * It can cause a dead lock or a read-after-free.
126 */
mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx * err_ctx)127 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx)
128 {
129 return err_ctx->recover(err_ctx->ctx);
130 }
131
mlx5e_tx_reporter_recover(struct devlink_health_reporter * reporter,void * context,struct netlink_ext_ack * extack)132 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter,
133 void *context,
134 struct netlink_ext_ack *extack)
135 {
136 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
137 struct mlx5e_err_ctx *err_ctx = context;
138
139 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) :
140 mlx5e_health_recover_channels(priv);
141 }
142
143 static int
mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg * fmsg,struct mlx5e_txqsq * sq,int tc)144 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg,
145 struct mlx5e_txqsq *sq, int tc)
146 {
147 struct mlx5e_priv *priv = sq->channel->priv;
148 bool stopped = netif_xmit_stopped(sq->txq);
149 u8 state;
150 int err;
151
152 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state);
153 if (err)
154 return err;
155
156 err = devlink_fmsg_obj_nest_start(fmsg);
157 if (err)
158 return err;
159
160 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix);
161 if (err)
162 return err;
163
164 err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc);
165 if (err)
166 return err;
167
168 err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix);
169 if (err)
170 return err;
171
172 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn);
173 if (err)
174 return err;
175
176 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state);
177 if (err)
178 return err;
179
180 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped);
181 if (err)
182 return err;
183
184 err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc);
185 if (err)
186 return err;
187
188 err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc);
189 if (err)
190 return err;
191
192 err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg);
193 if (err)
194 return err;
195
196 err = mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg);
197 if (err)
198 return err;
199
200 err = devlink_fmsg_obj_nest_end(fmsg);
201 if (err)
202 return err;
203
204 return 0;
205 }
206
mlx5e_tx_reporter_diagnose(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg,struct netlink_ext_ack * extack)207 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
208 struct devlink_fmsg *fmsg,
209 struct netlink_ext_ack *extack)
210 {
211 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
212 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0];
213 u32 sq_stride, sq_sz;
214
215 int i, tc, err = 0;
216
217 mutex_lock(&priv->state_lock);
218
219 if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
220 goto unlock;
221
222 sq_sz = mlx5_wq_cyc_get_size(&generic_sq->wq);
223 sq_stride = MLX5_SEND_WQE_BB;
224
225 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config");
226 if (err)
227 goto unlock;
228
229 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
230 if (err)
231 goto unlock;
232
233 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride);
234 if (err)
235 goto unlock;
236
237 err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz);
238 if (err)
239 goto unlock;
240
241 err = mlx5e_health_cq_common_diag_fmsg(&generic_sq->cq, fmsg);
242 if (err)
243 goto unlock;
244
245 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
246 if (err)
247 goto unlock;
248
249 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
250 if (err)
251 goto unlock;
252
253 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
254 if (err)
255 goto unlock;
256
257 for (i = 0; i < priv->channels.num; i++) {
258 struct mlx5e_channel *c = priv->channels.c[i];
259
260 for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
261 struct mlx5e_txqsq *sq = &c->sq[tc];
262
263 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc);
264 if (err)
265 goto unlock;
266 }
267 }
268 err = devlink_fmsg_arr_pair_nest_end(fmsg);
269 if (err)
270 goto unlock;
271
272 unlock:
273 mutex_unlock(&priv->state_lock);
274 return err;
275 }
276
mlx5e_tx_reporter_dump_sq(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg,void * ctx)277 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
278 void *ctx)
279 {
280 struct mlx5_rsc_key key = {};
281 struct mlx5e_txqsq *sq = ctx;
282 int err;
283
284 if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
285 return 0;
286
287 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
288 if (err)
289 return err;
290
291 key.size = PAGE_SIZE;
292 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
293 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
294 if (err)
295 return err;
296
297 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
298 if (err)
299 return err;
300
301 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
302 if (err)
303 return err;
304
305 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC");
306 if (err)
307 return err;
308
309 key.rsc = MLX5_SGMT_TYPE_FULL_QPC;
310 key.index1 = sq->sqn;
311 key.num_of_obj1 = 1;
312
313 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
314 if (err)
315 return err;
316
317 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
318 if (err)
319 return err;
320
321 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff");
322 if (err)
323 return err;
324
325 key.rsc = MLX5_SGMT_TYPE_SND_BUFF;
326 key.num_of_obj2 = MLX5_RSC_DUMP_ALL;
327 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
328 if (err)
329 return err;
330
331 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
332 if (err)
333 return err;
334
335 return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
336 }
337
mlx5e_tx_reporter_timeout_dump(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg,void * ctx)338 static int mlx5e_tx_reporter_timeout_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
339 void *ctx)
340 {
341 struct mlx5e_tx_timeout_ctx *to_ctx = ctx;
342
343 return mlx5e_tx_reporter_dump_sq(priv, fmsg, to_ctx->sq);
344 }
345
mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg)346 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv,
347 struct devlink_fmsg *fmsg)
348 {
349 struct mlx5_rsc_key key = {};
350 int i, tc, err;
351
352 if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
353 return 0;
354
355 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
356 if (err)
357 return err;
358
359 key.size = PAGE_SIZE;
360 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
361 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
362 if (err)
363 return err;
364
365 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
366 if (err)
367 return err;
368
369 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
370 if (err)
371 return err;
372
373 for (i = 0; i < priv->channels.num; i++) {
374 struct mlx5e_channel *c = priv->channels.c[i];
375
376 for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
377 struct mlx5e_txqsq *sq = &c->sq[tc];
378
379 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ");
380 if (err)
381 return err;
382 }
383 }
384 return devlink_fmsg_arr_pair_nest_end(fmsg);
385 }
386
mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv * priv,struct mlx5e_err_ctx * err_ctx,struct devlink_fmsg * fmsg)387 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv,
388 struct mlx5e_err_ctx *err_ctx,
389 struct devlink_fmsg *fmsg)
390 {
391 return err_ctx->dump(priv, fmsg, err_ctx->ctx);
392 }
393
mlx5e_tx_reporter_dump(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg,void * context,struct netlink_ext_ack * extack)394 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter,
395 struct devlink_fmsg *fmsg, void *context,
396 struct netlink_ext_ack *extack)
397 {
398 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
399 struct mlx5e_err_ctx *err_ctx = context;
400
401 return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) :
402 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg);
403 }
404
mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq * sq)405 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq)
406 {
407 struct mlx5e_priv *priv = sq->channel->priv;
408 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
409 struct mlx5e_err_ctx err_ctx = {};
410
411 err_ctx.ctx = sq;
412 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover;
413 err_ctx.dump = mlx5e_tx_reporter_dump_sq;
414 snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn);
415
416 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
417 }
418
mlx5e_reporter_tx_timeout(struct mlx5e_txqsq * sq)419 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq)
420 {
421 struct mlx5e_priv *priv = sq->channel->priv;
422 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
423 struct mlx5e_tx_timeout_ctx to_ctx = {};
424 struct mlx5e_err_ctx err_ctx = {};
425
426 to_ctx.sq = sq;
427 err_ctx.ctx = &to_ctx;
428 err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
429 err_ctx.dump = mlx5e_tx_reporter_timeout_dump;
430 snprintf(err_str, sizeof(err_str),
431 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u",
432 sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
433 jiffies_to_usecs(jiffies - sq->txq->trans_start));
434
435 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
436 return to_ctx.status;
437 }
438
439 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
440 .name = "tx",
441 .recover = mlx5e_tx_reporter_recover,
442 .diagnose = mlx5e_tx_reporter_diagnose,
443 .dump = mlx5e_tx_reporter_dump,
444 };
445
446 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
447
mlx5e_reporter_tx_create(struct mlx5e_priv * priv)448 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv)
449 {
450 struct devlink_health_reporter *reporter;
451
452 reporter = devlink_port_health_reporter_create(&priv->dl_port, &mlx5_tx_reporter_ops,
453 MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv);
454 if (IS_ERR(reporter)) {
455 netdev_warn(priv->netdev,
456 "Failed to create tx reporter, err = %ld\n",
457 PTR_ERR(reporter));
458 return;
459 }
460 priv->tx_reporter = reporter;
461 }
462
mlx5e_reporter_tx_destroy(struct mlx5e_priv * priv)463 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv)
464 {
465 if (!priv->tx_reporter)
466 return;
467
468 devlink_port_health_reporter_destroy(priv->tx_reporter);
469 }
470