• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /* Copyright (c) 2019 Mellanox Technologies. */
3 
4 #include "health.h"
5 
mlx5e_wait_for_sq_flush(struct mlx5e_txqsq * sq)6 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
7 {
8 	unsigned long exp_time = jiffies +
9 				 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC);
10 
11 	while (time_before(jiffies, exp_time)) {
12 		if (sq->cc == sq->pc)
13 			return 0;
14 
15 		msleep(20);
16 	}
17 
18 	netdev_err(sq->channel->netdev,
19 		   "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
20 		   sq->sqn, sq->cc, sq->pc);
21 
22 	return -ETIMEDOUT;
23 }
24 
mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq * sq)25 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
26 {
27 	WARN_ONCE(sq->cc != sq->pc,
28 		  "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
29 		  sq->sqn, sq->cc, sq->pc);
30 	sq->cc = 0;
31 	sq->dma_fifo_cc = 0;
32 	sq->pc = 0;
33 }
34 
mlx5e_tx_reporter_err_cqe_recover(void * ctx)35 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
36 {
37 	struct mlx5_core_dev *mdev;
38 	struct net_device *dev;
39 	struct mlx5e_txqsq *sq;
40 	u8 state;
41 	int err;
42 
43 	sq = ctx;
44 	mdev = sq->channel->mdev;
45 	dev = sq->channel->netdev;
46 
47 	if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
48 		return 0;
49 
50 	err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
51 	if (err) {
52 		netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
53 			   sq->sqn, err);
54 		goto out;
55 	}
56 
57 	if (state != MLX5_SQC_STATE_ERR)
58 		goto out;
59 
60 	mlx5e_tx_disable_queue(sq->txq);
61 
62 	err = mlx5e_wait_for_sq_flush(sq);
63 	if (err)
64 		goto out;
65 
66 	/* At this point, no new packets will arrive from the stack as TXQ is
67 	 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
68 	 * pending WQEs. SQ can safely reset the SQ.
69 	 */
70 
71 	err = mlx5e_health_sq_to_ready(sq->channel, sq->sqn);
72 	if (err)
73 		goto out;
74 
75 	mlx5e_reset_txqsq_cc_pc(sq);
76 	sq->stats->recover++;
77 	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
78 	mlx5e_activate_txqsq(sq);
79 
80 	return 0;
81 out:
82 	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
83 	return err;
84 }
85 
86 struct mlx5e_tx_timeout_ctx {
87 	struct mlx5e_txqsq *sq;
88 	signed int status;
89 };
90 
mlx5e_tx_reporter_timeout_recover(void * ctx)91 static int mlx5e_tx_reporter_timeout_recover(void *ctx)
92 {
93 	struct mlx5e_tx_timeout_ctx *to_ctx;
94 	struct mlx5e_priv *priv;
95 	struct mlx5_eq_comp *eq;
96 	struct mlx5e_txqsq *sq;
97 	int err;
98 
99 	to_ctx = ctx;
100 	sq = to_ctx->sq;
101 	eq = sq->cq.mcq.eq;
102 	priv = sq->channel->priv;
103 	err = mlx5e_health_channel_eq_recover(eq, sq->channel);
104 	if (!err) {
105 		to_ctx->status = 0; /* this sq recovered */
106 		return err;
107 	}
108 
109 	err = mlx5e_safe_reopen_channels(priv);
110 	if (!err) {
111 		to_ctx->status = 1; /* all channels recovered */
112 		return err;
113 	}
114 
115 	to_ctx->status = err;
116 	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
117 	netdev_err(priv->netdev,
118 		   "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
119 		   err);
120 
121 	return err;
122 }
123 
124 /* state lock cannot be grabbed within this function.
125  * It can cause a dead lock or a read-after-free.
126  */
mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx * err_ctx)127 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx)
128 {
129 	return err_ctx->recover(err_ctx->ctx);
130 }
131 
mlx5e_tx_reporter_recover(struct devlink_health_reporter * reporter,void * context,struct netlink_ext_ack * extack)132 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter,
133 				     void *context,
134 				     struct netlink_ext_ack *extack)
135 {
136 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
137 	struct mlx5e_err_ctx *err_ctx = context;
138 
139 	return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) :
140 			 mlx5e_health_recover_channels(priv);
141 }
142 
143 static int
mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg * fmsg,struct mlx5e_txqsq * sq,int tc)144 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg,
145 					struct mlx5e_txqsq *sq, int tc)
146 {
147 	struct mlx5e_priv *priv = sq->channel->priv;
148 	bool stopped = netif_xmit_stopped(sq->txq);
149 	u8 state;
150 	int err;
151 
152 	err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state);
153 	if (err)
154 		return err;
155 
156 	err = devlink_fmsg_obj_nest_start(fmsg);
157 	if (err)
158 		return err;
159 
160 	err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix);
161 	if (err)
162 		return err;
163 
164 	err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc);
165 	if (err)
166 		return err;
167 
168 	err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix);
169 	if (err)
170 		return err;
171 
172 	err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn);
173 	if (err)
174 		return err;
175 
176 	err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state);
177 	if (err)
178 		return err;
179 
180 	err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped);
181 	if (err)
182 		return err;
183 
184 	err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc);
185 	if (err)
186 		return err;
187 
188 	err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc);
189 	if (err)
190 		return err;
191 
192 	err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg);
193 	if (err)
194 		return err;
195 
196 	err = mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg);
197 	if (err)
198 		return err;
199 
200 	err = devlink_fmsg_obj_nest_end(fmsg);
201 	if (err)
202 		return err;
203 
204 	return 0;
205 }
206 
mlx5e_tx_reporter_diagnose(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg,struct netlink_ext_ack * extack)207 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
208 				      struct devlink_fmsg *fmsg,
209 				      struct netlink_ext_ack *extack)
210 {
211 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
212 	struct mlx5e_txqsq *generic_sq = priv->txq2sq[0];
213 	u32 sq_stride, sq_sz;
214 
215 	int i, tc, err = 0;
216 
217 	mutex_lock(&priv->state_lock);
218 
219 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
220 		goto unlock;
221 
222 	sq_sz = mlx5_wq_cyc_get_size(&generic_sq->wq);
223 	sq_stride = MLX5_SEND_WQE_BB;
224 
225 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config");
226 	if (err)
227 		goto unlock;
228 
229 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
230 	if (err)
231 		goto unlock;
232 
233 	err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride);
234 	if (err)
235 		goto unlock;
236 
237 	err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz);
238 	if (err)
239 		goto unlock;
240 
241 	err = mlx5e_health_cq_common_diag_fmsg(&generic_sq->cq, fmsg);
242 	if (err)
243 		goto unlock;
244 
245 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
246 	if (err)
247 		goto unlock;
248 
249 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
250 	if (err)
251 		goto unlock;
252 
253 	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
254 	if (err)
255 		goto unlock;
256 
257 	for (i = 0; i < priv->channels.num; i++) {
258 		struct mlx5e_channel *c = priv->channels.c[i];
259 
260 		for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
261 			struct mlx5e_txqsq *sq = &c->sq[tc];
262 
263 			err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc);
264 			if (err)
265 				goto unlock;
266 		}
267 	}
268 	err = devlink_fmsg_arr_pair_nest_end(fmsg);
269 	if (err)
270 		goto unlock;
271 
272 unlock:
273 	mutex_unlock(&priv->state_lock);
274 	return err;
275 }
276 
mlx5e_tx_reporter_dump_sq(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg,void * ctx)277 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
278 				     void *ctx)
279 {
280 	struct mlx5_rsc_key key = {};
281 	struct mlx5e_txqsq *sq = ctx;
282 	int err;
283 
284 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
285 		return 0;
286 
287 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
288 	if (err)
289 		return err;
290 
291 	key.size = PAGE_SIZE;
292 	key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
293 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
294 	if (err)
295 		return err;
296 
297 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
298 	if (err)
299 		return err;
300 
301 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
302 	if (err)
303 		return err;
304 
305 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC");
306 	if (err)
307 		return err;
308 
309 	key.rsc = MLX5_SGMT_TYPE_FULL_QPC;
310 	key.index1 = sq->sqn;
311 	key.num_of_obj1 = 1;
312 
313 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
314 	if (err)
315 		return err;
316 
317 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
318 	if (err)
319 		return err;
320 
321 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff");
322 	if (err)
323 		return err;
324 
325 	key.rsc = MLX5_SGMT_TYPE_SND_BUFF;
326 	key.num_of_obj2 = MLX5_RSC_DUMP_ALL;
327 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
328 	if (err)
329 		return err;
330 
331 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
332 	if (err)
333 		return err;
334 
335 	return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
336 }
337 
mlx5e_tx_reporter_timeout_dump(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg,void * ctx)338 static int mlx5e_tx_reporter_timeout_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
339 					  void *ctx)
340 {
341 	struct mlx5e_tx_timeout_ctx *to_ctx = ctx;
342 
343 	return mlx5e_tx_reporter_dump_sq(priv, fmsg, to_ctx->sq);
344 }
345 
mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg)346 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv,
347 					  struct devlink_fmsg *fmsg)
348 {
349 	struct mlx5_rsc_key key = {};
350 	int i, tc, err;
351 
352 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
353 		return 0;
354 
355 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
356 	if (err)
357 		return err;
358 
359 	key.size = PAGE_SIZE;
360 	key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
361 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
362 	if (err)
363 		return err;
364 
365 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
366 	if (err)
367 		return err;
368 
369 	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
370 	if (err)
371 		return err;
372 
373 	for (i = 0; i < priv->channels.num; i++) {
374 		struct mlx5e_channel *c = priv->channels.c[i];
375 
376 		for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
377 			struct mlx5e_txqsq *sq = &c->sq[tc];
378 
379 			err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ");
380 			if (err)
381 				return err;
382 		}
383 	}
384 	return devlink_fmsg_arr_pair_nest_end(fmsg);
385 }
386 
mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv * priv,struct mlx5e_err_ctx * err_ctx,struct devlink_fmsg * fmsg)387 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv,
388 					   struct mlx5e_err_ctx *err_ctx,
389 					   struct devlink_fmsg *fmsg)
390 {
391 	return err_ctx->dump(priv, fmsg, err_ctx->ctx);
392 }
393 
mlx5e_tx_reporter_dump(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg,void * context,struct netlink_ext_ack * extack)394 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter,
395 				  struct devlink_fmsg *fmsg, void *context,
396 				  struct netlink_ext_ack *extack)
397 {
398 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
399 	struct mlx5e_err_ctx *err_ctx = context;
400 
401 	return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) :
402 			 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg);
403 }
404 
mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq * sq)405 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq)
406 {
407 	struct mlx5e_priv *priv = sq->channel->priv;
408 	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
409 	struct mlx5e_err_ctx err_ctx = {};
410 
411 	err_ctx.ctx = sq;
412 	err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover;
413 	err_ctx.dump = mlx5e_tx_reporter_dump_sq;
414 	snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn);
415 
416 	mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
417 }
418 
mlx5e_reporter_tx_timeout(struct mlx5e_txqsq * sq)419 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq)
420 {
421 	struct mlx5e_priv *priv = sq->channel->priv;
422 	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
423 	struct mlx5e_tx_timeout_ctx to_ctx = {};
424 	struct mlx5e_err_ctx err_ctx = {};
425 
426 	to_ctx.sq = sq;
427 	err_ctx.ctx = &to_ctx;
428 	err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
429 	err_ctx.dump = mlx5e_tx_reporter_timeout_dump;
430 	snprintf(err_str, sizeof(err_str),
431 		 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u",
432 		 sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
433 		 jiffies_to_usecs(jiffies - sq->txq->trans_start));
434 
435 	mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
436 	return to_ctx.status;
437 }
438 
439 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
440 		.name = "tx",
441 		.recover = mlx5e_tx_reporter_recover,
442 		.diagnose = mlx5e_tx_reporter_diagnose,
443 		.dump = mlx5e_tx_reporter_dump,
444 };
445 
446 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
447 
mlx5e_reporter_tx_create(struct mlx5e_priv * priv)448 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv)
449 {
450 	struct devlink_health_reporter *reporter;
451 
452 	reporter = devlink_port_health_reporter_create(&priv->dl_port, &mlx5_tx_reporter_ops,
453 						       MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv);
454 	if (IS_ERR(reporter)) {
455 		netdev_warn(priv->netdev,
456 			    "Failed to create tx reporter, err = %ld\n",
457 			    PTR_ERR(reporter));
458 		return;
459 	}
460 	priv->tx_reporter = reporter;
461 }
462 
mlx5e_reporter_tx_destroy(struct mlx5e_priv * priv)463 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv)
464 {
465 	if (!priv->tx_reporter)
466 		return;
467 
468 	devlink_port_health_reporter_destroy(priv->tx_reporter);
469 }
470