1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright(c) 2018 Intel Corporation. */
3
4 #include <linux/bpf_trace.h>
5 #include <net/xdp_sock_drv.h>
6 #include <net/xdp.h>
7
8 #include "i40e.h"
9 #include "i40e_txrx_common.h"
10 #include "i40e_xsk.h"
11
i40e_alloc_rx_bi_zc(struct i40e_ring * rx_ring)12 int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring)
13 {
14 unsigned long sz = sizeof(*rx_ring->rx_bi_zc) * rx_ring->count;
15
16 rx_ring->rx_bi_zc = kzalloc(sz, GFP_KERNEL);
17 return rx_ring->rx_bi_zc ? 0 : -ENOMEM;
18 }
19
i40e_clear_rx_bi_zc(struct i40e_ring * rx_ring)20 void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring)
21 {
22 memset(rx_ring->rx_bi_zc, 0,
23 sizeof(*rx_ring->rx_bi_zc) * rx_ring->count);
24 }
25
i40e_rx_bi(struct i40e_ring * rx_ring,u32 idx)26 static struct xdp_buff **i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx)
27 {
28 return &rx_ring->rx_bi_zc[idx];
29 }
30
31 /**
32 * i40e_xsk_pool_enable - Enable/associate an AF_XDP buffer pool to a
33 * certain ring/qid
34 * @vsi: Current VSI
35 * @pool: buffer pool
36 * @qid: Rx ring to associate buffer pool with
37 *
38 * Returns 0 on success, <0 on failure
39 **/
i40e_xsk_pool_enable(struct i40e_vsi * vsi,struct xsk_buff_pool * pool,u16 qid)40 static int i40e_xsk_pool_enable(struct i40e_vsi *vsi,
41 struct xsk_buff_pool *pool,
42 u16 qid)
43 {
44 struct net_device *netdev = vsi->netdev;
45 bool if_running;
46 int err;
47
48 if (vsi->type != I40E_VSI_MAIN)
49 return -EINVAL;
50
51 if (qid >= vsi->num_queue_pairs)
52 return -EINVAL;
53
54 if (qid >= netdev->real_num_rx_queues ||
55 qid >= netdev->real_num_tx_queues)
56 return -EINVAL;
57
58 err = xsk_pool_dma_map(pool, &vsi->back->pdev->dev, I40E_RX_DMA_ATTR);
59 if (err)
60 return err;
61
62 set_bit(qid, vsi->af_xdp_zc_qps);
63
64 if_running = netif_running(vsi->netdev) && i40e_enabled_xdp_vsi(vsi);
65
66 if (if_running) {
67 err = i40e_queue_pair_disable(vsi, qid);
68 if (err)
69 return err;
70
71 err = i40e_queue_pair_enable(vsi, qid);
72 if (err)
73 return err;
74
75 /* Kick start the NAPI context so that receiving will start */
76 err = i40e_xsk_wakeup(vsi->netdev, qid, XDP_WAKEUP_RX);
77 if (err)
78 return err;
79 }
80
81 return 0;
82 }
83
84 /**
85 * i40e_xsk_pool_disable - Disassociate an AF_XDP buffer pool from a
86 * certain ring/qid
87 * @vsi: Current VSI
88 * @qid: Rx ring to associate buffer pool with
89 *
90 * Returns 0 on success, <0 on failure
91 **/
i40e_xsk_pool_disable(struct i40e_vsi * vsi,u16 qid)92 static int i40e_xsk_pool_disable(struct i40e_vsi *vsi, u16 qid)
93 {
94 struct net_device *netdev = vsi->netdev;
95 struct xsk_buff_pool *pool;
96 bool if_running;
97 int err;
98
99 pool = xsk_get_pool_from_qid(netdev, qid);
100 if (!pool)
101 return -EINVAL;
102
103 if_running = netif_running(vsi->netdev) && i40e_enabled_xdp_vsi(vsi);
104
105 if (if_running) {
106 err = i40e_queue_pair_disable(vsi, qid);
107 if (err)
108 return err;
109 }
110
111 clear_bit(qid, vsi->af_xdp_zc_qps);
112 xsk_pool_dma_unmap(pool, I40E_RX_DMA_ATTR);
113
114 if (if_running) {
115 err = i40e_queue_pair_enable(vsi, qid);
116 if (err)
117 return err;
118 }
119
120 return 0;
121 }
122
123 /**
124 * i40e_xsk_pool_setup - Enable/disassociate an AF_XDP buffer pool to/from
125 * a ring/qid
126 * @vsi: Current VSI
127 * @pool: Buffer pool to enable/associate to a ring, or NULL to disable
128 * @qid: Rx ring to (dis)associate buffer pool (from)to
129 *
130 * This function enables or disables a buffer pool to a certain ring.
131 *
132 * Returns 0 on success, <0 on failure
133 **/
i40e_xsk_pool_setup(struct i40e_vsi * vsi,struct xsk_buff_pool * pool,u16 qid)134 int i40e_xsk_pool_setup(struct i40e_vsi *vsi, struct xsk_buff_pool *pool,
135 u16 qid)
136 {
137 return pool ? i40e_xsk_pool_enable(vsi, pool, qid) :
138 i40e_xsk_pool_disable(vsi, qid);
139 }
140
141 /**
142 * i40e_run_xdp_zc - Executes an XDP program on an xdp_buff
143 * @rx_ring: Rx ring
144 * @xdp: xdp_buff used as input to the XDP program
145 *
146 * Returns any of I40E_XDP_{PASS, CONSUMED, TX, REDIR}
147 **/
i40e_run_xdp_zc(struct i40e_ring * rx_ring,struct xdp_buff * xdp)148 static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
149 {
150 int err, result = I40E_XDP_PASS;
151 struct i40e_ring *xdp_ring;
152 struct bpf_prog *xdp_prog;
153 u32 act;
154
155 rcu_read_lock();
156 /* NB! xdp_prog will always be !NULL, due to the fact that
157 * this path is enabled by setting an XDP program.
158 */
159 xdp_prog = READ_ONCE(rx_ring->xdp_prog);
160 act = bpf_prog_run_xdp(xdp_prog, xdp);
161
162 if (likely(act == XDP_REDIRECT)) {
163 err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
164 if (err)
165 goto out_failure;
166 rcu_read_unlock();
167 return I40E_XDP_REDIR;
168 }
169
170 switch (act) {
171 case XDP_PASS:
172 break;
173 case XDP_TX:
174 xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
175 result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
176 if (result == I40E_XDP_CONSUMED)
177 goto out_failure;
178 break;
179 default:
180 bpf_warn_invalid_xdp_action(act);
181 fallthrough;
182 case XDP_ABORTED:
183 out_failure:
184 trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
185 fallthrough; /* handle aborts by dropping packet */
186 case XDP_DROP:
187 result = I40E_XDP_CONSUMED;
188 break;
189 }
190 rcu_read_unlock();
191 return result;
192 }
193
i40e_alloc_rx_buffers_zc(struct i40e_ring * rx_ring,u16 count)194 bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
195 {
196 u16 ntu = rx_ring->next_to_use;
197 union i40e_rx_desc *rx_desc;
198 struct xdp_buff **bi, *xdp;
199 dma_addr_t dma;
200 bool ok = true;
201
202 rx_desc = I40E_RX_DESC(rx_ring, ntu);
203 bi = i40e_rx_bi(rx_ring, ntu);
204 do {
205 xdp = xsk_buff_alloc(rx_ring->xsk_pool);
206 if (!xdp) {
207 ok = false;
208 goto no_buffers;
209 }
210 *bi = xdp;
211 dma = xsk_buff_xdp_get_dma(xdp);
212 rx_desc->read.pkt_addr = cpu_to_le64(dma);
213 rx_desc->read.hdr_addr = 0;
214
215 rx_desc++;
216 bi++;
217 ntu++;
218
219 if (unlikely(ntu == rx_ring->count)) {
220 rx_desc = I40E_RX_DESC(rx_ring, 0);
221 bi = i40e_rx_bi(rx_ring, 0);
222 ntu = 0;
223 }
224
225 count--;
226 } while (count);
227
228 no_buffers:
229 if (rx_ring->next_to_use != ntu) {
230 /* clear the status bits for the next_to_use descriptor */
231 rx_desc->wb.qword1.status_error_len = 0;
232 i40e_release_rx_desc(rx_ring, ntu);
233 }
234
235 return ok;
236 }
237
238 /**
239 * i40e_construct_skb_zc - Create skbuff from zero-copy Rx buffer
240 * @rx_ring: Rx ring
241 * @xdp: xdp_buff
242 *
243 * This functions allocates a new skb from a zero-copy Rx buffer.
244 *
245 * Returns the skb, or NULL on failure.
246 **/
i40e_construct_skb_zc(struct i40e_ring * rx_ring,struct xdp_buff * xdp)247 static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
248 struct xdp_buff *xdp)
249 {
250 unsigned int metasize = xdp->data - xdp->data_meta;
251 unsigned int datasize = xdp->data_end - xdp->data;
252 struct sk_buff *skb;
253
254 /* allocate a skb to store the frags */
255 skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
256 xdp->data_end - xdp->data_hard_start,
257 GFP_ATOMIC | __GFP_NOWARN);
258 if (unlikely(!skb))
259 return NULL;
260
261 skb_reserve(skb, xdp->data - xdp->data_hard_start);
262 memcpy(__skb_put(skb, datasize), xdp->data, datasize);
263 if (metasize)
264 skb_metadata_set(skb, metasize);
265
266 xsk_buff_free(xdp);
267 return skb;
268 }
269
270 /**
271 * i40e_inc_ntc: Advance the next_to_clean index
272 * @rx_ring: Rx ring
273 **/
i40e_inc_ntc(struct i40e_ring * rx_ring)274 static void i40e_inc_ntc(struct i40e_ring *rx_ring)
275 {
276 u32 ntc = rx_ring->next_to_clean + 1;
277
278 ntc = (ntc < rx_ring->count) ? ntc : 0;
279 rx_ring->next_to_clean = ntc;
280 }
281
282 /**
283 * i40e_clean_rx_irq_zc - Consumes Rx packets from the hardware ring
284 * @rx_ring: Rx ring
285 * @budget: NAPI budget
286 *
287 * Returns amount of work completed
288 **/
i40e_clean_rx_irq_zc(struct i40e_ring * rx_ring,int budget)289 int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
290 {
291 unsigned int total_rx_bytes = 0, total_rx_packets = 0;
292 u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
293 unsigned int xdp_res, xdp_xmit = 0;
294 bool failure = false;
295 struct sk_buff *skb;
296
297 while (likely(total_rx_packets < (unsigned int)budget)) {
298 union i40e_rx_desc *rx_desc;
299 struct xdp_buff **bi;
300 unsigned int size;
301 u64 qword;
302
303 rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next_to_clean);
304 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
305
306 /* This memory barrier is needed to keep us from reading
307 * any other fields out of the rx_desc until we have
308 * verified the descriptor has been written back.
309 */
310 dma_rmb();
311
312 if (i40e_rx_is_programming_status(qword)) {
313 i40e_clean_programming_status(rx_ring,
314 rx_desc->raw.qword[0],
315 qword);
316 bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
317 xsk_buff_free(*bi);
318 *bi = NULL;
319 cleaned_count++;
320 i40e_inc_ntc(rx_ring);
321 continue;
322 }
323
324 bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
325 size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
326 I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
327 if (!size)
328 break;
329
330 bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
331 (*bi)->data_end = (*bi)->data + size;
332 xsk_buff_dma_sync_for_cpu(*bi, rx_ring->xsk_pool);
333
334 xdp_res = i40e_run_xdp_zc(rx_ring, *bi);
335 if (xdp_res) {
336 if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR))
337 xdp_xmit |= xdp_res;
338 else
339 xsk_buff_free(*bi);
340
341 *bi = NULL;
342 total_rx_bytes += size;
343 total_rx_packets++;
344
345 cleaned_count++;
346 i40e_inc_ntc(rx_ring);
347 continue;
348 }
349
350 /* XDP_PASS path */
351
352 /* NB! We are not checking for errors using
353 * i40e_test_staterr with
354 * BIT(I40E_RXD_QW1_ERROR_SHIFT). This is due to that
355 * SBP is *not* set in PRT_SBPVSI (default not set).
356 */
357 skb = i40e_construct_skb_zc(rx_ring, *bi);
358 if (!skb) {
359 rx_ring->rx_stats.alloc_buff_failed++;
360 break;
361 }
362
363 *bi = NULL;
364 cleaned_count++;
365 i40e_inc_ntc(rx_ring);
366
367 if (eth_skb_pad(skb))
368 continue;
369
370 total_rx_bytes += skb->len;
371 total_rx_packets++;
372
373 i40e_process_skb_fields(rx_ring, rx_desc, skb);
374 napi_gro_receive(&rx_ring->q_vector->napi, skb);
375 }
376
377 if (cleaned_count >= I40E_RX_BUFFER_WRITE)
378 failure = !i40e_alloc_rx_buffers_zc(rx_ring, cleaned_count);
379
380 i40e_finalize_xdp_rx(rx_ring, xdp_xmit);
381 i40e_update_rx_stats(rx_ring, total_rx_bytes, total_rx_packets);
382
383 if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) {
384 if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
385 xsk_set_rx_need_wakeup(rx_ring->xsk_pool);
386 else
387 xsk_clear_rx_need_wakeup(rx_ring->xsk_pool);
388
389 return (int)total_rx_packets;
390 }
391 return failure ? budget : (int)total_rx_packets;
392 }
393
394 /**
395 * i40e_xmit_zc - Performs zero-copy Tx AF_XDP
396 * @xdp_ring: XDP Tx ring
397 * @budget: NAPI budget
398 *
399 * Returns true if the work is finished.
400 **/
i40e_xmit_zc(struct i40e_ring * xdp_ring,unsigned int budget)401 static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
402 {
403 unsigned int sent_frames = 0, total_bytes = 0;
404 struct i40e_tx_desc *tx_desc = NULL;
405 struct i40e_tx_buffer *tx_bi;
406 struct xdp_desc desc;
407 dma_addr_t dma;
408
409 while (budget-- > 0) {
410 if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &desc))
411 break;
412
413 dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc.addr);
414 xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma,
415 desc.len);
416
417 tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
418 tx_bi->bytecount = desc.len;
419
420 tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
421 tx_desc->buffer_addr = cpu_to_le64(dma);
422 tx_desc->cmd_type_offset_bsz =
423 build_ctob(I40E_TX_DESC_CMD_ICRC
424 | I40E_TX_DESC_CMD_EOP,
425 0, desc.len, 0);
426
427 sent_frames++;
428 total_bytes += tx_bi->bytecount;
429
430 xdp_ring->next_to_use++;
431 if (xdp_ring->next_to_use == xdp_ring->count)
432 xdp_ring->next_to_use = 0;
433 }
434
435 if (tx_desc) {
436 /* Request an interrupt for the last frame and bump tail ptr. */
437 tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS <<
438 I40E_TXD_QW1_CMD_SHIFT);
439 i40e_xdp_ring_update_tail(xdp_ring);
440
441 xsk_tx_release(xdp_ring->xsk_pool);
442 i40e_update_tx_stats(xdp_ring, sent_frames, total_bytes);
443 }
444
445 return !!budget;
446 }
447
448 /**
449 * i40e_clean_xdp_tx_buffer - Frees and unmaps an XDP Tx entry
450 * @tx_ring: XDP Tx ring
451 * @tx_bi: Tx buffer info to clean
452 **/
i40e_clean_xdp_tx_buffer(struct i40e_ring * tx_ring,struct i40e_tx_buffer * tx_bi)453 static void i40e_clean_xdp_tx_buffer(struct i40e_ring *tx_ring,
454 struct i40e_tx_buffer *tx_bi)
455 {
456 xdp_return_frame(tx_bi->xdpf);
457 tx_ring->xdp_tx_active--;
458 dma_unmap_single(tx_ring->dev,
459 dma_unmap_addr(tx_bi, dma),
460 dma_unmap_len(tx_bi, len), DMA_TO_DEVICE);
461 dma_unmap_len_set(tx_bi, len, 0);
462 }
463
464 /**
465 * i40e_clean_xdp_tx_irq - Completes AF_XDP entries, and cleans XDP entries
466 * @vsi: Current VSI
467 * @tx_ring: XDP Tx ring
468 *
469 * Returns true if cleanup/tranmission is done.
470 **/
i40e_clean_xdp_tx_irq(struct i40e_vsi * vsi,struct i40e_ring * tx_ring)471 bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi, struct i40e_ring *tx_ring)
472 {
473 struct xsk_buff_pool *bp = tx_ring->xsk_pool;
474 u32 i, completed_frames, xsk_frames = 0;
475 u32 head_idx = i40e_get_head(tx_ring);
476 struct i40e_tx_buffer *tx_bi;
477 unsigned int ntc;
478
479 if (head_idx < tx_ring->next_to_clean)
480 head_idx += tx_ring->count;
481 completed_frames = head_idx - tx_ring->next_to_clean;
482
483 if (completed_frames == 0)
484 goto out_xmit;
485
486 if (likely(!tx_ring->xdp_tx_active)) {
487 xsk_frames = completed_frames;
488 goto skip;
489 }
490
491 ntc = tx_ring->next_to_clean;
492
493 for (i = 0; i < completed_frames; i++) {
494 tx_bi = &tx_ring->tx_bi[ntc];
495
496 if (tx_bi->xdpf) {
497 i40e_clean_xdp_tx_buffer(tx_ring, tx_bi);
498 tx_bi->xdpf = NULL;
499 } else {
500 xsk_frames++;
501 }
502
503 if (++ntc >= tx_ring->count)
504 ntc = 0;
505 }
506
507 skip:
508 tx_ring->next_to_clean += completed_frames;
509 if (unlikely(tx_ring->next_to_clean >= tx_ring->count))
510 tx_ring->next_to_clean -= tx_ring->count;
511
512 if (xsk_frames)
513 xsk_tx_completed(bp, xsk_frames);
514
515 i40e_arm_wb(tx_ring, vsi, completed_frames);
516
517 out_xmit:
518 if (xsk_uses_need_wakeup(tx_ring->xsk_pool))
519 xsk_set_tx_need_wakeup(tx_ring->xsk_pool);
520
521 return i40e_xmit_zc(tx_ring, I40E_DESC_UNUSED(tx_ring));
522 }
523
524 /**
525 * i40e_xsk_wakeup - Implements the ndo_xsk_wakeup
526 * @dev: the netdevice
527 * @queue_id: queue id to wake up
528 * @flags: ignored in our case since we have Rx and Tx in the same NAPI.
529 *
530 * Returns <0 for errors, 0 otherwise.
531 **/
i40e_xsk_wakeup(struct net_device * dev,u32 queue_id,u32 flags)532 int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
533 {
534 struct i40e_netdev_priv *np = netdev_priv(dev);
535 struct i40e_vsi *vsi = np->vsi;
536 struct i40e_pf *pf = vsi->back;
537 struct i40e_ring *ring;
538
539 if (test_bit(__I40E_CONFIG_BUSY, pf->state))
540 return -EAGAIN;
541
542 if (test_bit(__I40E_VSI_DOWN, vsi->state))
543 return -ENETDOWN;
544
545 if (!i40e_enabled_xdp_vsi(vsi))
546 return -ENXIO;
547
548 if (queue_id >= vsi->num_queue_pairs)
549 return -ENXIO;
550
551 if (!vsi->xdp_rings[queue_id]->xsk_pool)
552 return -ENXIO;
553
554 ring = vsi->xdp_rings[queue_id];
555
556 /* The idea here is that if NAPI is running, mark a miss, so
557 * it will run again. If not, trigger an interrupt and
558 * schedule the NAPI from interrupt context. If NAPI would be
559 * scheduled here, the interrupt affinity would not be
560 * honored.
561 */
562 if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi))
563 i40e_force_wb(vsi, ring->q_vector);
564
565 return 0;
566 }
567
i40e_xsk_clean_rx_ring(struct i40e_ring * rx_ring)568 void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring)
569 {
570 u16 i;
571
572 for (i = 0; i < rx_ring->count; i++) {
573 struct xdp_buff *rx_bi = *i40e_rx_bi(rx_ring, i);
574
575 if (!rx_bi)
576 continue;
577
578 xsk_buff_free(rx_bi);
579 rx_bi = NULL;
580 }
581 }
582
583 /**
584 * i40e_xsk_clean_xdp_ring - Clean the XDP Tx ring on shutdown
585 * @tx_ring: XDP Tx ring
586 **/
i40e_xsk_clean_tx_ring(struct i40e_ring * tx_ring)587 void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring)
588 {
589 u16 ntc = tx_ring->next_to_clean, ntu = tx_ring->next_to_use;
590 struct xsk_buff_pool *bp = tx_ring->xsk_pool;
591 struct i40e_tx_buffer *tx_bi;
592 u32 xsk_frames = 0;
593
594 while (ntc != ntu) {
595 tx_bi = &tx_ring->tx_bi[ntc];
596
597 if (tx_bi->xdpf)
598 i40e_clean_xdp_tx_buffer(tx_ring, tx_bi);
599 else
600 xsk_frames++;
601
602 tx_bi->xdpf = NULL;
603
604 ntc++;
605 if (ntc >= tx_ring->count)
606 ntc = 0;
607 }
608
609 if (xsk_frames)
610 xsk_tx_completed(bp, xsk_frames);
611 }
612
613 /**
614 * i40e_xsk_any_rx_ring_enabled - Checks if Rx rings have an AF_XDP
615 * buffer pool attached
616 * @vsi: vsi
617 *
618 * Returns true if any of the Rx rings has an AF_XDP buffer pool attached
619 **/
i40e_xsk_any_rx_ring_enabled(struct i40e_vsi * vsi)620 bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi)
621 {
622 struct net_device *netdev = vsi->netdev;
623 int i;
624
625 for (i = 0; i < vsi->num_queue_pairs; i++) {
626 if (xsk_get_pool_from_qid(netdev, i))
627 return true;
628 }
629
630 return false;
631 }
632