• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11 
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13 
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock_drv.h>
26 #include <net/busy_poll.h>
27 #include <net/xdp.h>
28 
29 #include "xsk_queue.h"
30 #include "xdp_umem.h"
31 #include "xsk.h"
32 
33 #define TX_BATCH_SIZE 32
34 
35 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
36 
xsk_set_rx_need_wakeup(struct xsk_buff_pool * pool)37 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
38 {
39 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
40 		return;
41 
42 	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
43 	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
44 }
45 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
46 
xsk_set_tx_need_wakeup(struct xsk_buff_pool * pool)47 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
48 {
49 	struct xdp_sock *xs;
50 
51 	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
52 		return;
53 
54 	rcu_read_lock();
55 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
56 		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
57 	}
58 	rcu_read_unlock();
59 
60 	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
61 }
62 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
63 
xsk_clear_rx_need_wakeup(struct xsk_buff_pool * pool)64 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
65 {
66 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
67 		return;
68 
69 	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
70 	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
71 }
72 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
73 
xsk_clear_tx_need_wakeup(struct xsk_buff_pool * pool)74 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
75 {
76 	struct xdp_sock *xs;
77 
78 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
79 		return;
80 
81 	rcu_read_lock();
82 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
83 		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
84 	}
85 	rcu_read_unlock();
86 
87 	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
88 }
89 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
90 
xsk_uses_need_wakeup(struct xsk_buff_pool * pool)91 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
92 {
93 	return pool->uses_need_wakeup;
94 }
95 EXPORT_SYMBOL(xsk_uses_need_wakeup);
96 
xsk_get_pool_from_qid(struct net_device * dev,u16 queue_id)97 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
98 					    u16 queue_id)
99 {
100 	if (queue_id < dev->real_num_rx_queues)
101 		return dev->_rx[queue_id].pool;
102 	if (queue_id < dev->real_num_tx_queues)
103 		return dev->_tx[queue_id].pool;
104 
105 	return NULL;
106 }
107 EXPORT_SYMBOL(xsk_get_pool_from_qid);
108 
xsk_clear_pool_at_qid(struct net_device * dev,u16 queue_id)109 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
110 {
111 	if (queue_id < dev->num_rx_queues)
112 		dev->_rx[queue_id].pool = NULL;
113 	if (queue_id < dev->num_tx_queues)
114 		dev->_tx[queue_id].pool = NULL;
115 }
116 
117 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
118  * not know if the device has more tx queues than rx, or the opposite.
119  * This might also change during run time.
120  */
xsk_reg_pool_at_qid(struct net_device * dev,struct xsk_buff_pool * pool,u16 queue_id)121 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
122 			u16 queue_id)
123 {
124 	if (queue_id >= max_t(unsigned int,
125 			      dev->real_num_rx_queues,
126 			      dev->real_num_tx_queues))
127 		return -EINVAL;
128 
129 	if (queue_id < dev->real_num_rx_queues)
130 		dev->_rx[queue_id].pool = pool;
131 	if (queue_id < dev->real_num_tx_queues)
132 		dev->_tx[queue_id].pool = pool;
133 
134 	return 0;
135 }
136 
xp_release(struct xdp_buff_xsk * xskb)137 void xp_release(struct xdp_buff_xsk *xskb)
138 {
139 	xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
140 }
141 
xp_get_handle(struct xdp_buff_xsk * xskb)142 static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
143 {
144 	u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
145 
146 	offset += xskb->pool->headroom;
147 	if (!xskb->pool->unaligned)
148 		return xskb->orig_addr + offset;
149 	return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
150 }
151 
__xsk_rcv_zc(struct xdp_sock * xs,struct xdp_buff * xdp,u32 len)152 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
153 {
154 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
155 	u64 addr;
156 	int err;
157 
158 	addr = xp_get_handle(xskb);
159 	err = xskq_prod_reserve_desc(xs->rx, addr, len);
160 	if (err) {
161 		xs->rx_queue_full++;
162 		return err;
163 	}
164 
165 	xp_release(xskb);
166 	return 0;
167 }
168 
xsk_copy_xdp(struct xdp_buff * to,struct xdp_buff * from,u32 len)169 static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
170 {
171 	void *from_buf, *to_buf;
172 	u32 metalen;
173 
174 	if (unlikely(xdp_data_meta_unsupported(from))) {
175 		from_buf = from->data;
176 		to_buf = to->data;
177 		metalen = 0;
178 	} else {
179 		from_buf = from->data_meta;
180 		metalen = from->data - from->data_meta;
181 		to_buf = to->data - metalen;
182 	}
183 
184 	memcpy(to_buf, from_buf, len + metalen);
185 }
186 
__xsk_rcv(struct xdp_sock * xs,struct xdp_buff * xdp)187 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
188 {
189 	struct xdp_buff *xsk_xdp;
190 	int err;
191 	u32 len;
192 
193 	len = xdp->data_end - xdp->data;
194 	if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
195 		xs->rx_dropped++;
196 		return -ENOSPC;
197 	}
198 
199 	xsk_xdp = xsk_buff_alloc(xs->pool);
200 	if (!xsk_xdp) {
201 		xs->rx_dropped++;
202 		return -ENOSPC;
203 	}
204 
205 	xsk_copy_xdp(xsk_xdp, xdp, len);
206 	err = __xsk_rcv_zc(xs, xsk_xdp, len);
207 	if (err) {
208 		xsk_buff_free(xsk_xdp);
209 		return err;
210 	}
211 	return 0;
212 }
213 
xsk_tx_writeable(struct xdp_sock * xs)214 static bool xsk_tx_writeable(struct xdp_sock *xs)
215 {
216 	if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
217 		return false;
218 
219 	return true;
220 }
221 
xsk_is_bound(struct xdp_sock * xs)222 static bool xsk_is_bound(struct xdp_sock *xs)
223 {
224 	if (READ_ONCE(xs->state) == XSK_BOUND) {
225 		/* Matches smp_wmb() in bind(). */
226 		smp_rmb();
227 		return true;
228 	}
229 	return false;
230 }
231 
xsk_rcv_check(struct xdp_sock * xs,struct xdp_buff * xdp)232 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp)
233 {
234 	if (!xsk_is_bound(xs))
235 		return -EINVAL;
236 
237 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
238 		return -EINVAL;
239 
240 	sk_mark_napi_id_once_xdp(&xs->sk, xdp);
241 	return 0;
242 }
243 
xsk_flush(struct xdp_sock * xs)244 static void xsk_flush(struct xdp_sock *xs)
245 {
246 	xskq_prod_submit(xs->rx);
247 	__xskq_cons_release(xs->pool->fq);
248 	sock_def_readable(&xs->sk);
249 }
250 
xsk_generic_rcv(struct xdp_sock * xs,struct xdp_buff * xdp)251 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
252 {
253 	int err;
254 
255 	spin_lock_bh(&xs->rx_lock);
256 	err = xsk_rcv_check(xs, xdp);
257 	if (!err) {
258 		err = __xsk_rcv(xs, xdp);
259 		xsk_flush(xs);
260 	}
261 	spin_unlock_bh(&xs->rx_lock);
262 	return err;
263 }
264 
xsk_rcv(struct xdp_sock * xs,struct xdp_buff * xdp)265 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
266 {
267 	int err;
268 	u32 len;
269 
270 	err = xsk_rcv_check(xs, xdp);
271 	if (err)
272 		return err;
273 
274 	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
275 		len = xdp->data_end - xdp->data;
276 		return __xsk_rcv_zc(xs, xdp, len);
277 	}
278 
279 	err = __xsk_rcv(xs, xdp);
280 	if (!err)
281 		xdp_return_buff(xdp);
282 	return err;
283 }
284 
__xsk_map_redirect(struct xdp_sock * xs,struct xdp_buff * xdp)285 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
286 {
287 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
288 	int err;
289 
290 	err = xsk_rcv(xs, xdp);
291 	if (err)
292 		return err;
293 
294 	if (!xs->flush_node.prev)
295 		list_add(&xs->flush_node, flush_list);
296 
297 	return 0;
298 }
299 
__xsk_map_flush(void)300 void __xsk_map_flush(void)
301 {
302 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
303 	struct xdp_sock *xs, *tmp;
304 
305 	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
306 		xsk_flush(xs);
307 		__list_del_clearprev(&xs->flush_node);
308 	}
309 }
310 
xsk_tx_completed(struct xsk_buff_pool * pool,u32 nb_entries)311 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
312 {
313 	xskq_prod_submit_n(pool->cq, nb_entries);
314 }
315 EXPORT_SYMBOL(xsk_tx_completed);
316 
xsk_tx_release(struct xsk_buff_pool * pool)317 void xsk_tx_release(struct xsk_buff_pool *pool)
318 {
319 	struct xdp_sock *xs;
320 
321 	rcu_read_lock();
322 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
323 		__xskq_cons_release(xs->tx);
324 		if (xsk_tx_writeable(xs))
325 			xs->sk.sk_write_space(&xs->sk);
326 	}
327 	rcu_read_unlock();
328 }
329 EXPORT_SYMBOL(xsk_tx_release);
330 
xsk_tx_peek_desc(struct xsk_buff_pool * pool,struct xdp_desc * desc)331 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
332 {
333 	struct xdp_sock *xs;
334 
335 	rcu_read_lock();
336 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
337 		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
338 			xs->tx->queue_empty_descs++;
339 			continue;
340 		}
341 
342 		/* This is the backpressure mechanism for the Tx path.
343 		 * Reserve space in the completion queue and only proceed
344 		 * if there is space in it. This avoids having to implement
345 		 * any buffering in the Tx path.
346 		 */
347 		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
348 			goto out;
349 
350 		xskq_cons_release(xs->tx);
351 		rcu_read_unlock();
352 		return true;
353 	}
354 
355 out:
356 	rcu_read_unlock();
357 	return false;
358 }
359 EXPORT_SYMBOL(xsk_tx_peek_desc);
360 
xsk_tx_peek_release_fallback(struct xsk_buff_pool * pool,u32 max_entries)361 static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
362 {
363 	struct xdp_desc *descs = pool->tx_descs;
364 	u32 nb_pkts = 0;
365 
366 	while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
367 		nb_pkts++;
368 
369 	xsk_tx_release(pool);
370 	return nb_pkts;
371 }
372 
xsk_tx_peek_release_desc_batch(struct xsk_buff_pool * pool,u32 nb_pkts)373 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
374 {
375 	struct xdp_sock *xs;
376 
377 	rcu_read_lock();
378 	if (!list_is_singular(&pool->xsk_tx_list)) {
379 		/* Fallback to the non-batched version */
380 		rcu_read_unlock();
381 		return xsk_tx_peek_release_fallback(pool, nb_pkts);
382 	}
383 
384 	xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
385 	if (!xs) {
386 		nb_pkts = 0;
387 		goto out;
388 	}
389 
390 	nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);
391 
392 	/* This is the backpressure mechanism for the Tx path. Try to
393 	 * reserve space in the completion queue for all packets, but
394 	 * if there are fewer slots available, just process that many
395 	 * packets. This avoids having to implement any buffering in
396 	 * the Tx path.
397 	 */
398 	nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
399 	if (!nb_pkts)
400 		goto out;
401 
402 	nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
403 	if (!nb_pkts) {
404 		xs->tx->queue_empty_descs++;
405 		goto out;
406 	}
407 
408 	__xskq_cons_release(xs->tx);
409 	xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
410 	xs->sk.sk_write_space(&xs->sk);
411 
412 out:
413 	rcu_read_unlock();
414 	return nb_pkts;
415 }
416 EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
417 
xsk_wakeup(struct xdp_sock * xs,u8 flags)418 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
419 {
420 	struct net_device *dev = xs->dev;
421 
422 	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
423 }
424 
xsk_destruct_skb(struct sk_buff * skb)425 static void xsk_destruct_skb(struct sk_buff *skb)
426 {
427 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
428 	struct xdp_sock *xs = xdp_sk(skb->sk);
429 	unsigned long flags;
430 
431 	spin_lock_irqsave(&xs->pool->cq_lock, flags);
432 	xskq_prod_submit_addr(xs->pool->cq, addr);
433 	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
434 
435 	sock_wfree(skb);
436 }
437 
xsk_build_skb_zerocopy(struct xdp_sock * xs,struct xdp_desc * desc)438 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
439 					      struct xdp_desc *desc)
440 {
441 	struct xsk_buff_pool *pool = xs->pool;
442 	u32 hr, len, ts, offset, copy, copied;
443 	struct sk_buff *skb;
444 	struct page *page;
445 	void *buffer;
446 	int err, i;
447 	u64 addr;
448 
449 	hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
450 
451 	skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
452 	if (unlikely(!skb))
453 		return ERR_PTR(err);
454 
455 	skb_reserve(skb, hr);
456 
457 	addr = desc->addr;
458 	len = desc->len;
459 	ts = pool->unaligned ? len : pool->chunk_size;
460 
461 	buffer = xsk_buff_raw_get_data(pool, addr);
462 	offset = offset_in_page(buffer);
463 	addr = buffer - pool->addrs;
464 
465 	for (copied = 0, i = 0; copied < len; i++) {
466 		page = pool->umem->pgs[addr >> PAGE_SHIFT];
467 		get_page(page);
468 
469 		copy = min_t(u32, PAGE_SIZE - offset, len - copied);
470 		skb_fill_page_desc(skb, i, page, offset, copy);
471 
472 		copied += copy;
473 		addr += copy;
474 		offset = 0;
475 	}
476 
477 	skb->len += len;
478 	skb->data_len += len;
479 	skb->truesize += ts;
480 
481 	refcount_add(ts, &xs->sk.sk_wmem_alloc);
482 
483 	return skb;
484 }
485 
xsk_build_skb(struct xdp_sock * xs,struct xdp_desc * desc)486 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
487 				     struct xdp_desc *desc)
488 {
489 	struct net_device *dev = xs->dev;
490 	struct sk_buff *skb;
491 
492 	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
493 		skb = xsk_build_skb_zerocopy(xs, desc);
494 		if (IS_ERR(skb))
495 			return skb;
496 	} else {
497 		u32 hr, tr, len;
498 		void *buffer;
499 		int err;
500 
501 		hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
502 		tr = dev->needed_tailroom;
503 		len = desc->len;
504 
505 		skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
506 		if (unlikely(!skb))
507 			return ERR_PTR(err);
508 
509 		skb_reserve(skb, hr);
510 		skb_put(skb, len);
511 
512 		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
513 		err = skb_store_bits(skb, 0, buffer, len);
514 		if (unlikely(err)) {
515 			kfree_skb(skb);
516 			return ERR_PTR(err);
517 		}
518 	}
519 
520 	skb->dev = dev;
521 	skb->priority = xs->sk.sk_priority;
522 	skb->mark = xs->sk.sk_mark;
523 	skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
524 	skb->destructor = xsk_destruct_skb;
525 
526 	return skb;
527 }
528 
xsk_generic_xmit(struct sock * sk)529 static int xsk_generic_xmit(struct sock *sk)
530 {
531 	struct xdp_sock *xs = xdp_sk(sk);
532 	u32 max_batch = TX_BATCH_SIZE;
533 	bool sent_frame = false;
534 	struct xdp_desc desc;
535 	struct sk_buff *skb;
536 	unsigned long flags;
537 	int err = 0;
538 
539 	mutex_lock(&xs->mutex);
540 
541 	/* Since we dropped the RCU read lock, the socket state might have changed. */
542 	if (unlikely(!xsk_is_bound(xs))) {
543 		err = -ENXIO;
544 		goto out;
545 	}
546 
547 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
548 		goto out;
549 
550 	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
551 		if (max_batch-- == 0) {
552 			err = -EAGAIN;
553 			goto out;
554 		}
555 
556 		/* This is the backpressure mechanism for the Tx path.
557 		 * Reserve space in the completion queue and only proceed
558 		 * if there is space in it. This avoids having to implement
559 		 * any buffering in the Tx path.
560 		 */
561 		spin_lock_irqsave(&xs->pool->cq_lock, flags);
562 		if (xskq_prod_reserve(xs->pool->cq)) {
563 			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
564 			goto out;
565 		}
566 		spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
567 
568 		skb = xsk_build_skb(xs, &desc);
569 		if (IS_ERR(skb)) {
570 			err = PTR_ERR(skb);
571 			spin_lock_irqsave(&xs->pool->cq_lock, flags);
572 			xskq_prod_cancel(xs->pool->cq);
573 			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
574 			goto out;
575 		}
576 
577 		err = __dev_direct_xmit(skb, xs->queue_id);
578 		if  (err == NETDEV_TX_BUSY) {
579 			/* Tell user-space to retry the send */
580 			skb->destructor = sock_wfree;
581 			spin_lock_irqsave(&xs->pool->cq_lock, flags);
582 			xskq_prod_cancel(xs->pool->cq);
583 			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
584 			/* Free skb without triggering the perf drop trace */
585 			consume_skb(skb);
586 			err = -EAGAIN;
587 			goto out;
588 		}
589 
590 		xskq_cons_release(xs->tx);
591 		/* Ignore NET_XMIT_CN as packet might have been sent */
592 		if (err == NET_XMIT_DROP) {
593 			/* SKB completed but not sent */
594 			err = -EBUSY;
595 			goto out;
596 		}
597 
598 		sent_frame = true;
599 	}
600 
601 	xs->tx->queue_empty_descs++;
602 
603 out:
604 	if (sent_frame)
605 		if (xsk_tx_writeable(xs))
606 			sk->sk_write_space(sk);
607 
608 	mutex_unlock(&xs->mutex);
609 	return err;
610 }
611 
xsk_xmit(struct sock * sk)612 static int xsk_xmit(struct sock *sk)
613 {
614 	struct xdp_sock *xs = xdp_sk(sk);
615 	int ret;
616 
617 	if (unlikely(!(xs->dev->flags & IFF_UP)))
618 		return -ENETDOWN;
619 	if (unlikely(!xs->tx))
620 		return -ENOBUFS;
621 
622 	if (xs->zc)
623 		return xsk_wakeup(xs, XDP_WAKEUP_TX);
624 
625 	/* Drop the RCU lock since the SKB path might sleep. */
626 	rcu_read_unlock();
627 	ret = xsk_generic_xmit(sk);
628 	/* Reaquire RCU lock before going into common code. */
629 	rcu_read_lock();
630 
631 	return ret;
632 }
633 
xsk_no_wakeup(struct sock * sk)634 static bool xsk_no_wakeup(struct sock *sk)
635 {
636 #ifdef CONFIG_NET_RX_BUSY_POLL
637 	/* Prefer busy-polling, skip the wakeup. */
638 	return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
639 		READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
640 #else
641 	return false;
642 #endif
643 }
644 
__xsk_sendmsg(struct socket * sock,struct msghdr * m,size_t total_len)645 static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
646 {
647 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
648 	struct sock *sk = sock->sk;
649 	struct xdp_sock *xs = xdp_sk(sk);
650 	struct xsk_buff_pool *pool;
651 
652 	if (unlikely(!xsk_is_bound(xs)))
653 		return -ENXIO;
654 	if (unlikely(need_wait))
655 		return -EOPNOTSUPP;
656 
657 	if (sk_can_busy_loop(sk))
658 		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
659 
660 	if (xs->zc && xsk_no_wakeup(sk))
661 		return 0;
662 
663 	pool = xs->pool;
664 	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
665 		return xsk_xmit(sk);
666 	return 0;
667 }
668 
xsk_sendmsg(struct socket * sock,struct msghdr * m,size_t total_len)669 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
670 {
671 	int ret;
672 
673 	rcu_read_lock();
674 	ret = __xsk_sendmsg(sock, m, total_len);
675 	rcu_read_unlock();
676 
677 	return ret;
678 }
679 
__xsk_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)680 static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
681 {
682 	bool need_wait = !(flags & MSG_DONTWAIT);
683 	struct sock *sk = sock->sk;
684 	struct xdp_sock *xs = xdp_sk(sk);
685 
686 	if (unlikely(!xsk_is_bound(xs)))
687 		return -ENXIO;
688 	if (unlikely(!(xs->dev->flags & IFF_UP)))
689 		return -ENETDOWN;
690 	if (unlikely(!xs->rx))
691 		return -ENOBUFS;
692 	if (unlikely(need_wait))
693 		return -EOPNOTSUPP;
694 
695 	if (sk_can_busy_loop(sk))
696 		sk_busy_loop(sk, 1); /* only support non-blocking sockets */
697 
698 	if (xsk_no_wakeup(sk))
699 		return 0;
700 
701 	if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
702 		return xsk_wakeup(xs, XDP_WAKEUP_RX);
703 	return 0;
704 }
705 
xsk_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)706 static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
707 {
708 	int ret;
709 
710 	rcu_read_lock();
711 	ret = __xsk_recvmsg(sock, m, len, flags);
712 	rcu_read_unlock();
713 
714 	return ret;
715 }
716 
xsk_poll(struct file * file,struct socket * sock,struct poll_table_struct * wait)717 static __poll_t xsk_poll(struct file *file, struct socket *sock,
718 			     struct poll_table_struct *wait)
719 {
720 	__poll_t mask = 0;
721 	struct sock *sk = sock->sk;
722 	struct xdp_sock *xs = xdp_sk(sk);
723 	struct xsk_buff_pool *pool;
724 
725 	sock_poll_wait(file, sock, wait);
726 
727 	rcu_read_lock();
728 	if (unlikely(!xsk_is_bound(xs))) {
729 		rcu_read_unlock();
730 		return mask;
731 	}
732 
733 	pool = xs->pool;
734 
735 	if (pool->cached_need_wakeup) {
736 		if (xs->zc)
737 			xsk_wakeup(xs, pool->cached_need_wakeup);
738 		else
739 			/* Poll needs to drive Tx also in copy mode */
740 			xsk_xmit(sk);
741 	}
742 
743 	if (xs->rx && !xskq_prod_is_empty(xs->rx))
744 		mask |= EPOLLIN | EPOLLRDNORM;
745 	if (xs->tx && xsk_tx_writeable(xs))
746 		mask |= EPOLLOUT | EPOLLWRNORM;
747 
748 	rcu_read_unlock();
749 	return mask;
750 }
751 
xsk_init_queue(u32 entries,struct xsk_queue ** queue,bool umem_queue)752 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
753 			  bool umem_queue)
754 {
755 	struct xsk_queue *q;
756 
757 	if (entries == 0 || *queue || !is_power_of_2(entries))
758 		return -EINVAL;
759 
760 	q = xskq_create(entries, umem_queue);
761 	if (!q)
762 		return -ENOMEM;
763 
764 	/* Make sure queue is ready before it can be seen by others */
765 	smp_wmb();
766 	WRITE_ONCE(*queue, q);
767 	return 0;
768 }
769 
xsk_unbind_dev(struct xdp_sock * xs)770 static void xsk_unbind_dev(struct xdp_sock *xs)
771 {
772 	struct net_device *dev = xs->dev;
773 
774 	if (xs->state != XSK_BOUND)
775 		return;
776 	WRITE_ONCE(xs->state, XSK_UNBOUND);
777 
778 	/* Wait for driver to stop using the xdp socket. */
779 	xp_del_xsk(xs->pool, xs);
780 	synchronize_net();
781 	dev_put(dev);
782 }
783 
xsk_get_map_list_entry(struct xdp_sock * xs,struct xdp_sock __rcu *** map_entry)784 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
785 					      struct xdp_sock __rcu ***map_entry)
786 {
787 	struct xsk_map *map = NULL;
788 	struct xsk_map_node *node;
789 
790 	*map_entry = NULL;
791 
792 	spin_lock_bh(&xs->map_list_lock);
793 	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
794 					node);
795 	if (node) {
796 		bpf_map_inc(&node->map->map);
797 		map = node->map;
798 		*map_entry = node->map_entry;
799 	}
800 	spin_unlock_bh(&xs->map_list_lock);
801 	return map;
802 }
803 
xsk_delete_from_maps(struct xdp_sock * xs)804 static void xsk_delete_from_maps(struct xdp_sock *xs)
805 {
806 	/* This function removes the current XDP socket from all the
807 	 * maps it resides in. We need to take extra care here, due to
808 	 * the two locks involved. Each map has a lock synchronizing
809 	 * updates to the entries, and each socket has a lock that
810 	 * synchronizes access to the list of maps (map_list). For
811 	 * deadlock avoidance the locks need to be taken in the order
812 	 * "map lock"->"socket map list lock". We start off by
813 	 * accessing the socket map list, and take a reference to the
814 	 * map to guarantee existence between the
815 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
816 	 * calls. Then we ask the map to remove the socket, which
817 	 * tries to remove the socket from the map. Note that there
818 	 * might be updates to the map between
819 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
820 	 */
821 	struct xdp_sock __rcu **map_entry = NULL;
822 	struct xsk_map *map;
823 
824 	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
825 		xsk_map_try_sock_delete(map, xs, map_entry);
826 		bpf_map_put(&map->map);
827 	}
828 }
829 
xsk_release(struct socket * sock)830 static int xsk_release(struct socket *sock)
831 {
832 	struct sock *sk = sock->sk;
833 	struct xdp_sock *xs = xdp_sk(sk);
834 	struct net *net;
835 
836 	if (!sk)
837 		return 0;
838 
839 	net = sock_net(sk);
840 
841 	mutex_lock(&net->xdp.lock);
842 	sk_del_node_init_rcu(sk);
843 	mutex_unlock(&net->xdp.lock);
844 
845 	local_bh_disable();
846 	sock_prot_inuse_add(net, sk->sk_prot, -1);
847 	local_bh_enable();
848 
849 	xsk_delete_from_maps(xs);
850 	mutex_lock(&xs->mutex);
851 	xsk_unbind_dev(xs);
852 	mutex_unlock(&xs->mutex);
853 
854 	xskq_destroy(xs->rx);
855 	xskq_destroy(xs->tx);
856 	xskq_destroy(xs->fq_tmp);
857 	xskq_destroy(xs->cq_tmp);
858 
859 	sock_orphan(sk);
860 	sock->sk = NULL;
861 
862 	sk_refcnt_debug_release(sk);
863 	sock_put(sk);
864 
865 	return 0;
866 }
867 
xsk_lookup_xsk_from_fd(int fd)868 static struct socket *xsk_lookup_xsk_from_fd(int fd)
869 {
870 	struct socket *sock;
871 	int err;
872 
873 	sock = sockfd_lookup(fd, &err);
874 	if (!sock)
875 		return ERR_PTR(-ENOTSOCK);
876 
877 	if (sock->sk->sk_family != PF_XDP) {
878 		sockfd_put(sock);
879 		return ERR_PTR(-ENOPROTOOPT);
880 	}
881 
882 	return sock;
883 }
884 
xsk_validate_queues(struct xdp_sock * xs)885 static bool xsk_validate_queues(struct xdp_sock *xs)
886 {
887 	return xs->fq_tmp && xs->cq_tmp;
888 }
889 
xsk_bind(struct socket * sock,struct sockaddr * addr,int addr_len)890 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
891 {
892 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
893 	struct sock *sk = sock->sk;
894 	struct xdp_sock *xs = xdp_sk(sk);
895 	struct net_device *dev;
896 	int bound_dev_if;
897 	u32 flags, qid;
898 	int err = 0;
899 
900 	if (addr_len < sizeof(struct sockaddr_xdp))
901 		return -EINVAL;
902 	if (sxdp->sxdp_family != AF_XDP)
903 		return -EINVAL;
904 
905 	flags = sxdp->sxdp_flags;
906 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
907 		      XDP_USE_NEED_WAKEUP))
908 		return -EINVAL;
909 
910 	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
911 	if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
912 		return -EINVAL;
913 
914 	rtnl_lock();
915 	mutex_lock(&xs->mutex);
916 	if (xs->state != XSK_READY) {
917 		err = -EBUSY;
918 		goto out_release;
919 	}
920 
921 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
922 	if (!dev) {
923 		err = -ENODEV;
924 		goto out_release;
925 	}
926 
927 	if (!xs->rx && !xs->tx) {
928 		err = -EINVAL;
929 		goto out_unlock;
930 	}
931 
932 	qid = sxdp->sxdp_queue_id;
933 
934 	if (flags & XDP_SHARED_UMEM) {
935 		struct xdp_sock *umem_xs;
936 		struct socket *sock;
937 
938 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
939 		    (flags & XDP_USE_NEED_WAKEUP)) {
940 			/* Cannot specify flags for shared sockets. */
941 			err = -EINVAL;
942 			goto out_unlock;
943 		}
944 
945 		if (xs->umem) {
946 			/* We have already our own. */
947 			err = -EINVAL;
948 			goto out_unlock;
949 		}
950 
951 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
952 		if (IS_ERR(sock)) {
953 			err = PTR_ERR(sock);
954 			goto out_unlock;
955 		}
956 
957 		umem_xs = xdp_sk(sock->sk);
958 		if (!xsk_is_bound(umem_xs)) {
959 			err = -EBADF;
960 			sockfd_put(sock);
961 			goto out_unlock;
962 		}
963 
964 		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
965 			/* Share the umem with another socket on another qid
966 			 * and/or device.
967 			 */
968 			xs->pool = xp_create_and_assign_umem(xs,
969 							     umem_xs->umem);
970 			if (!xs->pool) {
971 				err = -ENOMEM;
972 				sockfd_put(sock);
973 				goto out_unlock;
974 			}
975 
976 			err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
977 						   qid);
978 			if (err) {
979 				xp_destroy(xs->pool);
980 				xs->pool = NULL;
981 				sockfd_put(sock);
982 				goto out_unlock;
983 			}
984 		} else {
985 			/* Share the buffer pool with the other socket. */
986 			if (xs->fq_tmp || xs->cq_tmp) {
987 				/* Do not allow setting your own fq or cq. */
988 				err = -EINVAL;
989 				sockfd_put(sock);
990 				goto out_unlock;
991 			}
992 
993 			xp_get_pool(umem_xs->pool);
994 			xs->pool = umem_xs->pool;
995 
996 			/* If underlying shared umem was created without Tx
997 			 * ring, allocate Tx descs array that Tx batching API
998 			 * utilizes
999 			 */
1000 			if (xs->tx && !xs->pool->tx_descs) {
1001 				err = xp_alloc_tx_descs(xs->pool, xs);
1002 				if (err) {
1003 					xp_put_pool(xs->pool);
1004 					xs->pool = NULL;
1005 					sockfd_put(sock);
1006 					goto out_unlock;
1007 				}
1008 			}
1009 		}
1010 
1011 		xdp_get_umem(umem_xs->umem);
1012 		WRITE_ONCE(xs->umem, umem_xs->umem);
1013 		sockfd_put(sock);
1014 	} else if (!xs->umem || !xsk_validate_queues(xs)) {
1015 		err = -EINVAL;
1016 		goto out_unlock;
1017 	} else {
1018 		/* This xsk has its own umem. */
1019 		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
1020 		if (!xs->pool) {
1021 			err = -ENOMEM;
1022 			goto out_unlock;
1023 		}
1024 
1025 		err = xp_assign_dev(xs->pool, dev, qid, flags);
1026 		if (err) {
1027 			xp_destroy(xs->pool);
1028 			xs->pool = NULL;
1029 			goto out_unlock;
1030 		}
1031 	}
1032 
1033 	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
1034 	xs->fq_tmp = NULL;
1035 	xs->cq_tmp = NULL;
1036 
1037 	xs->dev = dev;
1038 	xs->zc = xs->umem->zc;
1039 	xs->queue_id = qid;
1040 	xp_add_xsk(xs->pool, xs);
1041 
1042 out_unlock:
1043 	if (err) {
1044 		dev_put(dev);
1045 	} else {
1046 		/* Matches smp_rmb() in bind() for shared umem
1047 		 * sockets, and xsk_is_bound().
1048 		 */
1049 		smp_wmb();
1050 		WRITE_ONCE(xs->state, XSK_BOUND);
1051 	}
1052 out_release:
1053 	mutex_unlock(&xs->mutex);
1054 	rtnl_unlock();
1055 	return err;
1056 }
1057 
1058 struct xdp_umem_reg_v1 {
1059 	__u64 addr; /* Start of packet data area */
1060 	__u64 len; /* Length of packet data area */
1061 	__u32 chunk_size;
1062 	__u32 headroom;
1063 };
1064 
xsk_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)1065 static int xsk_setsockopt(struct socket *sock, int level, int optname,
1066 			  sockptr_t optval, unsigned int optlen)
1067 {
1068 	struct sock *sk = sock->sk;
1069 	struct xdp_sock *xs = xdp_sk(sk);
1070 	int err;
1071 
1072 	if (level != SOL_XDP)
1073 		return -ENOPROTOOPT;
1074 
1075 	switch (optname) {
1076 	case XDP_RX_RING:
1077 	case XDP_TX_RING:
1078 	{
1079 		struct xsk_queue **q;
1080 		int entries;
1081 
1082 		if (optlen < sizeof(entries))
1083 			return -EINVAL;
1084 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1085 			return -EFAULT;
1086 
1087 		mutex_lock(&xs->mutex);
1088 		if (xs->state != XSK_READY) {
1089 			mutex_unlock(&xs->mutex);
1090 			return -EBUSY;
1091 		}
1092 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
1093 		err = xsk_init_queue(entries, q, false);
1094 		if (!err && optname == XDP_TX_RING)
1095 			/* Tx needs to be explicitly woken up the first time */
1096 			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
1097 		mutex_unlock(&xs->mutex);
1098 		return err;
1099 	}
1100 	case XDP_UMEM_REG:
1101 	{
1102 		size_t mr_size = sizeof(struct xdp_umem_reg);
1103 		struct xdp_umem_reg mr = {};
1104 		struct xdp_umem *umem;
1105 
1106 		if (optlen < sizeof(struct xdp_umem_reg_v1))
1107 			return -EINVAL;
1108 		else if (optlen < sizeof(mr))
1109 			mr_size = sizeof(struct xdp_umem_reg_v1);
1110 
1111 		if (copy_from_sockptr(&mr, optval, mr_size))
1112 			return -EFAULT;
1113 
1114 		mutex_lock(&xs->mutex);
1115 		if (xs->state != XSK_READY || xs->umem) {
1116 			mutex_unlock(&xs->mutex);
1117 			return -EBUSY;
1118 		}
1119 
1120 		umem = xdp_umem_create(&mr);
1121 		if (IS_ERR(umem)) {
1122 			mutex_unlock(&xs->mutex);
1123 			return PTR_ERR(umem);
1124 		}
1125 
1126 		/* Make sure umem is ready before it can be seen by others */
1127 		smp_wmb();
1128 		WRITE_ONCE(xs->umem, umem);
1129 		mutex_unlock(&xs->mutex);
1130 		return 0;
1131 	}
1132 	case XDP_UMEM_FILL_RING:
1133 	case XDP_UMEM_COMPLETION_RING:
1134 	{
1135 		struct xsk_queue **q;
1136 		int entries;
1137 
1138 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1139 			return -EFAULT;
1140 
1141 		mutex_lock(&xs->mutex);
1142 		if (xs->state != XSK_READY) {
1143 			mutex_unlock(&xs->mutex);
1144 			return -EBUSY;
1145 		}
1146 
1147 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1148 			&xs->cq_tmp;
1149 		err = xsk_init_queue(entries, q, true);
1150 		mutex_unlock(&xs->mutex);
1151 		return err;
1152 	}
1153 	default:
1154 		break;
1155 	}
1156 
1157 	return -ENOPROTOOPT;
1158 }
1159 
xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 * ring)1160 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1161 {
1162 	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1163 	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1164 	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1165 }
1166 
xsk_enter_umem_offsets(struct xdp_ring_offset_v1 * ring)1167 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1168 {
1169 	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1170 	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1171 	ring->desc = offsetof(struct xdp_umem_ring, desc);
1172 }
1173 
1174 struct xdp_statistics_v1 {
1175 	__u64 rx_dropped;
1176 	__u64 rx_invalid_descs;
1177 	__u64 tx_invalid_descs;
1178 };
1179 
xsk_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1180 static int xsk_getsockopt(struct socket *sock, int level, int optname,
1181 			  char __user *optval, int __user *optlen)
1182 {
1183 	struct sock *sk = sock->sk;
1184 	struct xdp_sock *xs = xdp_sk(sk);
1185 	int len;
1186 
1187 	if (level != SOL_XDP)
1188 		return -ENOPROTOOPT;
1189 
1190 	if (get_user(len, optlen))
1191 		return -EFAULT;
1192 	if (len < 0)
1193 		return -EINVAL;
1194 
1195 	switch (optname) {
1196 	case XDP_STATISTICS:
1197 	{
1198 		struct xdp_statistics stats = {};
1199 		bool extra_stats = true;
1200 		size_t stats_size;
1201 
1202 		if (len < sizeof(struct xdp_statistics_v1)) {
1203 			return -EINVAL;
1204 		} else if (len < sizeof(stats)) {
1205 			extra_stats = false;
1206 			stats_size = sizeof(struct xdp_statistics_v1);
1207 		} else {
1208 			stats_size = sizeof(stats);
1209 		}
1210 
1211 		mutex_lock(&xs->mutex);
1212 		stats.rx_dropped = xs->rx_dropped;
1213 		if (extra_stats) {
1214 			stats.rx_ring_full = xs->rx_queue_full;
1215 			stats.rx_fill_ring_empty_descs =
1216 				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
1217 			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1218 		} else {
1219 			stats.rx_dropped += xs->rx_queue_full;
1220 		}
1221 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1222 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1223 		mutex_unlock(&xs->mutex);
1224 
1225 		if (copy_to_user(optval, &stats, stats_size))
1226 			return -EFAULT;
1227 		if (put_user(stats_size, optlen))
1228 			return -EFAULT;
1229 
1230 		return 0;
1231 	}
1232 	case XDP_MMAP_OFFSETS:
1233 	{
1234 		struct xdp_mmap_offsets off;
1235 		struct xdp_mmap_offsets_v1 off_v1;
1236 		bool flags_supported = true;
1237 		void *to_copy;
1238 
1239 		if (len < sizeof(off_v1))
1240 			return -EINVAL;
1241 		else if (len < sizeof(off))
1242 			flags_supported = false;
1243 
1244 		if (flags_supported) {
1245 			/* xdp_ring_offset is identical to xdp_ring_offset_v1
1246 			 * except for the flags field added to the end.
1247 			 */
1248 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1249 					       &off.rx);
1250 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1251 					       &off.tx);
1252 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1253 					       &off.fr);
1254 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1255 					       &off.cr);
1256 			off.rx.flags = offsetof(struct xdp_rxtx_ring,
1257 						ptrs.flags);
1258 			off.tx.flags = offsetof(struct xdp_rxtx_ring,
1259 						ptrs.flags);
1260 			off.fr.flags = offsetof(struct xdp_umem_ring,
1261 						ptrs.flags);
1262 			off.cr.flags = offsetof(struct xdp_umem_ring,
1263 						ptrs.flags);
1264 
1265 			len = sizeof(off);
1266 			to_copy = &off;
1267 		} else {
1268 			xsk_enter_rxtx_offsets(&off_v1.rx);
1269 			xsk_enter_rxtx_offsets(&off_v1.tx);
1270 			xsk_enter_umem_offsets(&off_v1.fr);
1271 			xsk_enter_umem_offsets(&off_v1.cr);
1272 
1273 			len = sizeof(off_v1);
1274 			to_copy = &off_v1;
1275 		}
1276 
1277 		if (copy_to_user(optval, to_copy, len))
1278 			return -EFAULT;
1279 		if (put_user(len, optlen))
1280 			return -EFAULT;
1281 
1282 		return 0;
1283 	}
1284 	case XDP_OPTIONS:
1285 	{
1286 		struct xdp_options opts = {};
1287 
1288 		if (len < sizeof(opts))
1289 			return -EINVAL;
1290 
1291 		mutex_lock(&xs->mutex);
1292 		if (xs->zc)
1293 			opts.flags |= XDP_OPTIONS_ZEROCOPY;
1294 		mutex_unlock(&xs->mutex);
1295 
1296 		len = sizeof(opts);
1297 		if (copy_to_user(optval, &opts, len))
1298 			return -EFAULT;
1299 		if (put_user(len, optlen))
1300 			return -EFAULT;
1301 
1302 		return 0;
1303 	}
1304 	default:
1305 		break;
1306 	}
1307 
1308 	return -EOPNOTSUPP;
1309 }
1310 
xsk_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)1311 static int xsk_mmap(struct file *file, struct socket *sock,
1312 		    struct vm_area_struct *vma)
1313 {
1314 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1315 	unsigned long size = vma->vm_end - vma->vm_start;
1316 	struct xdp_sock *xs = xdp_sk(sock->sk);
1317 	struct xsk_queue *q = NULL;
1318 	unsigned long pfn;
1319 	struct page *qpg;
1320 
1321 	if (READ_ONCE(xs->state) != XSK_READY)
1322 		return -EBUSY;
1323 
1324 	if (offset == XDP_PGOFF_RX_RING) {
1325 		q = READ_ONCE(xs->rx);
1326 	} else if (offset == XDP_PGOFF_TX_RING) {
1327 		q = READ_ONCE(xs->tx);
1328 	} else {
1329 		/* Matches the smp_wmb() in XDP_UMEM_REG */
1330 		smp_rmb();
1331 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
1332 			q = READ_ONCE(xs->fq_tmp);
1333 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1334 			q = READ_ONCE(xs->cq_tmp);
1335 	}
1336 
1337 	if (!q)
1338 		return -EINVAL;
1339 
1340 	/* Matches the smp_wmb() in xsk_init_queue */
1341 	smp_rmb();
1342 	qpg = virt_to_head_page(q->ring);
1343 	if (size > page_size(qpg))
1344 		return -EINVAL;
1345 
1346 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1347 	return remap_pfn_range(vma, vma->vm_start, pfn,
1348 			       size, vma->vm_page_prot);
1349 }
1350 
xsk_notifier(struct notifier_block * this,unsigned long msg,void * ptr)1351 static int xsk_notifier(struct notifier_block *this,
1352 			unsigned long msg, void *ptr)
1353 {
1354 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1355 	struct net *net = dev_net(dev);
1356 	struct sock *sk;
1357 
1358 	switch (msg) {
1359 	case NETDEV_UNREGISTER:
1360 		mutex_lock(&net->xdp.lock);
1361 		sk_for_each(sk, &net->xdp.list) {
1362 			struct xdp_sock *xs = xdp_sk(sk);
1363 
1364 			mutex_lock(&xs->mutex);
1365 			if (xs->dev == dev) {
1366 				sk->sk_err = ENETDOWN;
1367 				if (!sock_flag(sk, SOCK_DEAD))
1368 					sk_error_report(sk);
1369 
1370 				xsk_unbind_dev(xs);
1371 
1372 				/* Clear device references. */
1373 				xp_clear_dev(xs->pool);
1374 			}
1375 			mutex_unlock(&xs->mutex);
1376 		}
1377 		mutex_unlock(&net->xdp.lock);
1378 		break;
1379 	}
1380 	return NOTIFY_DONE;
1381 }
1382 
1383 static struct proto xsk_proto = {
1384 	.name =		"XDP",
1385 	.owner =	THIS_MODULE,
1386 	.obj_size =	sizeof(struct xdp_sock),
1387 };
1388 
1389 static const struct proto_ops xsk_proto_ops = {
1390 	.family		= PF_XDP,
1391 	.owner		= THIS_MODULE,
1392 	.release	= xsk_release,
1393 	.bind		= xsk_bind,
1394 	.connect	= sock_no_connect,
1395 	.socketpair	= sock_no_socketpair,
1396 	.accept		= sock_no_accept,
1397 	.getname	= sock_no_getname,
1398 	.poll		= xsk_poll,
1399 	.ioctl		= sock_no_ioctl,
1400 	.listen		= sock_no_listen,
1401 	.shutdown	= sock_no_shutdown,
1402 	.setsockopt	= xsk_setsockopt,
1403 	.getsockopt	= xsk_getsockopt,
1404 	.sendmsg	= xsk_sendmsg,
1405 	.recvmsg	= xsk_recvmsg,
1406 	.mmap		= xsk_mmap,
1407 	.sendpage	= sock_no_sendpage,
1408 };
1409 
xsk_destruct(struct sock * sk)1410 static void xsk_destruct(struct sock *sk)
1411 {
1412 	struct xdp_sock *xs = xdp_sk(sk);
1413 
1414 	if (!sock_flag(sk, SOCK_DEAD))
1415 		return;
1416 
1417 	if (!xp_put_pool(xs->pool))
1418 		xdp_put_umem(xs->umem, !xs->pool);
1419 
1420 	sk_refcnt_debug_dec(sk);
1421 }
1422 
xsk_create(struct net * net,struct socket * sock,int protocol,int kern)1423 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1424 		      int kern)
1425 {
1426 	struct xdp_sock *xs;
1427 	struct sock *sk;
1428 
1429 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1430 		return -EPERM;
1431 	if (sock->type != SOCK_RAW)
1432 		return -ESOCKTNOSUPPORT;
1433 
1434 	if (protocol)
1435 		return -EPROTONOSUPPORT;
1436 
1437 	sock->state = SS_UNCONNECTED;
1438 
1439 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1440 	if (!sk)
1441 		return -ENOBUFS;
1442 
1443 	sock->ops = &xsk_proto_ops;
1444 
1445 	sock_init_data(sock, sk);
1446 
1447 	sk->sk_family = PF_XDP;
1448 
1449 	sk->sk_destruct = xsk_destruct;
1450 	sk_refcnt_debug_inc(sk);
1451 
1452 	sock_set_flag(sk, SOCK_RCU_FREE);
1453 
1454 	xs = xdp_sk(sk);
1455 	xs->state = XSK_READY;
1456 	mutex_init(&xs->mutex);
1457 	spin_lock_init(&xs->rx_lock);
1458 
1459 	INIT_LIST_HEAD(&xs->map_list);
1460 	spin_lock_init(&xs->map_list_lock);
1461 
1462 	mutex_lock(&net->xdp.lock);
1463 	sk_add_node_rcu(sk, &net->xdp.list);
1464 	mutex_unlock(&net->xdp.lock);
1465 
1466 	local_bh_disable();
1467 	sock_prot_inuse_add(net, &xsk_proto, 1);
1468 	local_bh_enable();
1469 
1470 	return 0;
1471 }
1472 
1473 static const struct net_proto_family xsk_family_ops = {
1474 	.family = PF_XDP,
1475 	.create = xsk_create,
1476 	.owner	= THIS_MODULE,
1477 };
1478 
1479 static struct notifier_block xsk_netdev_notifier = {
1480 	.notifier_call	= xsk_notifier,
1481 };
1482 
xsk_net_init(struct net * net)1483 static int __net_init xsk_net_init(struct net *net)
1484 {
1485 	mutex_init(&net->xdp.lock);
1486 	INIT_HLIST_HEAD(&net->xdp.list);
1487 	return 0;
1488 }
1489 
xsk_net_exit(struct net * net)1490 static void __net_exit xsk_net_exit(struct net *net)
1491 {
1492 	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1493 }
1494 
1495 static struct pernet_operations xsk_net_ops = {
1496 	.init = xsk_net_init,
1497 	.exit = xsk_net_exit,
1498 };
1499 
xsk_init(void)1500 static int __init xsk_init(void)
1501 {
1502 	int err, cpu;
1503 
1504 	err = proto_register(&xsk_proto, 0 /* no slab */);
1505 	if (err)
1506 		goto out;
1507 
1508 	err = sock_register(&xsk_family_ops);
1509 	if (err)
1510 		goto out_proto;
1511 
1512 	err = register_pernet_subsys(&xsk_net_ops);
1513 	if (err)
1514 		goto out_sk;
1515 
1516 	err = register_netdevice_notifier(&xsk_netdev_notifier);
1517 	if (err)
1518 		goto out_pernet;
1519 
1520 	for_each_possible_cpu(cpu)
1521 		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1522 	return 0;
1523 
1524 out_pernet:
1525 	unregister_pernet_subsys(&xsk_net_ops);
1526 out_sk:
1527 	sock_unregister(PF_XDP);
1528 out_proto:
1529 	proto_unregister(&xsk_proto);
1530 out:
1531 	return err;
1532 }
1533 
1534 fs_initcall(xsk_init);
1535