• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11 
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13 
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock_drv.h>
26 #include <net/xdp.h>
27 
28 #include "xsk_queue.h"
29 #include "xdp_umem.h"
30 #include "xsk.h"
31 
32 #define TX_BATCH_SIZE 16
33 
34 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
35 
xsk_set_rx_need_wakeup(struct xsk_buff_pool * pool)36 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
37 {
38 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
39 		return;
40 
41 	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
42 	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
43 }
44 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
45 
xsk_set_tx_need_wakeup(struct xsk_buff_pool * pool)46 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
47 {
48 	struct xdp_sock *xs;
49 
50 	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
51 		return;
52 
53 	rcu_read_lock();
54 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
55 		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
56 	}
57 	rcu_read_unlock();
58 
59 	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
60 }
61 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
62 
xsk_clear_rx_need_wakeup(struct xsk_buff_pool * pool)63 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
64 {
65 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
66 		return;
67 
68 	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
69 	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
70 }
71 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
72 
xsk_clear_tx_need_wakeup(struct xsk_buff_pool * pool)73 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
74 {
75 	struct xdp_sock *xs;
76 
77 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
78 		return;
79 
80 	rcu_read_lock();
81 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
82 		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
83 	}
84 	rcu_read_unlock();
85 
86 	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
87 }
88 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
89 
xsk_uses_need_wakeup(struct xsk_buff_pool * pool)90 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
91 {
92 	return pool->uses_need_wakeup;
93 }
94 EXPORT_SYMBOL(xsk_uses_need_wakeup);
95 
xsk_get_pool_from_qid(struct net_device * dev,u16 queue_id)96 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
97 					    u16 queue_id)
98 {
99 	if (queue_id < dev->real_num_rx_queues)
100 		return dev->_rx[queue_id].pool;
101 	if (queue_id < dev->real_num_tx_queues)
102 		return dev->_tx[queue_id].pool;
103 
104 	return NULL;
105 }
106 EXPORT_SYMBOL(xsk_get_pool_from_qid);
107 
xsk_clear_pool_at_qid(struct net_device * dev,u16 queue_id)108 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
109 {
110 	if (queue_id < dev->num_rx_queues)
111 		dev->_rx[queue_id].pool = NULL;
112 	if (queue_id < dev->num_tx_queues)
113 		dev->_tx[queue_id].pool = NULL;
114 }
115 
116 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
117  * not know if the device has more tx queues than rx, or the opposite.
118  * This might also change during run time.
119  */
xsk_reg_pool_at_qid(struct net_device * dev,struct xsk_buff_pool * pool,u16 queue_id)120 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
121 			u16 queue_id)
122 {
123 	if (queue_id >= max_t(unsigned int,
124 			      dev->real_num_rx_queues,
125 			      dev->real_num_tx_queues))
126 		return -EINVAL;
127 
128 	if (queue_id < dev->real_num_rx_queues)
129 		dev->_rx[queue_id].pool = pool;
130 	if (queue_id < dev->real_num_tx_queues)
131 		dev->_tx[queue_id].pool = pool;
132 
133 	return 0;
134 }
135 
xp_release(struct xdp_buff_xsk * xskb)136 void xp_release(struct xdp_buff_xsk *xskb)
137 {
138 	xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
139 }
140 
xp_get_handle(struct xdp_buff_xsk * xskb)141 static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
142 {
143 	u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
144 
145 	offset += xskb->pool->headroom;
146 	if (!xskb->pool->unaligned)
147 		return xskb->orig_addr + offset;
148 	return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
149 }
150 
__xsk_rcv_zc(struct xdp_sock * xs,struct xdp_buff * xdp,u32 len)151 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
152 {
153 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
154 	u64 addr;
155 	int err;
156 
157 	addr = xp_get_handle(xskb);
158 	err = xskq_prod_reserve_desc(xs->rx, addr, len);
159 	if (err) {
160 		xs->rx_queue_full++;
161 		return err;
162 	}
163 
164 	xp_release(xskb);
165 	return 0;
166 }
167 
xsk_copy_xdp(struct xdp_buff * to,struct xdp_buff * from,u32 len)168 static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
169 {
170 	void *from_buf, *to_buf;
171 	u32 metalen;
172 
173 	if (unlikely(xdp_data_meta_unsupported(from))) {
174 		from_buf = from->data;
175 		to_buf = to->data;
176 		metalen = 0;
177 	} else {
178 		from_buf = from->data_meta;
179 		metalen = from->data - from->data_meta;
180 		to_buf = to->data - metalen;
181 	}
182 
183 	memcpy(to_buf, from_buf, len + metalen);
184 }
185 
__xsk_rcv(struct xdp_sock * xs,struct xdp_buff * xdp,u32 len,bool explicit_free)186 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
187 		     bool explicit_free)
188 {
189 	struct xdp_buff *xsk_xdp;
190 	int err;
191 
192 	if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
193 		xs->rx_dropped++;
194 		return -ENOSPC;
195 	}
196 
197 	xsk_xdp = xsk_buff_alloc(xs->pool);
198 	if (!xsk_xdp) {
199 		xs->rx_dropped++;
200 		return -ENOSPC;
201 	}
202 
203 	xsk_copy_xdp(xsk_xdp, xdp, len);
204 	err = __xsk_rcv_zc(xs, xsk_xdp, len);
205 	if (err) {
206 		xsk_buff_free(xsk_xdp);
207 		return err;
208 	}
209 	if (explicit_free)
210 		xdp_return_buff(xdp);
211 	return 0;
212 }
213 
xsk_tx_writeable(struct xdp_sock * xs)214 static bool xsk_tx_writeable(struct xdp_sock *xs)
215 {
216 	if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
217 		return false;
218 
219 	return true;
220 }
221 
xsk_is_bound(struct xdp_sock * xs)222 static bool xsk_is_bound(struct xdp_sock *xs)
223 {
224 	if (READ_ONCE(xs->state) == XSK_BOUND) {
225 		/* Matches smp_wmb() in bind(). */
226 		smp_rmb();
227 		return true;
228 	}
229 	return false;
230 }
231 
xsk_rcv(struct xdp_sock * xs,struct xdp_buff * xdp,bool explicit_free)232 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
233 		   bool explicit_free)
234 {
235 	u32 len;
236 
237 	if (!xsk_is_bound(xs))
238 		return -EINVAL;
239 
240 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
241 		return -EINVAL;
242 
243 	len = xdp->data_end - xdp->data;
244 
245 	return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
246 		__xsk_rcv_zc(xs, xdp, len) :
247 		__xsk_rcv(xs, xdp, len, explicit_free);
248 }
249 
xsk_flush(struct xdp_sock * xs)250 static void xsk_flush(struct xdp_sock *xs)
251 {
252 	xskq_prod_submit(xs->rx);
253 	__xskq_cons_release(xs->pool->fq);
254 	sock_def_readable(&xs->sk);
255 }
256 
xsk_generic_rcv(struct xdp_sock * xs,struct xdp_buff * xdp)257 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
258 {
259 	int err;
260 
261 	spin_lock_bh(&xs->rx_lock);
262 	err = xsk_rcv(xs, xdp, false);
263 	xsk_flush(xs);
264 	spin_unlock_bh(&xs->rx_lock);
265 	return err;
266 }
267 
__xsk_map_redirect(struct xdp_sock * xs,struct xdp_buff * xdp)268 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
269 {
270 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
271 	int err;
272 
273 	err = xsk_rcv(xs, xdp, true);
274 	if (err)
275 		return err;
276 
277 	if (!xs->flush_node.prev)
278 		list_add(&xs->flush_node, flush_list);
279 
280 	return 0;
281 }
282 
__xsk_map_flush(void)283 void __xsk_map_flush(void)
284 {
285 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
286 	struct xdp_sock *xs, *tmp;
287 
288 	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
289 		xsk_flush(xs);
290 		__list_del_clearprev(&xs->flush_node);
291 	}
292 }
293 
xsk_tx_completed(struct xsk_buff_pool * pool,u32 nb_entries)294 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
295 {
296 	xskq_prod_submit_n(pool->cq, nb_entries);
297 }
298 EXPORT_SYMBOL(xsk_tx_completed);
299 
xsk_tx_release(struct xsk_buff_pool * pool)300 void xsk_tx_release(struct xsk_buff_pool *pool)
301 {
302 	struct xdp_sock *xs;
303 
304 	rcu_read_lock();
305 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
306 		__xskq_cons_release(xs->tx);
307 		if (xsk_tx_writeable(xs))
308 			xs->sk.sk_write_space(&xs->sk);
309 	}
310 	rcu_read_unlock();
311 }
312 EXPORT_SYMBOL(xsk_tx_release);
313 
xsk_tx_peek_desc(struct xsk_buff_pool * pool,struct xdp_desc * desc)314 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
315 {
316 	struct xdp_sock *xs;
317 
318 	rcu_read_lock();
319 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
320 		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
321 			xs->tx->queue_empty_descs++;
322 			continue;
323 		}
324 
325 		/* This is the backpressure mechanism for the Tx path.
326 		 * Reserve space in the completion queue and only proceed
327 		 * if there is space in it. This avoids having to implement
328 		 * any buffering in the Tx path.
329 		 */
330 		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
331 			goto out;
332 
333 		xskq_cons_release(xs->tx);
334 		rcu_read_unlock();
335 		return true;
336 	}
337 
338 out:
339 	rcu_read_unlock();
340 	return false;
341 }
342 EXPORT_SYMBOL(xsk_tx_peek_desc);
343 
xsk_wakeup(struct xdp_sock * xs,u8 flags)344 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
345 {
346 	struct net_device *dev = xs->dev;
347 	int err;
348 
349 	rcu_read_lock();
350 	err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
351 	rcu_read_unlock();
352 
353 	return err;
354 }
355 
xsk_zc_xmit(struct xdp_sock * xs)356 static int xsk_zc_xmit(struct xdp_sock *xs)
357 {
358 	return xsk_wakeup(xs, XDP_WAKEUP_TX);
359 }
360 
xsk_destruct_skb(struct sk_buff * skb)361 static void xsk_destruct_skb(struct sk_buff *skb)
362 {
363 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
364 	struct xdp_sock *xs = xdp_sk(skb->sk);
365 	unsigned long flags;
366 
367 	spin_lock_irqsave(&xs->pool->cq_lock, flags);
368 	xskq_prod_submit_addr(xs->pool->cq, addr);
369 	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
370 
371 	sock_wfree(skb);
372 }
373 
xsk_generic_xmit(struct sock * sk)374 static int xsk_generic_xmit(struct sock *sk)
375 {
376 	struct xdp_sock *xs = xdp_sk(sk);
377 	u32 max_batch = TX_BATCH_SIZE;
378 	bool sent_frame = false;
379 	struct xdp_desc desc;
380 	struct sk_buff *skb;
381 	unsigned long flags;
382 	int err = 0;
383 	u32 hr, tr;
384 
385 	mutex_lock(&xs->mutex);
386 
387 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
388 		goto out;
389 
390 	hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
391 	tr = xs->dev->needed_tailroom;
392 
393 	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
394 		char *buffer;
395 		u64 addr;
396 		u32 len;
397 
398 		if (max_batch-- == 0) {
399 			err = -EAGAIN;
400 			goto out;
401 		}
402 
403 		len = desc.len;
404 		skb = sock_alloc_send_skb(sk, hr + len + tr, 1, &err);
405 		if (unlikely(!skb))
406 			goto out;
407 
408 		skb_reserve(skb, hr);
409 		skb_put(skb, len);
410 
411 		addr = desc.addr;
412 		buffer = xsk_buff_raw_get_data(xs->pool, addr);
413 		err = skb_store_bits(skb, 0, buffer, len);
414 		/* This is the backpressure mechanism for the Tx path.
415 		 * Reserve space in the completion queue and only proceed
416 		 * if there is space in it. This avoids having to implement
417 		 * any buffering in the Tx path.
418 		 */
419 		spin_lock_irqsave(&xs->pool->cq_lock, flags);
420 		if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
421 			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
422 			kfree_skb(skb);
423 			goto out;
424 		}
425 		spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
426 
427 		skb->dev = xs->dev;
428 		skb->priority = sk->sk_priority;
429 		skb->mark = sk->sk_mark;
430 		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
431 		skb->destructor = xsk_destruct_skb;
432 
433 		err = __dev_direct_xmit(skb, xs->queue_id);
434 		if  (err == NETDEV_TX_BUSY) {
435 			/* Tell user-space to retry the send */
436 			skb->destructor = sock_wfree;
437 			spin_lock_irqsave(&xs->pool->cq_lock, flags);
438 			xskq_prod_cancel(xs->pool->cq);
439 			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
440 			/* Free skb without triggering the perf drop trace */
441 			consume_skb(skb);
442 			err = -EAGAIN;
443 			goto out;
444 		}
445 
446 		xskq_cons_release(xs->tx);
447 		/* Ignore NET_XMIT_CN as packet might have been sent */
448 		if (err == NET_XMIT_DROP) {
449 			/* SKB completed but not sent */
450 			err = -EBUSY;
451 			goto out;
452 		}
453 
454 		sent_frame = true;
455 	}
456 
457 	xs->tx->queue_empty_descs++;
458 
459 out:
460 	if (sent_frame)
461 		if (xsk_tx_writeable(xs))
462 			sk->sk_write_space(sk);
463 
464 	mutex_unlock(&xs->mutex);
465 	return err;
466 }
467 
__xsk_sendmsg(struct sock * sk)468 static int __xsk_sendmsg(struct sock *sk)
469 {
470 	struct xdp_sock *xs = xdp_sk(sk);
471 
472 	if (unlikely(!(xs->dev->flags & IFF_UP)))
473 		return -ENETDOWN;
474 	if (unlikely(!xs->tx))
475 		return -ENOBUFS;
476 
477 	return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
478 }
479 
xsk_sendmsg(struct socket * sock,struct msghdr * m,size_t total_len)480 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
481 {
482 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
483 	struct sock *sk = sock->sk;
484 	struct xdp_sock *xs = xdp_sk(sk);
485 
486 	if (unlikely(!xsk_is_bound(xs)))
487 		return -ENXIO;
488 	if (unlikely(need_wait))
489 		return -EOPNOTSUPP;
490 
491 	return __xsk_sendmsg(sk);
492 }
493 
xsk_poll(struct file * file,struct socket * sock,struct poll_table_struct * wait)494 static __poll_t xsk_poll(struct file *file, struct socket *sock,
495 			     struct poll_table_struct *wait)
496 {
497 	__poll_t mask = 0;
498 	struct sock *sk = sock->sk;
499 	struct xdp_sock *xs = xdp_sk(sk);
500 	struct xsk_buff_pool *pool;
501 
502 	sock_poll_wait(file, sock, wait);
503 
504 	if (unlikely(!xsk_is_bound(xs)))
505 		return mask;
506 
507 	pool = xs->pool;
508 
509 	if (pool->cached_need_wakeup) {
510 		if (xs->zc)
511 			xsk_wakeup(xs, pool->cached_need_wakeup);
512 		else
513 			/* Poll needs to drive Tx also in copy mode */
514 			__xsk_sendmsg(sk);
515 	}
516 
517 	if (xs->rx && !xskq_prod_is_empty(xs->rx))
518 		mask |= EPOLLIN | EPOLLRDNORM;
519 	if (xs->tx && xsk_tx_writeable(xs))
520 		mask |= EPOLLOUT | EPOLLWRNORM;
521 
522 	return mask;
523 }
524 
xsk_init_queue(u32 entries,struct xsk_queue ** queue,bool umem_queue)525 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
526 			  bool umem_queue)
527 {
528 	struct xsk_queue *q;
529 
530 	if (entries == 0 || *queue || !is_power_of_2(entries))
531 		return -EINVAL;
532 
533 	q = xskq_create(entries, umem_queue);
534 	if (!q)
535 		return -ENOMEM;
536 
537 	/* Make sure queue is ready before it can be seen by others */
538 	smp_wmb();
539 	WRITE_ONCE(*queue, q);
540 	return 0;
541 }
542 
xsk_unbind_dev(struct xdp_sock * xs)543 static void xsk_unbind_dev(struct xdp_sock *xs)
544 {
545 	struct net_device *dev = xs->dev;
546 
547 	if (xs->state != XSK_BOUND)
548 		return;
549 	WRITE_ONCE(xs->state, XSK_UNBOUND);
550 
551 	/* Wait for driver to stop using the xdp socket. */
552 	xp_del_xsk(xs->pool, xs);
553 	xs->dev = NULL;
554 	synchronize_net();
555 	dev_put(dev);
556 }
557 
xsk_get_map_list_entry(struct xdp_sock * xs,struct xdp_sock *** map_entry)558 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
559 					      struct xdp_sock ***map_entry)
560 {
561 	struct xsk_map *map = NULL;
562 	struct xsk_map_node *node;
563 
564 	*map_entry = NULL;
565 
566 	spin_lock_bh(&xs->map_list_lock);
567 	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
568 					node);
569 	if (node) {
570 		WARN_ON(xsk_map_inc(node->map));
571 		map = node->map;
572 		*map_entry = node->map_entry;
573 	}
574 	spin_unlock_bh(&xs->map_list_lock);
575 	return map;
576 }
577 
xsk_delete_from_maps(struct xdp_sock * xs)578 static void xsk_delete_from_maps(struct xdp_sock *xs)
579 {
580 	/* This function removes the current XDP socket from all the
581 	 * maps it resides in. We need to take extra care here, due to
582 	 * the two locks involved. Each map has a lock synchronizing
583 	 * updates to the entries, and each socket has a lock that
584 	 * synchronizes access to the list of maps (map_list). For
585 	 * deadlock avoidance the locks need to be taken in the order
586 	 * "map lock"->"socket map list lock". We start off by
587 	 * accessing the socket map list, and take a reference to the
588 	 * map to guarantee existence between the
589 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
590 	 * calls. Then we ask the map to remove the socket, which
591 	 * tries to remove the socket from the map. Note that there
592 	 * might be updates to the map between
593 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
594 	 */
595 	struct xdp_sock **map_entry = NULL;
596 	struct xsk_map *map;
597 
598 	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
599 		xsk_map_try_sock_delete(map, xs, map_entry);
600 		xsk_map_put(map);
601 	}
602 }
603 
xsk_release(struct socket * sock)604 static int xsk_release(struct socket *sock)
605 {
606 	struct sock *sk = sock->sk;
607 	struct xdp_sock *xs = xdp_sk(sk);
608 	struct net *net;
609 
610 	if (!sk)
611 		return 0;
612 
613 	net = sock_net(sk);
614 
615 	mutex_lock(&net->xdp.lock);
616 	sk_del_node_init_rcu(sk);
617 	mutex_unlock(&net->xdp.lock);
618 
619 	local_bh_disable();
620 	sock_prot_inuse_add(net, sk->sk_prot, -1);
621 	local_bh_enable();
622 
623 	xsk_delete_from_maps(xs);
624 	mutex_lock(&xs->mutex);
625 	xsk_unbind_dev(xs);
626 	mutex_unlock(&xs->mutex);
627 
628 	xskq_destroy(xs->rx);
629 	xskq_destroy(xs->tx);
630 	xskq_destroy(xs->fq_tmp);
631 	xskq_destroy(xs->cq_tmp);
632 
633 	sock_orphan(sk);
634 	sock->sk = NULL;
635 
636 	sk_refcnt_debug_release(sk);
637 	sock_put(sk);
638 
639 	return 0;
640 }
641 
xsk_lookup_xsk_from_fd(int fd)642 static struct socket *xsk_lookup_xsk_from_fd(int fd)
643 {
644 	struct socket *sock;
645 	int err;
646 
647 	sock = sockfd_lookup(fd, &err);
648 	if (!sock)
649 		return ERR_PTR(-ENOTSOCK);
650 
651 	if (sock->sk->sk_family != PF_XDP) {
652 		sockfd_put(sock);
653 		return ERR_PTR(-ENOPROTOOPT);
654 	}
655 
656 	return sock;
657 }
658 
xsk_validate_queues(struct xdp_sock * xs)659 static bool xsk_validate_queues(struct xdp_sock *xs)
660 {
661 	return xs->fq_tmp && xs->cq_tmp;
662 }
663 
xsk_bind(struct socket * sock,struct sockaddr * addr,int addr_len)664 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
665 {
666 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
667 	struct sock *sk = sock->sk;
668 	struct xdp_sock *xs = xdp_sk(sk);
669 	struct net_device *dev;
670 	int bound_dev_if;
671 	u32 flags, qid;
672 	int err = 0;
673 
674 	if (addr_len < sizeof(struct sockaddr_xdp))
675 		return -EINVAL;
676 	if (sxdp->sxdp_family != AF_XDP)
677 		return -EINVAL;
678 
679 	flags = sxdp->sxdp_flags;
680 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
681 		      XDP_USE_NEED_WAKEUP))
682 		return -EINVAL;
683 
684 	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
685 	if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
686 		return -EINVAL;
687 
688 	rtnl_lock();
689 	mutex_lock(&xs->mutex);
690 	if (xs->state != XSK_READY) {
691 		err = -EBUSY;
692 		goto out_release;
693 	}
694 
695 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
696 	if (!dev) {
697 		err = -ENODEV;
698 		goto out_release;
699 	}
700 
701 	if (!xs->rx && !xs->tx) {
702 		err = -EINVAL;
703 		goto out_unlock;
704 	}
705 
706 	qid = sxdp->sxdp_queue_id;
707 
708 	if (flags & XDP_SHARED_UMEM) {
709 		struct xdp_sock *umem_xs;
710 		struct socket *sock;
711 
712 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
713 		    (flags & XDP_USE_NEED_WAKEUP)) {
714 			/* Cannot specify flags for shared sockets. */
715 			err = -EINVAL;
716 			goto out_unlock;
717 		}
718 
719 		if (xs->umem) {
720 			/* We have already our own. */
721 			err = -EINVAL;
722 			goto out_unlock;
723 		}
724 
725 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
726 		if (IS_ERR(sock)) {
727 			err = PTR_ERR(sock);
728 			goto out_unlock;
729 		}
730 
731 		umem_xs = xdp_sk(sock->sk);
732 		if (!xsk_is_bound(umem_xs)) {
733 			err = -EBADF;
734 			sockfd_put(sock);
735 			goto out_unlock;
736 		}
737 
738 		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
739 			/* Share the umem with another socket on another qid
740 			 * and/or device.
741 			 */
742 			xs->pool = xp_create_and_assign_umem(xs,
743 							     umem_xs->umem);
744 			if (!xs->pool) {
745 				err = -ENOMEM;
746 				sockfd_put(sock);
747 				goto out_unlock;
748 			}
749 
750 			err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
751 						   qid);
752 			if (err) {
753 				xp_destroy(xs->pool);
754 				xs->pool = NULL;
755 				sockfd_put(sock);
756 				goto out_unlock;
757 			}
758 		} else {
759 			/* Share the buffer pool with the other socket. */
760 			if (xs->fq_tmp || xs->cq_tmp) {
761 				/* Do not allow setting your own fq or cq. */
762 				err = -EINVAL;
763 				sockfd_put(sock);
764 				goto out_unlock;
765 			}
766 
767 			xp_get_pool(umem_xs->pool);
768 			xs->pool = umem_xs->pool;
769 		}
770 
771 		xdp_get_umem(umem_xs->umem);
772 		WRITE_ONCE(xs->umem, umem_xs->umem);
773 		sockfd_put(sock);
774 	} else if (!xs->umem || !xsk_validate_queues(xs)) {
775 		err = -EINVAL;
776 		goto out_unlock;
777 	} else {
778 		/* This xsk has its own umem. */
779 		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
780 		if (!xs->pool) {
781 			err = -ENOMEM;
782 			goto out_unlock;
783 		}
784 
785 		err = xp_assign_dev(xs->pool, dev, qid, flags);
786 		if (err) {
787 			xp_destroy(xs->pool);
788 			xs->pool = NULL;
789 			goto out_unlock;
790 		}
791 	}
792 
793 	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
794 	xs->fq_tmp = NULL;
795 	xs->cq_tmp = NULL;
796 
797 	xs->dev = dev;
798 	xs->zc = xs->umem->zc;
799 	xs->queue_id = qid;
800 	xp_add_xsk(xs->pool, xs);
801 
802 out_unlock:
803 	if (err) {
804 		dev_put(dev);
805 	} else {
806 		/* Matches smp_rmb() in bind() for shared umem
807 		 * sockets, and xsk_is_bound().
808 		 */
809 		smp_wmb();
810 		WRITE_ONCE(xs->state, XSK_BOUND);
811 	}
812 out_release:
813 	mutex_unlock(&xs->mutex);
814 	rtnl_unlock();
815 	return err;
816 }
817 
818 struct xdp_umem_reg_v1 {
819 	__u64 addr; /* Start of packet data area */
820 	__u64 len; /* Length of packet data area */
821 	__u32 chunk_size;
822 	__u32 headroom;
823 };
824 
xsk_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)825 static int xsk_setsockopt(struct socket *sock, int level, int optname,
826 			  sockptr_t optval, unsigned int optlen)
827 {
828 	struct sock *sk = sock->sk;
829 	struct xdp_sock *xs = xdp_sk(sk);
830 	int err;
831 
832 	if (level != SOL_XDP)
833 		return -ENOPROTOOPT;
834 
835 	switch (optname) {
836 	case XDP_RX_RING:
837 	case XDP_TX_RING:
838 	{
839 		struct xsk_queue **q;
840 		int entries;
841 
842 		if (optlen < sizeof(entries))
843 			return -EINVAL;
844 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
845 			return -EFAULT;
846 
847 		mutex_lock(&xs->mutex);
848 		if (xs->state != XSK_READY) {
849 			mutex_unlock(&xs->mutex);
850 			return -EBUSY;
851 		}
852 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
853 		err = xsk_init_queue(entries, q, false);
854 		if (!err && optname == XDP_TX_RING)
855 			/* Tx needs to be explicitly woken up the first time */
856 			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
857 		mutex_unlock(&xs->mutex);
858 		return err;
859 	}
860 	case XDP_UMEM_REG:
861 	{
862 		size_t mr_size = sizeof(struct xdp_umem_reg);
863 		struct xdp_umem_reg mr = {};
864 		struct xdp_umem *umem;
865 
866 		if (optlen < sizeof(struct xdp_umem_reg_v1))
867 			return -EINVAL;
868 		else if (optlen < sizeof(mr))
869 			mr_size = sizeof(struct xdp_umem_reg_v1);
870 
871 		if (copy_from_sockptr(&mr, optval, mr_size))
872 			return -EFAULT;
873 
874 		mutex_lock(&xs->mutex);
875 		if (xs->state != XSK_READY || xs->umem) {
876 			mutex_unlock(&xs->mutex);
877 			return -EBUSY;
878 		}
879 
880 		umem = xdp_umem_create(&mr);
881 		if (IS_ERR(umem)) {
882 			mutex_unlock(&xs->mutex);
883 			return PTR_ERR(umem);
884 		}
885 
886 		/* Make sure umem is ready before it can be seen by others */
887 		smp_wmb();
888 		WRITE_ONCE(xs->umem, umem);
889 		mutex_unlock(&xs->mutex);
890 		return 0;
891 	}
892 	case XDP_UMEM_FILL_RING:
893 	case XDP_UMEM_COMPLETION_RING:
894 	{
895 		struct xsk_queue **q;
896 		int entries;
897 
898 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
899 			return -EFAULT;
900 
901 		mutex_lock(&xs->mutex);
902 		if (xs->state != XSK_READY) {
903 			mutex_unlock(&xs->mutex);
904 			return -EBUSY;
905 		}
906 
907 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
908 			&xs->cq_tmp;
909 		err = xsk_init_queue(entries, q, true);
910 		mutex_unlock(&xs->mutex);
911 		return err;
912 	}
913 	default:
914 		break;
915 	}
916 
917 	return -ENOPROTOOPT;
918 }
919 
xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 * ring)920 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
921 {
922 	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
923 	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
924 	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
925 }
926 
xsk_enter_umem_offsets(struct xdp_ring_offset_v1 * ring)927 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
928 {
929 	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
930 	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
931 	ring->desc = offsetof(struct xdp_umem_ring, desc);
932 }
933 
934 struct xdp_statistics_v1 {
935 	__u64 rx_dropped;
936 	__u64 rx_invalid_descs;
937 	__u64 tx_invalid_descs;
938 };
939 
xsk_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)940 static int xsk_getsockopt(struct socket *sock, int level, int optname,
941 			  char __user *optval, int __user *optlen)
942 {
943 	struct sock *sk = sock->sk;
944 	struct xdp_sock *xs = xdp_sk(sk);
945 	int len;
946 
947 	if (level != SOL_XDP)
948 		return -ENOPROTOOPT;
949 
950 	if (get_user(len, optlen))
951 		return -EFAULT;
952 	if (len < 0)
953 		return -EINVAL;
954 
955 	switch (optname) {
956 	case XDP_STATISTICS:
957 	{
958 		struct xdp_statistics stats = {};
959 		bool extra_stats = true;
960 		size_t stats_size;
961 
962 		if (len < sizeof(struct xdp_statistics_v1)) {
963 			return -EINVAL;
964 		} else if (len < sizeof(stats)) {
965 			extra_stats = false;
966 			stats_size = sizeof(struct xdp_statistics_v1);
967 		} else {
968 			stats_size = sizeof(stats);
969 		}
970 
971 		mutex_lock(&xs->mutex);
972 		stats.rx_dropped = xs->rx_dropped;
973 		if (extra_stats) {
974 			stats.rx_ring_full = xs->rx_queue_full;
975 			stats.rx_fill_ring_empty_descs =
976 				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
977 			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
978 		} else {
979 			stats.rx_dropped += xs->rx_queue_full;
980 		}
981 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
982 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
983 		mutex_unlock(&xs->mutex);
984 
985 		if (copy_to_user(optval, &stats, stats_size))
986 			return -EFAULT;
987 		if (put_user(stats_size, optlen))
988 			return -EFAULT;
989 
990 		return 0;
991 	}
992 	case XDP_MMAP_OFFSETS:
993 	{
994 		struct xdp_mmap_offsets off;
995 		struct xdp_mmap_offsets_v1 off_v1;
996 		bool flags_supported = true;
997 		void *to_copy;
998 
999 		if (len < sizeof(off_v1))
1000 			return -EINVAL;
1001 		else if (len < sizeof(off))
1002 			flags_supported = false;
1003 
1004 		if (flags_supported) {
1005 			/* xdp_ring_offset is identical to xdp_ring_offset_v1
1006 			 * except for the flags field added to the end.
1007 			 */
1008 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1009 					       &off.rx);
1010 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1011 					       &off.tx);
1012 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1013 					       &off.fr);
1014 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1015 					       &off.cr);
1016 			off.rx.flags = offsetof(struct xdp_rxtx_ring,
1017 						ptrs.flags);
1018 			off.tx.flags = offsetof(struct xdp_rxtx_ring,
1019 						ptrs.flags);
1020 			off.fr.flags = offsetof(struct xdp_umem_ring,
1021 						ptrs.flags);
1022 			off.cr.flags = offsetof(struct xdp_umem_ring,
1023 						ptrs.flags);
1024 
1025 			len = sizeof(off);
1026 			to_copy = &off;
1027 		} else {
1028 			xsk_enter_rxtx_offsets(&off_v1.rx);
1029 			xsk_enter_rxtx_offsets(&off_v1.tx);
1030 			xsk_enter_umem_offsets(&off_v1.fr);
1031 			xsk_enter_umem_offsets(&off_v1.cr);
1032 
1033 			len = sizeof(off_v1);
1034 			to_copy = &off_v1;
1035 		}
1036 
1037 		if (copy_to_user(optval, to_copy, len))
1038 			return -EFAULT;
1039 		if (put_user(len, optlen))
1040 			return -EFAULT;
1041 
1042 		return 0;
1043 	}
1044 	case XDP_OPTIONS:
1045 	{
1046 		struct xdp_options opts = {};
1047 
1048 		if (len < sizeof(opts))
1049 			return -EINVAL;
1050 
1051 		mutex_lock(&xs->mutex);
1052 		if (xs->zc)
1053 			opts.flags |= XDP_OPTIONS_ZEROCOPY;
1054 		mutex_unlock(&xs->mutex);
1055 
1056 		len = sizeof(opts);
1057 		if (copy_to_user(optval, &opts, len))
1058 			return -EFAULT;
1059 		if (put_user(len, optlen))
1060 			return -EFAULT;
1061 
1062 		return 0;
1063 	}
1064 	default:
1065 		break;
1066 	}
1067 
1068 	return -EOPNOTSUPP;
1069 }
1070 
xsk_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)1071 static int xsk_mmap(struct file *file, struct socket *sock,
1072 		    struct vm_area_struct *vma)
1073 {
1074 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1075 	unsigned long size = vma->vm_end - vma->vm_start;
1076 	struct xdp_sock *xs = xdp_sk(sock->sk);
1077 	struct xsk_queue *q = NULL;
1078 	unsigned long pfn;
1079 	struct page *qpg;
1080 
1081 	if (READ_ONCE(xs->state) != XSK_READY)
1082 		return -EBUSY;
1083 
1084 	if (offset == XDP_PGOFF_RX_RING) {
1085 		q = READ_ONCE(xs->rx);
1086 	} else if (offset == XDP_PGOFF_TX_RING) {
1087 		q = READ_ONCE(xs->tx);
1088 	} else {
1089 		/* Matches the smp_wmb() in XDP_UMEM_REG */
1090 		smp_rmb();
1091 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
1092 			q = READ_ONCE(xs->fq_tmp);
1093 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1094 			q = READ_ONCE(xs->cq_tmp);
1095 	}
1096 
1097 	if (!q)
1098 		return -EINVAL;
1099 
1100 	/* Matches the smp_wmb() in xsk_init_queue */
1101 	smp_rmb();
1102 	qpg = virt_to_head_page(q->ring);
1103 	if (size > page_size(qpg))
1104 		return -EINVAL;
1105 
1106 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1107 	return remap_pfn_range(vma, vma->vm_start, pfn,
1108 			       size, vma->vm_page_prot);
1109 }
1110 
xsk_notifier(struct notifier_block * this,unsigned long msg,void * ptr)1111 static int xsk_notifier(struct notifier_block *this,
1112 			unsigned long msg, void *ptr)
1113 {
1114 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1115 	struct net *net = dev_net(dev);
1116 	struct sock *sk;
1117 
1118 	switch (msg) {
1119 	case NETDEV_UNREGISTER:
1120 		mutex_lock(&net->xdp.lock);
1121 		sk_for_each(sk, &net->xdp.list) {
1122 			struct xdp_sock *xs = xdp_sk(sk);
1123 
1124 			mutex_lock(&xs->mutex);
1125 			if (xs->dev == dev) {
1126 				sk->sk_err = ENETDOWN;
1127 				if (!sock_flag(sk, SOCK_DEAD))
1128 					sk->sk_error_report(sk);
1129 
1130 				xsk_unbind_dev(xs);
1131 
1132 				/* Clear device references. */
1133 				xp_clear_dev(xs->pool);
1134 			}
1135 			mutex_unlock(&xs->mutex);
1136 		}
1137 		mutex_unlock(&net->xdp.lock);
1138 		break;
1139 	}
1140 	return NOTIFY_DONE;
1141 }
1142 
1143 static struct proto xsk_proto = {
1144 	.name =		"XDP",
1145 	.owner =	THIS_MODULE,
1146 	.obj_size =	sizeof(struct xdp_sock),
1147 };
1148 
1149 static const struct proto_ops xsk_proto_ops = {
1150 	.family		= PF_XDP,
1151 	.owner		= THIS_MODULE,
1152 	.release	= xsk_release,
1153 	.bind		= xsk_bind,
1154 	.connect	= sock_no_connect,
1155 	.socketpair	= sock_no_socketpair,
1156 	.accept		= sock_no_accept,
1157 	.getname	= sock_no_getname,
1158 	.poll		= xsk_poll,
1159 	.ioctl		= sock_no_ioctl,
1160 	.listen		= sock_no_listen,
1161 	.shutdown	= sock_no_shutdown,
1162 	.setsockopt	= xsk_setsockopt,
1163 	.getsockopt	= xsk_getsockopt,
1164 	.sendmsg	= xsk_sendmsg,
1165 	.recvmsg	= sock_no_recvmsg,
1166 	.mmap		= xsk_mmap,
1167 	.sendpage	= sock_no_sendpage,
1168 };
1169 
xsk_destruct(struct sock * sk)1170 static void xsk_destruct(struct sock *sk)
1171 {
1172 	struct xdp_sock *xs = xdp_sk(sk);
1173 
1174 	if (!sock_flag(sk, SOCK_DEAD))
1175 		return;
1176 
1177 	if (!xp_put_pool(xs->pool))
1178 		xdp_put_umem(xs->umem, !xs->pool);
1179 
1180 	sk_refcnt_debug_dec(sk);
1181 }
1182 
xsk_create(struct net * net,struct socket * sock,int protocol,int kern)1183 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1184 		      int kern)
1185 {
1186 	struct xdp_sock *xs;
1187 	struct sock *sk;
1188 
1189 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1190 		return -EPERM;
1191 	if (sock->type != SOCK_RAW)
1192 		return -ESOCKTNOSUPPORT;
1193 
1194 	if (protocol)
1195 		return -EPROTONOSUPPORT;
1196 
1197 	sock->state = SS_UNCONNECTED;
1198 
1199 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1200 	if (!sk)
1201 		return -ENOBUFS;
1202 
1203 	sock->ops = &xsk_proto_ops;
1204 
1205 	sock_init_data(sock, sk);
1206 
1207 	sk->sk_family = PF_XDP;
1208 
1209 	sk->sk_destruct = xsk_destruct;
1210 	sk_refcnt_debug_inc(sk);
1211 
1212 	sock_set_flag(sk, SOCK_RCU_FREE);
1213 
1214 	xs = xdp_sk(sk);
1215 	xs->state = XSK_READY;
1216 	mutex_init(&xs->mutex);
1217 	spin_lock_init(&xs->rx_lock);
1218 
1219 	INIT_LIST_HEAD(&xs->map_list);
1220 	spin_lock_init(&xs->map_list_lock);
1221 
1222 	mutex_lock(&net->xdp.lock);
1223 	sk_add_node_rcu(sk, &net->xdp.list);
1224 	mutex_unlock(&net->xdp.lock);
1225 
1226 	local_bh_disable();
1227 	sock_prot_inuse_add(net, &xsk_proto, 1);
1228 	local_bh_enable();
1229 
1230 	return 0;
1231 }
1232 
1233 static const struct net_proto_family xsk_family_ops = {
1234 	.family = PF_XDP,
1235 	.create = xsk_create,
1236 	.owner	= THIS_MODULE,
1237 };
1238 
1239 static struct notifier_block xsk_netdev_notifier = {
1240 	.notifier_call	= xsk_notifier,
1241 };
1242 
xsk_net_init(struct net * net)1243 static int __net_init xsk_net_init(struct net *net)
1244 {
1245 	mutex_init(&net->xdp.lock);
1246 	INIT_HLIST_HEAD(&net->xdp.list);
1247 	return 0;
1248 }
1249 
xsk_net_exit(struct net * net)1250 static void __net_exit xsk_net_exit(struct net *net)
1251 {
1252 	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1253 }
1254 
1255 static struct pernet_operations xsk_net_ops = {
1256 	.init = xsk_net_init,
1257 	.exit = xsk_net_exit,
1258 };
1259 
xsk_init(void)1260 static int __init xsk_init(void)
1261 {
1262 	int err, cpu;
1263 
1264 	err = proto_register(&xsk_proto, 0 /* no slab */);
1265 	if (err)
1266 		goto out;
1267 
1268 	err = sock_register(&xsk_family_ops);
1269 	if (err)
1270 		goto out_proto;
1271 
1272 	err = register_pernet_subsys(&xsk_net_ops);
1273 	if (err)
1274 		goto out_sk;
1275 
1276 	err = register_netdevice_notifier(&xsk_netdev_notifier);
1277 	if (err)
1278 		goto out_pernet;
1279 
1280 	for_each_possible_cpu(cpu)
1281 		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1282 	return 0;
1283 
1284 out_pernet:
1285 	unregister_pernet_subsys(&xsk_net_ops);
1286 out_sk:
1287 	sock_unregister(PF_XDP);
1288 out_proto:
1289 	proto_unregister(&xsk_proto);
1290 out:
1291 	return err;
1292 }
1293 
1294 fs_initcall(xsk_init);
1295