• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11 
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13 
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock.h>
26 #include <net/xdp.h>
27 
28 #include "xsk_queue.h"
29 #include "xdp_umem.h"
30 #include "xsk.h"
31 
32 #define TX_BATCH_SIZE 16
33 
xsk_is_setup_for_bpf_map(struct xdp_sock * xs)34 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
35 {
36 	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
37 		READ_ONCE(xs->umem->fq);
38 }
39 
xsk_umem_has_addrs(struct xdp_umem * umem,u32 cnt)40 bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
41 {
42 	return xskq_has_addrs(umem->fq, cnt);
43 }
44 EXPORT_SYMBOL(xsk_umem_has_addrs);
45 
xsk_umem_peek_addr(struct xdp_umem * umem,u64 * addr)46 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
47 {
48 	return xskq_peek_addr(umem->fq, addr, umem);
49 }
50 EXPORT_SYMBOL(xsk_umem_peek_addr);
51 
xsk_umem_discard_addr(struct xdp_umem * umem)52 void xsk_umem_discard_addr(struct xdp_umem *umem)
53 {
54 	xskq_discard_addr(umem->fq);
55 }
56 EXPORT_SYMBOL(xsk_umem_discard_addr);
57 
xsk_set_rx_need_wakeup(struct xdp_umem * umem)58 void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
59 {
60 	if (umem->need_wakeup & XDP_WAKEUP_RX)
61 		return;
62 
63 	umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
64 	umem->need_wakeup |= XDP_WAKEUP_RX;
65 }
66 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
67 
xsk_set_tx_need_wakeup(struct xdp_umem * umem)68 void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
69 {
70 	struct xdp_sock *xs;
71 
72 	if (umem->need_wakeup & XDP_WAKEUP_TX)
73 		return;
74 
75 	rcu_read_lock();
76 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
77 		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
78 	}
79 	rcu_read_unlock();
80 
81 	umem->need_wakeup |= XDP_WAKEUP_TX;
82 }
83 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
84 
xsk_clear_rx_need_wakeup(struct xdp_umem * umem)85 void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
86 {
87 	if (!(umem->need_wakeup & XDP_WAKEUP_RX))
88 		return;
89 
90 	umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
91 	umem->need_wakeup &= ~XDP_WAKEUP_RX;
92 }
93 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
94 
xsk_clear_tx_need_wakeup(struct xdp_umem * umem)95 void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
96 {
97 	struct xdp_sock *xs;
98 
99 	if (!(umem->need_wakeup & XDP_WAKEUP_TX))
100 		return;
101 
102 	rcu_read_lock();
103 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
104 		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
105 	}
106 	rcu_read_unlock();
107 
108 	umem->need_wakeup &= ~XDP_WAKEUP_TX;
109 }
110 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
111 
xsk_umem_uses_need_wakeup(struct xdp_umem * umem)112 bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
113 {
114 	return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
115 }
116 EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
117 
118 /* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
119  * each page. This is only required in copy mode.
120  */
__xsk_rcv_memcpy(struct xdp_umem * umem,u64 addr,void * from_buf,u32 len,u32 metalen)121 static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
122 			     u32 len, u32 metalen)
123 {
124 	void *to_buf = xdp_umem_get_data(umem, addr);
125 
126 	addr = xsk_umem_add_offset_to_addr(addr);
127 	if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
128 		void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
129 		u64 page_start = addr & ~(PAGE_SIZE - 1);
130 		u64 first_len = PAGE_SIZE - (addr - page_start);
131 
132 		memcpy(to_buf, from_buf, first_len + metalen);
133 		memcpy(next_pg_addr, from_buf + first_len, len - first_len);
134 
135 		return;
136 	}
137 
138 	memcpy(to_buf, from_buf, len + metalen);
139 }
140 
__xsk_rcv(struct xdp_sock * xs,struct xdp_buff * xdp,u32 len)141 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
142 {
143 	u64 offset = xs->umem->headroom;
144 	u64 addr, memcpy_addr;
145 	void *from_buf;
146 	u32 metalen;
147 	int err;
148 
149 	if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
150 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
151 		xs->rx_dropped++;
152 		return -ENOSPC;
153 	}
154 
155 	if (unlikely(xdp_data_meta_unsupported(xdp))) {
156 		from_buf = xdp->data;
157 		metalen = 0;
158 	} else {
159 		from_buf = xdp->data_meta;
160 		metalen = xdp->data - xdp->data_meta;
161 	}
162 
163 	memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
164 	__xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
165 
166 	offset += metalen;
167 	addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
168 	err = xskq_produce_batch_desc(xs->rx, addr, len);
169 	if (!err) {
170 		xskq_discard_addr(xs->umem->fq);
171 		xdp_return_buff(xdp);
172 		return 0;
173 	}
174 
175 	xs->rx_dropped++;
176 	return err;
177 }
178 
__xsk_rcv_zc(struct xdp_sock * xs,struct xdp_buff * xdp,u32 len)179 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
180 {
181 	int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
182 
183 	if (err)
184 		xs->rx_dropped++;
185 
186 	return err;
187 }
188 
xsk_is_bound(struct xdp_sock * xs)189 static bool xsk_is_bound(struct xdp_sock *xs)
190 {
191 	if (READ_ONCE(xs->state) == XSK_BOUND) {
192 		/* Matches smp_wmb() in bind(). */
193 		smp_rmb();
194 		return true;
195 	}
196 	return false;
197 }
198 
xsk_rcv(struct xdp_sock * xs,struct xdp_buff * xdp)199 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
200 {
201 	u32 len;
202 
203 	if (!xsk_is_bound(xs))
204 		return -EINVAL;
205 
206 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
207 		return -EINVAL;
208 
209 	len = xdp->data_end - xdp->data;
210 
211 	return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
212 		__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
213 }
214 
xsk_flush(struct xdp_sock * xs)215 void xsk_flush(struct xdp_sock *xs)
216 {
217 	xskq_produce_flush_desc(xs->rx);
218 	xs->sk.sk_data_ready(&xs->sk);
219 }
220 
xsk_generic_rcv(struct xdp_sock * xs,struct xdp_buff * xdp)221 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
222 {
223 	u32 metalen = xdp->data - xdp->data_meta;
224 	u32 len = xdp->data_end - xdp->data;
225 	u64 offset = xs->umem->headroom;
226 	void *buffer;
227 	u64 addr;
228 	int err;
229 
230 	spin_lock_bh(&xs->rx_lock);
231 
232 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
233 		err = -EINVAL;
234 		goto out_unlock;
235 	}
236 
237 	if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
238 	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
239 		err = -ENOSPC;
240 		goto out_drop;
241 	}
242 
243 	addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
244 	buffer = xdp_umem_get_data(xs->umem, addr);
245 	memcpy(buffer, xdp->data_meta, len + metalen);
246 
247 	addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
248 	err = xskq_produce_batch_desc(xs->rx, addr, len);
249 	if (err)
250 		goto out_drop;
251 
252 	xskq_discard_addr(xs->umem->fq);
253 	xskq_produce_flush_desc(xs->rx);
254 
255 	spin_unlock_bh(&xs->rx_lock);
256 
257 	xs->sk.sk_data_ready(&xs->sk);
258 	return 0;
259 
260 out_drop:
261 	xs->rx_dropped++;
262 out_unlock:
263 	spin_unlock_bh(&xs->rx_lock);
264 	return err;
265 }
266 
xsk_umem_complete_tx(struct xdp_umem * umem,u32 nb_entries)267 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
268 {
269 	xskq_produce_flush_addr_n(umem->cq, nb_entries);
270 }
271 EXPORT_SYMBOL(xsk_umem_complete_tx);
272 
xsk_umem_consume_tx_done(struct xdp_umem * umem)273 void xsk_umem_consume_tx_done(struct xdp_umem *umem)
274 {
275 	struct xdp_sock *xs;
276 
277 	rcu_read_lock();
278 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
279 		xs->sk.sk_write_space(&xs->sk);
280 	}
281 	rcu_read_unlock();
282 }
283 EXPORT_SYMBOL(xsk_umem_consume_tx_done);
284 
xsk_umem_consume_tx(struct xdp_umem * umem,struct xdp_desc * desc)285 bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
286 {
287 	struct xdp_sock *xs;
288 
289 	rcu_read_lock();
290 	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
291 		if (!xskq_peek_desc(xs->tx, desc, umem))
292 			continue;
293 
294 		if (xskq_produce_addr_lazy(umem->cq, desc->addr))
295 			goto out;
296 
297 		xskq_discard_desc(xs->tx);
298 		rcu_read_unlock();
299 		return true;
300 	}
301 
302 out:
303 	rcu_read_unlock();
304 	return false;
305 }
306 EXPORT_SYMBOL(xsk_umem_consume_tx);
307 
xsk_wakeup(struct xdp_sock * xs,u8 flags)308 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
309 {
310 	struct net_device *dev = xs->dev;
311 	int err;
312 
313 	rcu_read_lock();
314 	err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
315 	rcu_read_unlock();
316 
317 	return err;
318 }
319 
xsk_zc_xmit(struct xdp_sock * xs)320 static int xsk_zc_xmit(struct xdp_sock *xs)
321 {
322 	return xsk_wakeup(xs, XDP_WAKEUP_TX);
323 }
324 
xsk_destruct_skb(struct sk_buff * skb)325 static void xsk_destruct_skb(struct sk_buff *skb)
326 {
327 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
328 	struct xdp_sock *xs = xdp_sk(skb->sk);
329 	unsigned long flags;
330 
331 	spin_lock_irqsave(&xs->tx_completion_lock, flags);
332 	WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
333 	spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
334 
335 	sock_wfree(skb);
336 }
337 
xsk_generic_xmit(struct sock * sk)338 static int xsk_generic_xmit(struct sock *sk)
339 {
340 	struct xdp_sock *xs = xdp_sk(sk);
341 	u32 max_batch = TX_BATCH_SIZE;
342 	bool sent_frame = false;
343 	struct xdp_desc desc;
344 	struct sk_buff *skb;
345 	int err = 0;
346 
347 	mutex_lock(&xs->mutex);
348 
349 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
350 		goto out;
351 
352 	while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
353 		char *buffer;
354 		u64 addr;
355 		u32 len;
356 
357 		if (max_batch-- == 0) {
358 			err = -EAGAIN;
359 			goto out;
360 		}
361 
362 		len = desc.len;
363 		skb = sock_alloc_send_skb(sk, len, 1, &err);
364 		if (unlikely(!skb)) {
365 			err = -EAGAIN;
366 			goto out;
367 		}
368 
369 		skb_put(skb, len);
370 		addr = desc.addr;
371 		buffer = xdp_umem_get_data(xs->umem, addr);
372 		err = skb_store_bits(skb, 0, buffer, len);
373 		if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) {
374 			kfree_skb(skb);
375 			goto out;
376 		}
377 
378 		skb->dev = xs->dev;
379 		skb->priority = sk->sk_priority;
380 		skb->mark = sk->sk_mark;
381 		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
382 		skb->destructor = xsk_destruct_skb;
383 
384 		err = dev_direct_xmit(skb, xs->queue_id);
385 		xskq_discard_desc(xs->tx);
386 		/* Ignore NET_XMIT_CN as packet might have been sent */
387 		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
388 			/* SKB completed but not sent */
389 			err = -EBUSY;
390 			goto out;
391 		}
392 
393 		sent_frame = true;
394 	}
395 
396 out:
397 	if (sent_frame)
398 		sk->sk_write_space(sk);
399 
400 	mutex_unlock(&xs->mutex);
401 	return err;
402 }
403 
__xsk_sendmsg(struct sock * sk)404 static int __xsk_sendmsg(struct sock *sk)
405 {
406 	struct xdp_sock *xs = xdp_sk(sk);
407 
408 	if (unlikely(!(xs->dev->flags & IFF_UP)))
409 		return -ENETDOWN;
410 	if (unlikely(!xs->tx))
411 		return -ENOBUFS;
412 
413 	return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
414 }
415 
xsk_sendmsg(struct socket * sock,struct msghdr * m,size_t total_len)416 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
417 {
418 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
419 	struct sock *sk = sock->sk;
420 	struct xdp_sock *xs = xdp_sk(sk);
421 
422 	if (unlikely(!xsk_is_bound(xs)))
423 		return -ENXIO;
424 	if (unlikely(need_wait))
425 		return -EOPNOTSUPP;
426 
427 	return __xsk_sendmsg(sk);
428 }
429 
xsk_poll(struct file * file,struct socket * sock,struct poll_table_struct * wait)430 static unsigned int xsk_poll(struct file *file, struct socket *sock,
431 			     struct poll_table_struct *wait)
432 {
433 	unsigned int mask = datagram_poll(file, sock, wait);
434 	struct sock *sk = sock->sk;
435 	struct xdp_sock *xs = xdp_sk(sk);
436 	struct xdp_umem *umem;
437 
438 	if (unlikely(!xsk_is_bound(xs)))
439 		return mask;
440 
441 	umem = xs->umem;
442 
443 	if (umem->need_wakeup) {
444 		if (xs->zc)
445 			xsk_wakeup(xs, umem->need_wakeup);
446 		else
447 			/* Poll needs to drive Tx also in copy mode */
448 			__xsk_sendmsg(sk);
449 	}
450 
451 	if (xs->rx && !xskq_empty_desc(xs->rx))
452 		mask |= POLLIN | POLLRDNORM;
453 	if (xs->tx && !xskq_full_desc(xs->tx))
454 		mask |= POLLOUT | POLLWRNORM;
455 
456 	return mask;
457 }
458 
xsk_init_queue(u32 entries,struct xsk_queue ** queue,bool umem_queue)459 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
460 			  bool umem_queue)
461 {
462 	struct xsk_queue *q;
463 
464 	if (entries == 0 || *queue || !is_power_of_2(entries))
465 		return -EINVAL;
466 
467 	q = xskq_create(entries, umem_queue);
468 	if (!q)
469 		return -ENOMEM;
470 
471 	/* Make sure queue is ready before it can be seen by others */
472 	smp_wmb();
473 	WRITE_ONCE(*queue, q);
474 	return 0;
475 }
476 
xsk_unbind_dev(struct xdp_sock * xs)477 static void xsk_unbind_dev(struct xdp_sock *xs)
478 {
479 	struct net_device *dev = xs->dev;
480 
481 	if (xs->state != XSK_BOUND)
482 		return;
483 	WRITE_ONCE(xs->state, XSK_UNBOUND);
484 
485 	/* Wait for driver to stop using the xdp socket. */
486 	xdp_del_sk_umem(xs->umem, xs);
487 	xs->dev = NULL;
488 	synchronize_net();
489 	dev_put(dev);
490 }
491 
xsk_get_map_list_entry(struct xdp_sock * xs,struct xdp_sock *** map_entry)492 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
493 					      struct xdp_sock ***map_entry)
494 {
495 	struct xsk_map *map = NULL;
496 	struct xsk_map_node *node;
497 
498 	*map_entry = NULL;
499 
500 	spin_lock_bh(&xs->map_list_lock);
501 	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
502 					node);
503 	if (node) {
504 		WARN_ON(xsk_map_inc(node->map));
505 		map = node->map;
506 		*map_entry = node->map_entry;
507 	}
508 	spin_unlock_bh(&xs->map_list_lock);
509 	return map;
510 }
511 
xsk_delete_from_maps(struct xdp_sock * xs)512 static void xsk_delete_from_maps(struct xdp_sock *xs)
513 {
514 	/* This function removes the current XDP socket from all the
515 	 * maps it resides in. We need to take extra care here, due to
516 	 * the two locks involved. Each map has a lock synchronizing
517 	 * updates to the entries, and each socket has a lock that
518 	 * synchronizes access to the list of maps (map_list). For
519 	 * deadlock avoidance the locks need to be taken in the order
520 	 * "map lock"->"socket map list lock". We start off by
521 	 * accessing the socket map list, and take a reference to the
522 	 * map to guarantee existence between the
523 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
524 	 * calls. Then we ask the map to remove the socket, which
525 	 * tries to remove the socket from the map. Note that there
526 	 * might be updates to the map between
527 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
528 	 */
529 	struct xdp_sock **map_entry = NULL;
530 	struct xsk_map *map;
531 
532 	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
533 		xsk_map_try_sock_delete(map, xs, map_entry);
534 		xsk_map_put(map);
535 	}
536 }
537 
xsk_release(struct socket * sock)538 static int xsk_release(struct socket *sock)
539 {
540 	struct sock *sk = sock->sk;
541 	struct xdp_sock *xs = xdp_sk(sk);
542 	struct net *net;
543 
544 	if (!sk)
545 		return 0;
546 
547 	net = sock_net(sk);
548 
549 	mutex_lock(&net->xdp.lock);
550 	sk_del_node_init_rcu(sk);
551 	mutex_unlock(&net->xdp.lock);
552 
553 	local_bh_disable();
554 	sock_prot_inuse_add(net, sk->sk_prot, -1);
555 	local_bh_enable();
556 
557 	xsk_delete_from_maps(xs);
558 	mutex_lock(&xs->mutex);
559 	xsk_unbind_dev(xs);
560 	mutex_unlock(&xs->mutex);
561 
562 	xskq_destroy(xs->rx);
563 	xskq_destroy(xs->tx);
564 
565 	sock_orphan(sk);
566 	sock->sk = NULL;
567 
568 	sk_refcnt_debug_release(sk);
569 	sock_put(sk);
570 
571 	return 0;
572 }
573 
xsk_lookup_xsk_from_fd(int fd)574 static struct socket *xsk_lookup_xsk_from_fd(int fd)
575 {
576 	struct socket *sock;
577 	int err;
578 
579 	sock = sockfd_lookup(fd, &err);
580 	if (!sock)
581 		return ERR_PTR(-ENOTSOCK);
582 
583 	if (sock->sk->sk_family != PF_XDP) {
584 		sockfd_put(sock);
585 		return ERR_PTR(-ENOPROTOOPT);
586 	}
587 
588 	return sock;
589 }
590 
591 /* Check if umem pages are contiguous.
592  * If zero-copy mode, use the DMA address to do the page contiguity check
593  * For all other modes we use addr (kernel virtual address)
594  * Store the result in the low bits of addr.
595  */
xsk_check_page_contiguity(struct xdp_umem * umem,u32 flags)596 static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
597 {
598 	struct xdp_umem_page *pgs = umem->pages;
599 	int i, is_contig;
600 
601 	for (i = 0; i < umem->npgs - 1; i++) {
602 		is_contig = (flags & XDP_ZEROCOPY) ?
603 			(pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
604 			(pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
605 		pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
606 	}
607 }
608 
xsk_bind(struct socket * sock,struct sockaddr * addr,int addr_len)609 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
610 {
611 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
612 	struct sock *sk = sock->sk;
613 	struct xdp_sock *xs = xdp_sk(sk);
614 	struct net_device *dev;
615 	u32 flags, qid;
616 	int err = 0;
617 
618 	if (addr_len < sizeof(struct sockaddr_xdp))
619 		return -EINVAL;
620 	if (sxdp->sxdp_family != AF_XDP)
621 		return -EINVAL;
622 
623 	flags = sxdp->sxdp_flags;
624 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
625 		      XDP_USE_NEED_WAKEUP))
626 		return -EINVAL;
627 
628 	rtnl_lock();
629 	mutex_lock(&xs->mutex);
630 	if (xs->state != XSK_READY) {
631 		err = -EBUSY;
632 		goto out_release;
633 	}
634 
635 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
636 	if (!dev) {
637 		err = -ENODEV;
638 		goto out_release;
639 	}
640 
641 	if (!xs->rx && !xs->tx) {
642 		err = -EINVAL;
643 		goto out_unlock;
644 	}
645 
646 	qid = sxdp->sxdp_queue_id;
647 
648 	if (flags & XDP_SHARED_UMEM) {
649 		struct xdp_sock *umem_xs;
650 		struct socket *sock;
651 
652 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
653 		    (flags & XDP_USE_NEED_WAKEUP)) {
654 			/* Cannot specify flags for shared sockets. */
655 			err = -EINVAL;
656 			goto out_unlock;
657 		}
658 
659 		if (xs->umem) {
660 			/* We have already our own. */
661 			err = -EINVAL;
662 			goto out_unlock;
663 		}
664 
665 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
666 		if (IS_ERR(sock)) {
667 			err = PTR_ERR(sock);
668 			goto out_unlock;
669 		}
670 
671 		umem_xs = xdp_sk(sock->sk);
672 		if (!xsk_is_bound(umem_xs)) {
673 			err = -EBADF;
674 			sockfd_put(sock);
675 			goto out_unlock;
676 		}
677 		if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
678 			err = -EINVAL;
679 			sockfd_put(sock);
680 			goto out_unlock;
681 		}
682 
683 		xdp_get_umem(umem_xs->umem);
684 		WRITE_ONCE(xs->umem, umem_xs->umem);
685 		sockfd_put(sock);
686 	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
687 		err = -EINVAL;
688 		goto out_unlock;
689 	} else {
690 		/* This xsk has its own umem. */
691 		xskq_set_umem(xs->umem->fq, xs->umem->size,
692 			      xs->umem->chunk_mask);
693 		xskq_set_umem(xs->umem->cq, xs->umem->size,
694 			      xs->umem->chunk_mask);
695 
696 		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
697 		if (err)
698 			goto out_unlock;
699 
700 		xsk_check_page_contiguity(xs->umem, flags);
701 	}
702 
703 	xs->dev = dev;
704 	xs->zc = xs->umem->zc;
705 	xs->queue_id = qid;
706 	xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
707 	xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
708 	xdp_add_sk_umem(xs->umem, xs);
709 
710 out_unlock:
711 	if (err) {
712 		dev_put(dev);
713 	} else {
714 		/* Matches smp_rmb() in bind() for shared umem
715 		 * sockets, and xsk_is_bound().
716 		 */
717 		smp_wmb();
718 		WRITE_ONCE(xs->state, XSK_BOUND);
719 	}
720 out_release:
721 	mutex_unlock(&xs->mutex);
722 	rtnl_unlock();
723 	return err;
724 }
725 
726 struct xdp_umem_reg_v1 {
727 	__u64 addr; /* Start of packet data area */
728 	__u64 len; /* Length of packet data area */
729 	__u32 chunk_size;
730 	__u32 headroom;
731 };
732 
xsk_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)733 static int xsk_setsockopt(struct socket *sock, int level, int optname,
734 			  char __user *optval, unsigned int optlen)
735 {
736 	struct sock *sk = sock->sk;
737 	struct xdp_sock *xs = xdp_sk(sk);
738 	int err;
739 
740 	if (level != SOL_XDP)
741 		return -ENOPROTOOPT;
742 
743 	switch (optname) {
744 	case XDP_RX_RING:
745 	case XDP_TX_RING:
746 	{
747 		struct xsk_queue **q;
748 		int entries;
749 
750 		if (optlen < sizeof(entries))
751 			return -EINVAL;
752 		if (copy_from_user(&entries, optval, sizeof(entries)))
753 			return -EFAULT;
754 
755 		mutex_lock(&xs->mutex);
756 		if (xs->state != XSK_READY) {
757 			mutex_unlock(&xs->mutex);
758 			return -EBUSY;
759 		}
760 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
761 		err = xsk_init_queue(entries, q, false);
762 		if (!err && optname == XDP_TX_RING)
763 			/* Tx needs to be explicitly woken up the first time */
764 			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
765 		mutex_unlock(&xs->mutex);
766 		return err;
767 	}
768 	case XDP_UMEM_REG:
769 	{
770 		size_t mr_size = sizeof(struct xdp_umem_reg);
771 		struct xdp_umem_reg mr = {};
772 		struct xdp_umem *umem;
773 
774 		if (optlen < sizeof(struct xdp_umem_reg_v1))
775 			return -EINVAL;
776 		else if (optlen < sizeof(mr))
777 			mr_size = sizeof(struct xdp_umem_reg_v1);
778 
779 		if (copy_from_user(&mr, optval, mr_size))
780 			return -EFAULT;
781 
782 		mutex_lock(&xs->mutex);
783 		if (xs->state != XSK_READY || xs->umem) {
784 			mutex_unlock(&xs->mutex);
785 			return -EBUSY;
786 		}
787 
788 		umem = xdp_umem_create(&mr);
789 		if (IS_ERR(umem)) {
790 			mutex_unlock(&xs->mutex);
791 			return PTR_ERR(umem);
792 		}
793 
794 		/* Make sure umem is ready before it can be seen by others */
795 		smp_wmb();
796 		WRITE_ONCE(xs->umem, umem);
797 		mutex_unlock(&xs->mutex);
798 		return 0;
799 	}
800 	case XDP_UMEM_FILL_RING:
801 	case XDP_UMEM_COMPLETION_RING:
802 	{
803 		struct xsk_queue **q;
804 		int entries;
805 
806 		if (copy_from_user(&entries, optval, sizeof(entries)))
807 			return -EFAULT;
808 
809 		mutex_lock(&xs->mutex);
810 		if (xs->state != XSK_READY) {
811 			mutex_unlock(&xs->mutex);
812 			return -EBUSY;
813 		}
814 		if (!xs->umem) {
815 			mutex_unlock(&xs->mutex);
816 			return -EINVAL;
817 		}
818 
819 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
820 			&xs->umem->cq;
821 		err = xsk_init_queue(entries, q, true);
822 		mutex_unlock(&xs->mutex);
823 		return err;
824 	}
825 	default:
826 		break;
827 	}
828 
829 	return -ENOPROTOOPT;
830 }
831 
xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 * ring)832 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
833 {
834 	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
835 	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
836 	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
837 }
838 
xsk_enter_umem_offsets(struct xdp_ring_offset_v1 * ring)839 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
840 {
841 	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
842 	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
843 	ring->desc = offsetof(struct xdp_umem_ring, desc);
844 }
845 
xsk_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)846 static int xsk_getsockopt(struct socket *sock, int level, int optname,
847 			  char __user *optval, int __user *optlen)
848 {
849 	struct sock *sk = sock->sk;
850 	struct xdp_sock *xs = xdp_sk(sk);
851 	int len;
852 
853 	if (level != SOL_XDP)
854 		return -ENOPROTOOPT;
855 
856 	if (get_user(len, optlen))
857 		return -EFAULT;
858 	if (len < 0)
859 		return -EINVAL;
860 
861 	switch (optname) {
862 	case XDP_STATISTICS:
863 	{
864 		struct xdp_statistics stats;
865 
866 		if (len < sizeof(stats))
867 			return -EINVAL;
868 
869 		mutex_lock(&xs->mutex);
870 		stats.rx_dropped = xs->rx_dropped;
871 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
872 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
873 		mutex_unlock(&xs->mutex);
874 
875 		if (copy_to_user(optval, &stats, sizeof(stats)))
876 			return -EFAULT;
877 		if (put_user(sizeof(stats), optlen))
878 			return -EFAULT;
879 
880 		return 0;
881 	}
882 	case XDP_MMAP_OFFSETS:
883 	{
884 		struct xdp_mmap_offsets off;
885 		struct xdp_mmap_offsets_v1 off_v1;
886 		bool flags_supported = true;
887 		void *to_copy;
888 
889 		if (len < sizeof(off_v1))
890 			return -EINVAL;
891 		else if (len < sizeof(off))
892 			flags_supported = false;
893 
894 		if (flags_supported) {
895 			/* xdp_ring_offset is identical to xdp_ring_offset_v1
896 			 * except for the flags field added to the end.
897 			 */
898 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
899 					       &off.rx);
900 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
901 					       &off.tx);
902 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
903 					       &off.fr);
904 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
905 					       &off.cr);
906 			off.rx.flags = offsetof(struct xdp_rxtx_ring,
907 						ptrs.flags);
908 			off.tx.flags = offsetof(struct xdp_rxtx_ring,
909 						ptrs.flags);
910 			off.fr.flags = offsetof(struct xdp_umem_ring,
911 						ptrs.flags);
912 			off.cr.flags = offsetof(struct xdp_umem_ring,
913 						ptrs.flags);
914 
915 			len = sizeof(off);
916 			to_copy = &off;
917 		} else {
918 			xsk_enter_rxtx_offsets(&off_v1.rx);
919 			xsk_enter_rxtx_offsets(&off_v1.tx);
920 			xsk_enter_umem_offsets(&off_v1.fr);
921 			xsk_enter_umem_offsets(&off_v1.cr);
922 
923 			len = sizeof(off_v1);
924 			to_copy = &off_v1;
925 		}
926 
927 		if (copy_to_user(optval, to_copy, len))
928 			return -EFAULT;
929 		if (put_user(len, optlen))
930 			return -EFAULT;
931 
932 		return 0;
933 	}
934 	case XDP_OPTIONS:
935 	{
936 		struct xdp_options opts = {};
937 
938 		if (len < sizeof(opts))
939 			return -EINVAL;
940 
941 		mutex_lock(&xs->mutex);
942 		if (xs->zc)
943 			opts.flags |= XDP_OPTIONS_ZEROCOPY;
944 		mutex_unlock(&xs->mutex);
945 
946 		len = sizeof(opts);
947 		if (copy_to_user(optval, &opts, len))
948 			return -EFAULT;
949 		if (put_user(len, optlen))
950 			return -EFAULT;
951 
952 		return 0;
953 	}
954 	default:
955 		break;
956 	}
957 
958 	return -EOPNOTSUPP;
959 }
960 
xsk_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)961 static int xsk_mmap(struct file *file, struct socket *sock,
962 		    struct vm_area_struct *vma)
963 {
964 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
965 	unsigned long size = vma->vm_end - vma->vm_start;
966 	struct xdp_sock *xs = xdp_sk(sock->sk);
967 	struct xsk_queue *q = NULL;
968 	struct xdp_umem *umem;
969 	unsigned long pfn;
970 	struct page *qpg;
971 
972 	if (READ_ONCE(xs->state) != XSK_READY)
973 		return -EBUSY;
974 
975 	if (offset == XDP_PGOFF_RX_RING) {
976 		q = READ_ONCE(xs->rx);
977 	} else if (offset == XDP_PGOFF_TX_RING) {
978 		q = READ_ONCE(xs->tx);
979 	} else {
980 		umem = READ_ONCE(xs->umem);
981 		if (!umem)
982 			return -EINVAL;
983 
984 		/* Matches the smp_wmb() in XDP_UMEM_REG */
985 		smp_rmb();
986 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
987 			q = READ_ONCE(umem->fq);
988 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
989 			q = READ_ONCE(umem->cq);
990 	}
991 
992 	if (!q)
993 		return -EINVAL;
994 
995 	/* Matches the smp_wmb() in xsk_init_queue */
996 	smp_rmb();
997 	qpg = virt_to_head_page(q->ring);
998 	if (size > page_size(qpg))
999 		return -EINVAL;
1000 
1001 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1002 	return remap_pfn_range(vma, vma->vm_start, pfn,
1003 			       size, vma->vm_page_prot);
1004 }
1005 
xsk_notifier(struct notifier_block * this,unsigned long msg,void * ptr)1006 static int xsk_notifier(struct notifier_block *this,
1007 			unsigned long msg, void *ptr)
1008 {
1009 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1010 	struct net *net = dev_net(dev);
1011 	struct sock *sk;
1012 
1013 	switch (msg) {
1014 	case NETDEV_UNREGISTER:
1015 		mutex_lock(&net->xdp.lock);
1016 		sk_for_each(sk, &net->xdp.list) {
1017 			struct xdp_sock *xs = xdp_sk(sk);
1018 
1019 			mutex_lock(&xs->mutex);
1020 			if (xs->dev == dev) {
1021 				sk->sk_err = ENETDOWN;
1022 				if (!sock_flag(sk, SOCK_DEAD))
1023 					sk->sk_error_report(sk);
1024 
1025 				xsk_unbind_dev(xs);
1026 
1027 				/* Clear device references in umem. */
1028 				xdp_umem_clear_dev(xs->umem);
1029 			}
1030 			mutex_unlock(&xs->mutex);
1031 		}
1032 		mutex_unlock(&net->xdp.lock);
1033 		break;
1034 	}
1035 	return NOTIFY_DONE;
1036 }
1037 
1038 static struct proto xsk_proto = {
1039 	.name =		"XDP",
1040 	.owner =	THIS_MODULE,
1041 	.obj_size =	sizeof(struct xdp_sock),
1042 };
1043 
1044 static const struct proto_ops xsk_proto_ops = {
1045 	.family		= PF_XDP,
1046 	.owner		= THIS_MODULE,
1047 	.release	= xsk_release,
1048 	.bind		= xsk_bind,
1049 	.connect	= sock_no_connect,
1050 	.socketpair	= sock_no_socketpair,
1051 	.accept		= sock_no_accept,
1052 	.getname	= sock_no_getname,
1053 	.poll		= xsk_poll,
1054 	.ioctl		= sock_no_ioctl,
1055 	.listen		= sock_no_listen,
1056 	.shutdown	= sock_no_shutdown,
1057 	.setsockopt	= xsk_setsockopt,
1058 	.getsockopt	= xsk_getsockopt,
1059 	.sendmsg	= xsk_sendmsg,
1060 	.recvmsg	= sock_no_recvmsg,
1061 	.mmap		= xsk_mmap,
1062 	.sendpage	= sock_no_sendpage,
1063 };
1064 
xsk_destruct(struct sock * sk)1065 static void xsk_destruct(struct sock *sk)
1066 {
1067 	struct xdp_sock *xs = xdp_sk(sk);
1068 
1069 	if (!sock_flag(sk, SOCK_DEAD))
1070 		return;
1071 
1072 	xdp_put_umem(xs->umem);
1073 
1074 	sk_refcnt_debug_dec(sk);
1075 }
1076 
xsk_create(struct net * net,struct socket * sock,int protocol,int kern)1077 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1078 		      int kern)
1079 {
1080 	struct sock *sk;
1081 	struct xdp_sock *xs;
1082 
1083 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1084 		return -EPERM;
1085 	if (sock->type != SOCK_RAW)
1086 		return -ESOCKTNOSUPPORT;
1087 
1088 	if (protocol)
1089 		return -EPROTONOSUPPORT;
1090 
1091 	sock->state = SS_UNCONNECTED;
1092 
1093 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1094 	if (!sk)
1095 		return -ENOBUFS;
1096 
1097 	sock->ops = &xsk_proto_ops;
1098 
1099 	sock_init_data(sock, sk);
1100 
1101 	sk->sk_family = PF_XDP;
1102 
1103 	sk->sk_destruct = xsk_destruct;
1104 	sk_refcnt_debug_inc(sk);
1105 
1106 	sock_set_flag(sk, SOCK_RCU_FREE);
1107 
1108 	xs = xdp_sk(sk);
1109 	xs->state = XSK_READY;
1110 	mutex_init(&xs->mutex);
1111 	spin_lock_init(&xs->rx_lock);
1112 	spin_lock_init(&xs->tx_completion_lock);
1113 
1114 	INIT_LIST_HEAD(&xs->map_list);
1115 	spin_lock_init(&xs->map_list_lock);
1116 
1117 	mutex_lock(&net->xdp.lock);
1118 	sk_add_node_rcu(sk, &net->xdp.list);
1119 	mutex_unlock(&net->xdp.lock);
1120 
1121 	local_bh_disable();
1122 	sock_prot_inuse_add(net, &xsk_proto, 1);
1123 	local_bh_enable();
1124 
1125 	return 0;
1126 }
1127 
1128 static const struct net_proto_family xsk_family_ops = {
1129 	.family = PF_XDP,
1130 	.create = xsk_create,
1131 	.owner	= THIS_MODULE,
1132 };
1133 
1134 static struct notifier_block xsk_netdev_notifier = {
1135 	.notifier_call	= xsk_notifier,
1136 };
1137 
xsk_net_init(struct net * net)1138 static int __net_init xsk_net_init(struct net *net)
1139 {
1140 	mutex_init(&net->xdp.lock);
1141 	INIT_HLIST_HEAD(&net->xdp.list);
1142 	return 0;
1143 }
1144 
xsk_net_exit(struct net * net)1145 static void __net_exit xsk_net_exit(struct net *net)
1146 {
1147 	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1148 }
1149 
1150 static struct pernet_operations xsk_net_ops = {
1151 	.init = xsk_net_init,
1152 	.exit = xsk_net_exit,
1153 };
1154 
xsk_init(void)1155 static int __init xsk_init(void)
1156 {
1157 	int err;
1158 
1159 	err = proto_register(&xsk_proto, 0 /* no slab */);
1160 	if (err)
1161 		goto out;
1162 
1163 	err = sock_register(&xsk_family_ops);
1164 	if (err)
1165 		goto out_proto;
1166 
1167 	err = register_pernet_subsys(&xsk_net_ops);
1168 	if (err)
1169 		goto out_sk;
1170 
1171 	err = register_netdevice_notifier(&xsk_netdev_notifier);
1172 	if (err)
1173 		goto out_pernet;
1174 
1175 	return 0;
1176 
1177 out_pernet:
1178 	unregister_pernet_subsys(&xsk_net_ops);
1179 out_sk:
1180 	sock_unregister(PF_XDP);
1181 out_proto:
1182 	proto_unregister(&xsk_proto);
1183 out:
1184 	return err;
1185 }
1186 
1187 fs_initcall(xsk_init);
1188