1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
94 #include <asm/unaligned.h>
95 #include <linux/capability.h>
96 #include <linux/errno.h>
97 #include <linux/errqueue.h>
98 #include <linux/types.h>
99 #include <linux/socket.h>
100 #include <linux/in.h>
101 #include <linux/kernel.h>
102 #include <linux/module.h>
103 #include <linux/proc_fs.h>
104 #include <linux/seq_file.h>
105 #include <linux/sched.h>
106 #include <linux/sched/mm.h>
107 #include <linux/timer.h>
108 #include <linux/string.h>
109 #include <linux/sockios.h>
110 #include <linux/net.h>
111 #include <linux/mm.h>
112 #include <linux/slab.h>
113 #include <linux/interrupt.h>
114 #include <linux/poll.h>
115 #include <linux/tcp.h>
116 #include <linux/init.h>
117 #include <linux/highmem.h>
118 #include <linux/user_namespace.h>
119 #include <linux/static_key.h>
120 #include <linux/memcontrol.h>
121 #include <linux/prefetch.h>
122
123 #include <linux/uaccess.h>
124
125 #include <linux/netdevice.h>
126 #include <net/protocol.h>
127 #include <linux/skbuff.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <linux/net_tstamp.h>
132 #include <net/xfrm.h>
133 #include <linux/ipsec.h>
134 #include <net/cls_cgroup.h>
135 #include <net/netprio_cgroup.h>
136 #include <linux/sock_diag.h>
137
138 #include <linux/filter.h>
139 #include <net/sock_reuseport.h>
140
141 #include <trace/events/sock.h>
142
143 #include <net/tcp.h>
144 #include <net/busy_poll.h>
145
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
148
149 static void sock_inuse_add(struct net *net, int val);
150
151 /**
152 * sk_ns_capable - General socket capability test
153 * @sk: Socket to use a capability on or through
154 * @user_ns: The user namespace of the capability to use
155 * @cap: The capability to use
156 *
157 * Test to see if the opener of the socket had when the socket was
158 * created and the current process has the capability @cap in the user
159 * namespace @user_ns.
160 */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)161 bool sk_ns_capable(const struct sock *sk,
162 struct user_namespace *user_ns, int cap)
163 {
164 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 ns_capable(user_ns, cap);
166 }
167 EXPORT_SYMBOL(sk_ns_capable);
168
169 /**
170 * sk_capable - Socket global capability test
171 * @sk: Socket to use a capability on or through
172 * @cap: The global capability to use
173 *
174 * Test to see if the opener of the socket had when the socket was
175 * created and the current process has the capability @cap in all user
176 * namespaces.
177 */
sk_capable(const struct sock * sk,int cap)178 bool sk_capable(const struct sock *sk, int cap)
179 {
180 return sk_ns_capable(sk, &init_user_ns, cap);
181 }
182 EXPORT_SYMBOL(sk_capable);
183
184 /**
185 * sk_net_capable - Network namespace socket capability test
186 * @sk: Socket to use a capability on or through
187 * @cap: The capability to use
188 *
189 * Test to see if the opener of the socket had when the socket was created
190 * and the current process has the capability @cap over the network namespace
191 * the socket is a member of.
192 */
sk_net_capable(const struct sock * sk,int cap)193 bool sk_net_capable(const struct sock *sk, int cap)
194 {
195 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
196 }
197 EXPORT_SYMBOL(sk_net_capable);
198
199 /*
200 * Each address family might have different locking rules, so we have
201 * one slock key per address family and separate keys for internal and
202 * userspace sockets.
203 */
204 static struct lock_class_key af_family_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_keys[AF_MAX];
206 static struct lock_class_key af_family_slock_keys[AF_MAX];
207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
208
209 /*
210 * Make lock validator output more readable. (we pre-construct these
211 * strings build-time, so that runtime initialization of socket
212 * locks is fast):
213 */
214
215 #define _sock_locks(x) \
216 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
217 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
218 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
219 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
220 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
221 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
222 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
223 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
224 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
225 x "27" , x "28" , x "AF_CAN" , \
226 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
227 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
228 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
229 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
230 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
231 x "AF_MAX"
232
233 static const char *const af_family_key_strings[AF_MAX+1] = {
234 _sock_locks("sk_lock-")
235 };
236 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
237 _sock_locks("slock-")
238 };
239 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
240 _sock_locks("clock-")
241 };
242
243 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
244 _sock_locks("k-sk_lock-")
245 };
246 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
247 _sock_locks("k-slock-")
248 };
249 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
250 _sock_locks("k-clock-")
251 };
252 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
253 _sock_locks("rlock-")
254 };
255 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
256 _sock_locks("wlock-")
257 };
258 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
259 _sock_locks("elock-")
260 };
261
262 /*
263 * sk_callback_lock and sk queues locking rules are per-address-family,
264 * so split the lock classes by using a per-AF key:
265 */
266 static struct lock_class_key af_callback_keys[AF_MAX];
267 static struct lock_class_key af_rlock_keys[AF_MAX];
268 static struct lock_class_key af_wlock_keys[AF_MAX];
269 static struct lock_class_key af_elock_keys[AF_MAX];
270 static struct lock_class_key af_kern_callback_keys[AF_MAX];
271
272 /* Run time adjustable parameters. */
273 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
274 EXPORT_SYMBOL(sysctl_wmem_max);
275 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
276 EXPORT_SYMBOL(sysctl_rmem_max);
277 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
278 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
279
280 /* Maximal space eaten by iovec or ancillary data plus some space */
281 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
282 EXPORT_SYMBOL(sysctl_optmem_max);
283
284 int sysctl_tstamp_allow_data __read_mostly = 1;
285
286 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
287 EXPORT_SYMBOL_GPL(memalloc_socks_key);
288
289 /**
290 * sk_set_memalloc - sets %SOCK_MEMALLOC
291 * @sk: socket to set it on
292 *
293 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
294 * It's the responsibility of the admin to adjust min_free_kbytes
295 * to meet the requirements
296 */
sk_set_memalloc(struct sock * sk)297 void sk_set_memalloc(struct sock *sk)
298 {
299 sock_set_flag(sk, SOCK_MEMALLOC);
300 sk->sk_allocation |= __GFP_MEMALLOC;
301 static_branch_inc(&memalloc_socks_key);
302 }
303 EXPORT_SYMBOL_GPL(sk_set_memalloc);
304
sk_clear_memalloc(struct sock * sk)305 void sk_clear_memalloc(struct sock *sk)
306 {
307 sock_reset_flag(sk, SOCK_MEMALLOC);
308 sk->sk_allocation &= ~__GFP_MEMALLOC;
309 static_branch_dec(&memalloc_socks_key);
310
311 /*
312 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
313 * progress of swapping. SOCK_MEMALLOC may be cleared while
314 * it has rmem allocations due to the last swapfile being deactivated
315 * but there is a risk that the socket is unusable due to exceeding
316 * the rmem limits. Reclaim the reserves and obey rmem limits again.
317 */
318 sk_mem_reclaim(sk);
319 }
320 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
321
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)322 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
323 {
324 int ret;
325 unsigned int noreclaim_flag;
326
327 /* these should have been dropped before queueing */
328 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
329
330 noreclaim_flag = memalloc_noreclaim_save();
331 ret = sk->sk_backlog_rcv(sk, skb);
332 memalloc_noreclaim_restore(noreclaim_flag);
333
334 return ret;
335 }
336 EXPORT_SYMBOL(__sk_backlog_rcv);
337
sock_set_timeout(long * timeo_p,char __user * optval,int optlen)338 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
339 {
340 struct timeval tv;
341
342 if (optlen < sizeof(tv))
343 return -EINVAL;
344 if (copy_from_user(&tv, optval, sizeof(tv)))
345 return -EFAULT;
346 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
347 return -EDOM;
348
349 if (tv.tv_sec < 0) {
350 static int warned __read_mostly;
351
352 *timeo_p = 0;
353 if (warned < 10 && net_ratelimit()) {
354 warned++;
355 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
356 __func__, current->comm, task_pid_nr(current));
357 }
358 return 0;
359 }
360 *timeo_p = MAX_SCHEDULE_TIMEOUT;
361 if (tv.tv_sec == 0 && tv.tv_usec == 0)
362 return 0;
363 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
364 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
365 return 0;
366 }
367
sock_warn_obsolete_bsdism(const char * name)368 static void sock_warn_obsolete_bsdism(const char *name)
369 {
370 static int warned;
371 static char warncomm[TASK_COMM_LEN];
372 if (strcmp(warncomm, current->comm) && warned < 5) {
373 strcpy(warncomm, current->comm);
374 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
375 warncomm, name);
376 warned++;
377 }
378 }
379
sock_needs_netstamp(const struct sock * sk)380 static bool sock_needs_netstamp(const struct sock *sk)
381 {
382 switch (sk->sk_family) {
383 case AF_UNSPEC:
384 case AF_UNIX:
385 return false;
386 default:
387 return true;
388 }
389 }
390
sock_disable_timestamp(struct sock * sk,unsigned long flags)391 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
392 {
393 if (sk->sk_flags & flags) {
394 sk->sk_flags &= ~flags;
395 if (sock_needs_netstamp(sk) &&
396 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
397 net_disable_timestamp();
398 }
399 }
400
401
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)402 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
403 {
404 unsigned long flags;
405 struct sk_buff_head *list = &sk->sk_receive_queue;
406
407 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
408 atomic_inc(&sk->sk_drops);
409 trace_sock_rcvqueue_full(sk, skb);
410 return -ENOMEM;
411 }
412
413 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
414 atomic_inc(&sk->sk_drops);
415 return -ENOBUFS;
416 }
417
418 skb->dev = NULL;
419 skb_set_owner_r(skb, sk);
420
421 /* we escape from rcu protected region, make sure we dont leak
422 * a norefcounted dst
423 */
424 skb_dst_force(skb);
425
426 spin_lock_irqsave(&list->lock, flags);
427 sock_skb_set_dropcount(sk, skb);
428 __skb_queue_tail(list, skb);
429 spin_unlock_irqrestore(&list->lock, flags);
430
431 if (!sock_flag(sk, SOCK_DEAD))
432 sk->sk_data_ready(sk);
433 return 0;
434 }
435 EXPORT_SYMBOL(__sock_queue_rcv_skb);
436
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)437 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
438 {
439 int err;
440
441 err = sk_filter(sk, skb);
442 if (err)
443 return err;
444
445 return __sock_queue_rcv_skb(sk, skb);
446 }
447 EXPORT_SYMBOL(sock_queue_rcv_skb);
448
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)449 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
450 const int nested, unsigned int trim_cap, bool refcounted)
451 {
452 int rc = NET_RX_SUCCESS;
453
454 if (sk_filter_trim_cap(sk, skb, trim_cap))
455 goto discard_and_relse;
456
457 skb->dev = NULL;
458
459 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
460 atomic_inc(&sk->sk_drops);
461 goto discard_and_relse;
462 }
463 if (nested)
464 bh_lock_sock_nested(sk);
465 else
466 bh_lock_sock(sk);
467 if (!sock_owned_by_user(sk)) {
468 /*
469 * trylock + unlock semantics:
470 */
471 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
472
473 rc = sk_backlog_rcv(sk, skb);
474
475 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
476 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
477 bh_unlock_sock(sk);
478 atomic_inc(&sk->sk_drops);
479 goto discard_and_relse;
480 }
481
482 bh_unlock_sock(sk);
483 out:
484 if (refcounted)
485 sock_put(sk);
486 return rc;
487 discard_and_relse:
488 kfree_skb(skb);
489 goto out;
490 }
491 EXPORT_SYMBOL(__sk_receive_skb);
492
__sk_dst_check(struct sock * sk,u32 cookie)493 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
494 {
495 struct dst_entry *dst = __sk_dst_get(sk);
496
497 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
498 sk_tx_queue_clear(sk);
499 sk->sk_dst_pending_confirm = 0;
500 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
501 dst_release(dst);
502 return NULL;
503 }
504
505 return dst;
506 }
507 EXPORT_SYMBOL(__sk_dst_check);
508
sk_dst_check(struct sock * sk,u32 cookie)509 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
510 {
511 struct dst_entry *dst = sk_dst_get(sk);
512
513 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
514 sk_dst_reset(sk);
515 dst_release(dst);
516 return NULL;
517 }
518
519 return dst;
520 }
521 EXPORT_SYMBOL(sk_dst_check);
522
sock_setbindtodevice(struct sock * sk,char __user * optval,int optlen)523 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
524 int optlen)
525 {
526 int ret = -ENOPROTOOPT;
527 #ifdef CONFIG_NETDEVICES
528 struct net *net = sock_net(sk);
529 char devname[IFNAMSIZ];
530 int index;
531
532 /* Sorry... */
533 ret = -EPERM;
534 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
535 goto out;
536
537 ret = -EINVAL;
538 if (optlen < 0)
539 goto out;
540
541 /* Bind this socket to a particular device like "eth0",
542 * as specified in the passed interface name. If the
543 * name is "" or the option length is zero the socket
544 * is not bound.
545 */
546 if (optlen > IFNAMSIZ - 1)
547 optlen = IFNAMSIZ - 1;
548 memset(devname, 0, sizeof(devname));
549
550 ret = -EFAULT;
551 if (copy_from_user(devname, optval, optlen))
552 goto out;
553
554 index = 0;
555 if (devname[0] != '\0') {
556 struct net_device *dev;
557
558 rcu_read_lock();
559 dev = dev_get_by_name_rcu(net, devname);
560 if (dev)
561 index = dev->ifindex;
562 rcu_read_unlock();
563 ret = -ENODEV;
564 if (!dev)
565 goto out;
566 }
567
568 lock_sock(sk);
569 sk->sk_bound_dev_if = index;
570 sk_dst_reset(sk);
571 release_sock(sk);
572
573 ret = 0;
574
575 out:
576 #endif
577
578 return ret;
579 }
580
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)581 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
582 int __user *optlen, int len)
583 {
584 int ret = -ENOPROTOOPT;
585 #ifdef CONFIG_NETDEVICES
586 struct net *net = sock_net(sk);
587 char devname[IFNAMSIZ];
588
589 if (sk->sk_bound_dev_if == 0) {
590 len = 0;
591 goto zero;
592 }
593
594 ret = -EINVAL;
595 if (len < IFNAMSIZ)
596 goto out;
597
598 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
599 if (ret)
600 goto out;
601
602 len = strlen(devname) + 1;
603
604 ret = -EFAULT;
605 if (copy_to_user(optval, devname, len))
606 goto out;
607
608 zero:
609 ret = -EFAULT;
610 if (put_user(len, optlen))
611 goto out;
612
613 ret = 0;
614
615 out:
616 #endif
617
618 return ret;
619 }
620
sock_valbool_flag(struct sock * sk,int bit,int valbool)621 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
622 {
623 if (valbool)
624 sock_set_flag(sk, bit);
625 else
626 sock_reset_flag(sk, bit);
627 }
628
sk_mc_loop(struct sock * sk)629 bool sk_mc_loop(struct sock *sk)
630 {
631 if (dev_recursion_level())
632 return false;
633 if (!sk)
634 return true;
635 switch (sk->sk_family) {
636 case AF_INET:
637 return inet_sk(sk)->mc_loop;
638 #if IS_ENABLED(CONFIG_IPV6)
639 case AF_INET6:
640 return inet6_sk(sk)->mc_loop;
641 #endif
642 }
643 WARN_ON_ONCE(1);
644 return true;
645 }
646 EXPORT_SYMBOL(sk_mc_loop);
647
648 /*
649 * This is meant for all protocols to use and covers goings on
650 * at the socket level. Everything here is generic.
651 */
652
sock_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)653 int sock_setsockopt(struct socket *sock, int level, int optname,
654 char __user *optval, unsigned int optlen)
655 {
656 struct sock_txtime sk_txtime;
657 struct sock *sk = sock->sk;
658 int val;
659 int valbool;
660 struct linger ling;
661 int ret = 0;
662
663 /*
664 * Options without arguments
665 */
666
667 if (optname == SO_BINDTODEVICE)
668 return sock_setbindtodevice(sk, optval, optlen);
669
670 if (optlen < sizeof(int))
671 return -EINVAL;
672
673 if (get_user(val, (int __user *)optval))
674 return -EFAULT;
675
676 valbool = val ? 1 : 0;
677
678 lock_sock(sk);
679
680 switch (optname) {
681 case SO_DEBUG:
682 if (val && !capable(CAP_NET_ADMIN))
683 ret = -EACCES;
684 else
685 sock_valbool_flag(sk, SOCK_DBG, valbool);
686 break;
687 case SO_REUSEADDR:
688 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
689 break;
690 case SO_REUSEPORT:
691 sk->sk_reuseport = valbool;
692 break;
693 case SO_TYPE:
694 case SO_PROTOCOL:
695 case SO_DOMAIN:
696 case SO_ERROR:
697 ret = -ENOPROTOOPT;
698 break;
699 case SO_DONTROUTE:
700 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
701 sk_dst_reset(sk);
702 break;
703 case SO_BROADCAST:
704 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
705 break;
706 case SO_SNDBUF:
707 /* Don't error on this BSD doesn't and if you think
708 * about it this is right. Otherwise apps have to
709 * play 'guess the biggest size' games. RCVBUF/SNDBUF
710 * are treated in BSD as hints
711 */
712 val = min_t(u32, val, sysctl_wmem_max);
713 set_sndbuf:
714 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
715 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
716 /* Wake up sending tasks if we upped the value. */
717 sk->sk_write_space(sk);
718 break;
719
720 case SO_SNDBUFFORCE:
721 if (!capable(CAP_NET_ADMIN)) {
722 ret = -EPERM;
723 break;
724 }
725 goto set_sndbuf;
726
727 case SO_RCVBUF:
728 /* Don't error on this BSD doesn't and if you think
729 * about it this is right. Otherwise apps have to
730 * play 'guess the biggest size' games. RCVBUF/SNDBUF
731 * are treated in BSD as hints
732 */
733 val = min_t(u32, val, sysctl_rmem_max);
734 set_rcvbuf:
735 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
736 /*
737 * We double it on the way in to account for
738 * "struct sk_buff" etc. overhead. Applications
739 * assume that the SO_RCVBUF setting they make will
740 * allow that much actual data to be received on that
741 * socket.
742 *
743 * Applications are unaware that "struct sk_buff" and
744 * other overheads allocate from the receive buffer
745 * during socket buffer allocation.
746 *
747 * And after considering the possible alternatives,
748 * returning the value we actually used in getsockopt
749 * is the most desirable behavior.
750 */
751 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
752 break;
753
754 case SO_RCVBUFFORCE:
755 if (!capable(CAP_NET_ADMIN)) {
756 ret = -EPERM;
757 break;
758 }
759 goto set_rcvbuf;
760
761 case SO_KEEPALIVE:
762 if (sk->sk_prot->keepalive)
763 sk->sk_prot->keepalive(sk, valbool);
764 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
765 break;
766
767 case SO_OOBINLINE:
768 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
769 break;
770
771 case SO_NO_CHECK:
772 sk->sk_no_check_tx = valbool;
773 break;
774
775 case SO_PRIORITY:
776 if ((val >= 0 && val <= 6) ||
777 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
778 sk->sk_priority = val;
779 else
780 ret = -EPERM;
781 break;
782
783 case SO_LINGER:
784 if (optlen < sizeof(ling)) {
785 ret = -EINVAL; /* 1003.1g */
786 break;
787 }
788 if (copy_from_user(&ling, optval, sizeof(ling))) {
789 ret = -EFAULT;
790 break;
791 }
792 if (!ling.l_onoff)
793 sock_reset_flag(sk, SOCK_LINGER);
794 else {
795 #if (BITS_PER_LONG == 32)
796 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
797 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
798 else
799 #endif
800 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
801 sock_set_flag(sk, SOCK_LINGER);
802 }
803 break;
804
805 case SO_BSDCOMPAT:
806 sock_warn_obsolete_bsdism("setsockopt");
807 break;
808
809 case SO_PASSCRED:
810 if (valbool)
811 set_bit(SOCK_PASSCRED, &sock->flags);
812 else
813 clear_bit(SOCK_PASSCRED, &sock->flags);
814 break;
815
816 case SO_TIMESTAMP:
817 case SO_TIMESTAMPNS:
818 if (valbool) {
819 if (optname == SO_TIMESTAMP)
820 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
821 else
822 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
823 sock_set_flag(sk, SOCK_RCVTSTAMP);
824 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
825 } else {
826 sock_reset_flag(sk, SOCK_RCVTSTAMP);
827 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
828 }
829 break;
830
831 case SO_TIMESTAMPING:
832 if (val & ~SOF_TIMESTAMPING_MASK) {
833 ret = -EINVAL;
834 break;
835 }
836
837 if (val & SOF_TIMESTAMPING_OPT_ID &&
838 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
839 if (sk->sk_protocol == IPPROTO_TCP &&
840 sk->sk_type == SOCK_STREAM) {
841 if ((1 << sk->sk_state) &
842 (TCPF_CLOSE | TCPF_LISTEN)) {
843 ret = -EINVAL;
844 break;
845 }
846 sk->sk_tskey = tcp_sk(sk)->snd_una;
847 } else {
848 sk->sk_tskey = 0;
849 }
850 }
851
852 if (val & SOF_TIMESTAMPING_OPT_STATS &&
853 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
854 ret = -EINVAL;
855 break;
856 }
857
858 sk->sk_tsflags = val;
859 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
860 sock_enable_timestamp(sk,
861 SOCK_TIMESTAMPING_RX_SOFTWARE);
862 else
863 sock_disable_timestamp(sk,
864 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
865 break;
866
867 case SO_RCVLOWAT:
868 if (val < 0)
869 val = INT_MAX;
870 if (sock->ops->set_rcvlowat)
871 ret = sock->ops->set_rcvlowat(sk, val);
872 else
873 sk->sk_rcvlowat = val ? : 1;
874 break;
875
876 case SO_RCVTIMEO:
877 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
878 break;
879
880 case SO_SNDTIMEO:
881 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
882 break;
883
884 case SO_ATTACH_FILTER:
885 ret = -EINVAL;
886 if (optlen == sizeof(struct sock_fprog)) {
887 struct sock_fprog fprog;
888
889 ret = -EFAULT;
890 if (copy_from_user(&fprog, optval, sizeof(fprog)))
891 break;
892
893 ret = sk_attach_filter(&fprog, sk);
894 }
895 break;
896
897 case SO_ATTACH_BPF:
898 ret = -EINVAL;
899 if (optlen == sizeof(u32)) {
900 u32 ufd;
901
902 ret = -EFAULT;
903 if (copy_from_user(&ufd, optval, sizeof(ufd)))
904 break;
905
906 ret = sk_attach_bpf(ufd, sk);
907 }
908 break;
909
910 case SO_ATTACH_REUSEPORT_CBPF:
911 ret = -EINVAL;
912 if (optlen == sizeof(struct sock_fprog)) {
913 struct sock_fprog fprog;
914
915 ret = -EFAULT;
916 if (copy_from_user(&fprog, optval, sizeof(fprog)))
917 break;
918
919 ret = sk_reuseport_attach_filter(&fprog, sk);
920 }
921 break;
922
923 case SO_ATTACH_REUSEPORT_EBPF:
924 ret = -EINVAL;
925 if (optlen == sizeof(u32)) {
926 u32 ufd;
927
928 ret = -EFAULT;
929 if (copy_from_user(&ufd, optval, sizeof(ufd)))
930 break;
931
932 ret = sk_reuseport_attach_bpf(ufd, sk);
933 }
934 break;
935
936 case SO_DETACH_FILTER:
937 ret = sk_detach_filter(sk);
938 break;
939
940 case SO_LOCK_FILTER:
941 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
942 ret = -EPERM;
943 else
944 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
945 break;
946
947 case SO_PASSSEC:
948 if (valbool)
949 set_bit(SOCK_PASSSEC, &sock->flags);
950 else
951 clear_bit(SOCK_PASSSEC, &sock->flags);
952 break;
953 case SO_MARK:
954 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
955 ret = -EPERM;
956 else
957 sk->sk_mark = val;
958 break;
959
960 case SO_RXQ_OVFL:
961 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
962 break;
963
964 case SO_WIFI_STATUS:
965 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
966 break;
967
968 case SO_PEEK_OFF:
969 if (sock->ops->set_peek_off)
970 ret = sock->ops->set_peek_off(sk, val);
971 else
972 ret = -EOPNOTSUPP;
973 break;
974
975 case SO_NOFCS:
976 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
977 break;
978
979 case SO_SELECT_ERR_QUEUE:
980 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
981 break;
982
983 #ifdef CONFIG_NET_RX_BUSY_POLL
984 case SO_BUSY_POLL:
985 /* allow unprivileged users to decrease the value */
986 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
987 ret = -EPERM;
988 else {
989 if (val < 0)
990 ret = -EINVAL;
991 else
992 sk->sk_ll_usec = val;
993 }
994 break;
995 #endif
996
997 case SO_MAX_PACING_RATE:
998 if (val != ~0U)
999 cmpxchg(&sk->sk_pacing_status,
1000 SK_PACING_NONE,
1001 SK_PACING_NEEDED);
1002 sk->sk_max_pacing_rate = val;
1003 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1004 sk->sk_max_pacing_rate);
1005 break;
1006
1007 case SO_INCOMING_CPU:
1008 WRITE_ONCE(sk->sk_incoming_cpu, val);
1009 break;
1010
1011 case SO_CNX_ADVICE:
1012 if (val == 1)
1013 dst_negative_advice(sk);
1014 break;
1015
1016 case SO_ZEROCOPY:
1017 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1018 if (sk->sk_protocol != IPPROTO_TCP)
1019 ret = -ENOTSUPP;
1020 } else if (sk->sk_family != PF_RDS) {
1021 ret = -ENOTSUPP;
1022 }
1023 if (!ret) {
1024 if (val < 0 || val > 1)
1025 ret = -EINVAL;
1026 else
1027 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1028 }
1029 break;
1030
1031 case SO_TXTIME:
1032 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1033 ret = -EPERM;
1034 } else if (optlen != sizeof(struct sock_txtime)) {
1035 ret = -EINVAL;
1036 } else if (copy_from_user(&sk_txtime, optval,
1037 sizeof(struct sock_txtime))) {
1038 ret = -EFAULT;
1039 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1040 ret = -EINVAL;
1041 } else {
1042 sock_valbool_flag(sk, SOCK_TXTIME, true);
1043 sk->sk_clockid = sk_txtime.clockid;
1044 sk->sk_txtime_deadline_mode =
1045 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1046 sk->sk_txtime_report_errors =
1047 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1048 }
1049 break;
1050
1051 default:
1052 ret = -ENOPROTOOPT;
1053 break;
1054 }
1055 release_sock(sk);
1056 return ret;
1057 }
1058 EXPORT_SYMBOL(sock_setsockopt);
1059
1060
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1061 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1062 struct ucred *ucred)
1063 {
1064 ucred->pid = pid_vnr(pid);
1065 ucred->uid = ucred->gid = -1;
1066 if (cred) {
1067 struct user_namespace *current_ns = current_user_ns();
1068
1069 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1070 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1071 }
1072 }
1073
groups_to_user(gid_t __user * dst,const struct group_info * src)1074 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1075 {
1076 struct user_namespace *user_ns = current_user_ns();
1077 int i;
1078
1079 for (i = 0; i < src->ngroups; i++)
1080 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1081 return -EFAULT;
1082
1083 return 0;
1084 }
1085
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1086 int sock_getsockopt(struct socket *sock, int level, int optname,
1087 char __user *optval, int __user *optlen)
1088 {
1089 struct sock *sk = sock->sk;
1090
1091 union {
1092 int val;
1093 u64 val64;
1094 struct linger ling;
1095 struct timeval tm;
1096 struct sock_txtime txtime;
1097 } v;
1098
1099 int lv = sizeof(int);
1100 int len;
1101
1102 if (get_user(len, optlen))
1103 return -EFAULT;
1104 if (len < 0)
1105 return -EINVAL;
1106
1107 memset(&v, 0, sizeof(v));
1108
1109 switch (optname) {
1110 case SO_DEBUG:
1111 v.val = sock_flag(sk, SOCK_DBG);
1112 break;
1113
1114 case SO_DONTROUTE:
1115 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1116 break;
1117
1118 case SO_BROADCAST:
1119 v.val = sock_flag(sk, SOCK_BROADCAST);
1120 break;
1121
1122 case SO_SNDBUF:
1123 v.val = sk->sk_sndbuf;
1124 break;
1125
1126 case SO_RCVBUF:
1127 v.val = sk->sk_rcvbuf;
1128 break;
1129
1130 case SO_REUSEADDR:
1131 v.val = sk->sk_reuse;
1132 break;
1133
1134 case SO_REUSEPORT:
1135 v.val = sk->sk_reuseport;
1136 break;
1137
1138 case SO_KEEPALIVE:
1139 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1140 break;
1141
1142 case SO_TYPE:
1143 v.val = sk->sk_type;
1144 break;
1145
1146 case SO_PROTOCOL:
1147 v.val = sk->sk_protocol;
1148 break;
1149
1150 case SO_DOMAIN:
1151 v.val = sk->sk_family;
1152 break;
1153
1154 case SO_ERROR:
1155 v.val = -sock_error(sk);
1156 if (v.val == 0)
1157 v.val = xchg(&sk->sk_err_soft, 0);
1158 break;
1159
1160 case SO_OOBINLINE:
1161 v.val = sock_flag(sk, SOCK_URGINLINE);
1162 break;
1163
1164 case SO_NO_CHECK:
1165 v.val = sk->sk_no_check_tx;
1166 break;
1167
1168 case SO_PRIORITY:
1169 v.val = sk->sk_priority;
1170 break;
1171
1172 case SO_LINGER:
1173 lv = sizeof(v.ling);
1174 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1175 v.ling.l_linger = sk->sk_lingertime / HZ;
1176 break;
1177
1178 case SO_BSDCOMPAT:
1179 sock_warn_obsolete_bsdism("getsockopt");
1180 break;
1181
1182 case SO_TIMESTAMP:
1183 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1184 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1185 break;
1186
1187 case SO_TIMESTAMPNS:
1188 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1189 break;
1190
1191 case SO_TIMESTAMPING:
1192 v.val = sk->sk_tsflags;
1193 break;
1194
1195 case SO_RCVTIMEO:
1196 lv = sizeof(struct timeval);
1197 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1198 v.tm.tv_sec = 0;
1199 v.tm.tv_usec = 0;
1200 } else {
1201 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1202 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1203 }
1204 break;
1205
1206 case SO_SNDTIMEO:
1207 lv = sizeof(struct timeval);
1208 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1209 v.tm.tv_sec = 0;
1210 v.tm.tv_usec = 0;
1211 } else {
1212 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1213 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1214 }
1215 break;
1216
1217 case SO_RCVLOWAT:
1218 v.val = sk->sk_rcvlowat;
1219 break;
1220
1221 case SO_SNDLOWAT:
1222 v.val = 1;
1223 break;
1224
1225 case SO_PASSCRED:
1226 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1227 break;
1228
1229 case SO_PEERCRED:
1230 {
1231 struct ucred peercred;
1232 if (len > sizeof(peercred))
1233 len = sizeof(peercred);
1234 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1235 if (copy_to_user(optval, &peercred, len))
1236 return -EFAULT;
1237 goto lenout;
1238 }
1239
1240 case SO_PEERGROUPS:
1241 {
1242 int ret, n;
1243
1244 if (!sk->sk_peer_cred)
1245 return -ENODATA;
1246
1247 n = sk->sk_peer_cred->group_info->ngroups;
1248 if (len < n * sizeof(gid_t)) {
1249 len = n * sizeof(gid_t);
1250 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1251 }
1252 len = n * sizeof(gid_t);
1253
1254 ret = groups_to_user((gid_t __user *)optval,
1255 sk->sk_peer_cred->group_info);
1256 if (ret)
1257 return ret;
1258 goto lenout;
1259 }
1260
1261 case SO_PEERNAME:
1262 {
1263 char address[128];
1264
1265 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1266 if (lv < 0)
1267 return -ENOTCONN;
1268 if (lv < len)
1269 return -EINVAL;
1270 if (copy_to_user(optval, address, len))
1271 return -EFAULT;
1272 goto lenout;
1273 }
1274
1275 /* Dubious BSD thing... Probably nobody even uses it, but
1276 * the UNIX standard wants it for whatever reason... -DaveM
1277 */
1278 case SO_ACCEPTCONN:
1279 v.val = sk->sk_state == TCP_LISTEN;
1280 break;
1281
1282 case SO_PASSSEC:
1283 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1284 break;
1285
1286 case SO_PEERSEC:
1287 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1288
1289 case SO_MARK:
1290 v.val = sk->sk_mark;
1291 break;
1292
1293 case SO_RXQ_OVFL:
1294 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1295 break;
1296
1297 case SO_WIFI_STATUS:
1298 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1299 break;
1300
1301 case SO_PEEK_OFF:
1302 if (!sock->ops->set_peek_off)
1303 return -EOPNOTSUPP;
1304
1305 v.val = sk->sk_peek_off;
1306 break;
1307 case SO_NOFCS:
1308 v.val = sock_flag(sk, SOCK_NOFCS);
1309 break;
1310
1311 case SO_BINDTODEVICE:
1312 return sock_getbindtodevice(sk, optval, optlen, len);
1313
1314 case SO_GET_FILTER:
1315 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1316 if (len < 0)
1317 return len;
1318
1319 goto lenout;
1320
1321 case SO_LOCK_FILTER:
1322 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1323 break;
1324
1325 case SO_BPF_EXTENSIONS:
1326 v.val = bpf_tell_extensions();
1327 break;
1328
1329 case SO_SELECT_ERR_QUEUE:
1330 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1331 break;
1332
1333 #ifdef CONFIG_NET_RX_BUSY_POLL
1334 case SO_BUSY_POLL:
1335 v.val = sk->sk_ll_usec;
1336 break;
1337 #endif
1338
1339 case SO_MAX_PACING_RATE:
1340 v.val = sk->sk_max_pacing_rate;
1341 break;
1342
1343 case SO_INCOMING_CPU:
1344 v.val = READ_ONCE(sk->sk_incoming_cpu);
1345 break;
1346
1347 case SO_MEMINFO:
1348 {
1349 u32 meminfo[SK_MEMINFO_VARS];
1350
1351 sk_get_meminfo(sk, meminfo);
1352
1353 len = min_t(unsigned int, len, sizeof(meminfo));
1354 if (copy_to_user(optval, &meminfo, len))
1355 return -EFAULT;
1356
1357 goto lenout;
1358 }
1359
1360 #ifdef CONFIG_NET_RX_BUSY_POLL
1361 case SO_INCOMING_NAPI_ID:
1362 v.val = READ_ONCE(sk->sk_napi_id);
1363
1364 /* aggregate non-NAPI IDs down to 0 */
1365 if (v.val < MIN_NAPI_ID)
1366 v.val = 0;
1367
1368 break;
1369 #endif
1370
1371 case SO_COOKIE:
1372 lv = sizeof(u64);
1373 if (len < lv)
1374 return -EINVAL;
1375 v.val64 = sock_gen_cookie(sk);
1376 break;
1377
1378 case SO_ZEROCOPY:
1379 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1380 break;
1381
1382 case SO_TXTIME:
1383 lv = sizeof(v.txtime);
1384 v.txtime.clockid = sk->sk_clockid;
1385 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1386 SOF_TXTIME_DEADLINE_MODE : 0;
1387 v.txtime.flags |= sk->sk_txtime_report_errors ?
1388 SOF_TXTIME_REPORT_ERRORS : 0;
1389 break;
1390
1391 default:
1392 /* We implement the SO_SNDLOWAT etc to not be settable
1393 * (1003.1g 7).
1394 */
1395 return -ENOPROTOOPT;
1396 }
1397
1398 if (len > lv)
1399 len = lv;
1400 if (copy_to_user(optval, &v, len))
1401 return -EFAULT;
1402 lenout:
1403 if (put_user(len, optlen))
1404 return -EFAULT;
1405 return 0;
1406 }
1407
1408 /*
1409 * Initialize an sk_lock.
1410 *
1411 * (We also register the sk_lock with the lock validator.)
1412 */
sock_lock_init(struct sock * sk)1413 static inline void sock_lock_init(struct sock *sk)
1414 {
1415 if (sk->sk_kern_sock)
1416 sock_lock_init_class_and_name(
1417 sk,
1418 af_family_kern_slock_key_strings[sk->sk_family],
1419 af_family_kern_slock_keys + sk->sk_family,
1420 af_family_kern_key_strings[sk->sk_family],
1421 af_family_kern_keys + sk->sk_family);
1422 else
1423 sock_lock_init_class_and_name(
1424 sk,
1425 af_family_slock_key_strings[sk->sk_family],
1426 af_family_slock_keys + sk->sk_family,
1427 af_family_key_strings[sk->sk_family],
1428 af_family_keys + sk->sk_family);
1429 }
1430
1431 /*
1432 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1433 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1434 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1435 */
sock_copy(struct sock * nsk,const struct sock * osk)1436 static void sock_copy(struct sock *nsk, const struct sock *osk)
1437 {
1438 #ifdef CONFIG_SECURITY_NETWORK
1439 void *sptr = nsk->sk_security;
1440 #endif
1441 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1442
1443 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1444 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1445
1446 #ifdef CONFIG_SECURITY_NETWORK
1447 nsk->sk_security = sptr;
1448 security_sk_clone(osk, nsk);
1449 #endif
1450 }
1451
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1452 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1453 int family)
1454 {
1455 struct sock *sk;
1456 struct kmem_cache *slab;
1457
1458 slab = prot->slab;
1459 if (slab != NULL) {
1460 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1461 if (!sk)
1462 return sk;
1463 if (priority & __GFP_ZERO)
1464 sk_prot_clear_nulls(sk, prot->obj_size);
1465 } else
1466 sk = kmalloc(prot->obj_size, priority);
1467
1468 if (sk != NULL) {
1469 if (security_sk_alloc(sk, family, priority))
1470 goto out_free;
1471
1472 if (!try_module_get(prot->owner))
1473 goto out_free_sec;
1474 sk_tx_queue_clear(sk);
1475 }
1476
1477 return sk;
1478
1479 out_free_sec:
1480 security_sk_free(sk);
1481 out_free:
1482 if (slab != NULL)
1483 kmem_cache_free(slab, sk);
1484 else
1485 kfree(sk);
1486 return NULL;
1487 }
1488
sk_prot_free(struct proto * prot,struct sock * sk)1489 static void sk_prot_free(struct proto *prot, struct sock *sk)
1490 {
1491 struct kmem_cache *slab;
1492 struct module *owner;
1493
1494 owner = prot->owner;
1495 slab = prot->slab;
1496
1497 cgroup_sk_free(&sk->sk_cgrp_data);
1498 mem_cgroup_sk_free(sk);
1499 security_sk_free(sk);
1500 if (slab != NULL)
1501 kmem_cache_free(slab, sk);
1502 else
1503 kfree(sk);
1504 module_put(owner);
1505 }
1506
1507 /**
1508 * sk_alloc - All socket objects are allocated here
1509 * @net: the applicable net namespace
1510 * @family: protocol family
1511 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1512 * @prot: struct proto associated with this new sock instance
1513 * @kern: is this to be a kernel socket?
1514 */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1515 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1516 struct proto *prot, int kern)
1517 {
1518 struct sock *sk;
1519
1520 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1521 if (sk) {
1522 sk->sk_family = family;
1523 /*
1524 * See comment in struct sock definition to understand
1525 * why we need sk_prot_creator -acme
1526 */
1527 sk->sk_prot = sk->sk_prot_creator = prot;
1528 sk->sk_kern_sock = kern;
1529 sock_lock_init(sk);
1530 sk->sk_net_refcnt = kern ? 0 : 1;
1531 if (likely(sk->sk_net_refcnt)) {
1532 get_net(net);
1533 sock_inuse_add(net, 1);
1534 }
1535
1536 sock_net_set(sk, net);
1537 refcount_set(&sk->sk_wmem_alloc, 1);
1538
1539 mem_cgroup_sk_alloc(sk);
1540 cgroup_sk_alloc(&sk->sk_cgrp_data);
1541 sock_update_classid(&sk->sk_cgrp_data);
1542 sock_update_netprioidx(&sk->sk_cgrp_data);
1543 sk_tx_queue_clear(sk);
1544 }
1545
1546 return sk;
1547 }
1548 EXPORT_SYMBOL(sk_alloc);
1549
1550 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1551 * grace period. This is the case for UDP sockets and TCP listeners.
1552 */
__sk_destruct(struct rcu_head * head)1553 static void __sk_destruct(struct rcu_head *head)
1554 {
1555 struct sock *sk = container_of(head, struct sock, sk_rcu);
1556 struct sk_filter *filter;
1557
1558 if (sk->sk_destruct)
1559 sk->sk_destruct(sk);
1560
1561 filter = rcu_dereference_check(sk->sk_filter,
1562 refcount_read(&sk->sk_wmem_alloc) == 0);
1563 if (filter) {
1564 sk_filter_uncharge(sk, filter);
1565 RCU_INIT_POINTER(sk->sk_filter, NULL);
1566 }
1567
1568 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1569
1570 if (atomic_read(&sk->sk_omem_alloc))
1571 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1572 __func__, atomic_read(&sk->sk_omem_alloc));
1573
1574 if (sk->sk_frag.page) {
1575 put_page(sk->sk_frag.page);
1576 sk->sk_frag.page = NULL;
1577 }
1578
1579 if (sk->sk_peer_cred)
1580 put_cred(sk->sk_peer_cred);
1581 put_pid(sk->sk_peer_pid);
1582 if (likely(sk->sk_net_refcnt))
1583 put_net(sock_net(sk));
1584 sk_prot_free(sk->sk_prot_creator, sk);
1585 }
1586
sk_destruct(struct sock * sk)1587 void sk_destruct(struct sock *sk)
1588 {
1589 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1590
1591 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1592 reuseport_detach_sock(sk);
1593 use_call_rcu = true;
1594 }
1595
1596 if (use_call_rcu)
1597 call_rcu(&sk->sk_rcu, __sk_destruct);
1598 else
1599 __sk_destruct(&sk->sk_rcu);
1600 }
1601
__sk_free(struct sock * sk)1602 static void __sk_free(struct sock *sk)
1603 {
1604 if (likely(sk->sk_net_refcnt))
1605 sock_inuse_add(sock_net(sk), -1);
1606
1607 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1608 sock_diag_broadcast_destroy(sk);
1609 else
1610 sk_destruct(sk);
1611 }
1612
sk_free(struct sock * sk)1613 void sk_free(struct sock *sk)
1614 {
1615 /*
1616 * We subtract one from sk_wmem_alloc and can know if
1617 * some packets are still in some tx queue.
1618 * If not null, sock_wfree() will call __sk_free(sk) later
1619 */
1620 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1621 __sk_free(sk);
1622 }
1623 EXPORT_SYMBOL(sk_free);
1624
sk_init_common(struct sock * sk)1625 static void sk_init_common(struct sock *sk)
1626 {
1627 skb_queue_head_init(&sk->sk_receive_queue);
1628 skb_queue_head_init(&sk->sk_write_queue);
1629 skb_queue_head_init(&sk->sk_error_queue);
1630
1631 rwlock_init(&sk->sk_callback_lock);
1632 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1633 af_rlock_keys + sk->sk_family,
1634 af_family_rlock_key_strings[sk->sk_family]);
1635 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1636 af_wlock_keys + sk->sk_family,
1637 af_family_wlock_key_strings[sk->sk_family]);
1638 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1639 af_elock_keys + sk->sk_family,
1640 af_family_elock_key_strings[sk->sk_family]);
1641 lockdep_set_class_and_name(&sk->sk_callback_lock,
1642 af_callback_keys + sk->sk_family,
1643 af_family_clock_key_strings[sk->sk_family]);
1644 }
1645
1646 /**
1647 * sk_clone_lock - clone a socket, and lock its clone
1648 * @sk: the socket to clone
1649 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1650 *
1651 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1652 */
sk_clone_lock(const struct sock * sk,const gfp_t priority)1653 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1654 {
1655 struct sock *newsk;
1656 bool is_charged = true;
1657
1658 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1659 if (newsk != NULL) {
1660 struct sk_filter *filter;
1661
1662 sock_copy(newsk, sk);
1663
1664 newsk->sk_prot_creator = sk->sk_prot;
1665
1666 /* SANITY */
1667 if (likely(newsk->sk_net_refcnt))
1668 get_net(sock_net(newsk));
1669 sk_node_init(&newsk->sk_node);
1670 sock_lock_init(newsk);
1671 bh_lock_sock(newsk);
1672 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1673 newsk->sk_backlog.len = 0;
1674
1675 atomic_set(&newsk->sk_rmem_alloc, 0);
1676 /*
1677 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1678 */
1679 refcount_set(&newsk->sk_wmem_alloc, 1);
1680 atomic_set(&newsk->sk_omem_alloc, 0);
1681 sk_init_common(newsk);
1682
1683 newsk->sk_dst_cache = NULL;
1684 newsk->sk_dst_pending_confirm = 0;
1685 newsk->sk_wmem_queued = 0;
1686 newsk->sk_forward_alloc = 0;
1687 atomic_set(&newsk->sk_drops, 0);
1688 newsk->sk_send_head = NULL;
1689 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1690 atomic_set(&newsk->sk_zckey, 0);
1691
1692 sock_reset_flag(newsk, SOCK_DONE);
1693
1694 /* sk->sk_memcg will be populated at accept() time */
1695 newsk->sk_memcg = NULL;
1696
1697 cgroup_sk_clone(&newsk->sk_cgrp_data);
1698
1699 rcu_read_lock();
1700 filter = rcu_dereference(sk->sk_filter);
1701 if (filter != NULL)
1702 /* though it's an empty new sock, the charging may fail
1703 * if sysctl_optmem_max was changed between creation of
1704 * original socket and cloning
1705 */
1706 is_charged = sk_filter_charge(newsk, filter);
1707 RCU_INIT_POINTER(newsk->sk_filter, filter);
1708 rcu_read_unlock();
1709
1710 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1711 /* We need to make sure that we don't uncharge the new
1712 * socket if we couldn't charge it in the first place
1713 * as otherwise we uncharge the parent's filter.
1714 */
1715 if (!is_charged)
1716 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1717 sk_free_unlock_clone(newsk);
1718 newsk = NULL;
1719 goto out;
1720 }
1721 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1722
1723 newsk->sk_err = 0;
1724 newsk->sk_err_soft = 0;
1725 newsk->sk_priority = 0;
1726 newsk->sk_incoming_cpu = raw_smp_processor_id();
1727 atomic64_set(&newsk->sk_cookie, 0);
1728 if (likely(newsk->sk_net_refcnt))
1729 sock_inuse_add(sock_net(newsk), 1);
1730
1731 /*
1732 * Before updating sk_refcnt, we must commit prior changes to memory
1733 * (Documentation/RCU/rculist_nulls.txt for details)
1734 */
1735 smp_wmb();
1736 refcount_set(&newsk->sk_refcnt, 2);
1737
1738 /*
1739 * Increment the counter in the same struct proto as the master
1740 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1741 * is the same as sk->sk_prot->socks, as this field was copied
1742 * with memcpy).
1743 *
1744 * This _changes_ the previous behaviour, where
1745 * tcp_create_openreq_child always was incrementing the
1746 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1747 * to be taken into account in all callers. -acme
1748 */
1749 sk_refcnt_debug_inc(newsk);
1750 sk_set_socket(newsk, NULL);
1751 sk_tx_queue_clear(newsk);
1752 newsk->sk_wq = NULL;
1753
1754 if (newsk->sk_prot->sockets_allocated)
1755 sk_sockets_allocated_inc(newsk);
1756
1757 if (sock_needs_netstamp(sk) &&
1758 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1759 net_enable_timestamp();
1760 }
1761 out:
1762 return newsk;
1763 }
1764 EXPORT_SYMBOL_GPL(sk_clone_lock);
1765
sk_free_unlock_clone(struct sock * sk)1766 void sk_free_unlock_clone(struct sock *sk)
1767 {
1768 /* It is still raw copy of parent, so invalidate
1769 * destructor and make plain sk_free() */
1770 sk->sk_destruct = NULL;
1771 bh_unlock_sock(sk);
1772 sk_free(sk);
1773 }
1774 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1775
sk_setup_caps(struct sock * sk,struct dst_entry * dst)1776 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1777 {
1778 u32 max_segs = 1;
1779
1780 sk_dst_set(sk, dst);
1781 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1782 if (sk->sk_route_caps & NETIF_F_GSO)
1783 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1784 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1785 if (sk_can_gso(sk)) {
1786 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1787 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1788 } else {
1789 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1790 sk->sk_gso_max_size = dst->dev->gso_max_size;
1791 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1792 }
1793 }
1794 sk->sk_gso_max_segs = max_segs;
1795 }
1796 EXPORT_SYMBOL_GPL(sk_setup_caps);
1797
1798 /*
1799 * Simple resource managers for sockets.
1800 */
1801
1802
1803 /*
1804 * Write buffer destructor automatically called from kfree_skb.
1805 */
sock_wfree(struct sk_buff * skb)1806 void sock_wfree(struct sk_buff *skb)
1807 {
1808 struct sock *sk = skb->sk;
1809 unsigned int len = skb->truesize;
1810
1811 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1812 /*
1813 * Keep a reference on sk_wmem_alloc, this will be released
1814 * after sk_write_space() call
1815 */
1816 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1817 sk->sk_write_space(sk);
1818 len = 1;
1819 }
1820 /*
1821 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1822 * could not do because of in-flight packets
1823 */
1824 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1825 __sk_free(sk);
1826 }
1827 EXPORT_SYMBOL(sock_wfree);
1828
1829 /* This variant of sock_wfree() is used by TCP,
1830 * since it sets SOCK_USE_WRITE_QUEUE.
1831 */
__sock_wfree(struct sk_buff * skb)1832 void __sock_wfree(struct sk_buff *skb)
1833 {
1834 struct sock *sk = skb->sk;
1835
1836 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1837 __sk_free(sk);
1838 }
1839
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)1840 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1841 {
1842 skb_orphan(skb);
1843 skb->sk = sk;
1844 #ifdef CONFIG_INET
1845 if (unlikely(!sk_fullsock(sk))) {
1846 skb->destructor = sock_edemux;
1847 sock_hold(sk);
1848 return;
1849 }
1850 #endif
1851 skb->destructor = sock_wfree;
1852 skb_set_hash_from_sk(skb, sk);
1853 /*
1854 * We used to take a refcount on sk, but following operation
1855 * is enough to guarantee sk_free() wont free this sock until
1856 * all in-flight packets are completed
1857 */
1858 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1859 }
1860 EXPORT_SYMBOL(skb_set_owner_w);
1861
1862 /* This helper is used by netem, as it can hold packets in its
1863 * delay queue. We want to allow the owner socket to send more
1864 * packets, as if they were already TX completed by a typical driver.
1865 * But we also want to keep skb->sk set because some packet schedulers
1866 * rely on it (sch_fq for example).
1867 */
skb_orphan_partial(struct sk_buff * skb)1868 void skb_orphan_partial(struct sk_buff *skb)
1869 {
1870 if (skb_is_tcp_pure_ack(skb))
1871 return;
1872
1873 if (skb->destructor == sock_wfree
1874 #ifdef CONFIG_INET
1875 || skb->destructor == tcp_wfree
1876 #endif
1877 ) {
1878 struct sock *sk = skb->sk;
1879
1880 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1881 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1882 skb->destructor = sock_efree;
1883 }
1884 } else {
1885 skb_orphan(skb);
1886 }
1887 }
1888 EXPORT_SYMBOL(skb_orphan_partial);
1889
1890 /*
1891 * Read buffer destructor automatically called from kfree_skb.
1892 */
sock_rfree(struct sk_buff * skb)1893 void sock_rfree(struct sk_buff *skb)
1894 {
1895 struct sock *sk = skb->sk;
1896 unsigned int len = skb->truesize;
1897
1898 atomic_sub(len, &sk->sk_rmem_alloc);
1899 sk_mem_uncharge(sk, len);
1900 }
1901 EXPORT_SYMBOL(sock_rfree);
1902
1903 /*
1904 * Buffer destructor for skbs that are not used directly in read or write
1905 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1906 */
sock_efree(struct sk_buff * skb)1907 void sock_efree(struct sk_buff *skb)
1908 {
1909 sock_put(skb->sk);
1910 }
1911 EXPORT_SYMBOL(sock_efree);
1912
sock_i_uid(struct sock * sk)1913 kuid_t sock_i_uid(struct sock *sk)
1914 {
1915 kuid_t uid;
1916
1917 read_lock_bh(&sk->sk_callback_lock);
1918 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1919 read_unlock_bh(&sk->sk_callback_lock);
1920 return uid;
1921 }
1922 EXPORT_SYMBOL(sock_i_uid);
1923
sock_i_ino(struct sock * sk)1924 unsigned long sock_i_ino(struct sock *sk)
1925 {
1926 unsigned long ino;
1927
1928 read_lock_bh(&sk->sk_callback_lock);
1929 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1930 read_unlock_bh(&sk->sk_callback_lock);
1931 return ino;
1932 }
1933 EXPORT_SYMBOL(sock_i_ino);
1934
1935 /*
1936 * Allocate a skb from the socket's send buffer.
1937 */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)1938 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1939 gfp_t priority)
1940 {
1941 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1942 struct sk_buff *skb = alloc_skb(size, priority);
1943 if (skb) {
1944 skb_set_owner_w(skb, sk);
1945 return skb;
1946 }
1947 }
1948 return NULL;
1949 }
1950 EXPORT_SYMBOL(sock_wmalloc);
1951
sock_ofree(struct sk_buff * skb)1952 static void sock_ofree(struct sk_buff *skb)
1953 {
1954 struct sock *sk = skb->sk;
1955
1956 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1957 }
1958
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)1959 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1960 gfp_t priority)
1961 {
1962 struct sk_buff *skb;
1963
1964 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1965 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1966 sysctl_optmem_max)
1967 return NULL;
1968
1969 skb = alloc_skb(size, priority);
1970 if (!skb)
1971 return NULL;
1972
1973 atomic_add(skb->truesize, &sk->sk_omem_alloc);
1974 skb->sk = sk;
1975 skb->destructor = sock_ofree;
1976 return skb;
1977 }
1978
1979 /*
1980 * Allocate a memory block from the socket's option memory buffer.
1981 */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)1982 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1983 {
1984 if ((unsigned int)size <= sysctl_optmem_max &&
1985 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1986 void *mem;
1987 /* First do the add, to avoid the race if kmalloc
1988 * might sleep.
1989 */
1990 atomic_add(size, &sk->sk_omem_alloc);
1991 mem = kmalloc(size, priority);
1992 if (mem)
1993 return mem;
1994 atomic_sub(size, &sk->sk_omem_alloc);
1995 }
1996 return NULL;
1997 }
1998 EXPORT_SYMBOL(sock_kmalloc);
1999
2000 /* Free an option memory block. Note, we actually want the inline
2001 * here as this allows gcc to detect the nullify and fold away the
2002 * condition entirely.
2003 */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2004 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2005 const bool nullify)
2006 {
2007 if (WARN_ON_ONCE(!mem))
2008 return;
2009 if (nullify)
2010 kzfree(mem);
2011 else
2012 kfree(mem);
2013 atomic_sub(size, &sk->sk_omem_alloc);
2014 }
2015
sock_kfree_s(struct sock * sk,void * mem,int size)2016 void sock_kfree_s(struct sock *sk, void *mem, int size)
2017 {
2018 __sock_kfree_s(sk, mem, size, false);
2019 }
2020 EXPORT_SYMBOL(sock_kfree_s);
2021
sock_kzfree_s(struct sock * sk,void * mem,int size)2022 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2023 {
2024 __sock_kfree_s(sk, mem, size, true);
2025 }
2026 EXPORT_SYMBOL(sock_kzfree_s);
2027
2028 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2029 I think, these locks should be removed for datagram sockets.
2030 */
sock_wait_for_wmem(struct sock * sk,long timeo)2031 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2032 {
2033 DEFINE_WAIT(wait);
2034
2035 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2036 for (;;) {
2037 if (!timeo)
2038 break;
2039 if (signal_pending(current))
2040 break;
2041 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2042 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2043 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2044 break;
2045 if (sk->sk_shutdown & SEND_SHUTDOWN)
2046 break;
2047 if (sk->sk_err)
2048 break;
2049 timeo = schedule_timeout(timeo);
2050 }
2051 finish_wait(sk_sleep(sk), &wait);
2052 return timeo;
2053 }
2054
2055
2056 /*
2057 * Generic send/receive buffer handlers
2058 */
2059
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2060 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2061 unsigned long data_len, int noblock,
2062 int *errcode, int max_page_order)
2063 {
2064 struct sk_buff *skb;
2065 long timeo;
2066 int err;
2067
2068 timeo = sock_sndtimeo(sk, noblock);
2069 for (;;) {
2070 err = sock_error(sk);
2071 if (err != 0)
2072 goto failure;
2073
2074 err = -EPIPE;
2075 if (sk->sk_shutdown & SEND_SHUTDOWN)
2076 goto failure;
2077
2078 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2079 break;
2080
2081 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2082 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2083 err = -EAGAIN;
2084 if (!timeo)
2085 goto failure;
2086 if (signal_pending(current))
2087 goto interrupted;
2088 timeo = sock_wait_for_wmem(sk, timeo);
2089 }
2090 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2091 errcode, sk->sk_allocation);
2092 if (skb)
2093 skb_set_owner_w(skb, sk);
2094 return skb;
2095
2096 interrupted:
2097 err = sock_intr_errno(timeo);
2098 failure:
2099 *errcode = err;
2100 return NULL;
2101 }
2102 EXPORT_SYMBOL(sock_alloc_send_pskb);
2103
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)2104 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2105 int noblock, int *errcode)
2106 {
2107 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2108 }
2109 EXPORT_SYMBOL(sock_alloc_send_skb);
2110
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2111 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2112 struct sockcm_cookie *sockc)
2113 {
2114 u32 tsflags;
2115
2116 switch (cmsg->cmsg_type) {
2117 case SO_MARK:
2118 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2119 return -EPERM;
2120 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2121 return -EINVAL;
2122 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2123 break;
2124 case SO_TIMESTAMPING:
2125 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2126 return -EINVAL;
2127
2128 tsflags = *(u32 *)CMSG_DATA(cmsg);
2129 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2130 return -EINVAL;
2131
2132 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2133 sockc->tsflags |= tsflags;
2134 break;
2135 case SCM_TXTIME:
2136 if (!sock_flag(sk, SOCK_TXTIME))
2137 return -EINVAL;
2138 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2139 return -EINVAL;
2140 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2141 break;
2142 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2143 case SCM_RIGHTS:
2144 case SCM_CREDENTIALS:
2145 break;
2146 default:
2147 return -EINVAL;
2148 }
2149 return 0;
2150 }
2151 EXPORT_SYMBOL(__sock_cmsg_send);
2152
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2153 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2154 struct sockcm_cookie *sockc)
2155 {
2156 struct cmsghdr *cmsg;
2157 int ret;
2158
2159 for_each_cmsghdr(cmsg, msg) {
2160 if (!CMSG_OK(msg, cmsg))
2161 return -EINVAL;
2162 if (cmsg->cmsg_level != SOL_SOCKET)
2163 continue;
2164 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2165 if (ret)
2166 return ret;
2167 }
2168 return 0;
2169 }
2170 EXPORT_SYMBOL(sock_cmsg_send);
2171
sk_enter_memory_pressure(struct sock * sk)2172 static void sk_enter_memory_pressure(struct sock *sk)
2173 {
2174 if (!sk->sk_prot->enter_memory_pressure)
2175 return;
2176
2177 sk->sk_prot->enter_memory_pressure(sk);
2178 }
2179
sk_leave_memory_pressure(struct sock * sk)2180 static void sk_leave_memory_pressure(struct sock *sk)
2181 {
2182 if (sk->sk_prot->leave_memory_pressure) {
2183 sk->sk_prot->leave_memory_pressure(sk);
2184 } else {
2185 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2186
2187 if (memory_pressure && READ_ONCE(*memory_pressure))
2188 WRITE_ONCE(*memory_pressure, 0);
2189 }
2190 }
2191
2192 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2193
2194 /**
2195 * skb_page_frag_refill - check that a page_frag contains enough room
2196 * @sz: minimum size of the fragment we want to get
2197 * @pfrag: pointer to page_frag
2198 * @gfp: priority for memory allocation
2199 *
2200 * Note: While this allocator tries to use high order pages, there is
2201 * no guarantee that allocations succeed. Therefore, @sz MUST be
2202 * less or equal than PAGE_SIZE.
2203 */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2204 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2205 {
2206 if (pfrag->page) {
2207 if (page_ref_count(pfrag->page) == 1) {
2208 pfrag->offset = 0;
2209 return true;
2210 }
2211 if (pfrag->offset + sz <= pfrag->size)
2212 return true;
2213 put_page(pfrag->page);
2214 }
2215
2216 pfrag->offset = 0;
2217 if (SKB_FRAG_PAGE_ORDER) {
2218 /* Avoid direct reclaim but allow kswapd to wake */
2219 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2220 __GFP_COMP | __GFP_NOWARN |
2221 __GFP_NORETRY,
2222 SKB_FRAG_PAGE_ORDER);
2223 if (likely(pfrag->page)) {
2224 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2225 return true;
2226 }
2227 }
2228 pfrag->page = alloc_page(gfp);
2229 if (likely(pfrag->page)) {
2230 pfrag->size = PAGE_SIZE;
2231 return true;
2232 }
2233 return false;
2234 }
2235 EXPORT_SYMBOL(skb_page_frag_refill);
2236
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2237 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2238 {
2239 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2240 return true;
2241
2242 sk_enter_memory_pressure(sk);
2243 sk_stream_moderate_sndbuf(sk);
2244 return false;
2245 }
2246 EXPORT_SYMBOL(sk_page_frag_refill);
2247
sk_alloc_sg(struct sock * sk,int len,struct scatterlist * sg,int sg_start,int * sg_curr_index,unsigned int * sg_curr_size,int first_coalesce)2248 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2249 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2250 int first_coalesce)
2251 {
2252 int sg_curr = *sg_curr_index, use = 0, rc = 0;
2253 unsigned int size = *sg_curr_size;
2254 struct page_frag *pfrag;
2255 struct scatterlist *sge;
2256
2257 len -= size;
2258 pfrag = sk_page_frag(sk);
2259
2260 while (len > 0) {
2261 unsigned int orig_offset;
2262
2263 if (!sk_page_frag_refill(sk, pfrag)) {
2264 rc = -ENOMEM;
2265 goto out;
2266 }
2267
2268 use = min_t(int, len, pfrag->size - pfrag->offset);
2269
2270 if (!sk_wmem_schedule(sk, use)) {
2271 rc = -ENOMEM;
2272 goto out;
2273 }
2274
2275 sk_mem_charge(sk, use);
2276 size += use;
2277 orig_offset = pfrag->offset;
2278 pfrag->offset += use;
2279
2280 sge = sg + sg_curr - 1;
2281 if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
2282 sge->offset + sge->length == orig_offset) {
2283 sge->length += use;
2284 } else {
2285 sge = sg + sg_curr;
2286 sg_unmark_end(sge);
2287 sg_set_page(sge, pfrag->page, use, orig_offset);
2288 get_page(pfrag->page);
2289 sg_curr++;
2290
2291 if (sg_curr == MAX_SKB_FRAGS)
2292 sg_curr = 0;
2293
2294 if (sg_curr == sg_start) {
2295 rc = -ENOSPC;
2296 break;
2297 }
2298 }
2299
2300 len -= use;
2301 }
2302 out:
2303 *sg_curr_size = size;
2304 *sg_curr_index = sg_curr;
2305 return rc;
2306 }
2307 EXPORT_SYMBOL(sk_alloc_sg);
2308
__lock_sock(struct sock * sk)2309 static void __lock_sock(struct sock *sk)
2310 __releases(&sk->sk_lock.slock)
2311 __acquires(&sk->sk_lock.slock)
2312 {
2313 DEFINE_WAIT(wait);
2314
2315 for (;;) {
2316 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2317 TASK_UNINTERRUPTIBLE);
2318 spin_unlock_bh(&sk->sk_lock.slock);
2319 schedule();
2320 spin_lock_bh(&sk->sk_lock.slock);
2321 if (!sock_owned_by_user(sk))
2322 break;
2323 }
2324 finish_wait(&sk->sk_lock.wq, &wait);
2325 }
2326
__release_sock(struct sock * sk)2327 void __release_sock(struct sock *sk)
2328 __releases(&sk->sk_lock.slock)
2329 __acquires(&sk->sk_lock.slock)
2330 {
2331 struct sk_buff *skb, *next;
2332
2333 while ((skb = sk->sk_backlog.head) != NULL) {
2334 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2335
2336 spin_unlock_bh(&sk->sk_lock.slock);
2337
2338 do {
2339 next = skb->next;
2340 prefetch(next);
2341 WARN_ON_ONCE(skb_dst_is_noref(skb));
2342 skb->next = NULL;
2343 sk_backlog_rcv(sk, skb);
2344
2345 cond_resched();
2346
2347 skb = next;
2348 } while (skb != NULL);
2349
2350 spin_lock_bh(&sk->sk_lock.slock);
2351 }
2352
2353 /*
2354 * Doing the zeroing here guarantee we can not loop forever
2355 * while a wild producer attempts to flood us.
2356 */
2357 sk->sk_backlog.len = 0;
2358 }
2359
__sk_flush_backlog(struct sock * sk)2360 void __sk_flush_backlog(struct sock *sk)
2361 {
2362 spin_lock_bh(&sk->sk_lock.slock);
2363 __release_sock(sk);
2364 spin_unlock_bh(&sk->sk_lock.slock);
2365 }
2366
2367 /**
2368 * sk_wait_data - wait for data to arrive at sk_receive_queue
2369 * @sk: sock to wait on
2370 * @timeo: for how long
2371 * @skb: last skb seen on sk_receive_queue
2372 *
2373 * Now socket state including sk->sk_err is changed only under lock,
2374 * hence we may omit checks after joining wait queue.
2375 * We check receive queue before schedule() only as optimization;
2376 * it is very likely that release_sock() added new data.
2377 */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2378 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2379 {
2380 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2381 int rc;
2382
2383 add_wait_queue(sk_sleep(sk), &wait);
2384 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2385 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2386 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2387 remove_wait_queue(sk_sleep(sk), &wait);
2388 return rc;
2389 }
2390 EXPORT_SYMBOL(sk_wait_data);
2391
2392 /**
2393 * __sk_mem_raise_allocated - increase memory_allocated
2394 * @sk: socket
2395 * @size: memory size to allocate
2396 * @amt: pages to allocate
2397 * @kind: allocation type
2398 *
2399 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2400 */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)2401 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2402 {
2403 struct proto *prot = sk->sk_prot;
2404 long allocated = sk_memory_allocated_add(sk, amt);
2405 bool charged = true;
2406
2407 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2408 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2409 goto suppress_allocation;
2410
2411 /* Under limit. */
2412 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2413 sk_leave_memory_pressure(sk);
2414 return 1;
2415 }
2416
2417 /* Under pressure. */
2418 if (allocated > sk_prot_mem_limits(sk, 1))
2419 sk_enter_memory_pressure(sk);
2420
2421 /* Over hard limit. */
2422 if (allocated > sk_prot_mem_limits(sk, 2))
2423 goto suppress_allocation;
2424
2425 /* guarantee minimum buffer size under pressure */
2426 if (kind == SK_MEM_RECV) {
2427 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2428 return 1;
2429
2430 } else { /* SK_MEM_SEND */
2431 int wmem0 = sk_get_wmem0(sk, prot);
2432
2433 if (sk->sk_type == SOCK_STREAM) {
2434 if (sk->sk_wmem_queued < wmem0)
2435 return 1;
2436 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2437 return 1;
2438 }
2439 }
2440
2441 if (sk_has_memory_pressure(sk)) {
2442 u64 alloc;
2443
2444 if (!sk_under_memory_pressure(sk))
2445 return 1;
2446 alloc = sk_sockets_allocated_read_positive(sk);
2447 if (sk_prot_mem_limits(sk, 2) > alloc *
2448 sk_mem_pages(sk->sk_wmem_queued +
2449 atomic_read(&sk->sk_rmem_alloc) +
2450 sk->sk_forward_alloc))
2451 return 1;
2452 }
2453
2454 suppress_allocation:
2455
2456 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2457 sk_stream_moderate_sndbuf(sk);
2458
2459 /* Fail only if socket is _under_ its sndbuf.
2460 * In this case we cannot block, so that we have to fail.
2461 */
2462 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2463 return 1;
2464 }
2465
2466 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2467 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2468
2469 sk_memory_allocated_sub(sk, amt);
2470
2471 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2472 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2473
2474 return 0;
2475 }
2476 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2477
2478 /**
2479 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2480 * @sk: socket
2481 * @size: memory size to allocate
2482 * @kind: allocation type
2483 *
2484 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2485 * rmem allocation. This function assumes that protocols which have
2486 * memory_pressure use sk_wmem_queued as write buffer accounting.
2487 */
__sk_mem_schedule(struct sock * sk,int size,int kind)2488 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2489 {
2490 int ret, amt = sk_mem_pages(size);
2491
2492 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2493 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2494 if (!ret)
2495 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2496 return ret;
2497 }
2498 EXPORT_SYMBOL(__sk_mem_schedule);
2499
2500 /**
2501 * __sk_mem_reduce_allocated - reclaim memory_allocated
2502 * @sk: socket
2503 * @amount: number of quanta
2504 *
2505 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2506 */
__sk_mem_reduce_allocated(struct sock * sk,int amount)2507 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2508 {
2509 sk_memory_allocated_sub(sk, amount);
2510
2511 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2512 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2513
2514 if (sk_under_memory_pressure(sk) &&
2515 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2516 sk_leave_memory_pressure(sk);
2517 }
2518 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2519
2520 /**
2521 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2522 * @sk: socket
2523 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2524 */
__sk_mem_reclaim(struct sock * sk,int amount)2525 void __sk_mem_reclaim(struct sock *sk, int amount)
2526 {
2527 amount >>= SK_MEM_QUANTUM_SHIFT;
2528 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2529 __sk_mem_reduce_allocated(sk, amount);
2530 }
2531 EXPORT_SYMBOL(__sk_mem_reclaim);
2532
sk_set_peek_off(struct sock * sk,int val)2533 int sk_set_peek_off(struct sock *sk, int val)
2534 {
2535 sk->sk_peek_off = val;
2536 return 0;
2537 }
2538 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2539
2540 /*
2541 * Set of default routines for initialising struct proto_ops when
2542 * the protocol does not support a particular function. In certain
2543 * cases where it makes no sense for a protocol to have a "do nothing"
2544 * function, some default processing is provided.
2545 */
2546
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2547 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2548 {
2549 return -EOPNOTSUPP;
2550 }
2551 EXPORT_SYMBOL(sock_no_bind);
2552
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2553 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2554 int len, int flags)
2555 {
2556 return -EOPNOTSUPP;
2557 }
2558 EXPORT_SYMBOL(sock_no_connect);
2559
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2560 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2561 {
2562 return -EOPNOTSUPP;
2563 }
2564 EXPORT_SYMBOL(sock_no_socketpair);
2565
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)2566 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2567 bool kern)
2568 {
2569 return -EOPNOTSUPP;
2570 }
2571 EXPORT_SYMBOL(sock_no_accept);
2572
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)2573 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2574 int peer)
2575 {
2576 return -EOPNOTSUPP;
2577 }
2578 EXPORT_SYMBOL(sock_no_getname);
2579
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2580 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2581 {
2582 return -EOPNOTSUPP;
2583 }
2584 EXPORT_SYMBOL(sock_no_ioctl);
2585
sock_no_listen(struct socket * sock,int backlog)2586 int sock_no_listen(struct socket *sock, int backlog)
2587 {
2588 return -EOPNOTSUPP;
2589 }
2590 EXPORT_SYMBOL(sock_no_listen);
2591
sock_no_shutdown(struct socket * sock,int how)2592 int sock_no_shutdown(struct socket *sock, int how)
2593 {
2594 return -EOPNOTSUPP;
2595 }
2596 EXPORT_SYMBOL(sock_no_shutdown);
2597
sock_no_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2598 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2599 char __user *optval, unsigned int optlen)
2600 {
2601 return -EOPNOTSUPP;
2602 }
2603 EXPORT_SYMBOL(sock_no_setsockopt);
2604
sock_no_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2605 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2606 char __user *optval, int __user *optlen)
2607 {
2608 return -EOPNOTSUPP;
2609 }
2610 EXPORT_SYMBOL(sock_no_getsockopt);
2611
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2612 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2613 {
2614 return -EOPNOTSUPP;
2615 }
2616 EXPORT_SYMBOL(sock_no_sendmsg);
2617
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)2618 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2619 {
2620 return -EOPNOTSUPP;
2621 }
2622 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2623
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)2624 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2625 int flags)
2626 {
2627 return -EOPNOTSUPP;
2628 }
2629 EXPORT_SYMBOL(sock_no_recvmsg);
2630
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)2631 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2632 {
2633 /* Mirror missing mmap method error code */
2634 return -ENODEV;
2635 }
2636 EXPORT_SYMBOL(sock_no_mmap);
2637
2638 /*
2639 * When a file is received (via SCM_RIGHTS, etc), we must bump the
2640 * various sock-based usage counts.
2641 */
__receive_sock(struct file * file)2642 void __receive_sock(struct file *file)
2643 {
2644 struct socket *sock;
2645 int error;
2646
2647 /*
2648 * The resulting value of "error" is ignored here since we only
2649 * need to take action when the file is a socket and testing
2650 * "sock" for NULL is sufficient.
2651 */
2652 sock = sock_from_file(file, &error);
2653 if (sock) {
2654 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2655 sock_update_classid(&sock->sk->sk_cgrp_data);
2656 }
2657 }
2658
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)2659 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2660 {
2661 ssize_t res;
2662 struct msghdr msg = {.msg_flags = flags};
2663 struct kvec iov;
2664 char *kaddr = kmap(page);
2665 iov.iov_base = kaddr + offset;
2666 iov.iov_len = size;
2667 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2668 kunmap(page);
2669 return res;
2670 }
2671 EXPORT_SYMBOL(sock_no_sendpage);
2672
sock_no_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)2673 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2674 int offset, size_t size, int flags)
2675 {
2676 ssize_t res;
2677 struct msghdr msg = {.msg_flags = flags};
2678 struct kvec iov;
2679 char *kaddr = kmap(page);
2680
2681 iov.iov_base = kaddr + offset;
2682 iov.iov_len = size;
2683 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2684 kunmap(page);
2685 return res;
2686 }
2687 EXPORT_SYMBOL(sock_no_sendpage_locked);
2688
2689 /*
2690 * Default Socket Callbacks
2691 */
2692
sock_def_wakeup(struct sock * sk)2693 static void sock_def_wakeup(struct sock *sk)
2694 {
2695 struct socket_wq *wq;
2696
2697 rcu_read_lock();
2698 wq = rcu_dereference(sk->sk_wq);
2699 if (skwq_has_sleeper(wq))
2700 wake_up_interruptible_all(&wq->wait);
2701 rcu_read_unlock();
2702 }
2703
sock_def_error_report(struct sock * sk)2704 static void sock_def_error_report(struct sock *sk)
2705 {
2706 struct socket_wq *wq;
2707
2708 rcu_read_lock();
2709 wq = rcu_dereference(sk->sk_wq);
2710 if (skwq_has_sleeper(wq))
2711 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2712 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2713 rcu_read_unlock();
2714 }
2715
sock_def_readable(struct sock * sk)2716 static void sock_def_readable(struct sock *sk)
2717 {
2718 struct socket_wq *wq;
2719
2720 rcu_read_lock();
2721 wq = rcu_dereference(sk->sk_wq);
2722 if (skwq_has_sleeper(wq))
2723 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2724 EPOLLRDNORM | EPOLLRDBAND);
2725 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2726 rcu_read_unlock();
2727 }
2728
sock_def_write_space(struct sock * sk)2729 static void sock_def_write_space(struct sock *sk)
2730 {
2731 struct socket_wq *wq;
2732
2733 rcu_read_lock();
2734
2735 /* Do not wake up a writer until he can make "significant"
2736 * progress. --DaveM
2737 */
2738 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2739 wq = rcu_dereference(sk->sk_wq);
2740 if (skwq_has_sleeper(wq))
2741 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2742 EPOLLWRNORM | EPOLLWRBAND);
2743
2744 /* Should agree with poll, otherwise some programs break */
2745 if (sock_writeable(sk))
2746 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2747 }
2748
2749 rcu_read_unlock();
2750 }
2751
sock_def_destruct(struct sock * sk)2752 static void sock_def_destruct(struct sock *sk)
2753 {
2754 }
2755
sk_send_sigurg(struct sock * sk)2756 void sk_send_sigurg(struct sock *sk)
2757 {
2758 if (sk->sk_socket && sk->sk_socket->file)
2759 if (send_sigurg(&sk->sk_socket->file->f_owner))
2760 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2761 }
2762 EXPORT_SYMBOL(sk_send_sigurg);
2763
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)2764 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2765 unsigned long expires)
2766 {
2767 if (!mod_timer(timer, expires))
2768 sock_hold(sk);
2769 }
2770 EXPORT_SYMBOL(sk_reset_timer);
2771
sk_stop_timer(struct sock * sk,struct timer_list * timer)2772 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2773 {
2774 if (del_timer(timer))
2775 __sock_put(sk);
2776 }
2777 EXPORT_SYMBOL(sk_stop_timer);
2778
sock_init_data(struct socket * sock,struct sock * sk)2779 void sock_init_data(struct socket *sock, struct sock *sk)
2780 {
2781 sk_init_common(sk);
2782 sk->sk_send_head = NULL;
2783
2784 timer_setup(&sk->sk_timer, NULL, 0);
2785
2786 sk->sk_allocation = GFP_KERNEL;
2787 sk->sk_rcvbuf = sysctl_rmem_default;
2788 sk->sk_sndbuf = sysctl_wmem_default;
2789 sk->sk_state = TCP_CLOSE;
2790 sk_set_socket(sk, sock);
2791
2792 sock_set_flag(sk, SOCK_ZAPPED);
2793
2794 if (sock) {
2795 sk->sk_type = sock->type;
2796 sk->sk_wq = sock->wq;
2797 sock->sk = sk;
2798 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2799 } else {
2800 sk->sk_wq = NULL;
2801 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2802 }
2803
2804 rwlock_init(&sk->sk_callback_lock);
2805 if (sk->sk_kern_sock)
2806 lockdep_set_class_and_name(
2807 &sk->sk_callback_lock,
2808 af_kern_callback_keys + sk->sk_family,
2809 af_family_kern_clock_key_strings[sk->sk_family]);
2810 else
2811 lockdep_set_class_and_name(
2812 &sk->sk_callback_lock,
2813 af_callback_keys + sk->sk_family,
2814 af_family_clock_key_strings[sk->sk_family]);
2815
2816 sk->sk_state_change = sock_def_wakeup;
2817 sk->sk_data_ready = sock_def_readable;
2818 sk->sk_write_space = sock_def_write_space;
2819 sk->sk_error_report = sock_def_error_report;
2820 sk->sk_destruct = sock_def_destruct;
2821
2822 sk->sk_frag.page = NULL;
2823 sk->sk_frag.offset = 0;
2824 sk->sk_peek_off = -1;
2825
2826 sk->sk_peer_pid = NULL;
2827 sk->sk_peer_cred = NULL;
2828 sk->sk_write_pending = 0;
2829 sk->sk_rcvlowat = 1;
2830 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2831 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2832
2833 sk->sk_stamp = SK_DEFAULT_STAMP;
2834 #if BITS_PER_LONG==32
2835 seqlock_init(&sk->sk_stamp_seq);
2836 #endif
2837 atomic_set(&sk->sk_zckey, 0);
2838
2839 #ifdef CONFIG_NET_RX_BUSY_POLL
2840 sk->sk_napi_id = 0;
2841 sk->sk_ll_usec = sysctl_net_busy_read;
2842 #endif
2843
2844 sk->sk_max_pacing_rate = ~0U;
2845 sk->sk_pacing_rate = ~0U;
2846 sk->sk_pacing_shift = 10;
2847 sk->sk_incoming_cpu = -1;
2848
2849 sk_rx_queue_clear(sk);
2850 /*
2851 * Before updating sk_refcnt, we must commit prior changes to memory
2852 * (Documentation/RCU/rculist_nulls.txt for details)
2853 */
2854 smp_wmb();
2855 refcount_set(&sk->sk_refcnt, 1);
2856 atomic_set(&sk->sk_drops, 0);
2857 }
2858 EXPORT_SYMBOL(sock_init_data);
2859
lock_sock_nested(struct sock * sk,int subclass)2860 void lock_sock_nested(struct sock *sk, int subclass)
2861 {
2862 might_sleep();
2863 spin_lock_bh(&sk->sk_lock.slock);
2864 if (sk->sk_lock.owned)
2865 __lock_sock(sk);
2866 sk->sk_lock.owned = 1;
2867 spin_unlock(&sk->sk_lock.slock);
2868 /*
2869 * The sk_lock has mutex_lock() semantics here:
2870 */
2871 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2872 local_bh_enable();
2873 }
2874 EXPORT_SYMBOL(lock_sock_nested);
2875
release_sock(struct sock * sk)2876 void release_sock(struct sock *sk)
2877 {
2878 spin_lock_bh(&sk->sk_lock.slock);
2879 if (sk->sk_backlog.tail)
2880 __release_sock(sk);
2881
2882 /* Warning : release_cb() might need to release sk ownership,
2883 * ie call sock_release_ownership(sk) before us.
2884 */
2885 if (sk->sk_prot->release_cb)
2886 sk->sk_prot->release_cb(sk);
2887
2888 sock_release_ownership(sk);
2889 if (waitqueue_active(&sk->sk_lock.wq))
2890 wake_up(&sk->sk_lock.wq);
2891 spin_unlock_bh(&sk->sk_lock.slock);
2892 }
2893 EXPORT_SYMBOL(release_sock);
2894
2895 /**
2896 * lock_sock_fast - fast version of lock_sock
2897 * @sk: socket
2898 *
2899 * This version should be used for very small section, where process wont block
2900 * return false if fast path is taken:
2901 *
2902 * sk_lock.slock locked, owned = 0, BH disabled
2903 *
2904 * return true if slow path is taken:
2905 *
2906 * sk_lock.slock unlocked, owned = 1, BH enabled
2907 */
lock_sock_fast(struct sock * sk)2908 bool lock_sock_fast(struct sock *sk)
2909 {
2910 might_sleep();
2911 spin_lock_bh(&sk->sk_lock.slock);
2912
2913 if (!sk->sk_lock.owned)
2914 /*
2915 * Note : We must disable BH
2916 */
2917 return false;
2918
2919 __lock_sock(sk);
2920 sk->sk_lock.owned = 1;
2921 spin_unlock(&sk->sk_lock.slock);
2922 /*
2923 * The sk_lock has mutex_lock() semantics here:
2924 */
2925 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2926 local_bh_enable();
2927 return true;
2928 }
2929 EXPORT_SYMBOL(lock_sock_fast);
2930
sock_get_timestamp(struct sock * sk,struct timeval __user * userstamp)2931 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2932 {
2933 struct timeval tv;
2934
2935 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2936 tv = ktime_to_timeval(sock_read_timestamp(sk));
2937 if (tv.tv_sec == -1)
2938 return -ENOENT;
2939 if (tv.tv_sec == 0) {
2940 ktime_t kt = ktime_get_real();
2941 sock_write_timestamp(sk, kt);
2942 tv = ktime_to_timeval(kt);
2943 }
2944 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2945 }
2946 EXPORT_SYMBOL(sock_get_timestamp);
2947
sock_get_timestampns(struct sock * sk,struct timespec __user * userstamp)2948 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2949 {
2950 struct timespec ts;
2951
2952 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2953 ts = ktime_to_timespec(sock_read_timestamp(sk));
2954 if (ts.tv_sec == -1)
2955 return -ENOENT;
2956 if (ts.tv_sec == 0) {
2957 ktime_t kt = ktime_get_real();
2958 sock_write_timestamp(sk, kt);
2959 ts = ktime_to_timespec(sk->sk_stamp);
2960 }
2961 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2962 }
2963 EXPORT_SYMBOL(sock_get_timestampns);
2964
sock_enable_timestamp(struct sock * sk,int flag)2965 void sock_enable_timestamp(struct sock *sk, int flag)
2966 {
2967 if (!sock_flag(sk, flag)) {
2968 unsigned long previous_flags = sk->sk_flags;
2969
2970 sock_set_flag(sk, flag);
2971 /*
2972 * we just set one of the two flags which require net
2973 * time stamping, but time stamping might have been on
2974 * already because of the other one
2975 */
2976 if (sock_needs_netstamp(sk) &&
2977 !(previous_flags & SK_FLAGS_TIMESTAMP))
2978 net_enable_timestamp();
2979 }
2980 }
2981
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)2982 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2983 int level, int type)
2984 {
2985 struct sock_exterr_skb *serr;
2986 struct sk_buff *skb;
2987 int copied, err;
2988
2989 err = -EAGAIN;
2990 skb = sock_dequeue_err_skb(sk);
2991 if (skb == NULL)
2992 goto out;
2993
2994 copied = skb->len;
2995 if (copied > len) {
2996 msg->msg_flags |= MSG_TRUNC;
2997 copied = len;
2998 }
2999 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3000 if (err)
3001 goto out_free_skb;
3002
3003 sock_recv_timestamp(msg, sk, skb);
3004
3005 serr = SKB_EXT_ERR(skb);
3006 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3007
3008 msg->msg_flags |= MSG_ERRQUEUE;
3009 err = copied;
3010
3011 out_free_skb:
3012 kfree_skb(skb);
3013 out:
3014 return err;
3015 }
3016 EXPORT_SYMBOL(sock_recv_errqueue);
3017
3018 /*
3019 * Get a socket option on an socket.
3020 *
3021 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3022 * asynchronous errors should be reported by getsockopt. We assume
3023 * this means if you specify SO_ERROR (otherwise whats the point of it).
3024 */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3025 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3026 char __user *optval, int __user *optlen)
3027 {
3028 struct sock *sk = sock->sk;
3029
3030 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3031 }
3032 EXPORT_SYMBOL(sock_common_getsockopt);
3033
3034 #ifdef CONFIG_COMPAT
compat_sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3035 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3036 char __user *optval, int __user *optlen)
3037 {
3038 struct sock *sk = sock->sk;
3039
3040 if (sk->sk_prot->compat_getsockopt != NULL)
3041 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3042 optval, optlen);
3043 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3044 }
3045 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3046 #endif
3047
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3048 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3049 int flags)
3050 {
3051 struct sock *sk = sock->sk;
3052 int addr_len = 0;
3053 int err;
3054
3055 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3056 flags & ~MSG_DONTWAIT, &addr_len);
3057 if (err >= 0)
3058 msg->msg_namelen = addr_len;
3059 return err;
3060 }
3061 EXPORT_SYMBOL(sock_common_recvmsg);
3062
3063 /*
3064 * Set socket options on an inet socket.
3065 */
sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)3066 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3067 char __user *optval, unsigned int optlen)
3068 {
3069 struct sock *sk = sock->sk;
3070
3071 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3072 }
3073 EXPORT_SYMBOL(sock_common_setsockopt);
3074
3075 #ifdef CONFIG_COMPAT
compat_sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)3076 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3077 char __user *optval, unsigned int optlen)
3078 {
3079 struct sock *sk = sock->sk;
3080
3081 if (sk->sk_prot->compat_setsockopt != NULL)
3082 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3083 optval, optlen);
3084 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3085 }
3086 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3087 #endif
3088
sk_common_release(struct sock * sk)3089 void sk_common_release(struct sock *sk)
3090 {
3091 if (sk->sk_prot->destroy)
3092 sk->sk_prot->destroy(sk);
3093
3094 /*
3095 * Observation: when sock_common_release is called, processes have
3096 * no access to socket. But net still has.
3097 * Step one, detach it from networking:
3098 *
3099 * A. Remove from hash tables.
3100 */
3101
3102 sk->sk_prot->unhash(sk);
3103
3104 /*
3105 * In this point socket cannot receive new packets, but it is possible
3106 * that some packets are in flight because some CPU runs receiver and
3107 * did hash table lookup before we unhashed socket. They will achieve
3108 * receive queue and will be purged by socket destructor.
3109 *
3110 * Also we still have packets pending on receive queue and probably,
3111 * our own packets waiting in device queues. sock_destroy will drain
3112 * receive queue, but transmitted packets will delay socket destruction
3113 * until the last reference will be released.
3114 */
3115
3116 sock_orphan(sk);
3117
3118 xfrm_sk_free_policy(sk);
3119
3120 sk_refcnt_debug_release(sk);
3121
3122 sock_put(sk);
3123 }
3124 EXPORT_SYMBOL(sk_common_release);
3125
sk_get_meminfo(const struct sock * sk,u32 * mem)3126 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3127 {
3128 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3129
3130 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3131 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3132 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3133 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3134 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3135 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3136 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3137 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3138 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3139 }
3140
3141 #ifdef CONFIG_PROC_FS
3142 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
3143 struct prot_inuse {
3144 int val[PROTO_INUSE_NR];
3145 };
3146
3147 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3148
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)3149 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3150 {
3151 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3152 }
3153 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3154
sock_prot_inuse_get(struct net * net,struct proto * prot)3155 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3156 {
3157 int cpu, idx = prot->inuse_idx;
3158 int res = 0;
3159
3160 for_each_possible_cpu(cpu)
3161 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3162
3163 return res >= 0 ? res : 0;
3164 }
3165 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3166
sock_inuse_add(struct net * net,int val)3167 static void sock_inuse_add(struct net *net, int val)
3168 {
3169 this_cpu_add(*net->core.sock_inuse, val);
3170 }
3171
sock_inuse_get(struct net * net)3172 int sock_inuse_get(struct net *net)
3173 {
3174 int cpu, res = 0;
3175
3176 for_each_possible_cpu(cpu)
3177 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3178
3179 return res;
3180 }
3181
3182 EXPORT_SYMBOL_GPL(sock_inuse_get);
3183
sock_inuse_init_net(struct net * net)3184 static int __net_init sock_inuse_init_net(struct net *net)
3185 {
3186 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3187 if (net->core.prot_inuse == NULL)
3188 return -ENOMEM;
3189
3190 net->core.sock_inuse = alloc_percpu(int);
3191 if (net->core.sock_inuse == NULL)
3192 goto out;
3193
3194 return 0;
3195
3196 out:
3197 free_percpu(net->core.prot_inuse);
3198 return -ENOMEM;
3199 }
3200
sock_inuse_exit_net(struct net * net)3201 static void __net_exit sock_inuse_exit_net(struct net *net)
3202 {
3203 free_percpu(net->core.prot_inuse);
3204 free_percpu(net->core.sock_inuse);
3205 }
3206
3207 static struct pernet_operations net_inuse_ops = {
3208 .init = sock_inuse_init_net,
3209 .exit = sock_inuse_exit_net,
3210 };
3211
net_inuse_init(void)3212 static __init int net_inuse_init(void)
3213 {
3214 if (register_pernet_subsys(&net_inuse_ops))
3215 panic("Cannot initialize net inuse counters");
3216
3217 return 0;
3218 }
3219
3220 core_initcall(net_inuse_init);
3221
assign_proto_idx(struct proto * prot)3222 static void assign_proto_idx(struct proto *prot)
3223 {
3224 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3225
3226 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3227 pr_err("PROTO_INUSE_NR exhausted\n");
3228 return;
3229 }
3230
3231 set_bit(prot->inuse_idx, proto_inuse_idx);
3232 }
3233
release_proto_idx(struct proto * prot)3234 static void release_proto_idx(struct proto *prot)
3235 {
3236 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3237 clear_bit(prot->inuse_idx, proto_inuse_idx);
3238 }
3239 #else
assign_proto_idx(struct proto * prot)3240 static inline void assign_proto_idx(struct proto *prot)
3241 {
3242 }
3243
release_proto_idx(struct proto * prot)3244 static inline void release_proto_idx(struct proto *prot)
3245 {
3246 }
3247
sock_inuse_add(struct net * net,int val)3248 static void sock_inuse_add(struct net *net, int val)
3249 {
3250 }
3251 #endif
3252
req_prot_cleanup(struct request_sock_ops * rsk_prot)3253 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3254 {
3255 if (!rsk_prot)
3256 return;
3257 kfree(rsk_prot->slab_name);
3258 rsk_prot->slab_name = NULL;
3259 kmem_cache_destroy(rsk_prot->slab);
3260 rsk_prot->slab = NULL;
3261 }
3262
req_prot_init(const struct proto * prot)3263 static int req_prot_init(const struct proto *prot)
3264 {
3265 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3266
3267 if (!rsk_prot)
3268 return 0;
3269
3270 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3271 prot->name);
3272 if (!rsk_prot->slab_name)
3273 return -ENOMEM;
3274
3275 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3276 rsk_prot->obj_size, 0,
3277 SLAB_ACCOUNT | prot->slab_flags,
3278 NULL);
3279
3280 if (!rsk_prot->slab) {
3281 pr_crit("%s: Can't create request sock SLAB cache!\n",
3282 prot->name);
3283 return -ENOMEM;
3284 }
3285 return 0;
3286 }
3287
proto_register(struct proto * prot,int alloc_slab)3288 int proto_register(struct proto *prot, int alloc_slab)
3289 {
3290 if (alloc_slab) {
3291 prot->slab = kmem_cache_create_usercopy(prot->name,
3292 prot->obj_size, 0,
3293 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3294 prot->slab_flags,
3295 prot->useroffset, prot->usersize,
3296 NULL);
3297
3298 if (prot->slab == NULL) {
3299 pr_crit("%s: Can't create sock SLAB cache!\n",
3300 prot->name);
3301 goto out;
3302 }
3303
3304 if (req_prot_init(prot))
3305 goto out_free_request_sock_slab;
3306
3307 if (prot->twsk_prot != NULL) {
3308 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3309
3310 if (prot->twsk_prot->twsk_slab_name == NULL)
3311 goto out_free_request_sock_slab;
3312
3313 prot->twsk_prot->twsk_slab =
3314 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3315 prot->twsk_prot->twsk_obj_size,
3316 0,
3317 SLAB_ACCOUNT |
3318 prot->slab_flags,
3319 NULL);
3320 if (prot->twsk_prot->twsk_slab == NULL)
3321 goto out_free_timewait_sock_slab_name;
3322 }
3323 }
3324
3325 mutex_lock(&proto_list_mutex);
3326 list_add(&prot->node, &proto_list);
3327 assign_proto_idx(prot);
3328 mutex_unlock(&proto_list_mutex);
3329 return 0;
3330
3331 out_free_timewait_sock_slab_name:
3332 kfree(prot->twsk_prot->twsk_slab_name);
3333 out_free_request_sock_slab:
3334 req_prot_cleanup(prot->rsk_prot);
3335
3336 kmem_cache_destroy(prot->slab);
3337 prot->slab = NULL;
3338 out:
3339 return -ENOBUFS;
3340 }
3341 EXPORT_SYMBOL(proto_register);
3342
proto_unregister(struct proto * prot)3343 void proto_unregister(struct proto *prot)
3344 {
3345 mutex_lock(&proto_list_mutex);
3346 release_proto_idx(prot);
3347 list_del(&prot->node);
3348 mutex_unlock(&proto_list_mutex);
3349
3350 kmem_cache_destroy(prot->slab);
3351 prot->slab = NULL;
3352
3353 req_prot_cleanup(prot->rsk_prot);
3354
3355 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3356 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3357 kfree(prot->twsk_prot->twsk_slab_name);
3358 prot->twsk_prot->twsk_slab = NULL;
3359 }
3360 }
3361 EXPORT_SYMBOL(proto_unregister);
3362
sock_load_diag_module(int family,int protocol)3363 int sock_load_diag_module(int family, int protocol)
3364 {
3365 if (!protocol) {
3366 if (!sock_is_registered(family))
3367 return -ENOENT;
3368
3369 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3370 NETLINK_SOCK_DIAG, family);
3371 }
3372
3373 #ifdef CONFIG_INET
3374 if (family == AF_INET &&
3375 protocol != IPPROTO_RAW &&
3376 !rcu_access_pointer(inet_protos[protocol]))
3377 return -ENOENT;
3378 #endif
3379
3380 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3381 NETLINK_SOCK_DIAG, family, protocol);
3382 }
3383 EXPORT_SYMBOL(sock_load_diag_module);
3384
3385 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)3386 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3387 __acquires(proto_list_mutex)
3388 {
3389 mutex_lock(&proto_list_mutex);
3390 return seq_list_start_head(&proto_list, *pos);
3391 }
3392
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)3393 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3394 {
3395 return seq_list_next(v, &proto_list, pos);
3396 }
3397
proto_seq_stop(struct seq_file * seq,void * v)3398 static void proto_seq_stop(struct seq_file *seq, void *v)
3399 __releases(proto_list_mutex)
3400 {
3401 mutex_unlock(&proto_list_mutex);
3402 }
3403
proto_method_implemented(const void * method)3404 static char proto_method_implemented(const void *method)
3405 {
3406 return method == NULL ? 'n' : 'y';
3407 }
sock_prot_memory_allocated(struct proto * proto)3408 static long sock_prot_memory_allocated(struct proto *proto)
3409 {
3410 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3411 }
3412
sock_prot_memory_pressure(struct proto * proto)3413 static char *sock_prot_memory_pressure(struct proto *proto)
3414 {
3415 return proto->memory_pressure != NULL ?
3416 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3417 }
3418
proto_seq_printf(struct seq_file * seq,struct proto * proto)3419 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3420 {
3421
3422 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3423 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3424 proto->name,
3425 proto->obj_size,
3426 sock_prot_inuse_get(seq_file_net(seq), proto),
3427 sock_prot_memory_allocated(proto),
3428 sock_prot_memory_pressure(proto),
3429 proto->max_header,
3430 proto->slab == NULL ? "no" : "yes",
3431 module_name(proto->owner),
3432 proto_method_implemented(proto->close),
3433 proto_method_implemented(proto->connect),
3434 proto_method_implemented(proto->disconnect),
3435 proto_method_implemented(proto->accept),
3436 proto_method_implemented(proto->ioctl),
3437 proto_method_implemented(proto->init),
3438 proto_method_implemented(proto->destroy),
3439 proto_method_implemented(proto->shutdown),
3440 proto_method_implemented(proto->setsockopt),
3441 proto_method_implemented(proto->getsockopt),
3442 proto_method_implemented(proto->sendmsg),
3443 proto_method_implemented(proto->recvmsg),
3444 proto_method_implemented(proto->sendpage),
3445 proto_method_implemented(proto->bind),
3446 proto_method_implemented(proto->backlog_rcv),
3447 proto_method_implemented(proto->hash),
3448 proto_method_implemented(proto->unhash),
3449 proto_method_implemented(proto->get_port),
3450 proto_method_implemented(proto->enter_memory_pressure));
3451 }
3452
proto_seq_show(struct seq_file * seq,void * v)3453 static int proto_seq_show(struct seq_file *seq, void *v)
3454 {
3455 if (v == &proto_list)
3456 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3457 "protocol",
3458 "size",
3459 "sockets",
3460 "memory",
3461 "press",
3462 "maxhdr",
3463 "slab",
3464 "module",
3465 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3466 else
3467 proto_seq_printf(seq, list_entry(v, struct proto, node));
3468 return 0;
3469 }
3470
3471 static const struct seq_operations proto_seq_ops = {
3472 .start = proto_seq_start,
3473 .next = proto_seq_next,
3474 .stop = proto_seq_stop,
3475 .show = proto_seq_show,
3476 };
3477
proto_init_net(struct net * net)3478 static __net_init int proto_init_net(struct net *net)
3479 {
3480 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3481 sizeof(struct seq_net_private)))
3482 return -ENOMEM;
3483
3484 return 0;
3485 }
3486
proto_exit_net(struct net * net)3487 static __net_exit void proto_exit_net(struct net *net)
3488 {
3489 remove_proc_entry("protocols", net->proc_net);
3490 }
3491
3492
3493 static __net_initdata struct pernet_operations proto_net_ops = {
3494 .init = proto_init_net,
3495 .exit = proto_exit_net,
3496 };
3497
proto_init(void)3498 static int __init proto_init(void)
3499 {
3500 return register_pernet_subsys(&proto_net_ops);
3501 }
3502
3503 subsys_initcall(proto_init);
3504
3505 #endif /* PROC_FS */
3506
3507 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)3508 bool sk_busy_loop_end(void *p, unsigned long start_time)
3509 {
3510 struct sock *sk = p;
3511
3512 return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3513 sk_busy_loop_timeout(sk, start_time);
3514 }
3515 EXPORT_SYMBOL(sk_busy_loop_end);
3516 #endif /* CONFIG_NET_RX_BUSY_POLL */
3517