1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 */
85
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116
117 #include <linux/uaccess.h>
118
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/net_namespace.h>
123 #include <net/request_sock.h>
124 #include <net/sock.h>
125 #include <linux/net_tstamp.h>
126 #include <net/xfrm.h>
127 #include <linux/ipsec.h>
128 #include <net/cls_cgroup.h>
129 #include <net/netprio_cgroup.h>
130 #include <linux/sock_diag.h>
131
132 #include <linux/filter.h>
133 #include <net/sock_reuseport.h>
134 #include <net/bpf_sk_storage.h>
135
136 #include <trace/events/sock.h>
137 #include <trace/hooks/net.h>
138
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141
142 static DEFINE_MUTEX(proto_list_mutex);
143 static LIST_HEAD(proto_list);
144
145 static void sock_inuse_add(struct net *net, int val);
146
147 /**
148 * sk_ns_capable - General socket capability test
149 * @sk: Socket to use a capability on or through
150 * @user_ns: The user namespace of the capability to use
151 * @cap: The capability to use
152 *
153 * Test to see if the opener of the socket had when the socket was
154 * created and the current process has the capability @cap in the user
155 * namespace @user_ns.
156 */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)157 bool sk_ns_capable(const struct sock *sk,
158 struct user_namespace *user_ns, int cap)
159 {
160 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 ns_capable(user_ns, cap);
162 }
163 EXPORT_SYMBOL(sk_ns_capable);
164
165 /**
166 * sk_capable - Socket global capability test
167 * @sk: Socket to use a capability on or through
168 * @cap: The global capability to use
169 *
170 * Test to see if the opener of the socket had when the socket was
171 * created and the current process has the capability @cap in all user
172 * namespaces.
173 */
sk_capable(const struct sock * sk,int cap)174 bool sk_capable(const struct sock *sk, int cap)
175 {
176 return sk_ns_capable(sk, &init_user_ns, cap);
177 }
178 EXPORT_SYMBOL(sk_capable);
179
180 /**
181 * sk_net_capable - Network namespace socket capability test
182 * @sk: Socket to use a capability on or through
183 * @cap: The capability to use
184 *
185 * Test to see if the opener of the socket had when the socket was created
186 * and the current process has the capability @cap over the network namespace
187 * the socket is a member of.
188 */
sk_net_capable(const struct sock * sk,int cap)189 bool sk_net_capable(const struct sock *sk, int cap)
190 {
191 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
192 }
193 EXPORT_SYMBOL(sk_net_capable);
194
195 /*
196 * Each address family might have different locking rules, so we have
197 * one slock key per address family and separate keys for internal and
198 * userspace sockets.
199 */
200 static struct lock_class_key af_family_keys[AF_MAX];
201 static struct lock_class_key af_family_kern_keys[AF_MAX];
202 static struct lock_class_key af_family_slock_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
204
205 /*
206 * Make lock validator output more readable. (we pre-construct these
207 * strings build-time, so that runtime initialization of socket
208 * locks is fast):
209 */
210
211 #define _sock_locks(x) \
212 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
213 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
214 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
215 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
216 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
217 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
218 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
219 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
220 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
221 x "27" , x "28" , x "AF_CAN" , \
222 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
223 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
224 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
225 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
226 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
227 x "AF_MAX"
228
229 static const char *const af_family_key_strings[AF_MAX+1] = {
230 _sock_locks("sk_lock-")
231 };
232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
233 _sock_locks("slock-")
234 };
235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
236 _sock_locks("clock-")
237 };
238
239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
240 _sock_locks("k-sk_lock-")
241 };
242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
243 _sock_locks("k-slock-")
244 };
245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
246 _sock_locks("k-clock-")
247 };
248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
249 _sock_locks("rlock-")
250 };
251 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
252 _sock_locks("wlock-")
253 };
254 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
255 _sock_locks("elock-")
256 };
257
258 /*
259 * sk_callback_lock and sk queues locking rules are per-address-family,
260 * so split the lock classes by using a per-AF key:
261 */
262 static struct lock_class_key af_callback_keys[AF_MAX];
263 static struct lock_class_key af_rlock_keys[AF_MAX];
264 static struct lock_class_key af_wlock_keys[AF_MAX];
265 static struct lock_class_key af_elock_keys[AF_MAX];
266 static struct lock_class_key af_kern_callback_keys[AF_MAX];
267
268 /* Run time adjustable parameters. */
269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
270 EXPORT_SYMBOL(sysctl_wmem_max);
271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
272 EXPORT_SYMBOL(sysctl_rmem_max);
273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
275
276 /* Maximal space eaten by iovec or ancillary data plus some space */
277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
278 EXPORT_SYMBOL(sysctl_optmem_max);
279
280 int sysctl_tstamp_allow_data __read_mostly = 1;
281
282 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
283 EXPORT_SYMBOL_GPL(memalloc_socks_key);
284
285 /**
286 * sk_set_memalloc - sets %SOCK_MEMALLOC
287 * @sk: socket to set it on
288 *
289 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
290 * It's the responsibility of the admin to adjust min_free_kbytes
291 * to meet the requirements
292 */
sk_set_memalloc(struct sock * sk)293 void sk_set_memalloc(struct sock *sk)
294 {
295 sock_set_flag(sk, SOCK_MEMALLOC);
296 sk->sk_allocation |= __GFP_MEMALLOC;
297 static_branch_inc(&memalloc_socks_key);
298 }
299 EXPORT_SYMBOL_GPL(sk_set_memalloc);
300
sk_clear_memalloc(struct sock * sk)301 void sk_clear_memalloc(struct sock *sk)
302 {
303 sock_reset_flag(sk, SOCK_MEMALLOC);
304 sk->sk_allocation &= ~__GFP_MEMALLOC;
305 static_branch_dec(&memalloc_socks_key);
306
307 /*
308 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
309 * progress of swapping. SOCK_MEMALLOC may be cleared while
310 * it has rmem allocations due to the last swapfile being deactivated
311 * but there is a risk that the socket is unusable due to exceeding
312 * the rmem limits. Reclaim the reserves and obey rmem limits again.
313 */
314 sk_mem_reclaim(sk);
315 }
316 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
317
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
319 {
320 int ret;
321 unsigned int noreclaim_flag;
322
323 /* these should have been dropped before queueing */
324 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
325
326 noreclaim_flag = memalloc_noreclaim_save();
327 ret = sk->sk_backlog_rcv(sk, skb);
328 memalloc_noreclaim_restore(noreclaim_flag);
329
330 return ret;
331 }
332 EXPORT_SYMBOL(__sk_backlog_rcv);
333
sock_get_timeout(long timeo,void * optval,bool old_timeval)334 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
335 {
336 struct __kernel_sock_timeval tv;
337 int size;
338
339 if (timeo == MAX_SCHEDULE_TIMEOUT) {
340 tv.tv_sec = 0;
341 tv.tv_usec = 0;
342 } else {
343 tv.tv_sec = timeo / HZ;
344 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
345 }
346
347 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
348 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
349 *(struct old_timeval32 *)optval = tv32;
350 return sizeof(tv32);
351 }
352
353 if (old_timeval) {
354 struct __kernel_old_timeval old_tv;
355 old_tv.tv_sec = tv.tv_sec;
356 old_tv.tv_usec = tv.tv_usec;
357 *(struct __kernel_old_timeval *)optval = old_tv;
358 size = sizeof(old_tv);
359 } else {
360 *(struct __kernel_sock_timeval *)optval = tv;
361 size = sizeof(tv);
362 }
363
364 return size;
365 }
366
sock_set_timeout(long * timeo_p,char __user * optval,int optlen,bool old_timeval)367 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
368 {
369 struct __kernel_sock_timeval tv;
370
371 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
372 struct old_timeval32 tv32;
373
374 if (optlen < sizeof(tv32))
375 return -EINVAL;
376
377 if (copy_from_user(&tv32, optval, sizeof(tv32)))
378 return -EFAULT;
379 tv.tv_sec = tv32.tv_sec;
380 tv.tv_usec = tv32.tv_usec;
381 } else if (old_timeval) {
382 struct __kernel_old_timeval old_tv;
383
384 if (optlen < sizeof(old_tv))
385 return -EINVAL;
386 if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
387 return -EFAULT;
388 tv.tv_sec = old_tv.tv_sec;
389 tv.tv_usec = old_tv.tv_usec;
390 } else {
391 if (optlen < sizeof(tv))
392 return -EINVAL;
393 if (copy_from_user(&tv, optval, sizeof(tv)))
394 return -EFAULT;
395 }
396 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
397 return -EDOM;
398
399 if (tv.tv_sec < 0) {
400 static int warned __read_mostly;
401
402 *timeo_p = 0;
403 if (warned < 10 && net_ratelimit()) {
404 warned++;
405 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
406 __func__, current->comm, task_pid_nr(current));
407 }
408 return 0;
409 }
410 *timeo_p = MAX_SCHEDULE_TIMEOUT;
411 if (tv.tv_sec == 0 && tv.tv_usec == 0)
412 return 0;
413 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
414 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
415 return 0;
416 }
417
sock_warn_obsolete_bsdism(const char * name)418 static void sock_warn_obsolete_bsdism(const char *name)
419 {
420 static int warned;
421 static char warncomm[TASK_COMM_LEN];
422 if (strcmp(warncomm, current->comm) && warned < 5) {
423 strcpy(warncomm, current->comm);
424 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
425 warncomm, name);
426 warned++;
427 }
428 }
429
sock_needs_netstamp(const struct sock * sk)430 static bool sock_needs_netstamp(const struct sock *sk)
431 {
432 switch (sk->sk_family) {
433 case AF_UNSPEC:
434 case AF_UNIX:
435 return false;
436 default:
437 return true;
438 }
439 }
440
sock_disable_timestamp(struct sock * sk,unsigned long flags)441 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
442 {
443 if (sk->sk_flags & flags) {
444 sk->sk_flags &= ~flags;
445 if (sock_needs_netstamp(sk) &&
446 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
447 net_disable_timestamp();
448 }
449 }
450
451
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)452 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
453 {
454 unsigned long flags;
455 struct sk_buff_head *list = &sk->sk_receive_queue;
456
457 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
458 atomic_inc(&sk->sk_drops);
459 trace_sock_rcvqueue_full(sk, skb);
460 return -ENOMEM;
461 }
462
463 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
464 atomic_inc(&sk->sk_drops);
465 return -ENOBUFS;
466 }
467
468 skb->dev = NULL;
469 skb_set_owner_r(skb, sk);
470
471 /* we escape from rcu protected region, make sure we dont leak
472 * a norefcounted dst
473 */
474 skb_dst_force(skb);
475
476 spin_lock_irqsave(&list->lock, flags);
477 sock_skb_set_dropcount(sk, skb);
478 __skb_queue_tail(list, skb);
479 spin_unlock_irqrestore(&list->lock, flags);
480
481 if (!sock_flag(sk, SOCK_DEAD))
482 sk->sk_data_ready(sk);
483 return 0;
484 }
485 EXPORT_SYMBOL(__sock_queue_rcv_skb);
486
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)487 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
488 {
489 int err;
490
491 err = sk_filter(sk, skb);
492 if (err)
493 return err;
494
495 return __sock_queue_rcv_skb(sk, skb);
496 }
497 EXPORT_SYMBOL(sock_queue_rcv_skb);
498
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)499 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
500 const int nested, unsigned int trim_cap, bool refcounted)
501 {
502 int rc = NET_RX_SUCCESS;
503
504 if (sk_filter_trim_cap(sk, skb, trim_cap))
505 goto discard_and_relse;
506
507 skb->dev = NULL;
508
509 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
510 atomic_inc(&sk->sk_drops);
511 goto discard_and_relse;
512 }
513 if (nested)
514 bh_lock_sock_nested(sk);
515 else
516 bh_lock_sock(sk);
517 if (!sock_owned_by_user(sk)) {
518 /*
519 * trylock + unlock semantics:
520 */
521 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
522
523 rc = sk_backlog_rcv(sk, skb);
524
525 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
526 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
527 bh_unlock_sock(sk);
528 atomic_inc(&sk->sk_drops);
529 goto discard_and_relse;
530 }
531
532 bh_unlock_sock(sk);
533 out:
534 if (refcounted)
535 sock_put(sk);
536 return rc;
537 discard_and_relse:
538 kfree_skb(skb);
539 goto out;
540 }
541 EXPORT_SYMBOL(__sk_receive_skb);
542
__sk_dst_check(struct sock * sk,u32 cookie)543 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
544 {
545 struct dst_entry *dst = __sk_dst_get(sk);
546
547 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
548 sk_tx_queue_clear(sk);
549 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
550 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
551 dst_release(dst);
552 return NULL;
553 }
554
555 return dst;
556 }
557 EXPORT_SYMBOL(__sk_dst_check);
558
sk_dst_check(struct sock * sk,u32 cookie)559 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
560 {
561 struct dst_entry *dst = sk_dst_get(sk);
562
563 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
564 sk_dst_reset(sk);
565 dst_release(dst);
566 return NULL;
567 }
568
569 return dst;
570 }
571 EXPORT_SYMBOL(sk_dst_check);
572
sock_setbindtodevice_locked(struct sock * sk,int ifindex)573 static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
574 {
575 int ret = -ENOPROTOOPT;
576 #ifdef CONFIG_NETDEVICES
577 struct net *net = sock_net(sk);
578
579 /* Sorry... */
580 ret = -EPERM;
581 if (!ns_capable(net->user_ns, CAP_NET_RAW))
582 goto out;
583
584 ret = -EINVAL;
585 if (ifindex < 0)
586 goto out;
587
588 sk->sk_bound_dev_if = ifindex;
589 if (sk->sk_prot->rehash)
590 sk->sk_prot->rehash(sk);
591 sk_dst_reset(sk);
592
593 ret = 0;
594
595 out:
596 #endif
597
598 return ret;
599 }
600
sock_setbindtodevice(struct sock * sk,char __user * optval,int optlen)601 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
602 int optlen)
603 {
604 int ret = -ENOPROTOOPT;
605 #ifdef CONFIG_NETDEVICES
606 struct net *net = sock_net(sk);
607 char devname[IFNAMSIZ];
608 int index;
609
610 ret = -EINVAL;
611 if (optlen < 0)
612 goto out;
613
614 /* Bind this socket to a particular device like "eth0",
615 * as specified in the passed interface name. If the
616 * name is "" or the option length is zero the socket
617 * is not bound.
618 */
619 if (optlen > IFNAMSIZ - 1)
620 optlen = IFNAMSIZ - 1;
621 memset(devname, 0, sizeof(devname));
622
623 ret = -EFAULT;
624 if (copy_from_user(devname, optval, optlen))
625 goto out;
626
627 index = 0;
628 if (devname[0] != '\0') {
629 struct net_device *dev;
630
631 rcu_read_lock();
632 dev = dev_get_by_name_rcu(net, devname);
633 if (dev)
634 index = dev->ifindex;
635 rcu_read_unlock();
636 ret = -ENODEV;
637 if (!dev)
638 goto out;
639 }
640
641 lock_sock(sk);
642 ret = sock_setbindtodevice_locked(sk, index);
643 release_sock(sk);
644
645 out:
646 #endif
647
648 return ret;
649 }
650
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)651 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
652 int __user *optlen, int len)
653 {
654 int ret = -ENOPROTOOPT;
655 #ifdef CONFIG_NETDEVICES
656 struct net *net = sock_net(sk);
657 char devname[IFNAMSIZ];
658
659 if (sk->sk_bound_dev_if == 0) {
660 len = 0;
661 goto zero;
662 }
663
664 ret = -EINVAL;
665 if (len < IFNAMSIZ)
666 goto out;
667
668 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
669 if (ret)
670 goto out;
671
672 len = strlen(devname) + 1;
673
674 ret = -EFAULT;
675 if (copy_to_user(optval, devname, len))
676 goto out;
677
678 zero:
679 ret = -EFAULT;
680 if (put_user(len, optlen))
681 goto out;
682
683 ret = 0;
684
685 out:
686 #endif
687
688 return ret;
689 }
690
sock_valbool_flag(struct sock * sk,int bit,int valbool)691 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
692 {
693 if (valbool)
694 sock_set_flag(sk, bit);
695 else
696 sock_reset_flag(sk, bit);
697 }
698
sk_mc_loop(struct sock * sk)699 bool sk_mc_loop(struct sock *sk)
700 {
701 if (dev_recursion_level())
702 return false;
703 if (!sk)
704 return true;
705 /* IPV6_ADDRFORM can change sk->sk_family under us. */
706 switch (READ_ONCE(sk->sk_family)) {
707 case AF_INET:
708 return inet_sk(sk)->mc_loop;
709 #if IS_ENABLED(CONFIG_IPV6)
710 case AF_INET6:
711 return inet6_sk(sk)->mc_loop;
712 #endif
713 }
714 WARN_ON_ONCE(1);
715 return true;
716 }
717 EXPORT_SYMBOL(sk_mc_loop);
718
719 /*
720 * This is meant for all protocols to use and covers goings on
721 * at the socket level. Everything here is generic.
722 */
723
sock_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)724 int sock_setsockopt(struct socket *sock, int level, int optname,
725 char __user *optval, unsigned int optlen)
726 {
727 struct sock_txtime sk_txtime;
728 struct sock *sk = sock->sk;
729 int val;
730 int valbool;
731 struct linger ling;
732 int ret = 0;
733
734 /*
735 * Options without arguments
736 */
737
738 if (optname == SO_BINDTODEVICE)
739 return sock_setbindtodevice(sk, optval, optlen);
740
741 if (optlen < sizeof(int))
742 return -EINVAL;
743
744 if (get_user(val, (int __user *)optval))
745 return -EFAULT;
746
747 valbool = val ? 1 : 0;
748
749 lock_sock(sk);
750
751 switch (optname) {
752 case SO_DEBUG:
753 if (val && !capable(CAP_NET_ADMIN))
754 ret = -EACCES;
755 else
756 sock_valbool_flag(sk, SOCK_DBG, valbool);
757 break;
758 case SO_REUSEADDR:
759 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
760 break;
761 case SO_REUSEPORT:
762 sk->sk_reuseport = valbool;
763 break;
764 case SO_TYPE:
765 case SO_PROTOCOL:
766 case SO_DOMAIN:
767 case SO_ERROR:
768 ret = -ENOPROTOOPT;
769 break;
770 case SO_DONTROUTE:
771 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
772 sk_dst_reset(sk);
773 break;
774 case SO_BROADCAST:
775 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
776 break;
777 case SO_SNDBUF:
778 /* Don't error on this BSD doesn't and if you think
779 * about it this is right. Otherwise apps have to
780 * play 'guess the biggest size' games. RCVBUF/SNDBUF
781 * are treated in BSD as hints
782 */
783 val = min_t(u32, val, sysctl_wmem_max);
784 set_sndbuf:
785 /* Ensure val * 2 fits into an int, to prevent max_t()
786 * from treating it as a negative value.
787 */
788 val = min_t(int, val, INT_MAX / 2);
789 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
790 WRITE_ONCE(sk->sk_sndbuf,
791 max_t(int, val * 2, SOCK_MIN_SNDBUF));
792 /* Wake up sending tasks if we upped the value. */
793 sk->sk_write_space(sk);
794 break;
795
796 case SO_SNDBUFFORCE:
797 if (!capable(CAP_NET_ADMIN)) {
798 ret = -EPERM;
799 break;
800 }
801
802 /* No negative values (to prevent underflow, as val will be
803 * multiplied by 2).
804 */
805 if (val < 0)
806 val = 0;
807 goto set_sndbuf;
808
809 case SO_RCVBUF:
810 /* Don't error on this BSD doesn't and if you think
811 * about it this is right. Otherwise apps have to
812 * play 'guess the biggest size' games. RCVBUF/SNDBUF
813 * are treated in BSD as hints
814 */
815 val = min_t(u32, val, sysctl_rmem_max);
816 set_rcvbuf:
817 /* Ensure val * 2 fits into an int, to prevent max_t()
818 * from treating it as a negative value.
819 */
820 val = min_t(int, val, INT_MAX / 2);
821 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
822 /*
823 * We double it on the way in to account for
824 * "struct sk_buff" etc. overhead. Applications
825 * assume that the SO_RCVBUF setting they make will
826 * allow that much actual data to be received on that
827 * socket.
828 *
829 * Applications are unaware that "struct sk_buff" and
830 * other overheads allocate from the receive buffer
831 * during socket buffer allocation.
832 *
833 * And after considering the possible alternatives,
834 * returning the value we actually used in getsockopt
835 * is the most desirable behavior.
836 */
837 WRITE_ONCE(sk->sk_rcvbuf,
838 max_t(int, val * 2, SOCK_MIN_RCVBUF));
839 break;
840
841 case SO_RCVBUFFORCE:
842 if (!capable(CAP_NET_ADMIN)) {
843 ret = -EPERM;
844 break;
845 }
846
847 /* No negative values (to prevent underflow, as val will be
848 * multiplied by 2).
849 */
850 if (val < 0)
851 val = 0;
852 goto set_rcvbuf;
853
854 case SO_KEEPALIVE:
855 if (sk->sk_prot->keepalive)
856 sk->sk_prot->keepalive(sk, valbool);
857 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
858 break;
859
860 case SO_OOBINLINE:
861 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
862 break;
863
864 case SO_NO_CHECK:
865 sk->sk_no_check_tx = valbool;
866 break;
867
868 case SO_PRIORITY:
869 if ((val >= 0 && val <= 6) ||
870 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
871 sk->sk_priority = val;
872 else
873 ret = -EPERM;
874 break;
875
876 case SO_LINGER:
877 if (optlen < sizeof(ling)) {
878 ret = -EINVAL; /* 1003.1g */
879 break;
880 }
881 if (copy_from_user(&ling, optval, sizeof(ling))) {
882 ret = -EFAULT;
883 break;
884 }
885 if (!ling.l_onoff)
886 sock_reset_flag(sk, SOCK_LINGER);
887 else {
888 #if (BITS_PER_LONG == 32)
889 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
890 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
891 else
892 #endif
893 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
894 sock_set_flag(sk, SOCK_LINGER);
895 }
896 break;
897
898 case SO_BSDCOMPAT:
899 sock_warn_obsolete_bsdism("setsockopt");
900 break;
901
902 case SO_PASSCRED:
903 if (valbool)
904 set_bit(SOCK_PASSCRED, &sock->flags);
905 else
906 clear_bit(SOCK_PASSCRED, &sock->flags);
907 break;
908
909 case SO_TIMESTAMP_OLD:
910 case SO_TIMESTAMP_NEW:
911 case SO_TIMESTAMPNS_OLD:
912 case SO_TIMESTAMPNS_NEW:
913 if (valbool) {
914 if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
915 sock_set_flag(sk, SOCK_TSTAMP_NEW);
916 else
917 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
918
919 if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
920 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
921 else
922 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
923 sock_set_flag(sk, SOCK_RCVTSTAMP);
924 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
925 } else {
926 sock_reset_flag(sk, SOCK_RCVTSTAMP);
927 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
928 }
929 break;
930
931 case SO_TIMESTAMPING_NEW:
932 case SO_TIMESTAMPING_OLD:
933 if (val & ~SOF_TIMESTAMPING_MASK) {
934 ret = -EINVAL;
935 break;
936 }
937
938 if (val & SOF_TIMESTAMPING_OPT_ID &&
939 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
940 if (sk->sk_protocol == IPPROTO_TCP &&
941 sk->sk_type == SOCK_STREAM) {
942 if ((1 << sk->sk_state) &
943 (TCPF_CLOSE | TCPF_LISTEN)) {
944 ret = -EINVAL;
945 break;
946 }
947 sk->sk_tskey = tcp_sk(sk)->snd_una;
948 } else {
949 sk->sk_tskey = 0;
950 }
951 }
952
953 if (val & SOF_TIMESTAMPING_OPT_STATS &&
954 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
955 ret = -EINVAL;
956 break;
957 }
958
959 sk->sk_tsflags = val;
960 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
961
962 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
963 sock_enable_timestamp(sk,
964 SOCK_TIMESTAMPING_RX_SOFTWARE);
965 else
966 sock_disable_timestamp(sk,
967 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
968 break;
969
970 case SO_RCVLOWAT:
971 if (val < 0)
972 val = INT_MAX;
973 if (sock->ops->set_rcvlowat)
974 ret = sock->ops->set_rcvlowat(sk, val);
975 else
976 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
977 break;
978
979 case SO_RCVTIMEO_OLD:
980 case SO_RCVTIMEO_NEW:
981 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
982 break;
983
984 case SO_SNDTIMEO_OLD:
985 case SO_SNDTIMEO_NEW:
986 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
987 break;
988
989 case SO_ATTACH_FILTER:
990 ret = -EINVAL;
991 if (optlen == sizeof(struct sock_fprog)) {
992 struct sock_fprog fprog;
993
994 ret = -EFAULT;
995 if (copy_from_user(&fprog, optval, sizeof(fprog)))
996 break;
997
998 ret = sk_attach_filter(&fprog, sk);
999 }
1000 break;
1001
1002 case SO_ATTACH_BPF:
1003 ret = -EINVAL;
1004 if (optlen == sizeof(u32)) {
1005 u32 ufd;
1006
1007 ret = -EFAULT;
1008 if (copy_from_user(&ufd, optval, sizeof(ufd)))
1009 break;
1010
1011 ret = sk_attach_bpf(ufd, sk);
1012 }
1013 break;
1014
1015 case SO_ATTACH_REUSEPORT_CBPF:
1016 ret = -EINVAL;
1017 if (optlen == sizeof(struct sock_fprog)) {
1018 struct sock_fprog fprog;
1019
1020 ret = -EFAULT;
1021 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1022 break;
1023
1024 ret = sk_reuseport_attach_filter(&fprog, sk);
1025 }
1026 break;
1027
1028 case SO_ATTACH_REUSEPORT_EBPF:
1029 ret = -EINVAL;
1030 if (optlen == sizeof(u32)) {
1031 u32 ufd;
1032
1033 ret = -EFAULT;
1034 if (copy_from_user(&ufd, optval, sizeof(ufd)))
1035 break;
1036
1037 ret = sk_reuseport_attach_bpf(ufd, sk);
1038 }
1039 break;
1040
1041 case SO_DETACH_REUSEPORT_BPF:
1042 ret = reuseport_detach_prog(sk);
1043 break;
1044
1045 case SO_DETACH_FILTER:
1046 ret = sk_detach_filter(sk);
1047 break;
1048
1049 case SO_LOCK_FILTER:
1050 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1051 ret = -EPERM;
1052 else
1053 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1054 break;
1055
1056 case SO_PASSSEC:
1057 if (valbool)
1058 set_bit(SOCK_PASSSEC, &sock->flags);
1059 else
1060 clear_bit(SOCK_PASSSEC, &sock->flags);
1061 break;
1062 case SO_MARK:
1063 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1064 ret = -EPERM;
1065 } else if (val != sk->sk_mark) {
1066 sk->sk_mark = val;
1067 sk_dst_reset(sk);
1068 }
1069 break;
1070
1071 case SO_RXQ_OVFL:
1072 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1073 break;
1074
1075 case SO_WIFI_STATUS:
1076 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1077 break;
1078
1079 case SO_PEEK_OFF:
1080 if (sock->ops->set_peek_off)
1081 ret = sock->ops->set_peek_off(sk, val);
1082 else
1083 ret = -EOPNOTSUPP;
1084 break;
1085
1086 case SO_NOFCS:
1087 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1088 break;
1089
1090 case SO_SELECT_ERR_QUEUE:
1091 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1092 break;
1093
1094 #ifdef CONFIG_NET_RX_BUSY_POLL
1095 case SO_BUSY_POLL:
1096 /* allow unprivileged users to decrease the value */
1097 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1098 ret = -EPERM;
1099 else {
1100 if (val < 0)
1101 ret = -EINVAL;
1102 else
1103 WRITE_ONCE(sk->sk_ll_usec, val);
1104 }
1105 break;
1106 #endif
1107
1108 case SO_MAX_PACING_RATE:
1109 {
1110 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1111
1112 if (sizeof(ulval) != sizeof(val) &&
1113 optlen >= sizeof(ulval) &&
1114 get_user(ulval, (unsigned long __user *)optval)) {
1115 ret = -EFAULT;
1116 break;
1117 }
1118 if (ulval != ~0UL)
1119 cmpxchg(&sk->sk_pacing_status,
1120 SK_PACING_NONE,
1121 SK_PACING_NEEDED);
1122 /* Pairs with READ_ONCE() from sk_getsockopt() */
1123 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1124 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1125 break;
1126 }
1127 case SO_INCOMING_CPU:
1128 WRITE_ONCE(sk->sk_incoming_cpu, val);
1129 break;
1130
1131 case SO_CNX_ADVICE:
1132 if (val == 1)
1133 dst_negative_advice(sk);
1134 break;
1135
1136 case SO_ZEROCOPY:
1137 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1138 if (!((sk->sk_type == SOCK_STREAM &&
1139 sk->sk_protocol == IPPROTO_TCP) ||
1140 (sk->sk_type == SOCK_DGRAM &&
1141 sk->sk_protocol == IPPROTO_UDP)))
1142 ret = -ENOTSUPP;
1143 } else if (sk->sk_family != PF_RDS) {
1144 ret = -ENOTSUPP;
1145 }
1146 if (!ret) {
1147 if (val < 0 || val > 1)
1148 ret = -EINVAL;
1149 else
1150 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1151 }
1152 break;
1153
1154 case SO_TXTIME:
1155 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1156 ret = -EPERM;
1157 } else if (optlen != sizeof(struct sock_txtime)) {
1158 ret = -EINVAL;
1159 } else if (copy_from_user(&sk_txtime, optval,
1160 sizeof(struct sock_txtime))) {
1161 ret = -EFAULT;
1162 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1163 ret = -EINVAL;
1164 } else {
1165 sock_valbool_flag(sk, SOCK_TXTIME, true);
1166 sk->sk_clockid = sk_txtime.clockid;
1167 sk->sk_txtime_deadline_mode =
1168 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1169 sk->sk_txtime_report_errors =
1170 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1171 }
1172 break;
1173
1174 case SO_BINDTOIFINDEX:
1175 ret = sock_setbindtodevice_locked(sk, val);
1176 break;
1177
1178 default:
1179 ret = -ENOPROTOOPT;
1180 break;
1181 }
1182 release_sock(sk);
1183 return ret;
1184 }
1185 EXPORT_SYMBOL(sock_setsockopt);
1186
sk_get_peer_cred(struct sock * sk)1187 static const struct cred *sk_get_peer_cred(struct sock *sk)
1188 {
1189 const struct cred *cred;
1190
1191 spin_lock(&sk->sk_peer_lock);
1192 cred = get_cred(sk->sk_peer_cred);
1193 spin_unlock(&sk->sk_peer_lock);
1194
1195 return cred;
1196 }
1197
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1198 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1199 struct ucred *ucred)
1200 {
1201 ucred->pid = pid_vnr(pid);
1202 ucred->uid = ucred->gid = -1;
1203 if (cred) {
1204 struct user_namespace *current_ns = current_user_ns();
1205
1206 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1207 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1208 }
1209 }
1210
groups_to_user(gid_t __user * dst,const struct group_info * src)1211 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1212 {
1213 struct user_namespace *user_ns = current_user_ns();
1214 int i;
1215
1216 for (i = 0; i < src->ngroups; i++)
1217 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1218 return -EFAULT;
1219
1220 return 0;
1221 }
1222
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1223 int sock_getsockopt(struct socket *sock, int level, int optname,
1224 char __user *optval, int __user *optlen)
1225 {
1226 struct sock *sk = sock->sk;
1227
1228 union {
1229 int val;
1230 u64 val64;
1231 unsigned long ulval;
1232 struct linger ling;
1233 struct old_timeval32 tm32;
1234 struct __kernel_old_timeval tm;
1235 struct __kernel_sock_timeval stm;
1236 struct sock_txtime txtime;
1237 } v;
1238
1239 int lv = sizeof(int);
1240 int len;
1241
1242 if (get_user(len, optlen))
1243 return -EFAULT;
1244 if (len < 0)
1245 return -EINVAL;
1246
1247 memset(&v, 0, sizeof(v));
1248
1249 switch (optname) {
1250 case SO_DEBUG:
1251 v.val = sock_flag(sk, SOCK_DBG);
1252 break;
1253
1254 case SO_DONTROUTE:
1255 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1256 break;
1257
1258 case SO_BROADCAST:
1259 v.val = sock_flag(sk, SOCK_BROADCAST);
1260 break;
1261
1262 case SO_SNDBUF:
1263 v.val = READ_ONCE(sk->sk_sndbuf);
1264 break;
1265
1266 case SO_RCVBUF:
1267 v.val = READ_ONCE(sk->sk_rcvbuf);
1268 break;
1269
1270 case SO_REUSEADDR:
1271 v.val = sk->sk_reuse;
1272 break;
1273
1274 case SO_REUSEPORT:
1275 v.val = sk->sk_reuseport;
1276 break;
1277
1278 case SO_KEEPALIVE:
1279 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1280 break;
1281
1282 case SO_TYPE:
1283 v.val = sk->sk_type;
1284 break;
1285
1286 case SO_PROTOCOL:
1287 v.val = sk->sk_protocol;
1288 break;
1289
1290 case SO_DOMAIN:
1291 v.val = sk->sk_family;
1292 break;
1293
1294 case SO_ERROR:
1295 v.val = -sock_error(sk);
1296 if (v.val == 0)
1297 v.val = xchg(&sk->sk_err_soft, 0);
1298 break;
1299
1300 case SO_OOBINLINE:
1301 v.val = sock_flag(sk, SOCK_URGINLINE);
1302 break;
1303
1304 case SO_NO_CHECK:
1305 v.val = sk->sk_no_check_tx;
1306 break;
1307
1308 case SO_PRIORITY:
1309 v.val = sk->sk_priority;
1310 break;
1311
1312 case SO_LINGER:
1313 lv = sizeof(v.ling);
1314 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1315 v.ling.l_linger = sk->sk_lingertime / HZ;
1316 break;
1317
1318 case SO_BSDCOMPAT:
1319 sock_warn_obsolete_bsdism("getsockopt");
1320 break;
1321
1322 case SO_TIMESTAMP_OLD:
1323 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1324 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1325 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1326 break;
1327
1328 case SO_TIMESTAMPNS_OLD:
1329 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1330 break;
1331
1332 case SO_TIMESTAMP_NEW:
1333 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1334 break;
1335
1336 case SO_TIMESTAMPNS_NEW:
1337 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1338 break;
1339
1340 case SO_TIMESTAMPING_OLD:
1341 v.val = sk->sk_tsflags;
1342 break;
1343
1344 case SO_RCVTIMEO_OLD:
1345 case SO_RCVTIMEO_NEW:
1346 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1347 break;
1348
1349 case SO_SNDTIMEO_OLD:
1350 case SO_SNDTIMEO_NEW:
1351 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1352 break;
1353
1354 case SO_RCVLOWAT:
1355 v.val = READ_ONCE(sk->sk_rcvlowat);
1356 break;
1357
1358 case SO_SNDLOWAT:
1359 v.val = 1;
1360 break;
1361
1362 case SO_PASSCRED:
1363 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1364 break;
1365
1366 case SO_PEERCRED:
1367 {
1368 struct ucred peercred;
1369 if (len > sizeof(peercred))
1370 len = sizeof(peercred);
1371
1372 spin_lock(&sk->sk_peer_lock);
1373 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1374 spin_unlock(&sk->sk_peer_lock);
1375
1376 if (copy_to_user(optval, &peercred, len))
1377 return -EFAULT;
1378 goto lenout;
1379 }
1380
1381 case SO_PEERGROUPS:
1382 {
1383 const struct cred *cred;
1384 int ret, n;
1385
1386 cred = sk_get_peer_cred(sk);
1387 if (!cred)
1388 return -ENODATA;
1389
1390 n = cred->group_info->ngroups;
1391 if (len < n * sizeof(gid_t)) {
1392 len = n * sizeof(gid_t);
1393 put_cred(cred);
1394 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1395 }
1396 len = n * sizeof(gid_t);
1397
1398 ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1399 put_cred(cred);
1400 if (ret)
1401 return ret;
1402 goto lenout;
1403 }
1404
1405 case SO_PEERNAME:
1406 {
1407 char address[128];
1408
1409 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1410 if (lv < 0)
1411 return -ENOTCONN;
1412 if (lv < len)
1413 return -EINVAL;
1414 if (copy_to_user(optval, address, len))
1415 return -EFAULT;
1416 goto lenout;
1417 }
1418
1419 /* Dubious BSD thing... Probably nobody even uses it, but
1420 * the UNIX standard wants it for whatever reason... -DaveM
1421 */
1422 case SO_ACCEPTCONN:
1423 v.val = sk->sk_state == TCP_LISTEN;
1424 break;
1425
1426 case SO_PASSSEC:
1427 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1428 break;
1429
1430 case SO_PEERSEC:
1431 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1432
1433 case SO_MARK:
1434 v.val = sk->sk_mark;
1435 break;
1436
1437 case SO_RXQ_OVFL:
1438 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1439 break;
1440
1441 case SO_WIFI_STATUS:
1442 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1443 break;
1444
1445 case SO_PEEK_OFF:
1446 if (!sock->ops->set_peek_off)
1447 return -EOPNOTSUPP;
1448
1449 v.val = READ_ONCE(sk->sk_peek_off);
1450 break;
1451 case SO_NOFCS:
1452 v.val = sock_flag(sk, SOCK_NOFCS);
1453 break;
1454
1455 case SO_BINDTODEVICE:
1456 return sock_getbindtodevice(sk, optval, optlen, len);
1457
1458 case SO_GET_FILTER:
1459 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1460 if (len < 0)
1461 return len;
1462
1463 goto lenout;
1464
1465 case SO_LOCK_FILTER:
1466 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1467 break;
1468
1469 case SO_BPF_EXTENSIONS:
1470 v.val = bpf_tell_extensions();
1471 break;
1472
1473 case SO_SELECT_ERR_QUEUE:
1474 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1475 break;
1476
1477 #ifdef CONFIG_NET_RX_BUSY_POLL
1478 case SO_BUSY_POLL:
1479 v.val = READ_ONCE(sk->sk_ll_usec);
1480 break;
1481 #endif
1482
1483 case SO_MAX_PACING_RATE:
1484 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1485 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1486 lv = sizeof(v.ulval);
1487 v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1488 } else {
1489 /* 32bit version */
1490 v.val = min_t(unsigned long, ~0U,
1491 READ_ONCE(sk->sk_max_pacing_rate));
1492 }
1493 break;
1494
1495 case SO_INCOMING_CPU:
1496 v.val = READ_ONCE(sk->sk_incoming_cpu);
1497 break;
1498
1499 case SO_MEMINFO:
1500 {
1501 u32 meminfo[SK_MEMINFO_VARS];
1502
1503 sk_get_meminfo(sk, meminfo);
1504
1505 len = min_t(unsigned int, len, sizeof(meminfo));
1506 if (copy_to_user(optval, &meminfo, len))
1507 return -EFAULT;
1508
1509 goto lenout;
1510 }
1511
1512 #ifdef CONFIG_NET_RX_BUSY_POLL
1513 case SO_INCOMING_NAPI_ID:
1514 v.val = READ_ONCE(sk->sk_napi_id);
1515
1516 /* aggregate non-NAPI IDs down to 0 */
1517 if (v.val < MIN_NAPI_ID)
1518 v.val = 0;
1519
1520 break;
1521 #endif
1522
1523 case SO_COOKIE:
1524 lv = sizeof(u64);
1525 if (len < lv)
1526 return -EINVAL;
1527 v.val64 = sock_gen_cookie(sk);
1528 break;
1529
1530 case SO_ZEROCOPY:
1531 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1532 break;
1533
1534 case SO_TXTIME:
1535 lv = sizeof(v.txtime);
1536 v.txtime.clockid = sk->sk_clockid;
1537 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1538 SOF_TXTIME_DEADLINE_MODE : 0;
1539 v.txtime.flags |= sk->sk_txtime_report_errors ?
1540 SOF_TXTIME_REPORT_ERRORS : 0;
1541 break;
1542
1543 case SO_BINDTOIFINDEX:
1544 v.val = sk->sk_bound_dev_if;
1545 break;
1546
1547 default:
1548 /* We implement the SO_SNDLOWAT etc to not be settable
1549 * (1003.1g 7).
1550 */
1551 return -ENOPROTOOPT;
1552 }
1553
1554 if (len > lv)
1555 len = lv;
1556 if (copy_to_user(optval, &v, len))
1557 return -EFAULT;
1558 lenout:
1559 if (put_user(len, optlen))
1560 return -EFAULT;
1561 return 0;
1562 }
1563
1564 /*
1565 * Initialize an sk_lock.
1566 *
1567 * (We also register the sk_lock with the lock validator.)
1568 */
sock_lock_init(struct sock * sk)1569 static inline void sock_lock_init(struct sock *sk)
1570 {
1571 if (sk->sk_kern_sock)
1572 sock_lock_init_class_and_name(
1573 sk,
1574 af_family_kern_slock_key_strings[sk->sk_family],
1575 af_family_kern_slock_keys + sk->sk_family,
1576 af_family_kern_key_strings[sk->sk_family],
1577 af_family_kern_keys + sk->sk_family);
1578 else
1579 sock_lock_init_class_and_name(
1580 sk,
1581 af_family_slock_key_strings[sk->sk_family],
1582 af_family_slock_keys + sk->sk_family,
1583 af_family_key_strings[sk->sk_family],
1584 af_family_keys + sk->sk_family);
1585 }
1586
1587 /*
1588 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1589 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1590 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1591 */
sock_copy(struct sock * nsk,const struct sock * osk)1592 static void sock_copy(struct sock *nsk, const struct sock *osk)
1593 {
1594 #ifdef CONFIG_SECURITY_NETWORK
1595 void *sptr = nsk->sk_security;
1596 #endif
1597 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1598
1599 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1600 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1601
1602 #ifdef CONFIG_SECURITY_NETWORK
1603 nsk->sk_security = sptr;
1604 security_sk_clone(osk, nsk);
1605 #endif
1606 }
1607
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1608 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1609 int family)
1610 {
1611 struct sock *sk;
1612 struct kmem_cache *slab;
1613
1614 slab = prot->slab;
1615 if (slab != NULL) {
1616 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1617 if (!sk)
1618 return sk;
1619 if (want_init_on_alloc(priority))
1620 sk_prot_clear_nulls(sk, prot->obj_size);
1621 } else
1622 sk = kmalloc(prot->obj_size, priority);
1623
1624 if (sk != NULL) {
1625 if (security_sk_alloc(sk, family, priority))
1626 goto out_free;
1627
1628 trace_android_rvh_sk_alloc(sk);
1629
1630 if (!try_module_get(prot->owner))
1631 goto out_free_sec;
1632 sk_tx_queue_clear(sk);
1633 }
1634
1635 return sk;
1636
1637 out_free_sec:
1638 security_sk_free(sk);
1639 trace_android_rvh_sk_free(sk);
1640 out_free:
1641 if (slab != NULL)
1642 kmem_cache_free(slab, sk);
1643 else
1644 kfree(sk);
1645 return NULL;
1646 }
1647
sk_prot_free(struct proto * prot,struct sock * sk)1648 static void sk_prot_free(struct proto *prot, struct sock *sk)
1649 {
1650 struct kmem_cache *slab;
1651 struct module *owner;
1652
1653 owner = prot->owner;
1654 slab = prot->slab;
1655
1656 cgroup_sk_free(&sk->sk_cgrp_data);
1657 mem_cgroup_sk_free(sk);
1658 security_sk_free(sk);
1659 trace_android_rvh_sk_free(sk);
1660 if (slab != NULL)
1661 kmem_cache_free(slab, sk);
1662 else
1663 kfree(sk);
1664 module_put(owner);
1665 }
1666
1667 /**
1668 * sk_alloc - All socket objects are allocated here
1669 * @net: the applicable net namespace
1670 * @family: protocol family
1671 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1672 * @prot: struct proto associated with this new sock instance
1673 * @kern: is this to be a kernel socket?
1674 */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1675 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1676 struct proto *prot, int kern)
1677 {
1678 struct sock *sk;
1679
1680 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1681 if (sk) {
1682 sk->sk_family = family;
1683 /*
1684 * See comment in struct sock definition to understand
1685 * why we need sk_prot_creator -acme
1686 */
1687 sk->sk_prot = sk->sk_prot_creator = prot;
1688 sk->sk_kern_sock = kern;
1689 sock_lock_init(sk);
1690 sk->sk_net_refcnt = kern ? 0 : 1;
1691 if (likely(sk->sk_net_refcnt)) {
1692 get_net(net);
1693 sock_inuse_add(net, 1);
1694 }
1695
1696 sock_net_set(sk, net);
1697 refcount_set(&sk->sk_wmem_alloc, 1);
1698
1699 mem_cgroup_sk_alloc(sk);
1700 cgroup_sk_alloc(&sk->sk_cgrp_data);
1701 sock_update_classid(&sk->sk_cgrp_data);
1702 sock_update_netprioidx(&sk->sk_cgrp_data);
1703 sk_tx_queue_clear(sk);
1704 }
1705
1706 return sk;
1707 }
1708 EXPORT_SYMBOL(sk_alloc);
1709
1710 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1711 * grace period. This is the case for UDP sockets and TCP listeners.
1712 */
__sk_destruct(struct rcu_head * head)1713 static void __sk_destruct(struct rcu_head *head)
1714 {
1715 struct sock *sk = container_of(head, struct sock, sk_rcu);
1716 struct sk_filter *filter;
1717
1718 if (sk->sk_destruct)
1719 sk->sk_destruct(sk);
1720
1721 filter = rcu_dereference_check(sk->sk_filter,
1722 refcount_read(&sk->sk_wmem_alloc) == 0);
1723 if (filter) {
1724 sk_filter_uncharge(sk, filter);
1725 RCU_INIT_POINTER(sk->sk_filter, NULL);
1726 }
1727
1728 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1729
1730 #ifdef CONFIG_BPF_SYSCALL
1731 bpf_sk_storage_free(sk);
1732 #endif
1733
1734 if (atomic_read(&sk->sk_omem_alloc))
1735 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1736 __func__, atomic_read(&sk->sk_omem_alloc));
1737
1738 if (sk->sk_frag.page) {
1739 put_page(sk->sk_frag.page);
1740 sk->sk_frag.page = NULL;
1741 }
1742
1743 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1744 put_cred(sk->sk_peer_cred);
1745 put_pid(sk->sk_peer_pid);
1746
1747 if (likely(sk->sk_net_refcnt))
1748 put_net(sock_net(sk));
1749 sk_prot_free(sk->sk_prot_creator, sk);
1750 }
1751
sk_destruct(struct sock * sk)1752 void sk_destruct(struct sock *sk)
1753 {
1754 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1755
1756 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1757 reuseport_detach_sock(sk);
1758 use_call_rcu = true;
1759 }
1760
1761 if (use_call_rcu)
1762 call_rcu(&sk->sk_rcu, __sk_destruct);
1763 else
1764 __sk_destruct(&sk->sk_rcu);
1765 }
1766
__sk_free(struct sock * sk)1767 static void __sk_free(struct sock *sk)
1768 {
1769 if (likely(sk->sk_net_refcnt))
1770 sock_inuse_add(sock_net(sk), -1);
1771
1772 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1773 sock_diag_broadcast_destroy(sk);
1774 else
1775 sk_destruct(sk);
1776 }
1777
sk_free(struct sock * sk)1778 void sk_free(struct sock *sk)
1779 {
1780 /*
1781 * We subtract one from sk_wmem_alloc and can know if
1782 * some packets are still in some tx queue.
1783 * If not null, sock_wfree() will call __sk_free(sk) later
1784 */
1785 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1786 __sk_free(sk);
1787 }
1788 EXPORT_SYMBOL(sk_free);
1789
sk_init_common(struct sock * sk)1790 static void sk_init_common(struct sock *sk)
1791 {
1792 skb_queue_head_init(&sk->sk_receive_queue);
1793 skb_queue_head_init(&sk->sk_write_queue);
1794 skb_queue_head_init(&sk->sk_error_queue);
1795
1796 rwlock_init(&sk->sk_callback_lock);
1797 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1798 af_rlock_keys + sk->sk_family,
1799 af_family_rlock_key_strings[sk->sk_family]);
1800 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1801 af_wlock_keys + sk->sk_family,
1802 af_family_wlock_key_strings[sk->sk_family]);
1803 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1804 af_elock_keys + sk->sk_family,
1805 af_family_elock_key_strings[sk->sk_family]);
1806 lockdep_set_class_and_name(&sk->sk_callback_lock,
1807 af_callback_keys + sk->sk_family,
1808 af_family_clock_key_strings[sk->sk_family]);
1809 }
1810
1811 /**
1812 * sk_clone_lock - clone a socket, and lock its clone
1813 * @sk: the socket to clone
1814 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1815 *
1816 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1817 */
sk_clone_lock(const struct sock * sk,const gfp_t priority)1818 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1819 {
1820 struct sock *newsk;
1821 bool is_charged = true;
1822
1823 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1824 if (newsk != NULL) {
1825 struct sk_filter *filter;
1826
1827 sock_copy(newsk, sk);
1828
1829 newsk->sk_prot_creator = sk->sk_prot;
1830
1831 /* SANITY */
1832 if (likely(newsk->sk_net_refcnt))
1833 get_net(sock_net(newsk));
1834 sk_node_init(&newsk->sk_node);
1835 sock_lock_init(newsk);
1836 bh_lock_sock(newsk);
1837 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1838 newsk->sk_backlog.len = 0;
1839
1840 atomic_set(&newsk->sk_rmem_alloc, 0);
1841 /*
1842 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1843 */
1844 refcount_set(&newsk->sk_wmem_alloc, 1);
1845 atomic_set(&newsk->sk_omem_alloc, 0);
1846 sk_init_common(newsk);
1847
1848 newsk->sk_dst_cache = NULL;
1849 newsk->sk_dst_pending_confirm = 0;
1850 newsk->sk_wmem_queued = 0;
1851 newsk->sk_forward_alloc = 0;
1852 atomic_set(&newsk->sk_drops, 0);
1853 newsk->sk_send_head = NULL;
1854 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1855 atomic_set(&newsk->sk_zckey, 0);
1856
1857 sock_reset_flag(newsk, SOCK_DONE);
1858
1859 /* sk->sk_memcg will be populated at accept() time */
1860 newsk->sk_memcg = NULL;
1861
1862 cgroup_sk_clone(&newsk->sk_cgrp_data);
1863
1864 rcu_read_lock();
1865 filter = rcu_dereference(sk->sk_filter);
1866 if (filter != NULL)
1867 /* though it's an empty new sock, the charging may fail
1868 * if sysctl_optmem_max was changed between creation of
1869 * original socket and cloning
1870 */
1871 is_charged = sk_filter_charge(newsk, filter);
1872 RCU_INIT_POINTER(newsk->sk_filter, filter);
1873 rcu_read_unlock();
1874
1875 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1876 /* We need to make sure that we don't uncharge the new
1877 * socket if we couldn't charge it in the first place
1878 * as otherwise we uncharge the parent's filter.
1879 */
1880 if (!is_charged)
1881 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1882 sk_free_unlock_clone(newsk);
1883 newsk = NULL;
1884 goto out;
1885 }
1886 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1887
1888 if (bpf_sk_storage_clone(sk, newsk)) {
1889 sk_free_unlock_clone(newsk);
1890 newsk = NULL;
1891 goto out;
1892 }
1893
1894 newsk->sk_err = 0;
1895 newsk->sk_err_soft = 0;
1896 newsk->sk_priority = 0;
1897 newsk->sk_incoming_cpu = raw_smp_processor_id();
1898 if (likely(newsk->sk_net_refcnt))
1899 sock_inuse_add(sock_net(newsk), 1);
1900
1901 /*
1902 * Before updating sk_refcnt, we must commit prior changes to memory
1903 * (Documentation/RCU/rculist_nulls.txt for details)
1904 */
1905 smp_wmb();
1906 refcount_set(&newsk->sk_refcnt, 2);
1907
1908 /*
1909 * Increment the counter in the same struct proto as the master
1910 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1911 * is the same as sk->sk_prot->socks, as this field was copied
1912 * with memcpy).
1913 *
1914 * This _changes_ the previous behaviour, where
1915 * tcp_create_openreq_child always was incrementing the
1916 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1917 * to be taken into account in all callers. -acme
1918 */
1919 sk_refcnt_debug_inc(newsk);
1920 sk_set_socket(newsk, NULL);
1921 sk_tx_queue_clear(newsk);
1922 RCU_INIT_POINTER(newsk->sk_wq, NULL);
1923
1924 if (newsk->sk_prot->sockets_allocated)
1925 sk_sockets_allocated_inc(newsk);
1926
1927 if (sock_needs_netstamp(sk) &&
1928 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1929 net_enable_timestamp();
1930 }
1931 out:
1932 return newsk;
1933 }
1934 EXPORT_SYMBOL_GPL(sk_clone_lock);
1935
sk_free_unlock_clone(struct sock * sk)1936 void sk_free_unlock_clone(struct sock *sk)
1937 {
1938 /* It is still raw copy of parent, so invalidate
1939 * destructor and make plain sk_free() */
1940 sk->sk_destruct = NULL;
1941 bh_unlock_sock(sk);
1942 sk_free(sk);
1943 }
1944 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1945
sk_setup_caps(struct sock * sk,struct dst_entry * dst)1946 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1947 {
1948 u32 max_segs = 1;
1949
1950 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1951 if (sk->sk_route_caps & NETIF_F_GSO)
1952 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1953 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1954 if (sk_can_gso(sk)) {
1955 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1956 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1957 } else {
1958 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1959 sk->sk_gso_max_size = dst->dev->gso_max_size;
1960 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1961 }
1962 }
1963 sk->sk_gso_max_segs = max_segs;
1964 sk_dst_set(sk, dst);
1965 }
1966 EXPORT_SYMBOL_GPL(sk_setup_caps);
1967
1968 /*
1969 * Simple resource managers for sockets.
1970 */
1971
1972
1973 /*
1974 * Write buffer destructor automatically called from kfree_skb.
1975 */
sock_wfree(struct sk_buff * skb)1976 void sock_wfree(struct sk_buff *skb)
1977 {
1978 struct sock *sk = skb->sk;
1979 unsigned int len = skb->truesize;
1980
1981 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1982 /*
1983 * Keep a reference on sk_wmem_alloc, this will be released
1984 * after sk_write_space() call
1985 */
1986 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1987 sk->sk_write_space(sk);
1988 len = 1;
1989 }
1990 /*
1991 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1992 * could not do because of in-flight packets
1993 */
1994 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1995 __sk_free(sk);
1996 }
1997 EXPORT_SYMBOL(sock_wfree);
1998
1999 /* This variant of sock_wfree() is used by TCP,
2000 * since it sets SOCK_USE_WRITE_QUEUE.
2001 */
__sock_wfree(struct sk_buff * skb)2002 void __sock_wfree(struct sk_buff *skb)
2003 {
2004 struct sock *sk = skb->sk;
2005
2006 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2007 __sk_free(sk);
2008 }
2009
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2010 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2011 {
2012 skb_orphan(skb);
2013 skb->sk = sk;
2014 #ifdef CONFIG_INET
2015 if (unlikely(!sk_fullsock(sk))) {
2016 skb->destructor = sock_edemux;
2017 sock_hold(sk);
2018 return;
2019 }
2020 #endif
2021 skb->destructor = sock_wfree;
2022 skb_set_hash_from_sk(skb, sk);
2023 /*
2024 * We used to take a refcount on sk, but following operation
2025 * is enough to guarantee sk_free() wont free this sock until
2026 * all in-flight packets are completed
2027 */
2028 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2029 }
2030 EXPORT_SYMBOL(skb_set_owner_w);
2031
can_skb_orphan_partial(const struct sk_buff * skb)2032 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2033 {
2034 #ifdef CONFIG_TLS_DEVICE
2035 /* Drivers depend on in-order delivery for crypto offload,
2036 * partial orphan breaks out-of-order-OK logic.
2037 */
2038 if (skb->decrypted)
2039 return false;
2040 #endif
2041 return (skb->destructor == sock_wfree ||
2042 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2043 }
2044
2045 /* This helper is used by netem, as it can hold packets in its
2046 * delay queue. We want to allow the owner socket to send more
2047 * packets, as if they were already TX completed by a typical driver.
2048 * But we also want to keep skb->sk set because some packet schedulers
2049 * rely on it (sch_fq for example).
2050 */
skb_orphan_partial(struct sk_buff * skb)2051 void skb_orphan_partial(struct sk_buff *skb)
2052 {
2053 if (skb_is_tcp_pure_ack(skb))
2054 return;
2055
2056 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2057 return;
2058
2059 skb_orphan(skb);
2060 }
2061 EXPORT_SYMBOL(skb_orphan_partial);
2062
2063 /*
2064 * Read buffer destructor automatically called from kfree_skb.
2065 */
sock_rfree(struct sk_buff * skb)2066 void sock_rfree(struct sk_buff *skb)
2067 {
2068 struct sock *sk = skb->sk;
2069 unsigned int len = skb->truesize;
2070
2071 atomic_sub(len, &sk->sk_rmem_alloc);
2072 sk_mem_uncharge(sk, len);
2073 }
2074 EXPORT_SYMBOL(sock_rfree);
2075
2076 /*
2077 * Buffer destructor for skbs that are not used directly in read or write
2078 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2079 */
sock_efree(struct sk_buff * skb)2080 void sock_efree(struct sk_buff *skb)
2081 {
2082 sock_put(skb->sk);
2083 }
2084 EXPORT_SYMBOL(sock_efree);
2085
sock_i_uid(struct sock * sk)2086 kuid_t sock_i_uid(struct sock *sk)
2087 {
2088 kuid_t uid;
2089
2090 read_lock_bh(&sk->sk_callback_lock);
2091 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2092 read_unlock_bh(&sk->sk_callback_lock);
2093 return uid;
2094 }
2095 EXPORT_SYMBOL(sock_i_uid);
2096
__sock_i_ino(struct sock * sk)2097 unsigned long __sock_i_ino(struct sock *sk)
2098 {
2099 unsigned long ino;
2100
2101 read_lock(&sk->sk_callback_lock);
2102 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2103 read_unlock(&sk->sk_callback_lock);
2104 return ino;
2105 }
2106 EXPORT_SYMBOL(__sock_i_ino);
2107
sock_i_ino(struct sock * sk)2108 unsigned long sock_i_ino(struct sock *sk)
2109 {
2110 unsigned long ino;
2111
2112 local_bh_disable();
2113 ino = __sock_i_ino(sk);
2114 local_bh_enable();
2115 return ino;
2116 }
2117 EXPORT_SYMBOL(sock_i_ino);
2118
2119 /*
2120 * Allocate a skb from the socket's send buffer.
2121 */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2122 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2123 gfp_t priority)
2124 {
2125 if (force ||
2126 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2127 struct sk_buff *skb = alloc_skb(size, priority);
2128
2129 if (skb) {
2130 skb_set_owner_w(skb, sk);
2131 return skb;
2132 }
2133 }
2134 return NULL;
2135 }
2136 EXPORT_SYMBOL(sock_wmalloc);
2137
sock_ofree(struct sk_buff * skb)2138 static void sock_ofree(struct sk_buff *skb)
2139 {
2140 struct sock *sk = skb->sk;
2141
2142 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2143 }
2144
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2145 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2146 gfp_t priority)
2147 {
2148 struct sk_buff *skb;
2149
2150 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2151 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2152 sysctl_optmem_max)
2153 return NULL;
2154
2155 skb = alloc_skb(size, priority);
2156 if (!skb)
2157 return NULL;
2158
2159 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2160 skb->sk = sk;
2161 skb->destructor = sock_ofree;
2162 return skb;
2163 }
2164
2165 /*
2166 * Allocate a memory block from the socket's option memory buffer.
2167 */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2168 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2169 {
2170 if ((unsigned int)size <= sysctl_optmem_max &&
2171 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2172 void *mem;
2173 /* First do the add, to avoid the race if kmalloc
2174 * might sleep.
2175 */
2176 atomic_add(size, &sk->sk_omem_alloc);
2177 mem = kmalloc(size, priority);
2178 if (mem)
2179 return mem;
2180 atomic_sub(size, &sk->sk_omem_alloc);
2181 }
2182 return NULL;
2183 }
2184 EXPORT_SYMBOL(sock_kmalloc);
2185
2186 /* Free an option memory block. Note, we actually want the inline
2187 * here as this allows gcc to detect the nullify and fold away the
2188 * condition entirely.
2189 */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2190 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2191 const bool nullify)
2192 {
2193 if (WARN_ON_ONCE(!mem))
2194 return;
2195 if (nullify)
2196 kzfree(mem);
2197 else
2198 kfree(mem);
2199 atomic_sub(size, &sk->sk_omem_alloc);
2200 }
2201
sock_kfree_s(struct sock * sk,void * mem,int size)2202 void sock_kfree_s(struct sock *sk, void *mem, int size)
2203 {
2204 __sock_kfree_s(sk, mem, size, false);
2205 }
2206 EXPORT_SYMBOL(sock_kfree_s);
2207
sock_kzfree_s(struct sock * sk,void * mem,int size)2208 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2209 {
2210 __sock_kfree_s(sk, mem, size, true);
2211 }
2212 EXPORT_SYMBOL(sock_kzfree_s);
2213
2214 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2215 I think, these locks should be removed for datagram sockets.
2216 */
sock_wait_for_wmem(struct sock * sk,long timeo)2217 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2218 {
2219 DEFINE_WAIT(wait);
2220
2221 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2222 for (;;) {
2223 if (!timeo)
2224 break;
2225 if (signal_pending(current))
2226 break;
2227 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2228 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2229 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2230 break;
2231 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2232 break;
2233 if (READ_ONCE(sk->sk_err))
2234 break;
2235 timeo = schedule_timeout(timeo);
2236 }
2237 finish_wait(sk_sleep(sk), &wait);
2238 return timeo;
2239 }
2240
2241
2242 /*
2243 * Generic send/receive buffer handlers
2244 */
2245
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2246 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2247 unsigned long data_len, int noblock,
2248 int *errcode, int max_page_order)
2249 {
2250 struct sk_buff *skb;
2251 long timeo;
2252 int err;
2253
2254 timeo = sock_sndtimeo(sk, noblock);
2255 for (;;) {
2256 err = sock_error(sk);
2257 if (err != 0)
2258 goto failure;
2259
2260 err = -EPIPE;
2261 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2262 goto failure;
2263
2264 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2265 break;
2266
2267 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2268 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2269 err = -EAGAIN;
2270 if (!timeo)
2271 goto failure;
2272 if (signal_pending(current))
2273 goto interrupted;
2274 timeo = sock_wait_for_wmem(sk, timeo);
2275 }
2276 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2277 errcode, sk->sk_allocation);
2278 if (skb)
2279 skb_set_owner_w(skb, sk);
2280 return skb;
2281
2282 interrupted:
2283 err = sock_intr_errno(timeo);
2284 failure:
2285 *errcode = err;
2286 return NULL;
2287 }
2288 EXPORT_SYMBOL(sock_alloc_send_pskb);
2289
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)2290 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2291 int noblock, int *errcode)
2292 {
2293 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2294 }
2295 EXPORT_SYMBOL(sock_alloc_send_skb);
2296
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2297 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2298 struct sockcm_cookie *sockc)
2299 {
2300 u32 tsflags;
2301
2302 switch (cmsg->cmsg_type) {
2303 case SO_MARK:
2304 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2305 return -EPERM;
2306 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2307 return -EINVAL;
2308 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2309 break;
2310 case SO_TIMESTAMPING_OLD:
2311 case SO_TIMESTAMPING_NEW:
2312 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2313 return -EINVAL;
2314
2315 tsflags = *(u32 *)CMSG_DATA(cmsg);
2316 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2317 return -EINVAL;
2318
2319 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2320 sockc->tsflags |= tsflags;
2321 break;
2322 case SCM_TXTIME:
2323 if (!sock_flag(sk, SOCK_TXTIME))
2324 return -EINVAL;
2325 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2326 return -EINVAL;
2327 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2328 break;
2329 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2330 case SCM_RIGHTS:
2331 case SCM_CREDENTIALS:
2332 break;
2333 default:
2334 return -EINVAL;
2335 }
2336 return 0;
2337 }
2338 EXPORT_SYMBOL(__sock_cmsg_send);
2339
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2340 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2341 struct sockcm_cookie *sockc)
2342 {
2343 struct cmsghdr *cmsg;
2344 int ret;
2345
2346 for_each_cmsghdr(cmsg, msg) {
2347 if (!CMSG_OK(msg, cmsg))
2348 return -EINVAL;
2349 if (cmsg->cmsg_level != SOL_SOCKET)
2350 continue;
2351 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2352 if (ret)
2353 return ret;
2354 }
2355 return 0;
2356 }
2357 EXPORT_SYMBOL(sock_cmsg_send);
2358
sk_enter_memory_pressure(struct sock * sk)2359 static void sk_enter_memory_pressure(struct sock *sk)
2360 {
2361 if (!sk->sk_prot->enter_memory_pressure)
2362 return;
2363
2364 sk->sk_prot->enter_memory_pressure(sk);
2365 }
2366
sk_leave_memory_pressure(struct sock * sk)2367 static void sk_leave_memory_pressure(struct sock *sk)
2368 {
2369 if (sk->sk_prot->leave_memory_pressure) {
2370 sk->sk_prot->leave_memory_pressure(sk);
2371 } else {
2372 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2373
2374 if (memory_pressure && READ_ONCE(*memory_pressure))
2375 WRITE_ONCE(*memory_pressure, 0);
2376 }
2377 }
2378
2379 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2380
2381 /**
2382 * skb_page_frag_refill - check that a page_frag contains enough room
2383 * @sz: minimum size of the fragment we want to get
2384 * @pfrag: pointer to page_frag
2385 * @gfp: priority for memory allocation
2386 *
2387 * Note: While this allocator tries to use high order pages, there is
2388 * no guarantee that allocations succeed. Therefore, @sz MUST be
2389 * less or equal than PAGE_SIZE.
2390 */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2391 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2392 {
2393 if (pfrag->page) {
2394 if (page_ref_count(pfrag->page) == 1) {
2395 pfrag->offset = 0;
2396 return true;
2397 }
2398 if (pfrag->offset + sz <= pfrag->size)
2399 return true;
2400 put_page(pfrag->page);
2401 }
2402
2403 pfrag->offset = 0;
2404 if (SKB_FRAG_PAGE_ORDER &&
2405 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2406 /* Avoid direct reclaim but allow kswapd to wake */
2407 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2408 __GFP_COMP | __GFP_NOWARN |
2409 __GFP_NORETRY,
2410 SKB_FRAG_PAGE_ORDER);
2411 if (likely(pfrag->page)) {
2412 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2413 return true;
2414 }
2415 }
2416 pfrag->page = alloc_page(gfp);
2417 if (likely(pfrag->page)) {
2418 pfrag->size = PAGE_SIZE;
2419 return true;
2420 }
2421 return false;
2422 }
2423 EXPORT_SYMBOL(skb_page_frag_refill);
2424
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2425 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2426 {
2427 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2428 return true;
2429
2430 sk_enter_memory_pressure(sk);
2431 sk_stream_moderate_sndbuf(sk);
2432 return false;
2433 }
2434 EXPORT_SYMBOL(sk_page_frag_refill);
2435
__lock_sock(struct sock * sk)2436 static void __lock_sock(struct sock *sk)
2437 __releases(&sk->sk_lock.slock)
2438 __acquires(&sk->sk_lock.slock)
2439 {
2440 DEFINE_WAIT(wait);
2441
2442 for (;;) {
2443 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2444 TASK_UNINTERRUPTIBLE);
2445 spin_unlock_bh(&sk->sk_lock.slock);
2446 schedule();
2447 spin_lock_bh(&sk->sk_lock.slock);
2448 if (!sock_owned_by_user(sk))
2449 break;
2450 }
2451 finish_wait(&sk->sk_lock.wq, &wait);
2452 }
2453
__release_sock(struct sock * sk)2454 void __release_sock(struct sock *sk)
2455 __releases(&sk->sk_lock.slock)
2456 __acquires(&sk->sk_lock.slock)
2457 {
2458 struct sk_buff *skb, *next;
2459
2460 while ((skb = sk->sk_backlog.head) != NULL) {
2461 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2462
2463 spin_unlock_bh(&sk->sk_lock.slock);
2464
2465 do {
2466 next = skb->next;
2467 prefetch(next);
2468 WARN_ON_ONCE(skb_dst_is_noref(skb));
2469 skb_mark_not_on_list(skb);
2470 sk_backlog_rcv(sk, skb);
2471
2472 cond_resched();
2473
2474 skb = next;
2475 } while (skb != NULL);
2476
2477 spin_lock_bh(&sk->sk_lock.slock);
2478 }
2479
2480 /*
2481 * Doing the zeroing here guarantee we can not loop forever
2482 * while a wild producer attempts to flood us.
2483 */
2484 sk->sk_backlog.len = 0;
2485 }
2486
__sk_flush_backlog(struct sock * sk)2487 void __sk_flush_backlog(struct sock *sk)
2488 {
2489 spin_lock_bh(&sk->sk_lock.slock);
2490 __release_sock(sk);
2491 spin_unlock_bh(&sk->sk_lock.slock);
2492 }
2493
2494 /**
2495 * sk_wait_data - wait for data to arrive at sk_receive_queue
2496 * @sk: sock to wait on
2497 * @timeo: for how long
2498 * @skb: last skb seen on sk_receive_queue
2499 *
2500 * Now socket state including sk->sk_err is changed only under lock,
2501 * hence we may omit checks after joining wait queue.
2502 * We check receive queue before schedule() only as optimization;
2503 * it is very likely that release_sock() added new data.
2504 */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2505 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2506 {
2507 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2508 int rc;
2509
2510 add_wait_queue(sk_sleep(sk), &wait);
2511 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2512 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2513 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2514 remove_wait_queue(sk_sleep(sk), &wait);
2515 return rc;
2516 }
2517 EXPORT_SYMBOL(sk_wait_data);
2518
2519 /**
2520 * __sk_mem_raise_allocated - increase memory_allocated
2521 * @sk: socket
2522 * @size: memory size to allocate
2523 * @amt: pages to allocate
2524 * @kind: allocation type
2525 *
2526 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2527 */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)2528 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2529 {
2530 struct proto *prot = sk->sk_prot;
2531 long allocated = sk_memory_allocated_add(sk, amt);
2532 bool charged = true;
2533
2534 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2535 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2536 goto suppress_allocation;
2537
2538 /* Under limit. */
2539 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2540 sk_leave_memory_pressure(sk);
2541 return 1;
2542 }
2543
2544 /* Under pressure. */
2545 if (allocated > sk_prot_mem_limits(sk, 1))
2546 sk_enter_memory_pressure(sk);
2547
2548 /* Over hard limit. */
2549 if (allocated > sk_prot_mem_limits(sk, 2))
2550 goto suppress_allocation;
2551
2552 /* guarantee minimum buffer size under pressure */
2553 if (kind == SK_MEM_RECV) {
2554 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2555 return 1;
2556
2557 } else { /* SK_MEM_SEND */
2558 int wmem0 = sk_get_wmem0(sk, prot);
2559
2560 if (sk->sk_type == SOCK_STREAM) {
2561 if (sk->sk_wmem_queued < wmem0)
2562 return 1;
2563 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2564 return 1;
2565 }
2566 }
2567
2568 if (sk_has_memory_pressure(sk)) {
2569 u64 alloc;
2570
2571 if (!sk_under_memory_pressure(sk))
2572 return 1;
2573 alloc = sk_sockets_allocated_read_positive(sk);
2574 if (sk_prot_mem_limits(sk, 2) > alloc *
2575 sk_mem_pages(sk->sk_wmem_queued +
2576 atomic_read(&sk->sk_rmem_alloc) +
2577 sk->sk_forward_alloc))
2578 return 1;
2579 }
2580
2581 suppress_allocation:
2582
2583 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2584 sk_stream_moderate_sndbuf(sk);
2585
2586 /* Fail only if socket is _under_ its sndbuf.
2587 * In this case we cannot block, so that we have to fail.
2588 */
2589 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2590 return 1;
2591 }
2592
2593 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2594 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2595
2596 sk_memory_allocated_sub(sk, amt);
2597
2598 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2599 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2600
2601 return 0;
2602 }
2603 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2604
2605 /**
2606 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2607 * @sk: socket
2608 * @size: memory size to allocate
2609 * @kind: allocation type
2610 *
2611 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2612 * rmem allocation. This function assumes that protocols which have
2613 * memory_pressure use sk_wmem_queued as write buffer accounting.
2614 */
__sk_mem_schedule(struct sock * sk,int size,int kind)2615 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2616 {
2617 int ret, amt = sk_mem_pages(size);
2618
2619 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2620 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2621 if (!ret)
2622 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2623 return ret;
2624 }
2625 EXPORT_SYMBOL(__sk_mem_schedule);
2626
2627 /**
2628 * __sk_mem_reduce_allocated - reclaim memory_allocated
2629 * @sk: socket
2630 * @amount: number of quanta
2631 *
2632 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2633 */
__sk_mem_reduce_allocated(struct sock * sk,int amount)2634 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2635 {
2636 sk_memory_allocated_sub(sk, amount);
2637
2638 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2639 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2640
2641 if (sk_under_global_memory_pressure(sk) &&
2642 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2643 sk_leave_memory_pressure(sk);
2644 }
2645 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2646
2647 /**
2648 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2649 * @sk: socket
2650 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2651 */
__sk_mem_reclaim(struct sock * sk,int amount)2652 void __sk_mem_reclaim(struct sock *sk, int amount)
2653 {
2654 amount >>= SK_MEM_QUANTUM_SHIFT;
2655 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2656 __sk_mem_reduce_allocated(sk, amount);
2657 }
2658 EXPORT_SYMBOL(__sk_mem_reclaim);
2659
sk_set_peek_off(struct sock * sk,int val)2660 int sk_set_peek_off(struct sock *sk, int val)
2661 {
2662 WRITE_ONCE(sk->sk_peek_off, val);
2663 return 0;
2664 }
2665 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2666
2667 /*
2668 * Set of default routines for initialising struct proto_ops when
2669 * the protocol does not support a particular function. In certain
2670 * cases where it makes no sense for a protocol to have a "do nothing"
2671 * function, some default processing is provided.
2672 */
2673
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2674 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2675 {
2676 return -EOPNOTSUPP;
2677 }
2678 EXPORT_SYMBOL(sock_no_bind);
2679
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2680 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2681 int len, int flags)
2682 {
2683 return -EOPNOTSUPP;
2684 }
2685 EXPORT_SYMBOL(sock_no_connect);
2686
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2687 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2688 {
2689 return -EOPNOTSUPP;
2690 }
2691 EXPORT_SYMBOL(sock_no_socketpair);
2692
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)2693 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2694 bool kern)
2695 {
2696 return -EOPNOTSUPP;
2697 }
2698 EXPORT_SYMBOL(sock_no_accept);
2699
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)2700 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2701 int peer)
2702 {
2703 return -EOPNOTSUPP;
2704 }
2705 EXPORT_SYMBOL(sock_no_getname);
2706
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2707 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2708 {
2709 return -EOPNOTSUPP;
2710 }
2711 EXPORT_SYMBOL(sock_no_ioctl);
2712
sock_no_listen(struct socket * sock,int backlog)2713 int sock_no_listen(struct socket *sock, int backlog)
2714 {
2715 return -EOPNOTSUPP;
2716 }
2717 EXPORT_SYMBOL(sock_no_listen);
2718
sock_no_shutdown(struct socket * sock,int how)2719 int sock_no_shutdown(struct socket *sock, int how)
2720 {
2721 return -EOPNOTSUPP;
2722 }
2723 EXPORT_SYMBOL(sock_no_shutdown);
2724
sock_no_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)2725 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2726 char __user *optval, unsigned int optlen)
2727 {
2728 return -EOPNOTSUPP;
2729 }
2730 EXPORT_SYMBOL(sock_no_setsockopt);
2731
sock_no_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2732 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2733 char __user *optval, int __user *optlen)
2734 {
2735 return -EOPNOTSUPP;
2736 }
2737 EXPORT_SYMBOL(sock_no_getsockopt);
2738
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2739 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2740 {
2741 return -EOPNOTSUPP;
2742 }
2743 EXPORT_SYMBOL(sock_no_sendmsg);
2744
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)2745 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2746 {
2747 return -EOPNOTSUPP;
2748 }
2749 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2750
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)2751 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2752 int flags)
2753 {
2754 return -EOPNOTSUPP;
2755 }
2756 EXPORT_SYMBOL(sock_no_recvmsg);
2757
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)2758 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2759 {
2760 /* Mirror missing mmap method error code */
2761 return -ENODEV;
2762 }
2763 EXPORT_SYMBOL(sock_no_mmap);
2764
2765 /*
2766 * When a file is received (via SCM_RIGHTS, etc), we must bump the
2767 * various sock-based usage counts.
2768 */
__receive_sock(struct file * file)2769 void __receive_sock(struct file *file)
2770 {
2771 struct socket *sock;
2772 int error;
2773
2774 /*
2775 * The resulting value of "error" is ignored here since we only
2776 * need to take action when the file is a socket and testing
2777 * "sock" for NULL is sufficient.
2778 */
2779 sock = sock_from_file(file, &error);
2780 if (sock) {
2781 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2782 sock_update_classid(&sock->sk->sk_cgrp_data);
2783 }
2784 }
2785
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)2786 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2787 {
2788 ssize_t res;
2789 struct msghdr msg = {.msg_flags = flags};
2790 struct kvec iov;
2791 char *kaddr = kmap(page);
2792 iov.iov_base = kaddr + offset;
2793 iov.iov_len = size;
2794 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2795 kunmap(page);
2796 return res;
2797 }
2798 EXPORT_SYMBOL(sock_no_sendpage);
2799
sock_no_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)2800 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2801 int offset, size_t size, int flags)
2802 {
2803 ssize_t res;
2804 struct msghdr msg = {.msg_flags = flags};
2805 struct kvec iov;
2806 char *kaddr = kmap(page);
2807
2808 iov.iov_base = kaddr + offset;
2809 iov.iov_len = size;
2810 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2811 kunmap(page);
2812 return res;
2813 }
2814 EXPORT_SYMBOL(sock_no_sendpage_locked);
2815
2816 /*
2817 * Default Socket Callbacks
2818 */
2819
sock_def_wakeup(struct sock * sk)2820 static void sock_def_wakeup(struct sock *sk)
2821 {
2822 struct socket_wq *wq;
2823
2824 rcu_read_lock();
2825 wq = rcu_dereference(sk->sk_wq);
2826 if (skwq_has_sleeper(wq))
2827 wake_up_interruptible_all(&wq->wait);
2828 rcu_read_unlock();
2829 }
2830
sock_def_error_report(struct sock * sk)2831 static void sock_def_error_report(struct sock *sk)
2832 {
2833 struct socket_wq *wq;
2834
2835 rcu_read_lock();
2836 wq = rcu_dereference(sk->sk_wq);
2837 if (skwq_has_sleeper(wq))
2838 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2839 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2840 rcu_read_unlock();
2841 }
2842
sock_def_readable(struct sock * sk)2843 static void sock_def_readable(struct sock *sk)
2844 {
2845 struct socket_wq *wq;
2846
2847 rcu_read_lock();
2848 wq = rcu_dereference(sk->sk_wq);
2849 if (skwq_has_sleeper(wq))
2850 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2851 EPOLLRDNORM | EPOLLRDBAND);
2852 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2853 rcu_read_unlock();
2854 }
2855
sock_def_write_space(struct sock * sk)2856 static void sock_def_write_space(struct sock *sk)
2857 {
2858 struct socket_wq *wq;
2859
2860 rcu_read_lock();
2861
2862 /* Do not wake up a writer until he can make "significant"
2863 * progress. --DaveM
2864 */
2865 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2866 wq = rcu_dereference(sk->sk_wq);
2867 if (skwq_has_sleeper(wq))
2868 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2869 EPOLLWRNORM | EPOLLWRBAND);
2870
2871 /* Should agree with poll, otherwise some programs break */
2872 if (sock_writeable(sk))
2873 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2874 }
2875
2876 rcu_read_unlock();
2877 }
2878
sock_def_destruct(struct sock * sk)2879 static void sock_def_destruct(struct sock *sk)
2880 {
2881 }
2882
sk_send_sigurg(struct sock * sk)2883 void sk_send_sigurg(struct sock *sk)
2884 {
2885 if (sk->sk_socket && sk->sk_socket->file)
2886 if (send_sigurg(&sk->sk_socket->file->f_owner))
2887 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2888 }
2889 EXPORT_SYMBOL(sk_send_sigurg);
2890
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)2891 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2892 unsigned long expires)
2893 {
2894 if (!mod_timer(timer, expires))
2895 sock_hold(sk);
2896 }
2897 EXPORT_SYMBOL(sk_reset_timer);
2898
sk_stop_timer(struct sock * sk,struct timer_list * timer)2899 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2900 {
2901 if (del_timer(timer))
2902 __sock_put(sk);
2903 }
2904 EXPORT_SYMBOL(sk_stop_timer);
2905
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)2906 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
2907 {
2908 if (del_timer_sync(timer))
2909 __sock_put(sk);
2910 }
2911 EXPORT_SYMBOL(sk_stop_timer_sync);
2912
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)2913 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
2914 {
2915 sk_init_common(sk);
2916 sk->sk_send_head = NULL;
2917
2918 timer_setup(&sk->sk_timer, NULL, 0);
2919
2920 sk->sk_allocation = GFP_KERNEL;
2921 sk->sk_rcvbuf = sysctl_rmem_default;
2922 sk->sk_sndbuf = sysctl_wmem_default;
2923 sk->sk_state = TCP_CLOSE;
2924 sk_set_socket(sk, sock);
2925
2926 sock_set_flag(sk, SOCK_ZAPPED);
2927
2928 if (sock) {
2929 sk->sk_type = sock->type;
2930 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2931 sock->sk = sk;
2932 } else {
2933 RCU_INIT_POINTER(sk->sk_wq, NULL);
2934 }
2935 sk->sk_uid = uid;
2936
2937 rwlock_init(&sk->sk_callback_lock);
2938 if (sk->sk_kern_sock)
2939 lockdep_set_class_and_name(
2940 &sk->sk_callback_lock,
2941 af_kern_callback_keys + sk->sk_family,
2942 af_family_kern_clock_key_strings[sk->sk_family]);
2943 else
2944 lockdep_set_class_and_name(
2945 &sk->sk_callback_lock,
2946 af_callback_keys + sk->sk_family,
2947 af_family_clock_key_strings[sk->sk_family]);
2948
2949 sk->sk_state_change = sock_def_wakeup;
2950 sk->sk_data_ready = sock_def_readable;
2951 sk->sk_write_space = sock_def_write_space;
2952 sk->sk_error_report = sock_def_error_report;
2953 sk->sk_destruct = sock_def_destruct;
2954
2955 sk->sk_frag.page = NULL;
2956 sk->sk_frag.offset = 0;
2957 sk->sk_peek_off = -1;
2958
2959 sk->sk_peer_pid = NULL;
2960 sk->sk_peer_cred = NULL;
2961 spin_lock_init(&sk->sk_peer_lock);
2962
2963 sk->sk_write_pending = 0;
2964 sk->sk_rcvlowat = 1;
2965 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2966 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2967
2968 sk->sk_stamp = SK_DEFAULT_STAMP;
2969 #if BITS_PER_LONG==32
2970 seqlock_init(&sk->sk_stamp_seq);
2971 #endif
2972 atomic_set(&sk->sk_zckey, 0);
2973
2974 #ifdef CONFIG_NET_RX_BUSY_POLL
2975 sk->sk_napi_id = 0;
2976 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
2977 #endif
2978
2979 sk->sk_max_pacing_rate = ~0UL;
2980 sk->sk_pacing_rate = ~0UL;
2981 WRITE_ONCE(sk->sk_pacing_shift, 10);
2982 sk->sk_incoming_cpu = -1;
2983
2984 sk_rx_queue_clear(sk);
2985 /*
2986 * Before updating sk_refcnt, we must commit prior changes to memory
2987 * (Documentation/RCU/rculist_nulls.txt for details)
2988 */
2989 smp_wmb();
2990 refcount_set(&sk->sk_refcnt, 1);
2991 atomic_set(&sk->sk_drops, 0);
2992 }
2993 EXPORT_SYMBOL(sock_init_data_uid);
2994
sock_init_data(struct socket * sock,struct sock * sk)2995 void sock_init_data(struct socket *sock, struct sock *sk)
2996 {
2997 kuid_t uid = sock ?
2998 SOCK_INODE(sock)->i_uid :
2999 make_kuid(sock_net(sk)->user_ns, 0);
3000
3001 sock_init_data_uid(sock, sk, uid);
3002 }
3003 EXPORT_SYMBOL(sock_init_data);
3004
lock_sock_nested(struct sock * sk,int subclass)3005 void lock_sock_nested(struct sock *sk, int subclass)
3006 {
3007 might_sleep();
3008 spin_lock_bh(&sk->sk_lock.slock);
3009 if (sk->sk_lock.owned)
3010 __lock_sock(sk);
3011 sk->sk_lock.owned = 1;
3012 spin_unlock(&sk->sk_lock.slock);
3013 /*
3014 * The sk_lock has mutex_lock() semantics here:
3015 */
3016 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3017 local_bh_enable();
3018 }
3019 EXPORT_SYMBOL(lock_sock_nested);
3020
release_sock(struct sock * sk)3021 void release_sock(struct sock *sk)
3022 {
3023 spin_lock_bh(&sk->sk_lock.slock);
3024 if (sk->sk_backlog.tail)
3025 __release_sock(sk);
3026
3027 /* Warning : release_cb() might need to release sk ownership,
3028 * ie call sock_release_ownership(sk) before us.
3029 */
3030 if (sk->sk_prot->release_cb)
3031 sk->sk_prot->release_cb(sk);
3032
3033 sock_release_ownership(sk);
3034 if (waitqueue_active(&sk->sk_lock.wq))
3035 wake_up(&sk->sk_lock.wq);
3036 spin_unlock_bh(&sk->sk_lock.slock);
3037 }
3038 EXPORT_SYMBOL(release_sock);
3039
3040 /**
3041 * lock_sock_fast - fast version of lock_sock
3042 * @sk: socket
3043 *
3044 * This version should be used for very small section, where process wont block
3045 * return false if fast path is taken:
3046 *
3047 * sk_lock.slock locked, owned = 0, BH disabled
3048 *
3049 * return true if slow path is taken:
3050 *
3051 * sk_lock.slock unlocked, owned = 1, BH enabled
3052 */
lock_sock_fast(struct sock * sk)3053 bool lock_sock_fast(struct sock *sk)
3054 {
3055 might_sleep();
3056 spin_lock_bh(&sk->sk_lock.slock);
3057
3058 if (!sk->sk_lock.owned)
3059 /*
3060 * Note : We must disable BH
3061 */
3062 return false;
3063
3064 __lock_sock(sk);
3065 sk->sk_lock.owned = 1;
3066 spin_unlock(&sk->sk_lock.slock);
3067 /*
3068 * The sk_lock has mutex_lock() semantics here:
3069 */
3070 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3071 local_bh_enable();
3072 return true;
3073 }
3074 EXPORT_SYMBOL(lock_sock_fast);
3075
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3076 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3077 bool timeval, bool time32)
3078 {
3079 struct sock *sk = sock->sk;
3080 struct timespec64 ts;
3081
3082 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3083 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3084 if (ts.tv_sec == -1)
3085 return -ENOENT;
3086 if (ts.tv_sec == 0) {
3087 ktime_t kt = ktime_get_real();
3088 sock_write_timestamp(sk, kt);;
3089 ts = ktime_to_timespec64(kt);
3090 }
3091
3092 if (timeval)
3093 ts.tv_nsec /= 1000;
3094
3095 #ifdef CONFIG_COMPAT_32BIT_TIME
3096 if (time32)
3097 return put_old_timespec32(&ts, userstamp);
3098 #endif
3099 #ifdef CONFIG_SPARC64
3100 /* beware of padding in sparc64 timeval */
3101 if (timeval && !in_compat_syscall()) {
3102 struct __kernel_old_timeval __user tv = {
3103 .tv_sec = ts.tv_sec,
3104 .tv_usec = ts.tv_nsec,
3105 };
3106 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3107 return -EFAULT;
3108 return 0;
3109 }
3110 #endif
3111 return put_timespec64(&ts, userstamp);
3112 }
3113 EXPORT_SYMBOL(sock_gettstamp);
3114
sock_enable_timestamp(struct sock * sk,int flag)3115 void sock_enable_timestamp(struct sock *sk, int flag)
3116 {
3117 if (!sock_flag(sk, flag)) {
3118 unsigned long previous_flags = sk->sk_flags;
3119
3120 sock_set_flag(sk, flag);
3121 /*
3122 * we just set one of the two flags which require net
3123 * time stamping, but time stamping might have been on
3124 * already because of the other one
3125 */
3126 if (sock_needs_netstamp(sk) &&
3127 !(previous_flags & SK_FLAGS_TIMESTAMP))
3128 net_enable_timestamp();
3129 }
3130 }
3131
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3132 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3133 int level, int type)
3134 {
3135 struct sock_exterr_skb *serr;
3136 struct sk_buff *skb;
3137 int copied, err;
3138
3139 err = -EAGAIN;
3140 skb = sock_dequeue_err_skb(sk);
3141 if (skb == NULL)
3142 goto out;
3143
3144 copied = skb->len;
3145 if (copied > len) {
3146 msg->msg_flags |= MSG_TRUNC;
3147 copied = len;
3148 }
3149 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3150 if (err)
3151 goto out_free_skb;
3152
3153 sock_recv_timestamp(msg, sk, skb);
3154
3155 serr = SKB_EXT_ERR(skb);
3156 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3157
3158 msg->msg_flags |= MSG_ERRQUEUE;
3159 err = copied;
3160
3161 out_free_skb:
3162 kfree_skb(skb);
3163 out:
3164 return err;
3165 }
3166 EXPORT_SYMBOL(sock_recv_errqueue);
3167
3168 /*
3169 * Get a socket option on an socket.
3170 *
3171 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3172 * asynchronous errors should be reported by getsockopt. We assume
3173 * this means if you specify SO_ERROR (otherwise whats the point of it).
3174 */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3175 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3176 char __user *optval, int __user *optlen)
3177 {
3178 struct sock *sk = sock->sk;
3179
3180 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3181 }
3182 EXPORT_SYMBOL(sock_common_getsockopt);
3183
3184 #ifdef CONFIG_COMPAT
compat_sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3185 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3186 char __user *optval, int __user *optlen)
3187 {
3188 struct sock *sk = sock->sk;
3189
3190 if (sk->sk_prot->compat_getsockopt != NULL)
3191 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3192 optval, optlen);
3193 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3194 }
3195 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3196 #endif
3197
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3198 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3199 int flags)
3200 {
3201 struct sock *sk = sock->sk;
3202 int addr_len = 0;
3203 int err;
3204
3205 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3206 flags & ~MSG_DONTWAIT, &addr_len);
3207 if (err >= 0)
3208 msg->msg_namelen = addr_len;
3209 return err;
3210 }
3211 EXPORT_SYMBOL(sock_common_recvmsg);
3212
3213 /*
3214 * Set socket options on an inet socket.
3215 */
sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)3216 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3217 char __user *optval, unsigned int optlen)
3218 {
3219 struct sock *sk = sock->sk;
3220
3221 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3222 }
3223 EXPORT_SYMBOL(sock_common_setsockopt);
3224
3225 #ifdef CONFIG_COMPAT
compat_sock_common_setsockopt(struct socket * sock,int level,int optname,char __user * optval,unsigned int optlen)3226 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3227 char __user *optval, unsigned int optlen)
3228 {
3229 struct sock *sk = sock->sk;
3230
3231 if (sk->sk_prot->compat_setsockopt != NULL)
3232 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3233 optval, optlen);
3234 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3235 }
3236 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3237 #endif
3238
sk_common_release(struct sock * sk)3239 void sk_common_release(struct sock *sk)
3240 {
3241 if (sk->sk_prot->destroy)
3242 sk->sk_prot->destroy(sk);
3243
3244 /*
3245 * Observation: when sock_common_release is called, processes have
3246 * no access to socket. But net still has.
3247 * Step one, detach it from networking:
3248 *
3249 * A. Remove from hash tables.
3250 */
3251
3252 sk->sk_prot->unhash(sk);
3253
3254 /*
3255 * In this point socket cannot receive new packets, but it is possible
3256 * that some packets are in flight because some CPU runs receiver and
3257 * did hash table lookup before we unhashed socket. They will achieve
3258 * receive queue and will be purged by socket destructor.
3259 *
3260 * Also we still have packets pending on receive queue and probably,
3261 * our own packets waiting in device queues. sock_destroy will drain
3262 * receive queue, but transmitted packets will delay socket destruction
3263 * until the last reference will be released.
3264 */
3265
3266 sock_orphan(sk);
3267
3268 xfrm_sk_free_policy(sk);
3269
3270 sk_refcnt_debug_release(sk);
3271
3272 sock_put(sk);
3273 }
3274 EXPORT_SYMBOL(sk_common_release);
3275
sk_get_meminfo(const struct sock * sk,u32 * mem)3276 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3277 {
3278 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3279
3280 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3281 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3282 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3283 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3284 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3285 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3286 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3287 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3288 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3289 }
3290
3291 #ifdef CONFIG_PROC_FS
3292 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
3293 struct prot_inuse {
3294 int val[PROTO_INUSE_NR];
3295 };
3296
3297 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3298
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)3299 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3300 {
3301 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3302 }
3303 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3304
sock_prot_inuse_get(struct net * net,struct proto * prot)3305 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3306 {
3307 int cpu, idx = prot->inuse_idx;
3308 int res = 0;
3309
3310 for_each_possible_cpu(cpu)
3311 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3312
3313 return res >= 0 ? res : 0;
3314 }
3315 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3316
sock_inuse_add(struct net * net,int val)3317 static void sock_inuse_add(struct net *net, int val)
3318 {
3319 this_cpu_add(*net->core.sock_inuse, val);
3320 }
3321
sock_inuse_get(struct net * net)3322 int sock_inuse_get(struct net *net)
3323 {
3324 int cpu, res = 0;
3325
3326 for_each_possible_cpu(cpu)
3327 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3328
3329 return res;
3330 }
3331
3332 EXPORT_SYMBOL_GPL(sock_inuse_get);
3333
sock_inuse_init_net(struct net * net)3334 static int __net_init sock_inuse_init_net(struct net *net)
3335 {
3336 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3337 if (net->core.prot_inuse == NULL)
3338 return -ENOMEM;
3339
3340 net->core.sock_inuse = alloc_percpu(int);
3341 if (net->core.sock_inuse == NULL)
3342 goto out;
3343
3344 return 0;
3345
3346 out:
3347 free_percpu(net->core.prot_inuse);
3348 return -ENOMEM;
3349 }
3350
sock_inuse_exit_net(struct net * net)3351 static void __net_exit sock_inuse_exit_net(struct net *net)
3352 {
3353 free_percpu(net->core.prot_inuse);
3354 free_percpu(net->core.sock_inuse);
3355 }
3356
3357 static struct pernet_operations net_inuse_ops = {
3358 .init = sock_inuse_init_net,
3359 .exit = sock_inuse_exit_net,
3360 };
3361
net_inuse_init(void)3362 static __init int net_inuse_init(void)
3363 {
3364 if (register_pernet_subsys(&net_inuse_ops))
3365 panic("Cannot initialize net inuse counters");
3366
3367 return 0;
3368 }
3369
3370 core_initcall(net_inuse_init);
3371
assign_proto_idx(struct proto * prot)3372 static int assign_proto_idx(struct proto *prot)
3373 {
3374 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3375
3376 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3377 pr_err("PROTO_INUSE_NR exhausted\n");
3378 return -ENOSPC;
3379 }
3380
3381 set_bit(prot->inuse_idx, proto_inuse_idx);
3382 return 0;
3383 }
3384
release_proto_idx(struct proto * prot)3385 static void release_proto_idx(struct proto *prot)
3386 {
3387 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3388 clear_bit(prot->inuse_idx, proto_inuse_idx);
3389 }
3390 #else
assign_proto_idx(struct proto * prot)3391 static inline int assign_proto_idx(struct proto *prot)
3392 {
3393 return 0;
3394 }
3395
release_proto_idx(struct proto * prot)3396 static inline void release_proto_idx(struct proto *prot)
3397 {
3398 }
3399
sock_inuse_add(struct net * net,int val)3400 static void sock_inuse_add(struct net *net, int val)
3401 {
3402 }
3403 #endif
3404
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3405 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3406 {
3407 if (!twsk_prot)
3408 return;
3409 kfree(twsk_prot->twsk_slab_name);
3410 twsk_prot->twsk_slab_name = NULL;
3411 kmem_cache_destroy(twsk_prot->twsk_slab);
3412 twsk_prot->twsk_slab = NULL;
3413 }
3414
req_prot_cleanup(struct request_sock_ops * rsk_prot)3415 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3416 {
3417 if (!rsk_prot)
3418 return;
3419 kfree(rsk_prot->slab_name);
3420 rsk_prot->slab_name = NULL;
3421 kmem_cache_destroy(rsk_prot->slab);
3422 rsk_prot->slab = NULL;
3423 }
3424
req_prot_init(const struct proto * prot)3425 static int req_prot_init(const struct proto *prot)
3426 {
3427 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3428
3429 if (!rsk_prot)
3430 return 0;
3431
3432 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3433 prot->name);
3434 if (!rsk_prot->slab_name)
3435 return -ENOMEM;
3436
3437 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3438 rsk_prot->obj_size, 0,
3439 SLAB_ACCOUNT | prot->slab_flags,
3440 NULL);
3441
3442 if (!rsk_prot->slab) {
3443 pr_crit("%s: Can't create request sock SLAB cache!\n",
3444 prot->name);
3445 return -ENOMEM;
3446 }
3447 return 0;
3448 }
3449
proto_register(struct proto * prot,int alloc_slab)3450 int proto_register(struct proto *prot, int alloc_slab)
3451 {
3452 int ret = -ENOBUFS;
3453
3454 if (alloc_slab) {
3455 prot->slab = kmem_cache_create_usercopy(prot->name,
3456 prot->obj_size, 0,
3457 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3458 prot->slab_flags,
3459 prot->useroffset, prot->usersize,
3460 NULL);
3461
3462 if (prot->slab == NULL) {
3463 pr_crit("%s: Can't create sock SLAB cache!\n",
3464 prot->name);
3465 goto out;
3466 }
3467
3468 if (req_prot_init(prot))
3469 goto out_free_request_sock_slab;
3470
3471 if (prot->twsk_prot != NULL) {
3472 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3473
3474 if (prot->twsk_prot->twsk_slab_name == NULL)
3475 goto out_free_request_sock_slab;
3476
3477 prot->twsk_prot->twsk_slab =
3478 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3479 prot->twsk_prot->twsk_obj_size,
3480 0,
3481 SLAB_ACCOUNT |
3482 prot->slab_flags,
3483 NULL);
3484 if (prot->twsk_prot->twsk_slab == NULL)
3485 goto out_free_timewait_sock_slab;
3486 }
3487 }
3488
3489 mutex_lock(&proto_list_mutex);
3490 ret = assign_proto_idx(prot);
3491 if (ret) {
3492 mutex_unlock(&proto_list_mutex);
3493 goto out_free_timewait_sock_slab;
3494 }
3495 list_add(&prot->node, &proto_list);
3496 mutex_unlock(&proto_list_mutex);
3497 return ret;
3498
3499 out_free_timewait_sock_slab:
3500 if (alloc_slab && prot->twsk_prot)
3501 tw_prot_cleanup(prot->twsk_prot);
3502 out_free_request_sock_slab:
3503 if (alloc_slab) {
3504 req_prot_cleanup(prot->rsk_prot);
3505
3506 kmem_cache_destroy(prot->slab);
3507 prot->slab = NULL;
3508 }
3509 out:
3510 return ret;
3511 }
3512 EXPORT_SYMBOL(proto_register);
3513
proto_unregister(struct proto * prot)3514 void proto_unregister(struct proto *prot)
3515 {
3516 mutex_lock(&proto_list_mutex);
3517 release_proto_idx(prot);
3518 list_del(&prot->node);
3519 mutex_unlock(&proto_list_mutex);
3520
3521 kmem_cache_destroy(prot->slab);
3522 prot->slab = NULL;
3523
3524 req_prot_cleanup(prot->rsk_prot);
3525 tw_prot_cleanup(prot->twsk_prot);
3526 }
3527 EXPORT_SYMBOL(proto_unregister);
3528
sock_load_diag_module(int family,int protocol)3529 int sock_load_diag_module(int family, int protocol)
3530 {
3531 if (!protocol) {
3532 if (!sock_is_registered(family))
3533 return -ENOENT;
3534
3535 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3536 NETLINK_SOCK_DIAG, family);
3537 }
3538
3539 #ifdef CONFIG_INET
3540 if (family == AF_INET &&
3541 protocol != IPPROTO_RAW &&
3542 !rcu_access_pointer(inet_protos[protocol]))
3543 return -ENOENT;
3544 #endif
3545
3546 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3547 NETLINK_SOCK_DIAG, family, protocol);
3548 }
3549 EXPORT_SYMBOL(sock_load_diag_module);
3550
3551 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)3552 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3553 __acquires(proto_list_mutex)
3554 {
3555 mutex_lock(&proto_list_mutex);
3556 return seq_list_start_head(&proto_list, *pos);
3557 }
3558
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)3559 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3560 {
3561 return seq_list_next(v, &proto_list, pos);
3562 }
3563
proto_seq_stop(struct seq_file * seq,void * v)3564 static void proto_seq_stop(struct seq_file *seq, void *v)
3565 __releases(proto_list_mutex)
3566 {
3567 mutex_unlock(&proto_list_mutex);
3568 }
3569
proto_method_implemented(const void * method)3570 static char proto_method_implemented(const void *method)
3571 {
3572 return method == NULL ? 'n' : 'y';
3573 }
sock_prot_memory_allocated(struct proto * proto)3574 static long sock_prot_memory_allocated(struct proto *proto)
3575 {
3576 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3577 }
3578
sock_prot_memory_pressure(struct proto * proto)3579 static const char *sock_prot_memory_pressure(struct proto *proto)
3580 {
3581 return proto->memory_pressure != NULL ?
3582 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3583 }
3584
proto_seq_printf(struct seq_file * seq,struct proto * proto)3585 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3586 {
3587
3588 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3589 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3590 proto->name,
3591 proto->obj_size,
3592 sock_prot_inuse_get(seq_file_net(seq), proto),
3593 sock_prot_memory_allocated(proto),
3594 sock_prot_memory_pressure(proto),
3595 proto->max_header,
3596 proto->slab == NULL ? "no" : "yes",
3597 module_name(proto->owner),
3598 proto_method_implemented(proto->close),
3599 proto_method_implemented(proto->connect),
3600 proto_method_implemented(proto->disconnect),
3601 proto_method_implemented(proto->accept),
3602 proto_method_implemented(proto->ioctl),
3603 proto_method_implemented(proto->init),
3604 proto_method_implemented(proto->destroy),
3605 proto_method_implemented(proto->shutdown),
3606 proto_method_implemented(proto->setsockopt),
3607 proto_method_implemented(proto->getsockopt),
3608 proto_method_implemented(proto->sendmsg),
3609 proto_method_implemented(proto->recvmsg),
3610 proto_method_implemented(proto->sendpage),
3611 proto_method_implemented(proto->bind),
3612 proto_method_implemented(proto->backlog_rcv),
3613 proto_method_implemented(proto->hash),
3614 proto_method_implemented(proto->unhash),
3615 proto_method_implemented(proto->get_port),
3616 proto_method_implemented(proto->enter_memory_pressure));
3617 }
3618
proto_seq_show(struct seq_file * seq,void * v)3619 static int proto_seq_show(struct seq_file *seq, void *v)
3620 {
3621 if (v == &proto_list)
3622 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3623 "protocol",
3624 "size",
3625 "sockets",
3626 "memory",
3627 "press",
3628 "maxhdr",
3629 "slab",
3630 "module",
3631 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3632 else
3633 proto_seq_printf(seq, list_entry(v, struct proto, node));
3634 return 0;
3635 }
3636
3637 static const struct seq_operations proto_seq_ops = {
3638 .start = proto_seq_start,
3639 .next = proto_seq_next,
3640 .stop = proto_seq_stop,
3641 .show = proto_seq_show,
3642 };
3643
proto_init_net(struct net * net)3644 static __net_init int proto_init_net(struct net *net)
3645 {
3646 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3647 sizeof(struct seq_net_private)))
3648 return -ENOMEM;
3649
3650 return 0;
3651 }
3652
proto_exit_net(struct net * net)3653 static __net_exit void proto_exit_net(struct net *net)
3654 {
3655 remove_proc_entry("protocols", net->proc_net);
3656 }
3657
3658
3659 static __net_initdata struct pernet_operations proto_net_ops = {
3660 .init = proto_init_net,
3661 .exit = proto_exit_net,
3662 };
3663
proto_init(void)3664 static int __init proto_init(void)
3665 {
3666 return register_pernet_subsys(&proto_net_ops);
3667 }
3668
3669 subsys_initcall(proto_init);
3670
3671 #endif /* PROC_FS */
3672
3673 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)3674 bool sk_busy_loop_end(void *p, unsigned long start_time)
3675 {
3676 struct sock *sk = p;
3677
3678 return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3679 sk_busy_loop_timeout(sk, start_time);
3680 }
3681 EXPORT_SYMBOL(sk_busy_loop_end);
3682 #endif /* CONFIG_NET_RX_BUSY_POLL */
3683