1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 */
85
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117
118 #include <linux/uaccess.h>
119
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136
137 #include <trace/events/sock.h>
138 #include <trace/hooks/sched.h>
139 #include <trace/hooks/net.h>
140
141 #include <net/tcp.h>
142 #include <net/busy_poll.h>
143
144 #include <linux/ethtool.h>
145
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
148
149 static void sock_inuse_add(struct net *net, int val);
150
151 /**
152 * sk_ns_capable - General socket capability test
153 * @sk: Socket to use a capability on or through
154 * @user_ns: The user namespace of the capability to use
155 * @cap: The capability to use
156 *
157 * Test to see if the opener of the socket had when the socket was
158 * created and the current process has the capability @cap in the user
159 * namespace @user_ns.
160 */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)161 bool sk_ns_capable(const struct sock *sk,
162 struct user_namespace *user_ns, int cap)
163 {
164 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 ns_capable(user_ns, cap);
166 }
167 EXPORT_SYMBOL(sk_ns_capable);
168
169 /**
170 * sk_capable - Socket global capability test
171 * @sk: Socket to use a capability on or through
172 * @cap: The global capability to use
173 *
174 * Test to see if the opener of the socket had when the socket was
175 * created and the current process has the capability @cap in all user
176 * namespaces.
177 */
sk_capable(const struct sock * sk,int cap)178 bool sk_capable(const struct sock *sk, int cap)
179 {
180 return sk_ns_capable(sk, &init_user_ns, cap);
181 }
182 EXPORT_SYMBOL(sk_capable);
183
184 /**
185 * sk_net_capable - Network namespace socket capability test
186 * @sk: Socket to use a capability on or through
187 * @cap: The capability to use
188 *
189 * Test to see if the opener of the socket had when the socket was created
190 * and the current process has the capability @cap over the network namespace
191 * the socket is a member of.
192 */
sk_net_capable(const struct sock * sk,int cap)193 bool sk_net_capable(const struct sock *sk, int cap)
194 {
195 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
196 }
197 EXPORT_SYMBOL(sk_net_capable);
198
199 /*
200 * Each address family might have different locking rules, so we have
201 * one slock key per address family and separate keys for internal and
202 * userspace sockets.
203 */
204 static struct lock_class_key af_family_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_keys[AF_MAX];
206 static struct lock_class_key af_family_slock_keys[AF_MAX];
207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
208
209 /*
210 * Make lock validator output more readable. (we pre-construct these
211 * strings build-time, so that runtime initialization of socket
212 * locks is fast):
213 */
214
215 #define _sock_locks(x) \
216 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
217 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
218 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
219 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
220 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
221 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
222 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
223 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
224 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
225 x "27" , x "28" , x "AF_CAN" , \
226 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
227 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
228 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
229 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
230 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
231 x "AF_MCTP" , \
232 x "AF_MAX"
233
234 static const char *const af_family_key_strings[AF_MAX+1] = {
235 _sock_locks("sk_lock-")
236 };
237 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
238 _sock_locks("slock-")
239 };
240 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
241 _sock_locks("clock-")
242 };
243
244 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
245 _sock_locks("k-sk_lock-")
246 };
247 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
248 _sock_locks("k-slock-")
249 };
250 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
251 _sock_locks("k-clock-")
252 };
253 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
254 _sock_locks("rlock-")
255 };
256 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
257 _sock_locks("wlock-")
258 };
259 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
260 _sock_locks("elock-")
261 };
262
263 /*
264 * sk_callback_lock and sk queues locking rules are per-address-family,
265 * so split the lock classes by using a per-AF key:
266 */
267 static struct lock_class_key af_callback_keys[AF_MAX];
268 static struct lock_class_key af_rlock_keys[AF_MAX];
269 static struct lock_class_key af_wlock_keys[AF_MAX];
270 static struct lock_class_key af_elock_keys[AF_MAX];
271 static struct lock_class_key af_kern_callback_keys[AF_MAX];
272
273 /* Run time adjustable parameters. */
274 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
275 EXPORT_SYMBOL(sysctl_wmem_max);
276 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
277 EXPORT_SYMBOL(sysctl_rmem_max);
278 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
279 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
280
281 /* Maximal space eaten by iovec or ancillary data plus some space */
282 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
283 EXPORT_SYMBOL(sysctl_optmem_max);
284
285 int sysctl_tstamp_allow_data __read_mostly = 1;
286
287 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
288 EXPORT_SYMBOL_GPL(memalloc_socks_key);
289
290 /**
291 * sk_set_memalloc - sets %SOCK_MEMALLOC
292 * @sk: socket to set it on
293 *
294 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
295 * It's the responsibility of the admin to adjust min_free_kbytes
296 * to meet the requirements
297 */
sk_set_memalloc(struct sock * sk)298 void sk_set_memalloc(struct sock *sk)
299 {
300 sock_set_flag(sk, SOCK_MEMALLOC);
301 sk->sk_allocation |= __GFP_MEMALLOC;
302 static_branch_inc(&memalloc_socks_key);
303 }
304 EXPORT_SYMBOL_GPL(sk_set_memalloc);
305
sk_clear_memalloc(struct sock * sk)306 void sk_clear_memalloc(struct sock *sk)
307 {
308 sock_reset_flag(sk, SOCK_MEMALLOC);
309 sk->sk_allocation &= ~__GFP_MEMALLOC;
310 static_branch_dec(&memalloc_socks_key);
311
312 /*
313 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
314 * progress of swapping. SOCK_MEMALLOC may be cleared while
315 * it has rmem allocations due to the last swapfile being deactivated
316 * but there is a risk that the socket is unusable due to exceeding
317 * the rmem limits. Reclaim the reserves and obey rmem limits again.
318 */
319 sk_mem_reclaim(sk);
320 }
321 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
322
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)323 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
324 {
325 int ret;
326 unsigned int noreclaim_flag;
327
328 /* these should have been dropped before queueing */
329 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
330
331 noreclaim_flag = memalloc_noreclaim_save();
332 ret = sk->sk_backlog_rcv(sk, skb);
333 memalloc_noreclaim_restore(noreclaim_flag);
334
335 return ret;
336 }
337 EXPORT_SYMBOL(__sk_backlog_rcv);
338
sk_error_report(struct sock * sk)339 void sk_error_report(struct sock *sk)
340 {
341 sk->sk_error_report(sk);
342
343 switch (sk->sk_family) {
344 case AF_INET:
345 fallthrough;
346 case AF_INET6:
347 trace_inet_sk_error_report(sk);
348 break;
349 default:
350 break;
351 }
352 }
353 EXPORT_SYMBOL(sk_error_report);
354
sock_get_timeout(long timeo,void * optval,bool old_timeval)355 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
356 {
357 struct __kernel_sock_timeval tv;
358
359 if (timeo == MAX_SCHEDULE_TIMEOUT) {
360 tv.tv_sec = 0;
361 tv.tv_usec = 0;
362 } else {
363 tv.tv_sec = timeo / HZ;
364 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
365 }
366
367 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
368 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
369 *(struct old_timeval32 *)optval = tv32;
370 return sizeof(tv32);
371 }
372
373 if (old_timeval) {
374 struct __kernel_old_timeval old_tv;
375 old_tv.tv_sec = tv.tv_sec;
376 old_tv.tv_usec = tv.tv_usec;
377 *(struct __kernel_old_timeval *)optval = old_tv;
378 return sizeof(old_tv);
379 }
380
381 *(struct __kernel_sock_timeval *)optval = tv;
382 return sizeof(tv);
383 }
384
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)385 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
386 bool old_timeval)
387 {
388 struct __kernel_sock_timeval tv;
389
390 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
391 struct old_timeval32 tv32;
392
393 if (optlen < sizeof(tv32))
394 return -EINVAL;
395
396 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
397 return -EFAULT;
398 tv.tv_sec = tv32.tv_sec;
399 tv.tv_usec = tv32.tv_usec;
400 } else if (old_timeval) {
401 struct __kernel_old_timeval old_tv;
402
403 if (optlen < sizeof(old_tv))
404 return -EINVAL;
405 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
406 return -EFAULT;
407 tv.tv_sec = old_tv.tv_sec;
408 tv.tv_usec = old_tv.tv_usec;
409 } else {
410 if (optlen < sizeof(tv))
411 return -EINVAL;
412 if (copy_from_sockptr(&tv, optval, sizeof(tv)))
413 return -EFAULT;
414 }
415 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
416 return -EDOM;
417
418 if (tv.tv_sec < 0) {
419 static int warned __read_mostly;
420
421 *timeo_p = 0;
422 if (warned < 10 && net_ratelimit()) {
423 warned++;
424 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
425 __func__, current->comm, task_pid_nr(current));
426 }
427 return 0;
428 }
429 *timeo_p = MAX_SCHEDULE_TIMEOUT;
430 if (tv.tv_sec == 0 && tv.tv_usec == 0)
431 return 0;
432 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
433 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
434 return 0;
435 }
436
sock_needs_netstamp(const struct sock * sk)437 static bool sock_needs_netstamp(const struct sock *sk)
438 {
439 switch (sk->sk_family) {
440 case AF_UNSPEC:
441 case AF_UNIX:
442 return false;
443 default:
444 return true;
445 }
446 }
447
sock_disable_timestamp(struct sock * sk,unsigned long flags)448 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
449 {
450 if (sk->sk_flags & flags) {
451 sk->sk_flags &= ~flags;
452 if (sock_needs_netstamp(sk) &&
453 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
454 net_disable_timestamp();
455 }
456 }
457
458
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)459 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
460 {
461 unsigned long flags;
462 struct sk_buff_head *list = &sk->sk_receive_queue;
463
464 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
465 atomic_inc(&sk->sk_drops);
466 trace_sock_rcvqueue_full(sk, skb);
467 return -ENOMEM;
468 }
469
470 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
471 atomic_inc(&sk->sk_drops);
472 return -ENOBUFS;
473 }
474
475 skb->dev = NULL;
476 skb_set_owner_r(skb, sk);
477
478 /* we escape from rcu protected region, make sure we dont leak
479 * a norefcounted dst
480 */
481 skb_dst_force(skb);
482
483 spin_lock_irqsave(&list->lock, flags);
484 sock_skb_set_dropcount(sk, skb);
485 __skb_queue_tail(list, skb);
486 spin_unlock_irqrestore(&list->lock, flags);
487
488 if (!sock_flag(sk, SOCK_DEAD))
489 sk->sk_data_ready(sk);
490 return 0;
491 }
492 EXPORT_SYMBOL(__sock_queue_rcv_skb);
493
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)494 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
495 {
496 int err;
497
498 err = sk_filter(sk, skb);
499 if (err)
500 return err;
501
502 return __sock_queue_rcv_skb(sk, skb);
503 }
504 EXPORT_SYMBOL(sock_queue_rcv_skb);
505
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)506 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
507 const int nested, unsigned int trim_cap, bool refcounted)
508 {
509 int rc = NET_RX_SUCCESS;
510
511 if (sk_filter_trim_cap(sk, skb, trim_cap))
512 goto discard_and_relse;
513
514 skb->dev = NULL;
515
516 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
517 atomic_inc(&sk->sk_drops);
518 goto discard_and_relse;
519 }
520 if (nested)
521 bh_lock_sock_nested(sk);
522 else
523 bh_lock_sock(sk);
524 if (!sock_owned_by_user(sk)) {
525 /*
526 * trylock + unlock semantics:
527 */
528 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
529
530 rc = sk_backlog_rcv(sk, skb);
531
532 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
533 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
534 bh_unlock_sock(sk);
535 atomic_inc(&sk->sk_drops);
536 goto discard_and_relse;
537 }
538
539 bh_unlock_sock(sk);
540 out:
541 if (refcounted)
542 sock_put(sk);
543 return rc;
544 discard_and_relse:
545 kfree_skb(skb);
546 goto out;
547 }
548 EXPORT_SYMBOL(__sk_receive_skb);
549
550 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
551 u32));
552 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
553 u32));
__sk_dst_check(struct sock * sk,u32 cookie)554 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
555 {
556 struct dst_entry *dst = __sk_dst_get(sk);
557
558 if (dst && dst->obsolete &&
559 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
560 dst, cookie) == NULL) {
561 sk_tx_queue_clear(sk);
562 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
563 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
564 dst_release(dst);
565 return NULL;
566 }
567
568 return dst;
569 }
570 EXPORT_SYMBOL(__sk_dst_check);
571
sk_dst_check(struct sock * sk,u32 cookie)572 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
573 {
574 struct dst_entry *dst = sk_dst_get(sk);
575
576 if (dst && dst->obsolete &&
577 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
578 dst, cookie) == NULL) {
579 sk_dst_reset(sk);
580 dst_release(dst);
581 return NULL;
582 }
583
584 return dst;
585 }
586 EXPORT_SYMBOL(sk_dst_check);
587
sock_bindtoindex_locked(struct sock * sk,int ifindex)588 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
589 {
590 int ret = -ENOPROTOOPT;
591 #ifdef CONFIG_NETDEVICES
592 struct net *net = sock_net(sk);
593
594 /* Sorry... */
595 ret = -EPERM;
596 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
597 goto out;
598
599 ret = -EINVAL;
600 if (ifindex < 0)
601 goto out;
602
603 sk->sk_bound_dev_if = ifindex;
604 if (sk->sk_prot->rehash)
605 sk->sk_prot->rehash(sk);
606 sk_dst_reset(sk);
607
608 ret = 0;
609
610 out:
611 #endif
612
613 return ret;
614 }
615
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)616 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
617 {
618 int ret;
619
620 if (lock_sk)
621 lock_sock(sk);
622 ret = sock_bindtoindex_locked(sk, ifindex);
623 if (lock_sk)
624 release_sock(sk);
625
626 return ret;
627 }
628 EXPORT_SYMBOL(sock_bindtoindex);
629
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)630 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
631 {
632 int ret = -ENOPROTOOPT;
633 #ifdef CONFIG_NETDEVICES
634 struct net *net = sock_net(sk);
635 char devname[IFNAMSIZ];
636 int index;
637
638 ret = -EINVAL;
639 if (optlen < 0)
640 goto out;
641
642 /* Bind this socket to a particular device like "eth0",
643 * as specified in the passed interface name. If the
644 * name is "" or the option length is zero the socket
645 * is not bound.
646 */
647 if (optlen > IFNAMSIZ - 1)
648 optlen = IFNAMSIZ - 1;
649 memset(devname, 0, sizeof(devname));
650
651 ret = -EFAULT;
652 if (copy_from_sockptr(devname, optval, optlen))
653 goto out;
654
655 index = 0;
656 if (devname[0] != '\0') {
657 struct net_device *dev;
658
659 rcu_read_lock();
660 dev = dev_get_by_name_rcu(net, devname);
661 if (dev)
662 index = dev->ifindex;
663 rcu_read_unlock();
664 ret = -ENODEV;
665 if (!dev)
666 goto out;
667 }
668
669 return sock_bindtoindex(sk, index, true);
670 out:
671 #endif
672
673 return ret;
674 }
675
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)676 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
677 int __user *optlen, int len)
678 {
679 int ret = -ENOPROTOOPT;
680 #ifdef CONFIG_NETDEVICES
681 struct net *net = sock_net(sk);
682 char devname[IFNAMSIZ];
683
684 if (sk->sk_bound_dev_if == 0) {
685 len = 0;
686 goto zero;
687 }
688
689 ret = -EINVAL;
690 if (len < IFNAMSIZ)
691 goto out;
692
693 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
694 if (ret)
695 goto out;
696
697 len = strlen(devname) + 1;
698
699 ret = -EFAULT;
700 if (copy_to_user(optval, devname, len))
701 goto out;
702
703 zero:
704 ret = -EFAULT;
705 if (put_user(len, optlen))
706 goto out;
707
708 ret = 0;
709
710 out:
711 #endif
712
713 return ret;
714 }
715
sk_mc_loop(struct sock * sk)716 bool sk_mc_loop(struct sock *sk)
717 {
718 if (dev_recursion_level())
719 return false;
720 if (!sk)
721 return true;
722 /* IPV6_ADDRFORM can change sk->sk_family under us. */
723 switch (READ_ONCE(sk->sk_family)) {
724 case AF_INET:
725 return inet_sk(sk)->mc_loop;
726 #if IS_ENABLED(CONFIG_IPV6)
727 case AF_INET6:
728 return inet6_sk(sk)->mc_loop;
729 #endif
730 }
731 WARN_ON_ONCE(1);
732 return true;
733 }
734 EXPORT_SYMBOL(sk_mc_loop);
735
sock_set_reuseaddr(struct sock * sk)736 void sock_set_reuseaddr(struct sock *sk)
737 {
738 lock_sock(sk);
739 sk->sk_reuse = SK_CAN_REUSE;
740 release_sock(sk);
741 }
742 EXPORT_SYMBOL(sock_set_reuseaddr);
743
sock_set_reuseport(struct sock * sk)744 void sock_set_reuseport(struct sock *sk)
745 {
746 lock_sock(sk);
747 sk->sk_reuseport = true;
748 release_sock(sk);
749 }
750 EXPORT_SYMBOL(sock_set_reuseport);
751
sock_no_linger(struct sock * sk)752 void sock_no_linger(struct sock *sk)
753 {
754 lock_sock(sk);
755 sk->sk_lingertime = 0;
756 sock_set_flag(sk, SOCK_LINGER);
757 release_sock(sk);
758 }
759 EXPORT_SYMBOL(sock_no_linger);
760
sock_set_priority(struct sock * sk,u32 priority)761 void sock_set_priority(struct sock *sk, u32 priority)
762 {
763 lock_sock(sk);
764 sk->sk_priority = priority;
765 release_sock(sk);
766 }
767 EXPORT_SYMBOL(sock_set_priority);
768
sock_set_sndtimeo(struct sock * sk,s64 secs)769 void sock_set_sndtimeo(struct sock *sk, s64 secs)
770 {
771 lock_sock(sk);
772 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
773 sk->sk_sndtimeo = secs * HZ;
774 else
775 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
776 release_sock(sk);
777 }
778 EXPORT_SYMBOL(sock_set_sndtimeo);
779
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)780 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
781 {
782 if (val) {
783 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
784 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
785 sock_set_flag(sk, SOCK_RCVTSTAMP);
786 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
787 } else {
788 sock_reset_flag(sk, SOCK_RCVTSTAMP);
789 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
790 }
791 }
792
sock_enable_timestamps(struct sock * sk)793 void sock_enable_timestamps(struct sock *sk)
794 {
795 lock_sock(sk);
796 __sock_set_timestamps(sk, true, false, true);
797 release_sock(sk);
798 }
799 EXPORT_SYMBOL(sock_enable_timestamps);
800
sock_set_timestamp(struct sock * sk,int optname,bool valbool)801 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
802 {
803 switch (optname) {
804 case SO_TIMESTAMP_OLD:
805 __sock_set_timestamps(sk, valbool, false, false);
806 break;
807 case SO_TIMESTAMP_NEW:
808 __sock_set_timestamps(sk, valbool, true, false);
809 break;
810 case SO_TIMESTAMPNS_OLD:
811 __sock_set_timestamps(sk, valbool, false, true);
812 break;
813 case SO_TIMESTAMPNS_NEW:
814 __sock_set_timestamps(sk, valbool, true, true);
815 break;
816 }
817 }
818
sock_timestamping_bind_phc(struct sock * sk,int phc_index)819 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
820 {
821 struct net *net = sock_net(sk);
822 struct net_device *dev = NULL;
823 bool match = false;
824 int *vclock_index;
825 int i, num;
826
827 if (sk->sk_bound_dev_if)
828 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
829
830 if (!dev) {
831 pr_err("%s: sock not bind to device\n", __func__);
832 return -EOPNOTSUPP;
833 }
834
835 num = ethtool_get_phc_vclocks(dev, &vclock_index);
836 dev_put(dev);
837
838 for (i = 0; i < num; i++) {
839 if (*(vclock_index + i) == phc_index) {
840 match = true;
841 break;
842 }
843 }
844
845 if (num > 0)
846 kfree(vclock_index);
847
848 if (!match)
849 return -EINVAL;
850
851 sk->sk_bind_phc = phc_index;
852
853 return 0;
854 }
855
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)856 int sock_set_timestamping(struct sock *sk, int optname,
857 struct so_timestamping timestamping)
858 {
859 int val = timestamping.flags;
860 int ret;
861
862 if (val & ~SOF_TIMESTAMPING_MASK)
863 return -EINVAL;
864
865 if (val & SOF_TIMESTAMPING_OPT_ID &&
866 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
867 if (sk->sk_protocol == IPPROTO_TCP &&
868 sk->sk_type == SOCK_STREAM) {
869 if ((1 << sk->sk_state) &
870 (TCPF_CLOSE | TCPF_LISTEN))
871 return -EINVAL;
872 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
873 } else {
874 atomic_set(&sk->sk_tskey, 0);
875 }
876 }
877
878 if (val & SOF_TIMESTAMPING_OPT_STATS &&
879 !(val & SOF_TIMESTAMPING_OPT_TSONLY))
880 return -EINVAL;
881
882 if (val & SOF_TIMESTAMPING_BIND_PHC) {
883 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
884 if (ret)
885 return ret;
886 }
887
888 sk->sk_tsflags = val;
889 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
890
891 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
892 sock_enable_timestamp(sk,
893 SOCK_TIMESTAMPING_RX_SOFTWARE);
894 else
895 sock_disable_timestamp(sk,
896 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
897 return 0;
898 }
899
sock_set_keepalive(struct sock * sk)900 void sock_set_keepalive(struct sock *sk)
901 {
902 lock_sock(sk);
903 if (sk->sk_prot->keepalive)
904 sk->sk_prot->keepalive(sk, true);
905 sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
906 release_sock(sk);
907 }
908 EXPORT_SYMBOL(sock_set_keepalive);
909
__sock_set_rcvbuf(struct sock * sk,int val)910 static void __sock_set_rcvbuf(struct sock *sk, int val)
911 {
912 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
913 * as a negative value.
914 */
915 val = min_t(int, val, INT_MAX / 2);
916 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
917
918 /* We double it on the way in to account for "struct sk_buff" etc.
919 * overhead. Applications assume that the SO_RCVBUF setting they make
920 * will allow that much actual data to be received on that socket.
921 *
922 * Applications are unaware that "struct sk_buff" and other overheads
923 * allocate from the receive buffer during socket buffer allocation.
924 *
925 * And after considering the possible alternatives, returning the value
926 * we actually used in getsockopt is the most desirable behavior.
927 */
928 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
929 }
930
sock_set_rcvbuf(struct sock * sk,int val)931 void sock_set_rcvbuf(struct sock *sk, int val)
932 {
933 lock_sock(sk);
934 __sock_set_rcvbuf(sk, val);
935 release_sock(sk);
936 }
937 EXPORT_SYMBOL(sock_set_rcvbuf);
938
__sock_set_mark(struct sock * sk,u32 val)939 static void __sock_set_mark(struct sock *sk, u32 val)
940 {
941 if (val != sk->sk_mark) {
942 sk->sk_mark = val;
943 sk_dst_reset(sk);
944 }
945 }
946
sock_set_mark(struct sock * sk,u32 val)947 void sock_set_mark(struct sock *sk, u32 val)
948 {
949 lock_sock(sk);
950 __sock_set_mark(sk, val);
951 release_sock(sk);
952 }
953 EXPORT_SYMBOL(sock_set_mark);
954
955 /*
956 * This is meant for all protocols to use and covers goings on
957 * at the socket level. Everything here is generic.
958 */
959
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)960 int sock_setsockopt(struct socket *sock, int level, int optname,
961 sockptr_t optval, unsigned int optlen)
962 {
963 struct so_timestamping timestamping;
964 struct sock_txtime sk_txtime;
965 struct sock *sk = sock->sk;
966 int val;
967 int valbool;
968 struct linger ling;
969 int ret = 0;
970
971 /*
972 * Options without arguments
973 */
974
975 if (optname == SO_BINDTODEVICE)
976 return sock_setbindtodevice(sk, optval, optlen);
977
978 if (optlen < sizeof(int))
979 return -EINVAL;
980
981 if (copy_from_sockptr(&val, optval, sizeof(val)))
982 return -EFAULT;
983
984 valbool = val ? 1 : 0;
985
986 lock_sock(sk);
987
988 switch (optname) {
989 case SO_DEBUG:
990 if (val && !capable(CAP_NET_ADMIN))
991 ret = -EACCES;
992 else
993 sock_valbool_flag(sk, SOCK_DBG, valbool);
994 break;
995 case SO_REUSEADDR:
996 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
997 break;
998 case SO_REUSEPORT:
999 sk->sk_reuseport = valbool;
1000 break;
1001 case SO_TYPE:
1002 case SO_PROTOCOL:
1003 case SO_DOMAIN:
1004 case SO_ERROR:
1005 ret = -ENOPROTOOPT;
1006 break;
1007 case SO_DONTROUTE:
1008 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1009 sk_dst_reset(sk);
1010 break;
1011 case SO_BROADCAST:
1012 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1013 break;
1014 case SO_SNDBUF:
1015 /* Don't error on this BSD doesn't and if you think
1016 * about it this is right. Otherwise apps have to
1017 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1018 * are treated in BSD as hints
1019 */
1020 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1021 set_sndbuf:
1022 /* Ensure val * 2 fits into an int, to prevent max_t()
1023 * from treating it as a negative value.
1024 */
1025 val = min_t(int, val, INT_MAX / 2);
1026 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1027 WRITE_ONCE(sk->sk_sndbuf,
1028 max_t(int, val * 2, SOCK_MIN_SNDBUF));
1029 /* Wake up sending tasks if we upped the value. */
1030 sk->sk_write_space(sk);
1031 break;
1032
1033 case SO_SNDBUFFORCE:
1034 if (!capable(CAP_NET_ADMIN)) {
1035 ret = -EPERM;
1036 break;
1037 }
1038
1039 /* No negative values (to prevent underflow, as val will be
1040 * multiplied by 2).
1041 */
1042 if (val < 0)
1043 val = 0;
1044 goto set_sndbuf;
1045
1046 case SO_RCVBUF:
1047 /* Don't error on this BSD doesn't and if you think
1048 * about it this is right. Otherwise apps have to
1049 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1050 * are treated in BSD as hints
1051 */
1052 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1053 break;
1054
1055 case SO_RCVBUFFORCE:
1056 if (!capable(CAP_NET_ADMIN)) {
1057 ret = -EPERM;
1058 break;
1059 }
1060
1061 /* No negative values (to prevent underflow, as val will be
1062 * multiplied by 2).
1063 */
1064 __sock_set_rcvbuf(sk, max(val, 0));
1065 break;
1066
1067 case SO_KEEPALIVE:
1068 if (sk->sk_prot->keepalive)
1069 sk->sk_prot->keepalive(sk, valbool);
1070 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1071 break;
1072
1073 case SO_OOBINLINE:
1074 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1075 break;
1076
1077 case SO_NO_CHECK:
1078 sk->sk_no_check_tx = valbool;
1079 break;
1080
1081 case SO_PRIORITY:
1082 if ((val >= 0 && val <= 6) ||
1083 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1084 sk->sk_priority = val;
1085 else
1086 ret = -EPERM;
1087 break;
1088
1089 case SO_LINGER:
1090 if (optlen < sizeof(ling)) {
1091 ret = -EINVAL; /* 1003.1g */
1092 break;
1093 }
1094 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1095 ret = -EFAULT;
1096 break;
1097 }
1098 if (!ling.l_onoff)
1099 sock_reset_flag(sk, SOCK_LINGER);
1100 else {
1101 #if (BITS_PER_LONG == 32)
1102 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1103 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1104 else
1105 #endif
1106 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1107 sock_set_flag(sk, SOCK_LINGER);
1108 }
1109 break;
1110
1111 case SO_BSDCOMPAT:
1112 break;
1113
1114 case SO_PASSCRED:
1115 if (valbool)
1116 set_bit(SOCK_PASSCRED, &sock->flags);
1117 else
1118 clear_bit(SOCK_PASSCRED, &sock->flags);
1119 break;
1120
1121 case SO_TIMESTAMP_OLD:
1122 case SO_TIMESTAMP_NEW:
1123 case SO_TIMESTAMPNS_OLD:
1124 case SO_TIMESTAMPNS_NEW:
1125 sock_set_timestamp(sk, optname, valbool);
1126 break;
1127
1128 case SO_TIMESTAMPING_NEW:
1129 case SO_TIMESTAMPING_OLD:
1130 if (optlen == sizeof(timestamping)) {
1131 if (copy_from_sockptr(×tamping, optval,
1132 sizeof(timestamping))) {
1133 ret = -EFAULT;
1134 break;
1135 }
1136 } else {
1137 memset(×tamping, 0, sizeof(timestamping));
1138 timestamping.flags = val;
1139 }
1140 ret = sock_set_timestamping(sk, optname, timestamping);
1141 break;
1142
1143 case SO_RCVLOWAT:
1144 if (val < 0)
1145 val = INT_MAX;
1146 if (sock->ops->set_rcvlowat)
1147 ret = sock->ops->set_rcvlowat(sk, val);
1148 else
1149 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1150 break;
1151
1152 case SO_RCVTIMEO_OLD:
1153 case SO_RCVTIMEO_NEW:
1154 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1155 optlen, optname == SO_RCVTIMEO_OLD);
1156 break;
1157
1158 case SO_SNDTIMEO_OLD:
1159 case SO_SNDTIMEO_NEW:
1160 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1161 optlen, optname == SO_SNDTIMEO_OLD);
1162 break;
1163
1164 case SO_ATTACH_FILTER: {
1165 struct sock_fprog fprog;
1166
1167 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1168 if (!ret)
1169 ret = sk_attach_filter(&fprog, sk);
1170 break;
1171 }
1172 case SO_ATTACH_BPF:
1173 ret = -EINVAL;
1174 if (optlen == sizeof(u32)) {
1175 u32 ufd;
1176
1177 ret = -EFAULT;
1178 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1179 break;
1180
1181 ret = sk_attach_bpf(ufd, sk);
1182 }
1183 break;
1184
1185 case SO_ATTACH_REUSEPORT_CBPF: {
1186 struct sock_fprog fprog;
1187
1188 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1189 if (!ret)
1190 ret = sk_reuseport_attach_filter(&fprog, sk);
1191 break;
1192 }
1193 case SO_ATTACH_REUSEPORT_EBPF:
1194 ret = -EINVAL;
1195 if (optlen == sizeof(u32)) {
1196 u32 ufd;
1197
1198 ret = -EFAULT;
1199 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1200 break;
1201
1202 ret = sk_reuseport_attach_bpf(ufd, sk);
1203 }
1204 break;
1205
1206 case SO_DETACH_REUSEPORT_BPF:
1207 ret = reuseport_detach_prog(sk);
1208 break;
1209
1210 case SO_DETACH_FILTER:
1211 ret = sk_detach_filter(sk);
1212 break;
1213
1214 case SO_LOCK_FILTER:
1215 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1216 ret = -EPERM;
1217 else
1218 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1219 break;
1220
1221 case SO_PASSSEC:
1222 if (valbool)
1223 set_bit(SOCK_PASSSEC, &sock->flags);
1224 else
1225 clear_bit(SOCK_PASSSEC, &sock->flags);
1226 break;
1227 case SO_MARK:
1228 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1229 ret = -EPERM;
1230 break;
1231 }
1232
1233 __sock_set_mark(sk, val);
1234 break;
1235 case SO_RCVMARK:
1236 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1237 break;
1238
1239 case SO_RXQ_OVFL:
1240 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1241 break;
1242
1243 case SO_WIFI_STATUS:
1244 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1245 break;
1246
1247 case SO_PEEK_OFF:
1248 if (sock->ops->set_peek_off)
1249 ret = sock->ops->set_peek_off(sk, val);
1250 else
1251 ret = -EOPNOTSUPP;
1252 break;
1253
1254 case SO_NOFCS:
1255 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1256 break;
1257
1258 case SO_SELECT_ERR_QUEUE:
1259 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1260 break;
1261
1262 #ifdef CONFIG_NET_RX_BUSY_POLL
1263 case SO_BUSY_POLL:
1264 /* allow unprivileged users to decrease the value */
1265 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1266 ret = -EPERM;
1267 else {
1268 if (val < 0)
1269 ret = -EINVAL;
1270 else
1271 WRITE_ONCE(sk->sk_ll_usec, val);
1272 }
1273 break;
1274 case SO_PREFER_BUSY_POLL:
1275 if (valbool && !capable(CAP_NET_ADMIN))
1276 ret = -EPERM;
1277 else
1278 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1279 break;
1280 case SO_BUSY_POLL_BUDGET:
1281 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1282 ret = -EPERM;
1283 } else {
1284 if (val < 0 || val > U16_MAX)
1285 ret = -EINVAL;
1286 else
1287 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1288 }
1289 break;
1290 #endif
1291
1292 case SO_MAX_PACING_RATE:
1293 {
1294 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1295
1296 if (sizeof(ulval) != sizeof(val) &&
1297 optlen >= sizeof(ulval) &&
1298 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1299 ret = -EFAULT;
1300 break;
1301 }
1302 if (ulval != ~0UL)
1303 cmpxchg(&sk->sk_pacing_status,
1304 SK_PACING_NONE,
1305 SK_PACING_NEEDED);
1306 /* Pairs with READ_ONCE() from sk_getsockopt() */
1307 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1308 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1309 break;
1310 }
1311 case SO_INCOMING_CPU:
1312 reuseport_update_incoming_cpu(sk, val);
1313 break;
1314
1315 case SO_CNX_ADVICE:
1316 if (val == 1)
1317 dst_negative_advice(sk);
1318 break;
1319
1320 case SO_ZEROCOPY:
1321 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1322 if (!((sk->sk_type == SOCK_STREAM &&
1323 sk->sk_protocol == IPPROTO_TCP) ||
1324 (sk->sk_type == SOCK_DGRAM &&
1325 sk->sk_protocol == IPPROTO_UDP)))
1326 ret = -ENOTSUPP;
1327 } else if (sk->sk_family != PF_RDS) {
1328 ret = -ENOTSUPP;
1329 }
1330 if (!ret) {
1331 if (val < 0 || val > 1)
1332 ret = -EINVAL;
1333 else
1334 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1335 }
1336 break;
1337
1338 case SO_TXTIME:
1339 if (optlen != sizeof(struct sock_txtime)) {
1340 ret = -EINVAL;
1341 break;
1342 } else if (copy_from_sockptr(&sk_txtime, optval,
1343 sizeof(struct sock_txtime))) {
1344 ret = -EFAULT;
1345 break;
1346 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1347 ret = -EINVAL;
1348 break;
1349 }
1350 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1351 * scheduler has enough safe guards.
1352 */
1353 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1354 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1355 ret = -EPERM;
1356 break;
1357 }
1358 sock_valbool_flag(sk, SOCK_TXTIME, true);
1359 sk->sk_clockid = sk_txtime.clockid;
1360 sk->sk_txtime_deadline_mode =
1361 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1362 sk->sk_txtime_report_errors =
1363 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1364 break;
1365
1366 case SO_BINDTOIFINDEX:
1367 ret = sock_bindtoindex_locked(sk, val);
1368 break;
1369
1370 case SO_BUF_LOCK:
1371 if (val & ~SOCK_BUF_LOCK_MASK) {
1372 ret = -EINVAL;
1373 break;
1374 }
1375 sk->sk_userlocks = val | (sk->sk_userlocks &
1376 ~SOCK_BUF_LOCK_MASK);
1377 break;
1378
1379 default:
1380 ret = -ENOPROTOOPT;
1381 break;
1382 }
1383 release_sock(sk);
1384 return ret;
1385 }
1386 EXPORT_SYMBOL(sock_setsockopt);
1387
sk_get_peer_cred(struct sock * sk)1388 static const struct cred *sk_get_peer_cred(struct sock *sk)
1389 {
1390 const struct cred *cred;
1391
1392 spin_lock(&sk->sk_peer_lock);
1393 cred = get_cred(sk->sk_peer_cred);
1394 spin_unlock(&sk->sk_peer_lock);
1395
1396 return cred;
1397 }
1398
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1399 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1400 struct ucred *ucred)
1401 {
1402 ucred->pid = pid_vnr(pid);
1403 ucred->uid = ucred->gid = -1;
1404 if (cred) {
1405 struct user_namespace *current_ns = current_user_ns();
1406
1407 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1408 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1409 }
1410 }
1411
groups_to_user(gid_t __user * dst,const struct group_info * src)1412 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1413 {
1414 struct user_namespace *user_ns = current_user_ns();
1415 int i;
1416
1417 for (i = 0; i < src->ngroups; i++)
1418 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1419 return -EFAULT;
1420
1421 return 0;
1422 }
1423
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1424 int sock_getsockopt(struct socket *sock, int level, int optname,
1425 char __user *optval, int __user *optlen)
1426 {
1427 struct sock *sk = sock->sk;
1428
1429 union {
1430 int val;
1431 u64 val64;
1432 unsigned long ulval;
1433 struct linger ling;
1434 struct old_timeval32 tm32;
1435 struct __kernel_old_timeval tm;
1436 struct __kernel_sock_timeval stm;
1437 struct sock_txtime txtime;
1438 struct so_timestamping timestamping;
1439 } v;
1440
1441 int lv = sizeof(int);
1442 int len;
1443
1444 if (get_user(len, optlen))
1445 return -EFAULT;
1446 if (len < 0)
1447 return -EINVAL;
1448
1449 memset(&v, 0, sizeof(v));
1450
1451 switch (optname) {
1452 case SO_DEBUG:
1453 v.val = sock_flag(sk, SOCK_DBG);
1454 break;
1455
1456 case SO_DONTROUTE:
1457 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1458 break;
1459
1460 case SO_BROADCAST:
1461 v.val = sock_flag(sk, SOCK_BROADCAST);
1462 break;
1463
1464 case SO_SNDBUF:
1465 v.val = READ_ONCE(sk->sk_sndbuf);
1466 break;
1467
1468 case SO_RCVBUF:
1469 v.val = READ_ONCE(sk->sk_rcvbuf);
1470 break;
1471
1472 case SO_REUSEADDR:
1473 v.val = sk->sk_reuse;
1474 break;
1475
1476 case SO_REUSEPORT:
1477 v.val = sk->sk_reuseport;
1478 break;
1479
1480 case SO_KEEPALIVE:
1481 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1482 break;
1483
1484 case SO_TYPE:
1485 v.val = sk->sk_type;
1486 break;
1487
1488 case SO_PROTOCOL:
1489 v.val = sk->sk_protocol;
1490 break;
1491
1492 case SO_DOMAIN:
1493 v.val = sk->sk_family;
1494 break;
1495
1496 case SO_ERROR:
1497 v.val = -sock_error(sk);
1498 if (v.val == 0)
1499 v.val = xchg(&sk->sk_err_soft, 0);
1500 break;
1501
1502 case SO_OOBINLINE:
1503 v.val = sock_flag(sk, SOCK_URGINLINE);
1504 break;
1505
1506 case SO_NO_CHECK:
1507 v.val = sk->sk_no_check_tx;
1508 break;
1509
1510 case SO_PRIORITY:
1511 v.val = sk->sk_priority;
1512 break;
1513
1514 case SO_LINGER:
1515 lv = sizeof(v.ling);
1516 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1517 v.ling.l_linger = sk->sk_lingertime / HZ;
1518 break;
1519
1520 case SO_BSDCOMPAT:
1521 break;
1522
1523 case SO_TIMESTAMP_OLD:
1524 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1525 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1526 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1527 break;
1528
1529 case SO_TIMESTAMPNS_OLD:
1530 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1531 break;
1532
1533 case SO_TIMESTAMP_NEW:
1534 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1535 break;
1536
1537 case SO_TIMESTAMPNS_NEW:
1538 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1539 break;
1540
1541 case SO_TIMESTAMPING_OLD:
1542 case SO_TIMESTAMPING_NEW:
1543 lv = sizeof(v.timestamping);
1544 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1545 * returning the flags when they were set through the same option.
1546 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1547 */
1548 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1549 v.timestamping.flags = sk->sk_tsflags;
1550 v.timestamping.bind_phc = sk->sk_bind_phc;
1551 }
1552 break;
1553
1554 case SO_RCVTIMEO_OLD:
1555 case SO_RCVTIMEO_NEW:
1556 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1557 break;
1558
1559 case SO_SNDTIMEO_OLD:
1560 case SO_SNDTIMEO_NEW:
1561 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1562 break;
1563
1564 case SO_RCVLOWAT:
1565 v.val = READ_ONCE(sk->sk_rcvlowat);
1566 break;
1567
1568 case SO_SNDLOWAT:
1569 v.val = 1;
1570 break;
1571
1572 case SO_PASSCRED:
1573 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1574 break;
1575
1576 case SO_PEERCRED:
1577 {
1578 struct ucred peercred;
1579 if (len > sizeof(peercred))
1580 len = sizeof(peercred);
1581
1582 spin_lock(&sk->sk_peer_lock);
1583 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1584 spin_unlock(&sk->sk_peer_lock);
1585
1586 if (copy_to_user(optval, &peercred, len))
1587 return -EFAULT;
1588 goto lenout;
1589 }
1590
1591 case SO_PEERGROUPS:
1592 {
1593 const struct cred *cred;
1594 int ret, n;
1595
1596 cred = sk_get_peer_cred(sk);
1597 if (!cred)
1598 return -ENODATA;
1599
1600 n = cred->group_info->ngroups;
1601 if (len < n * sizeof(gid_t)) {
1602 len = n * sizeof(gid_t);
1603 put_cred(cred);
1604 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1605 }
1606 len = n * sizeof(gid_t);
1607
1608 ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1609 put_cred(cred);
1610 if (ret)
1611 return ret;
1612 goto lenout;
1613 }
1614
1615 case SO_PEERNAME:
1616 {
1617 char address[128];
1618
1619 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1620 if (lv < 0)
1621 return -ENOTCONN;
1622 if (lv < len)
1623 return -EINVAL;
1624 if (copy_to_user(optval, address, len))
1625 return -EFAULT;
1626 goto lenout;
1627 }
1628
1629 /* Dubious BSD thing... Probably nobody even uses it, but
1630 * the UNIX standard wants it for whatever reason... -DaveM
1631 */
1632 case SO_ACCEPTCONN:
1633 v.val = sk->sk_state == TCP_LISTEN;
1634 break;
1635
1636 case SO_PASSSEC:
1637 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1638 break;
1639
1640 case SO_PEERSEC:
1641 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1642
1643 case SO_MARK:
1644 v.val = sk->sk_mark;
1645 break;
1646
1647 case SO_RCVMARK:
1648 v.val = sock_flag(sk, SOCK_RCVMARK);
1649 break;
1650
1651 case SO_RXQ_OVFL:
1652 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1653 break;
1654
1655 case SO_WIFI_STATUS:
1656 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1657 break;
1658
1659 case SO_PEEK_OFF:
1660 if (!sock->ops->set_peek_off)
1661 return -EOPNOTSUPP;
1662
1663 v.val = READ_ONCE(sk->sk_peek_off);
1664 break;
1665 case SO_NOFCS:
1666 v.val = sock_flag(sk, SOCK_NOFCS);
1667 break;
1668
1669 case SO_BINDTODEVICE:
1670 return sock_getbindtodevice(sk, optval, optlen, len);
1671
1672 case SO_GET_FILTER:
1673 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1674 if (len < 0)
1675 return len;
1676
1677 goto lenout;
1678
1679 case SO_LOCK_FILTER:
1680 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1681 break;
1682
1683 case SO_BPF_EXTENSIONS:
1684 v.val = bpf_tell_extensions();
1685 break;
1686
1687 case SO_SELECT_ERR_QUEUE:
1688 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1689 break;
1690
1691 #ifdef CONFIG_NET_RX_BUSY_POLL
1692 case SO_BUSY_POLL:
1693 v.val = READ_ONCE(sk->sk_ll_usec);
1694 break;
1695 case SO_PREFER_BUSY_POLL:
1696 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1697 break;
1698 #endif
1699
1700 case SO_MAX_PACING_RATE:
1701 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1702 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1703 lv = sizeof(v.ulval);
1704 v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1705 } else {
1706 /* 32bit version */
1707 v.val = min_t(unsigned long, ~0U,
1708 READ_ONCE(sk->sk_max_pacing_rate));
1709 }
1710 break;
1711
1712 case SO_INCOMING_CPU:
1713 v.val = READ_ONCE(sk->sk_incoming_cpu);
1714 break;
1715
1716 case SO_MEMINFO:
1717 {
1718 u32 meminfo[SK_MEMINFO_VARS];
1719
1720 sk_get_meminfo(sk, meminfo);
1721
1722 len = min_t(unsigned int, len, sizeof(meminfo));
1723 if (copy_to_user(optval, &meminfo, len))
1724 return -EFAULT;
1725
1726 goto lenout;
1727 }
1728
1729 #ifdef CONFIG_NET_RX_BUSY_POLL
1730 case SO_INCOMING_NAPI_ID:
1731 v.val = READ_ONCE(sk->sk_napi_id);
1732
1733 /* aggregate non-NAPI IDs down to 0 */
1734 if (v.val < MIN_NAPI_ID)
1735 v.val = 0;
1736
1737 break;
1738 #endif
1739
1740 case SO_COOKIE:
1741 lv = sizeof(u64);
1742 if (len < lv)
1743 return -EINVAL;
1744 v.val64 = sock_gen_cookie(sk);
1745 break;
1746
1747 case SO_ZEROCOPY:
1748 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1749 break;
1750
1751 case SO_TXTIME:
1752 lv = sizeof(v.txtime);
1753 v.txtime.clockid = sk->sk_clockid;
1754 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1755 SOF_TXTIME_DEADLINE_MODE : 0;
1756 v.txtime.flags |= sk->sk_txtime_report_errors ?
1757 SOF_TXTIME_REPORT_ERRORS : 0;
1758 break;
1759
1760 case SO_BINDTOIFINDEX:
1761 v.val = sk->sk_bound_dev_if;
1762 break;
1763
1764 case SO_NETNS_COOKIE:
1765 lv = sizeof(u64);
1766 if (len != lv)
1767 return -EINVAL;
1768 v.val64 = sock_net(sk)->net_cookie;
1769 break;
1770
1771 case SO_BUF_LOCK:
1772 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1773 break;
1774
1775 default:
1776 /* We implement the SO_SNDLOWAT etc to not be settable
1777 * (1003.1g 7).
1778 */
1779 return -ENOPROTOOPT;
1780 }
1781
1782 if (len > lv)
1783 len = lv;
1784 if (copy_to_user(optval, &v, len))
1785 return -EFAULT;
1786 lenout:
1787 if (put_user(len, optlen))
1788 return -EFAULT;
1789 return 0;
1790 }
1791
1792 /*
1793 * Initialize an sk_lock.
1794 *
1795 * (We also register the sk_lock with the lock validator.)
1796 */
sock_lock_init(struct sock * sk)1797 static inline void sock_lock_init(struct sock *sk)
1798 {
1799 if (sk->sk_kern_sock)
1800 sock_lock_init_class_and_name(
1801 sk,
1802 af_family_kern_slock_key_strings[sk->sk_family],
1803 af_family_kern_slock_keys + sk->sk_family,
1804 af_family_kern_key_strings[sk->sk_family],
1805 af_family_kern_keys + sk->sk_family);
1806 else
1807 sock_lock_init_class_and_name(
1808 sk,
1809 af_family_slock_key_strings[sk->sk_family],
1810 af_family_slock_keys + sk->sk_family,
1811 af_family_key_strings[sk->sk_family],
1812 af_family_keys + sk->sk_family);
1813 }
1814
1815 /*
1816 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1817 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1818 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1819 */
sock_copy(struct sock * nsk,const struct sock * osk)1820 static void sock_copy(struct sock *nsk, const struct sock *osk)
1821 {
1822 const struct proto *prot = READ_ONCE(osk->sk_prot);
1823 #ifdef CONFIG_SECURITY_NETWORK
1824 void *sptr = nsk->sk_security;
1825 #endif
1826
1827 /* If we move sk_tx_queue_mapping out of the private section,
1828 * we must check if sk_tx_queue_clear() is called after
1829 * sock_copy() in sk_clone_lock().
1830 */
1831 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1832 offsetof(struct sock, sk_dontcopy_begin) ||
1833 offsetof(struct sock, sk_tx_queue_mapping) >=
1834 offsetof(struct sock, sk_dontcopy_end));
1835
1836 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1837
1838 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1839 prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1840
1841 #ifdef CONFIG_SECURITY_NETWORK
1842 nsk->sk_security = sptr;
1843 security_sk_clone(osk, nsk);
1844 #endif
1845 }
1846
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1847 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1848 int family)
1849 {
1850 struct sock *sk;
1851 struct kmem_cache *slab;
1852
1853 slab = prot->slab;
1854 if (slab != NULL) {
1855 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1856 if (!sk)
1857 return sk;
1858 if (want_init_on_alloc(priority))
1859 sk_prot_clear_nulls(sk, prot->obj_size);
1860 } else
1861 sk = kmalloc(prot->obj_size, priority);
1862
1863 if (sk != NULL) {
1864 if (security_sk_alloc(sk, family, priority))
1865 goto out_free;
1866
1867 trace_android_rvh_sk_alloc(sk);
1868
1869 if (!try_module_get(prot->owner))
1870 goto out_free_sec;
1871 }
1872
1873 return sk;
1874
1875 out_free_sec:
1876 security_sk_free(sk);
1877 trace_android_rvh_sk_free(sk);
1878 out_free:
1879 if (slab != NULL)
1880 kmem_cache_free(slab, sk);
1881 else
1882 kfree(sk);
1883 return NULL;
1884 }
1885
sk_prot_free(struct proto * prot,struct sock * sk)1886 static void sk_prot_free(struct proto *prot, struct sock *sk)
1887 {
1888 struct kmem_cache *slab;
1889 struct module *owner;
1890
1891 owner = prot->owner;
1892 slab = prot->slab;
1893
1894 cgroup_sk_free(&sk->sk_cgrp_data);
1895 mem_cgroup_sk_free(sk);
1896 security_sk_free(sk);
1897 trace_android_rvh_sk_free(sk);
1898 if (slab != NULL)
1899 kmem_cache_free(slab, sk);
1900 else
1901 kfree(sk);
1902 module_put(owner);
1903 }
1904
1905 /**
1906 * sk_alloc - All socket objects are allocated here
1907 * @net: the applicable net namespace
1908 * @family: protocol family
1909 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1910 * @prot: struct proto associated with this new sock instance
1911 * @kern: is this to be a kernel socket?
1912 */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1913 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1914 struct proto *prot, int kern)
1915 {
1916 struct sock *sk;
1917
1918 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1919 if (sk) {
1920 sk->sk_family = family;
1921 /*
1922 * See comment in struct sock definition to understand
1923 * why we need sk_prot_creator -acme
1924 */
1925 sk->sk_prot = sk->sk_prot_creator = prot;
1926 sk->sk_kern_sock = kern;
1927 sock_lock_init(sk);
1928 sk->sk_net_refcnt = kern ? 0 : 1;
1929 if (likely(sk->sk_net_refcnt)) {
1930 get_net(net);
1931 sock_inuse_add(net, 1);
1932 }
1933
1934 sock_net_set(sk, net);
1935 refcount_set(&sk->sk_wmem_alloc, 1);
1936
1937 mem_cgroup_sk_alloc(sk);
1938 cgroup_sk_alloc(&sk->sk_cgrp_data);
1939 sock_update_classid(&sk->sk_cgrp_data);
1940 sock_update_netprioidx(&sk->sk_cgrp_data);
1941 sk_tx_queue_clear(sk);
1942 }
1943
1944 return sk;
1945 }
1946 EXPORT_SYMBOL(sk_alloc);
1947
1948 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1949 * grace period. This is the case for UDP sockets and TCP listeners.
1950 */
__sk_destruct(struct rcu_head * head)1951 static void __sk_destruct(struct rcu_head *head)
1952 {
1953 struct sock *sk = container_of(head, struct sock, sk_rcu);
1954 struct sk_filter *filter;
1955
1956 if (sk->sk_destruct)
1957 sk->sk_destruct(sk);
1958
1959 filter = rcu_dereference_check(sk->sk_filter,
1960 refcount_read(&sk->sk_wmem_alloc) == 0);
1961 if (filter) {
1962 sk_filter_uncharge(sk, filter);
1963 RCU_INIT_POINTER(sk->sk_filter, NULL);
1964 }
1965
1966 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1967
1968 #ifdef CONFIG_BPF_SYSCALL
1969 bpf_sk_storage_free(sk);
1970 #endif
1971
1972 if (atomic_read(&sk->sk_omem_alloc))
1973 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1974 __func__, atomic_read(&sk->sk_omem_alloc));
1975
1976 if (sk->sk_frag.page) {
1977 put_page(sk->sk_frag.page);
1978 sk->sk_frag.page = NULL;
1979 }
1980
1981 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1982 put_cred(sk->sk_peer_cred);
1983 put_pid(sk->sk_peer_pid);
1984
1985 if (likely(sk->sk_net_refcnt))
1986 put_net(sock_net(sk));
1987 sk_prot_free(sk->sk_prot_creator, sk);
1988 }
1989
sk_destruct(struct sock * sk)1990 void sk_destruct(struct sock *sk)
1991 {
1992 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1993
1994 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1995 reuseport_detach_sock(sk);
1996 use_call_rcu = true;
1997 }
1998
1999 if (use_call_rcu)
2000 call_rcu(&sk->sk_rcu, __sk_destruct);
2001 else
2002 __sk_destruct(&sk->sk_rcu);
2003 }
2004
__sk_free(struct sock * sk)2005 static void __sk_free(struct sock *sk)
2006 {
2007 if (likely(sk->sk_net_refcnt))
2008 sock_inuse_add(sock_net(sk), -1);
2009
2010 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2011 sock_diag_broadcast_destroy(sk);
2012 else
2013 sk_destruct(sk);
2014 }
2015
sk_free(struct sock * sk)2016 void sk_free(struct sock *sk)
2017 {
2018 /*
2019 * We subtract one from sk_wmem_alloc and can know if
2020 * some packets are still in some tx queue.
2021 * If not null, sock_wfree() will call __sk_free(sk) later
2022 */
2023 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2024 __sk_free(sk);
2025 }
2026 EXPORT_SYMBOL(sk_free);
2027
sk_init_common(struct sock * sk)2028 static void sk_init_common(struct sock *sk)
2029 {
2030 skb_queue_head_init(&sk->sk_receive_queue);
2031 skb_queue_head_init(&sk->sk_write_queue);
2032 skb_queue_head_init(&sk->sk_error_queue);
2033
2034 rwlock_init(&sk->sk_callback_lock);
2035 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2036 af_rlock_keys + sk->sk_family,
2037 af_family_rlock_key_strings[sk->sk_family]);
2038 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2039 af_wlock_keys + sk->sk_family,
2040 af_family_wlock_key_strings[sk->sk_family]);
2041 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2042 af_elock_keys + sk->sk_family,
2043 af_family_elock_key_strings[sk->sk_family]);
2044 lockdep_set_class_and_name(&sk->sk_callback_lock,
2045 af_callback_keys + sk->sk_family,
2046 af_family_clock_key_strings[sk->sk_family]);
2047 }
2048
2049 /**
2050 * sk_clone_lock - clone a socket, and lock its clone
2051 * @sk: the socket to clone
2052 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2053 *
2054 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2055 */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2056 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2057 {
2058 struct proto *prot = READ_ONCE(sk->sk_prot);
2059 struct sk_filter *filter;
2060 bool is_charged = true;
2061 struct sock *newsk;
2062
2063 newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2064 if (!newsk)
2065 goto out;
2066
2067 sock_copy(newsk, sk);
2068
2069 newsk->sk_prot_creator = prot;
2070
2071 /* SANITY */
2072 if (likely(newsk->sk_net_refcnt)) {
2073 get_net(sock_net(newsk));
2074 sock_inuse_add(sock_net(newsk), 1);
2075 }
2076 sk_node_init(&newsk->sk_node);
2077 sock_lock_init(newsk);
2078 bh_lock_sock(newsk);
2079 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
2080 newsk->sk_backlog.len = 0;
2081
2082 atomic_set(&newsk->sk_rmem_alloc, 0);
2083
2084 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2085 refcount_set(&newsk->sk_wmem_alloc, 1);
2086
2087 atomic_set(&newsk->sk_omem_alloc, 0);
2088 sk_init_common(newsk);
2089
2090 newsk->sk_dst_cache = NULL;
2091 newsk->sk_dst_pending_confirm = 0;
2092 newsk->sk_wmem_queued = 0;
2093 newsk->sk_forward_alloc = 0;
2094 atomic_set(&newsk->sk_drops, 0);
2095 newsk->sk_send_head = NULL;
2096 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2097 atomic_set(&newsk->sk_zckey, 0);
2098
2099 sock_reset_flag(newsk, SOCK_DONE);
2100
2101 /* sk->sk_memcg will be populated at accept() time */
2102 newsk->sk_memcg = NULL;
2103
2104 cgroup_sk_clone(&newsk->sk_cgrp_data);
2105
2106 rcu_read_lock();
2107 filter = rcu_dereference(sk->sk_filter);
2108 if (filter != NULL)
2109 /* though it's an empty new sock, the charging may fail
2110 * if sysctl_optmem_max was changed between creation of
2111 * original socket and cloning
2112 */
2113 is_charged = sk_filter_charge(newsk, filter);
2114 RCU_INIT_POINTER(newsk->sk_filter, filter);
2115 rcu_read_unlock();
2116
2117 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2118 /* We need to make sure that we don't uncharge the new
2119 * socket if we couldn't charge it in the first place
2120 * as otherwise we uncharge the parent's filter.
2121 */
2122 if (!is_charged)
2123 RCU_INIT_POINTER(newsk->sk_filter, NULL);
2124 sk_free_unlock_clone(newsk);
2125 newsk = NULL;
2126 goto out;
2127 }
2128 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2129
2130 if (bpf_sk_storage_clone(sk, newsk)) {
2131 sk_free_unlock_clone(newsk);
2132 newsk = NULL;
2133 goto out;
2134 }
2135
2136 /* Clear sk_user_data if parent had the pointer tagged
2137 * as not suitable for copying when cloning.
2138 */
2139 if (sk_user_data_is_nocopy(newsk))
2140 newsk->sk_user_data = NULL;
2141
2142 newsk->sk_err = 0;
2143 newsk->sk_err_soft = 0;
2144 newsk->sk_priority = 0;
2145 newsk->sk_incoming_cpu = raw_smp_processor_id();
2146
2147 /* Before updating sk_refcnt, we must commit prior changes to memory
2148 * (Documentation/RCU/rculist_nulls.rst for details)
2149 */
2150 smp_wmb();
2151 refcount_set(&newsk->sk_refcnt, 2);
2152
2153 /* Increment the counter in the same struct proto as the master
2154 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2155 * is the same as sk->sk_prot->socks, as this field was copied
2156 * with memcpy).
2157 *
2158 * This _changes_ the previous behaviour, where
2159 * tcp_create_openreq_child always was incrementing the
2160 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2161 * to be taken into account in all callers. -acme
2162 */
2163 sk_refcnt_debug_inc(newsk);
2164 sk_set_socket(newsk, NULL);
2165 sk_tx_queue_clear(newsk);
2166 RCU_INIT_POINTER(newsk->sk_wq, NULL);
2167
2168 if (newsk->sk_prot->sockets_allocated)
2169 sk_sockets_allocated_inc(newsk);
2170
2171 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2172 net_enable_timestamp();
2173 out:
2174 return newsk;
2175 }
2176 EXPORT_SYMBOL_GPL(sk_clone_lock);
2177
sk_free_unlock_clone(struct sock * sk)2178 void sk_free_unlock_clone(struct sock *sk)
2179 {
2180 /* It is still raw copy of parent, so invalidate
2181 * destructor and make plain sk_free() */
2182 sk->sk_destruct = NULL;
2183 bh_unlock_sock(sk);
2184 sk_free(sk);
2185 }
2186 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2187
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2188 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2189 {
2190 u32 max_segs = 1;
2191
2192 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2193 if (sk->sk_route_caps & NETIF_F_GSO)
2194 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2195 sk->sk_route_caps &= ~sk->sk_route_nocaps;
2196 if (sk_can_gso(sk)) {
2197 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2198 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2199 } else {
2200 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2201 sk->sk_gso_max_size = dst->dev->gso_max_size;
2202 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2203 }
2204 }
2205 sk->sk_gso_max_segs = max_segs;
2206 sk_dst_set(sk, dst);
2207 }
2208 EXPORT_SYMBOL_GPL(sk_setup_caps);
2209
2210 /*
2211 * Simple resource managers for sockets.
2212 */
2213
2214
2215 /*
2216 * Write buffer destructor automatically called from kfree_skb.
2217 */
sock_wfree(struct sk_buff * skb)2218 void sock_wfree(struct sk_buff *skb)
2219 {
2220 struct sock *sk = skb->sk;
2221 unsigned int len = skb->truesize;
2222
2223 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2224 /*
2225 * Keep a reference on sk_wmem_alloc, this will be released
2226 * after sk_write_space() call
2227 */
2228 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2229 sk->sk_write_space(sk);
2230 len = 1;
2231 }
2232 /*
2233 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2234 * could not do because of in-flight packets
2235 */
2236 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2237 __sk_free(sk);
2238 }
2239 EXPORT_SYMBOL(sock_wfree);
2240
2241 /* This variant of sock_wfree() is used by TCP,
2242 * since it sets SOCK_USE_WRITE_QUEUE.
2243 */
__sock_wfree(struct sk_buff * skb)2244 void __sock_wfree(struct sk_buff *skb)
2245 {
2246 struct sock *sk = skb->sk;
2247
2248 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2249 __sk_free(sk);
2250 }
2251
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2252 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2253 {
2254 skb_orphan(skb);
2255 skb->sk = sk;
2256 #ifdef CONFIG_INET
2257 if (unlikely(!sk_fullsock(sk))) {
2258 skb->destructor = sock_edemux;
2259 sock_hold(sk);
2260 return;
2261 }
2262 #endif
2263 skb->destructor = sock_wfree;
2264 skb_set_hash_from_sk(skb, sk);
2265 /*
2266 * We used to take a refcount on sk, but following operation
2267 * is enough to guarantee sk_free() wont free this sock until
2268 * all in-flight packets are completed
2269 */
2270 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2271 }
2272 EXPORT_SYMBOL(skb_set_owner_w);
2273
can_skb_orphan_partial(const struct sk_buff * skb)2274 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2275 {
2276 #ifdef CONFIG_TLS_DEVICE
2277 /* Drivers depend on in-order delivery for crypto offload,
2278 * partial orphan breaks out-of-order-OK logic.
2279 */
2280 if (skb->decrypted)
2281 return false;
2282 #endif
2283 return (skb->destructor == sock_wfree ||
2284 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2285 }
2286
2287 /* This helper is used by netem, as it can hold packets in its
2288 * delay queue. We want to allow the owner socket to send more
2289 * packets, as if they were already TX completed by a typical driver.
2290 * But we also want to keep skb->sk set because some packet schedulers
2291 * rely on it (sch_fq for example).
2292 */
skb_orphan_partial(struct sk_buff * skb)2293 void skb_orphan_partial(struct sk_buff *skb)
2294 {
2295 if (skb_is_tcp_pure_ack(skb))
2296 return;
2297
2298 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2299 return;
2300
2301 skb_orphan(skb);
2302 }
2303 EXPORT_SYMBOL(skb_orphan_partial);
2304
2305 /*
2306 * Read buffer destructor automatically called from kfree_skb.
2307 */
sock_rfree(struct sk_buff * skb)2308 void sock_rfree(struct sk_buff *skb)
2309 {
2310 struct sock *sk = skb->sk;
2311 unsigned int len = skb->truesize;
2312
2313 atomic_sub(len, &sk->sk_rmem_alloc);
2314 sk_mem_uncharge(sk, len);
2315 }
2316 EXPORT_SYMBOL(sock_rfree);
2317
2318 /*
2319 * Buffer destructor for skbs that are not used directly in read or write
2320 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2321 */
sock_efree(struct sk_buff * skb)2322 void sock_efree(struct sk_buff *skb)
2323 {
2324 sock_put(skb->sk);
2325 }
2326 EXPORT_SYMBOL(sock_efree);
2327
2328 /* Buffer destructor for prefetch/receive path where reference count may
2329 * not be held, e.g. for listen sockets.
2330 */
2331 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2332 void sock_pfree(struct sk_buff *skb)
2333 {
2334 if (sk_is_refcounted(skb->sk))
2335 sock_gen_put(skb->sk);
2336 }
2337 EXPORT_SYMBOL(sock_pfree);
2338 #endif /* CONFIG_INET */
2339
sock_i_uid(struct sock * sk)2340 kuid_t sock_i_uid(struct sock *sk)
2341 {
2342 kuid_t uid;
2343
2344 read_lock_bh(&sk->sk_callback_lock);
2345 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2346 read_unlock_bh(&sk->sk_callback_lock);
2347 return uid;
2348 }
2349 EXPORT_SYMBOL(sock_i_uid);
2350
__sock_i_ino(struct sock * sk)2351 unsigned long __sock_i_ino(struct sock *sk)
2352 {
2353 unsigned long ino;
2354
2355 read_lock(&sk->sk_callback_lock);
2356 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2357 read_unlock(&sk->sk_callback_lock);
2358 return ino;
2359 }
2360 EXPORT_SYMBOL(__sock_i_ino);
2361
sock_i_ino(struct sock * sk)2362 unsigned long sock_i_ino(struct sock *sk)
2363 {
2364 unsigned long ino;
2365
2366 local_bh_disable();
2367 ino = __sock_i_ino(sk);
2368 local_bh_enable();
2369 return ino;
2370 }
2371 EXPORT_SYMBOL(sock_i_ino);
2372
2373 /*
2374 * Allocate a skb from the socket's send buffer.
2375 */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2376 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2377 gfp_t priority)
2378 {
2379 if (force ||
2380 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2381 struct sk_buff *skb = alloc_skb(size, priority);
2382
2383 if (skb) {
2384 skb_set_owner_w(skb, sk);
2385 return skb;
2386 }
2387 }
2388 return NULL;
2389 }
2390 EXPORT_SYMBOL(sock_wmalloc);
2391
sock_ofree(struct sk_buff * skb)2392 static void sock_ofree(struct sk_buff *skb)
2393 {
2394 struct sock *sk = skb->sk;
2395
2396 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2397 }
2398
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2399 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2400 gfp_t priority)
2401 {
2402 struct sk_buff *skb;
2403
2404 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2405 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2406 READ_ONCE(sysctl_optmem_max))
2407 return NULL;
2408
2409 skb = alloc_skb(size, priority);
2410 if (!skb)
2411 return NULL;
2412
2413 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2414 skb->sk = sk;
2415 skb->destructor = sock_ofree;
2416 return skb;
2417 }
2418
2419 /*
2420 * Allocate a memory block from the socket's option memory buffer.
2421 */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2422 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2423 {
2424 int optmem_max = READ_ONCE(sysctl_optmem_max);
2425
2426 if ((unsigned int)size <= optmem_max &&
2427 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2428 void *mem;
2429 /* First do the add, to avoid the race if kmalloc
2430 * might sleep.
2431 */
2432 atomic_add(size, &sk->sk_omem_alloc);
2433 mem = kmalloc(size, priority);
2434 if (mem)
2435 return mem;
2436 atomic_sub(size, &sk->sk_omem_alloc);
2437 }
2438 return NULL;
2439 }
2440 EXPORT_SYMBOL(sock_kmalloc);
2441
2442 /* Free an option memory block. Note, we actually want the inline
2443 * here as this allows gcc to detect the nullify and fold away the
2444 * condition entirely.
2445 */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2446 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2447 const bool nullify)
2448 {
2449 if (WARN_ON_ONCE(!mem))
2450 return;
2451 if (nullify)
2452 kfree_sensitive(mem);
2453 else
2454 kfree(mem);
2455 atomic_sub(size, &sk->sk_omem_alloc);
2456 }
2457
sock_kfree_s(struct sock * sk,void * mem,int size)2458 void sock_kfree_s(struct sock *sk, void *mem, int size)
2459 {
2460 __sock_kfree_s(sk, mem, size, false);
2461 }
2462 EXPORT_SYMBOL(sock_kfree_s);
2463
sock_kzfree_s(struct sock * sk,void * mem,int size)2464 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2465 {
2466 __sock_kfree_s(sk, mem, size, true);
2467 }
2468 EXPORT_SYMBOL(sock_kzfree_s);
2469
2470 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2471 I think, these locks should be removed for datagram sockets.
2472 */
sock_wait_for_wmem(struct sock * sk,long timeo)2473 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2474 {
2475 DEFINE_WAIT(wait);
2476
2477 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2478 for (;;) {
2479 if (!timeo)
2480 break;
2481 if (signal_pending(current))
2482 break;
2483 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2484 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2485 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2486 break;
2487 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2488 break;
2489 if (READ_ONCE(sk->sk_err))
2490 break;
2491 timeo = schedule_timeout(timeo);
2492 }
2493 finish_wait(sk_sleep(sk), &wait);
2494 return timeo;
2495 }
2496
2497
2498 /*
2499 * Generic send/receive buffer handlers
2500 */
2501
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2502 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2503 unsigned long data_len, int noblock,
2504 int *errcode, int max_page_order)
2505 {
2506 struct sk_buff *skb;
2507 long timeo;
2508 int err;
2509
2510 timeo = sock_sndtimeo(sk, noblock);
2511 for (;;) {
2512 err = sock_error(sk);
2513 if (err != 0)
2514 goto failure;
2515
2516 err = -EPIPE;
2517 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2518 goto failure;
2519
2520 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2521 break;
2522
2523 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2524 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2525 err = -EAGAIN;
2526 if (!timeo)
2527 goto failure;
2528 if (signal_pending(current))
2529 goto interrupted;
2530 timeo = sock_wait_for_wmem(sk, timeo);
2531 }
2532 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2533 errcode, sk->sk_allocation);
2534 if (skb)
2535 skb_set_owner_w(skb, sk);
2536 return skb;
2537
2538 interrupted:
2539 err = sock_intr_errno(timeo);
2540 failure:
2541 *errcode = err;
2542 return NULL;
2543 }
2544 EXPORT_SYMBOL(sock_alloc_send_pskb);
2545
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)2546 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2547 int noblock, int *errcode)
2548 {
2549 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2550 }
2551 EXPORT_SYMBOL(sock_alloc_send_skb);
2552
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2553 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2554 struct sockcm_cookie *sockc)
2555 {
2556 u32 tsflags;
2557
2558 switch (cmsg->cmsg_type) {
2559 case SO_MARK:
2560 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2561 return -EPERM;
2562 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2563 return -EINVAL;
2564 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2565 break;
2566 case SO_TIMESTAMPING_OLD:
2567 case SO_TIMESTAMPING_NEW:
2568 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2569 return -EINVAL;
2570
2571 tsflags = *(u32 *)CMSG_DATA(cmsg);
2572 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2573 return -EINVAL;
2574
2575 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2576 sockc->tsflags |= tsflags;
2577 break;
2578 case SCM_TXTIME:
2579 if (!sock_flag(sk, SOCK_TXTIME))
2580 return -EINVAL;
2581 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2582 return -EINVAL;
2583 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2584 break;
2585 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2586 case SCM_RIGHTS:
2587 case SCM_CREDENTIALS:
2588 break;
2589 default:
2590 return -EINVAL;
2591 }
2592 return 0;
2593 }
2594 EXPORT_SYMBOL(__sock_cmsg_send);
2595
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2596 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2597 struct sockcm_cookie *sockc)
2598 {
2599 struct cmsghdr *cmsg;
2600 int ret;
2601
2602 for_each_cmsghdr(cmsg, msg) {
2603 if (!CMSG_OK(msg, cmsg))
2604 return -EINVAL;
2605 if (cmsg->cmsg_level != SOL_SOCKET)
2606 continue;
2607 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2608 if (ret)
2609 return ret;
2610 }
2611 return 0;
2612 }
2613 EXPORT_SYMBOL(sock_cmsg_send);
2614
sk_enter_memory_pressure(struct sock * sk)2615 static void sk_enter_memory_pressure(struct sock *sk)
2616 {
2617 if (!sk->sk_prot->enter_memory_pressure)
2618 return;
2619
2620 sk->sk_prot->enter_memory_pressure(sk);
2621 }
2622
sk_leave_memory_pressure(struct sock * sk)2623 static void sk_leave_memory_pressure(struct sock *sk)
2624 {
2625 if (sk->sk_prot->leave_memory_pressure) {
2626 sk->sk_prot->leave_memory_pressure(sk);
2627 } else {
2628 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2629
2630 if (memory_pressure && READ_ONCE(*memory_pressure))
2631 WRITE_ONCE(*memory_pressure, 0);
2632 }
2633 }
2634
2635 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2636
2637 /**
2638 * skb_page_frag_refill - check that a page_frag contains enough room
2639 * @sz: minimum size of the fragment we want to get
2640 * @pfrag: pointer to page_frag
2641 * @gfp: priority for memory allocation
2642 *
2643 * Note: While this allocator tries to use high order pages, there is
2644 * no guarantee that allocations succeed. Therefore, @sz MUST be
2645 * less or equal than PAGE_SIZE.
2646 */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2647 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2648 {
2649 if (pfrag->page) {
2650 if (page_ref_count(pfrag->page) == 1) {
2651 pfrag->offset = 0;
2652 return true;
2653 }
2654 if (pfrag->offset + sz <= pfrag->size)
2655 return true;
2656 put_page(pfrag->page);
2657 }
2658
2659 pfrag->offset = 0;
2660 if (SKB_FRAG_PAGE_ORDER &&
2661 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2662 /* Avoid direct reclaim but allow kswapd to wake */
2663 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2664 __GFP_COMP | __GFP_NOWARN |
2665 __GFP_NORETRY,
2666 SKB_FRAG_PAGE_ORDER);
2667 if (likely(pfrag->page)) {
2668 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2669 return true;
2670 }
2671 }
2672 pfrag->page = alloc_page(gfp);
2673 if (likely(pfrag->page)) {
2674 pfrag->size = PAGE_SIZE;
2675 return true;
2676 }
2677 return false;
2678 }
2679 EXPORT_SYMBOL(skb_page_frag_refill);
2680
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2681 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2682 {
2683 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2684 return true;
2685
2686 sk_enter_memory_pressure(sk);
2687 sk_stream_moderate_sndbuf(sk);
2688 return false;
2689 }
2690 EXPORT_SYMBOL(sk_page_frag_refill);
2691
__lock_sock(struct sock * sk)2692 void __lock_sock(struct sock *sk)
2693 __releases(&sk->sk_lock.slock)
2694 __acquires(&sk->sk_lock.slock)
2695 {
2696 DEFINE_WAIT(wait);
2697
2698 for (;;) {
2699 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2700 TASK_UNINTERRUPTIBLE);
2701 spin_unlock_bh(&sk->sk_lock.slock);
2702 schedule();
2703 spin_lock_bh(&sk->sk_lock.slock);
2704 if (!sock_owned_by_user(sk))
2705 break;
2706 }
2707 finish_wait(&sk->sk_lock.wq, &wait);
2708 }
2709
__release_sock(struct sock * sk)2710 void __release_sock(struct sock *sk)
2711 __releases(&sk->sk_lock.slock)
2712 __acquires(&sk->sk_lock.slock)
2713 {
2714 struct sk_buff *skb, *next;
2715
2716 while ((skb = sk->sk_backlog.head) != NULL) {
2717 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2718
2719 spin_unlock_bh(&sk->sk_lock.slock);
2720
2721 do {
2722 next = skb->next;
2723 prefetch(next);
2724 WARN_ON_ONCE(skb_dst_is_noref(skb));
2725 skb_mark_not_on_list(skb);
2726 sk_backlog_rcv(sk, skb);
2727
2728 cond_resched();
2729
2730 skb = next;
2731 } while (skb != NULL);
2732
2733 spin_lock_bh(&sk->sk_lock.slock);
2734 }
2735
2736 /*
2737 * Doing the zeroing here guarantee we can not loop forever
2738 * while a wild producer attempts to flood us.
2739 */
2740 sk->sk_backlog.len = 0;
2741 }
2742
__sk_flush_backlog(struct sock * sk)2743 void __sk_flush_backlog(struct sock *sk)
2744 {
2745 spin_lock_bh(&sk->sk_lock.slock);
2746 __release_sock(sk);
2747 spin_unlock_bh(&sk->sk_lock.slock);
2748 }
2749
2750 /**
2751 * sk_wait_data - wait for data to arrive at sk_receive_queue
2752 * @sk: sock to wait on
2753 * @timeo: for how long
2754 * @skb: last skb seen on sk_receive_queue
2755 *
2756 * Now socket state including sk->sk_err is changed only under lock,
2757 * hence we may omit checks after joining wait queue.
2758 * We check receive queue before schedule() only as optimization;
2759 * it is very likely that release_sock() added new data.
2760 */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2761 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2762 {
2763 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2764 int rc;
2765
2766 add_wait_queue(sk_sleep(sk), &wait);
2767 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2768 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2769 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2770 remove_wait_queue(sk_sleep(sk), &wait);
2771 return rc;
2772 }
2773 EXPORT_SYMBOL(sk_wait_data);
2774
2775 /**
2776 * __sk_mem_raise_allocated - increase memory_allocated
2777 * @sk: socket
2778 * @size: memory size to allocate
2779 * @amt: pages to allocate
2780 * @kind: allocation type
2781 *
2782 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2783 */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)2784 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2785 {
2786 struct proto *prot = sk->sk_prot;
2787 long allocated = sk_memory_allocated_add(sk, amt);
2788 bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2789 bool charged = true;
2790
2791 if (memcg_charge &&
2792 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2793 gfp_memcg_charge())))
2794 goto suppress_allocation;
2795
2796 /* Under limit. */
2797 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2798 sk_leave_memory_pressure(sk);
2799 return 1;
2800 }
2801
2802 /* Under pressure. */
2803 if (allocated > sk_prot_mem_limits(sk, 1))
2804 sk_enter_memory_pressure(sk);
2805
2806 /* Over hard limit. */
2807 if (allocated > sk_prot_mem_limits(sk, 2))
2808 goto suppress_allocation;
2809
2810 /* guarantee minimum buffer size under pressure */
2811 if (kind == SK_MEM_RECV) {
2812 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2813 return 1;
2814
2815 } else { /* SK_MEM_SEND */
2816 int wmem0 = sk_get_wmem0(sk, prot);
2817
2818 if (sk->sk_type == SOCK_STREAM) {
2819 if (sk->sk_wmem_queued < wmem0)
2820 return 1;
2821 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2822 return 1;
2823 }
2824 }
2825
2826 if (sk_has_memory_pressure(sk)) {
2827 u64 alloc;
2828
2829 if (!sk_under_memory_pressure(sk))
2830 return 1;
2831 alloc = sk_sockets_allocated_read_positive(sk);
2832 if (sk_prot_mem_limits(sk, 2) > alloc *
2833 sk_mem_pages(sk->sk_wmem_queued +
2834 atomic_read(&sk->sk_rmem_alloc) +
2835 sk->sk_forward_alloc))
2836 return 1;
2837 }
2838
2839 suppress_allocation:
2840
2841 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2842 sk_stream_moderate_sndbuf(sk);
2843
2844 /* Fail only if socket is _under_ its sndbuf.
2845 * In this case we cannot block, so that we have to fail.
2846 */
2847 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2848 /* Force charge with __GFP_NOFAIL */
2849 if (memcg_charge && !charged) {
2850 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2851 gfp_memcg_charge() | __GFP_NOFAIL);
2852 }
2853 return 1;
2854 }
2855 }
2856
2857 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2858 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2859
2860 sk_memory_allocated_sub(sk, amt);
2861
2862 if (memcg_charge && charged)
2863 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2864
2865 return 0;
2866 }
2867 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2868
2869 /**
2870 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2871 * @sk: socket
2872 * @size: memory size to allocate
2873 * @kind: allocation type
2874 *
2875 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2876 * rmem allocation. This function assumes that protocols which have
2877 * memory_pressure use sk_wmem_queued as write buffer accounting.
2878 */
__sk_mem_schedule(struct sock * sk,int size,int kind)2879 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2880 {
2881 int ret, amt = sk_mem_pages(size);
2882
2883 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2884 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2885 if (!ret)
2886 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2887 return ret;
2888 }
2889 EXPORT_SYMBOL(__sk_mem_schedule);
2890
2891 /**
2892 * __sk_mem_reduce_allocated - reclaim memory_allocated
2893 * @sk: socket
2894 * @amount: number of quanta
2895 *
2896 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2897 */
__sk_mem_reduce_allocated(struct sock * sk,int amount)2898 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2899 {
2900 sk_memory_allocated_sub(sk, amount);
2901
2902 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2903 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2904
2905 if (sk_under_global_memory_pressure(sk) &&
2906 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2907 sk_leave_memory_pressure(sk);
2908 }
2909 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2910
2911 /**
2912 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2913 * @sk: socket
2914 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2915 */
__sk_mem_reclaim(struct sock * sk,int amount)2916 void __sk_mem_reclaim(struct sock *sk, int amount)
2917 {
2918 amount >>= SK_MEM_QUANTUM_SHIFT;
2919 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2920 __sk_mem_reduce_allocated(sk, amount);
2921 }
2922 EXPORT_SYMBOL(__sk_mem_reclaim);
2923
sk_set_peek_off(struct sock * sk,int val)2924 int sk_set_peek_off(struct sock *sk, int val)
2925 {
2926 WRITE_ONCE(sk->sk_peek_off, val);
2927 return 0;
2928 }
2929 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2930
2931 /*
2932 * Set of default routines for initialising struct proto_ops when
2933 * the protocol does not support a particular function. In certain
2934 * cases where it makes no sense for a protocol to have a "do nothing"
2935 * function, some default processing is provided.
2936 */
2937
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2938 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2939 {
2940 return -EOPNOTSUPP;
2941 }
2942 EXPORT_SYMBOL(sock_no_bind);
2943
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2944 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2945 int len, int flags)
2946 {
2947 return -EOPNOTSUPP;
2948 }
2949 EXPORT_SYMBOL(sock_no_connect);
2950
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2951 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2952 {
2953 return -EOPNOTSUPP;
2954 }
2955 EXPORT_SYMBOL(sock_no_socketpair);
2956
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)2957 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2958 bool kern)
2959 {
2960 return -EOPNOTSUPP;
2961 }
2962 EXPORT_SYMBOL(sock_no_accept);
2963
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)2964 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2965 int peer)
2966 {
2967 return -EOPNOTSUPP;
2968 }
2969 EXPORT_SYMBOL(sock_no_getname);
2970
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2971 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2972 {
2973 return -EOPNOTSUPP;
2974 }
2975 EXPORT_SYMBOL(sock_no_ioctl);
2976
sock_no_listen(struct socket * sock,int backlog)2977 int sock_no_listen(struct socket *sock, int backlog)
2978 {
2979 return -EOPNOTSUPP;
2980 }
2981 EXPORT_SYMBOL(sock_no_listen);
2982
sock_no_shutdown(struct socket * sock,int how)2983 int sock_no_shutdown(struct socket *sock, int how)
2984 {
2985 return -EOPNOTSUPP;
2986 }
2987 EXPORT_SYMBOL(sock_no_shutdown);
2988
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2989 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2990 {
2991 return -EOPNOTSUPP;
2992 }
2993 EXPORT_SYMBOL(sock_no_sendmsg);
2994
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)2995 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2996 {
2997 return -EOPNOTSUPP;
2998 }
2999 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3000
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)3001 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3002 int flags)
3003 {
3004 return -EOPNOTSUPP;
3005 }
3006 EXPORT_SYMBOL(sock_no_recvmsg);
3007
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)3008 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3009 {
3010 /* Mirror missing mmap method error code */
3011 return -ENODEV;
3012 }
3013 EXPORT_SYMBOL(sock_no_mmap);
3014
3015 /*
3016 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3017 * various sock-based usage counts.
3018 */
__receive_sock(struct file * file)3019 void __receive_sock(struct file *file)
3020 {
3021 struct socket *sock;
3022
3023 sock = sock_from_file(file);
3024 if (sock) {
3025 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3026 sock_update_classid(&sock->sk->sk_cgrp_data);
3027 }
3028 }
3029
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)3030 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3031 {
3032 ssize_t res;
3033 struct msghdr msg = {.msg_flags = flags};
3034 struct kvec iov;
3035 char *kaddr = kmap(page);
3036 iov.iov_base = kaddr + offset;
3037 iov.iov_len = size;
3038 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3039 kunmap(page);
3040 return res;
3041 }
3042 EXPORT_SYMBOL(sock_no_sendpage);
3043
sock_no_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)3044 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3045 int offset, size_t size, int flags)
3046 {
3047 ssize_t res;
3048 struct msghdr msg = {.msg_flags = flags};
3049 struct kvec iov;
3050 char *kaddr = kmap(page);
3051
3052 iov.iov_base = kaddr + offset;
3053 iov.iov_len = size;
3054 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3055 kunmap(page);
3056 return res;
3057 }
3058 EXPORT_SYMBOL(sock_no_sendpage_locked);
3059
3060 /*
3061 * Default Socket Callbacks
3062 */
3063
sock_def_wakeup(struct sock * sk)3064 static void sock_def_wakeup(struct sock *sk)
3065 {
3066 struct socket_wq *wq;
3067
3068 rcu_read_lock();
3069 wq = rcu_dereference(sk->sk_wq);
3070 if (skwq_has_sleeper(wq))
3071 wake_up_interruptible_all(&wq->wait);
3072 rcu_read_unlock();
3073 }
3074
sock_def_error_report(struct sock * sk)3075 static void sock_def_error_report(struct sock *sk)
3076 {
3077 struct socket_wq *wq;
3078
3079 rcu_read_lock();
3080 wq = rcu_dereference(sk->sk_wq);
3081 if (skwq_has_sleeper(wq))
3082 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3083 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3084 rcu_read_unlock();
3085 }
3086
sock_def_readable(struct sock * sk)3087 void sock_def_readable(struct sock *sk)
3088 {
3089 struct socket_wq *wq;
3090
3091 rcu_read_lock();
3092 wq = rcu_dereference(sk->sk_wq);
3093
3094 if (skwq_has_sleeper(wq)) {
3095 int done = 0;
3096
3097 trace_android_vh_do_wake_up_sync(&wq->wait, &done);
3098 if (done)
3099 goto out;
3100
3101 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3102 EPOLLRDNORM | EPOLLRDBAND);
3103 }
3104
3105 out:
3106 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3107 rcu_read_unlock();
3108 }
3109
sock_def_write_space(struct sock * sk)3110 static void sock_def_write_space(struct sock *sk)
3111 {
3112 struct socket_wq *wq;
3113
3114 rcu_read_lock();
3115
3116 /* Do not wake up a writer until he can make "significant"
3117 * progress. --DaveM
3118 */
3119 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3120 wq = rcu_dereference(sk->sk_wq);
3121 if (skwq_has_sleeper(wq))
3122 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3123 EPOLLWRNORM | EPOLLWRBAND);
3124
3125 /* Should agree with poll, otherwise some programs break */
3126 if (sock_writeable(sk))
3127 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3128 }
3129
3130 rcu_read_unlock();
3131 }
3132
sock_def_destruct(struct sock * sk)3133 static void sock_def_destruct(struct sock *sk)
3134 {
3135 }
3136
sk_send_sigurg(struct sock * sk)3137 void sk_send_sigurg(struct sock *sk)
3138 {
3139 if (sk->sk_socket && sk->sk_socket->file)
3140 if (send_sigurg(&sk->sk_socket->file->f_owner))
3141 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3142 }
3143 EXPORT_SYMBOL(sk_send_sigurg);
3144
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3145 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3146 unsigned long expires)
3147 {
3148 if (!mod_timer(timer, expires))
3149 sock_hold(sk);
3150 }
3151 EXPORT_SYMBOL(sk_reset_timer);
3152
sk_stop_timer(struct sock * sk,struct timer_list * timer)3153 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3154 {
3155 if (del_timer(timer))
3156 __sock_put(sk);
3157 }
3158 EXPORT_SYMBOL(sk_stop_timer);
3159
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3160 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3161 {
3162 if (del_timer_sync(timer))
3163 __sock_put(sk);
3164 }
3165 EXPORT_SYMBOL(sk_stop_timer_sync);
3166
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3167 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3168 {
3169 sk_init_common(sk);
3170 sk->sk_send_head = NULL;
3171
3172 timer_setup(&sk->sk_timer, NULL, 0);
3173
3174 sk->sk_allocation = GFP_KERNEL;
3175 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
3176 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
3177 sk->sk_state = TCP_CLOSE;
3178 sk_set_socket(sk, sock);
3179
3180 sock_set_flag(sk, SOCK_ZAPPED);
3181
3182 if (sock) {
3183 sk->sk_type = sock->type;
3184 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3185 sock->sk = sk;
3186 } else {
3187 RCU_INIT_POINTER(sk->sk_wq, NULL);
3188 }
3189 sk->sk_uid = uid;
3190
3191 rwlock_init(&sk->sk_callback_lock);
3192 if (sk->sk_kern_sock)
3193 lockdep_set_class_and_name(
3194 &sk->sk_callback_lock,
3195 af_kern_callback_keys + sk->sk_family,
3196 af_family_kern_clock_key_strings[sk->sk_family]);
3197 else
3198 lockdep_set_class_and_name(
3199 &sk->sk_callback_lock,
3200 af_callback_keys + sk->sk_family,
3201 af_family_clock_key_strings[sk->sk_family]);
3202
3203 sk->sk_state_change = sock_def_wakeup;
3204 sk->sk_data_ready = sock_def_readable;
3205 sk->sk_write_space = sock_def_write_space;
3206 sk->sk_error_report = sock_def_error_report;
3207 sk->sk_destruct = sock_def_destruct;
3208
3209 sk->sk_frag.page = NULL;
3210 sk->sk_frag.offset = 0;
3211 sk->sk_peek_off = -1;
3212
3213 sk->sk_peer_pid = NULL;
3214 sk->sk_peer_cred = NULL;
3215 spin_lock_init(&sk->sk_peer_lock);
3216
3217 sk->sk_write_pending = 0;
3218 sk->sk_rcvlowat = 1;
3219 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3220 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3221
3222 sk->sk_stamp = SK_DEFAULT_STAMP;
3223 #if BITS_PER_LONG==32
3224 seqlock_init(&sk->sk_stamp_seq);
3225 #endif
3226 atomic_set(&sk->sk_zckey, 0);
3227
3228 #ifdef CONFIG_NET_RX_BUSY_POLL
3229 sk->sk_napi_id = 0;
3230 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
3231 #endif
3232
3233 sk->sk_max_pacing_rate = ~0UL;
3234 sk->sk_pacing_rate = ~0UL;
3235 WRITE_ONCE(sk->sk_pacing_shift, 10);
3236 sk->sk_incoming_cpu = -1;
3237
3238 sk_rx_queue_clear(sk);
3239 /*
3240 * Before updating sk_refcnt, we must commit prior changes to memory
3241 * (Documentation/RCU/rculist_nulls.rst for details)
3242 */
3243 smp_wmb();
3244 refcount_set(&sk->sk_refcnt, 1);
3245 atomic_set(&sk->sk_drops, 0);
3246 }
3247 EXPORT_SYMBOL(sock_init_data_uid);
3248
sock_init_data(struct socket * sock,struct sock * sk)3249 void sock_init_data(struct socket *sock, struct sock *sk)
3250 {
3251 kuid_t uid = sock ?
3252 SOCK_INODE(sock)->i_uid :
3253 make_kuid(sock_net(sk)->user_ns, 0);
3254
3255 sock_init_data_uid(sock, sk, uid);
3256 }
3257 EXPORT_SYMBOL(sock_init_data);
3258
lock_sock_nested(struct sock * sk,int subclass)3259 void lock_sock_nested(struct sock *sk, int subclass)
3260 {
3261 /* The sk_lock has mutex_lock() semantics here. */
3262 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3263
3264 might_sleep();
3265 spin_lock_bh(&sk->sk_lock.slock);
3266 if (sk->sk_lock.owned)
3267 __lock_sock(sk);
3268 sk->sk_lock.owned = 1;
3269 spin_unlock_bh(&sk->sk_lock.slock);
3270 }
3271 EXPORT_SYMBOL(lock_sock_nested);
3272
release_sock(struct sock * sk)3273 void release_sock(struct sock *sk)
3274 {
3275 spin_lock_bh(&sk->sk_lock.slock);
3276 if (sk->sk_backlog.tail)
3277 __release_sock(sk);
3278
3279 /* Warning : release_cb() might need to release sk ownership,
3280 * ie call sock_release_ownership(sk) before us.
3281 */
3282 if (sk->sk_prot->release_cb)
3283 sk->sk_prot->release_cb(sk);
3284
3285 sock_release_ownership(sk);
3286 if (waitqueue_active(&sk->sk_lock.wq))
3287 wake_up(&sk->sk_lock.wq);
3288 spin_unlock_bh(&sk->sk_lock.slock);
3289 }
3290 EXPORT_SYMBOL(release_sock);
3291
__lock_sock_fast(struct sock * sk)3292 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3293 {
3294 might_sleep();
3295 spin_lock_bh(&sk->sk_lock.slock);
3296
3297 if (!sk->sk_lock.owned) {
3298 /*
3299 * Fast path return with bottom halves disabled and
3300 * sock::sk_lock.slock held.
3301 *
3302 * The 'mutex' is not contended and holding
3303 * sock::sk_lock.slock prevents all other lockers to
3304 * proceed so the corresponding unlock_sock_fast() can
3305 * avoid the slow path of release_sock() completely and
3306 * just release slock.
3307 *
3308 * From a semantical POV this is equivalent to 'acquiring'
3309 * the 'mutex', hence the corresponding lockdep
3310 * mutex_release() has to happen in the fast path of
3311 * unlock_sock_fast().
3312 */
3313 return false;
3314 }
3315
3316 __lock_sock(sk);
3317 sk->sk_lock.owned = 1;
3318 __acquire(&sk->sk_lock.slock);
3319 spin_unlock_bh(&sk->sk_lock.slock);
3320 return true;
3321 }
3322 EXPORT_SYMBOL(__lock_sock_fast);
3323
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3324 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3325 bool timeval, bool time32)
3326 {
3327 struct sock *sk = sock->sk;
3328 struct timespec64 ts;
3329
3330 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3331 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3332 if (ts.tv_sec == -1)
3333 return -ENOENT;
3334 if (ts.tv_sec == 0) {
3335 ktime_t kt = ktime_get_real();
3336 sock_write_timestamp(sk, kt);
3337 ts = ktime_to_timespec64(kt);
3338 }
3339
3340 if (timeval)
3341 ts.tv_nsec /= 1000;
3342
3343 #ifdef CONFIG_COMPAT_32BIT_TIME
3344 if (time32)
3345 return put_old_timespec32(&ts, userstamp);
3346 #endif
3347 #ifdef CONFIG_SPARC64
3348 /* beware of padding in sparc64 timeval */
3349 if (timeval && !in_compat_syscall()) {
3350 struct __kernel_old_timeval __user tv = {
3351 .tv_sec = ts.tv_sec,
3352 .tv_usec = ts.tv_nsec,
3353 };
3354 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3355 return -EFAULT;
3356 return 0;
3357 }
3358 #endif
3359 return put_timespec64(&ts, userstamp);
3360 }
3361 EXPORT_SYMBOL(sock_gettstamp);
3362
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3363 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3364 {
3365 if (!sock_flag(sk, flag)) {
3366 unsigned long previous_flags = sk->sk_flags;
3367
3368 sock_set_flag(sk, flag);
3369 /*
3370 * we just set one of the two flags which require net
3371 * time stamping, but time stamping might have been on
3372 * already because of the other one
3373 */
3374 if (sock_needs_netstamp(sk) &&
3375 !(previous_flags & SK_FLAGS_TIMESTAMP))
3376 net_enable_timestamp();
3377 }
3378 }
3379
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3380 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3381 int level, int type)
3382 {
3383 struct sock_exterr_skb *serr;
3384 struct sk_buff *skb;
3385 int copied, err;
3386
3387 err = -EAGAIN;
3388 skb = sock_dequeue_err_skb(sk);
3389 if (skb == NULL)
3390 goto out;
3391
3392 copied = skb->len;
3393 if (copied > len) {
3394 msg->msg_flags |= MSG_TRUNC;
3395 copied = len;
3396 }
3397 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3398 if (err)
3399 goto out_free_skb;
3400
3401 sock_recv_timestamp(msg, sk, skb);
3402
3403 serr = SKB_EXT_ERR(skb);
3404 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3405
3406 msg->msg_flags |= MSG_ERRQUEUE;
3407 err = copied;
3408
3409 out_free_skb:
3410 kfree_skb(skb);
3411 out:
3412 return err;
3413 }
3414 EXPORT_SYMBOL(sock_recv_errqueue);
3415
3416 /*
3417 * Get a socket option on an socket.
3418 *
3419 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3420 * asynchronous errors should be reported by getsockopt. We assume
3421 * this means if you specify SO_ERROR (otherwise whats the point of it).
3422 */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3423 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3424 char __user *optval, int __user *optlen)
3425 {
3426 struct sock *sk = sock->sk;
3427
3428 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3429 }
3430 EXPORT_SYMBOL(sock_common_getsockopt);
3431
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3432 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3433 int flags)
3434 {
3435 struct sock *sk = sock->sk;
3436 int addr_len = 0;
3437 int err;
3438
3439 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3440 flags & ~MSG_DONTWAIT, &addr_len);
3441 if (err >= 0)
3442 msg->msg_namelen = addr_len;
3443 return err;
3444 }
3445 EXPORT_SYMBOL(sock_common_recvmsg);
3446
3447 /*
3448 * Set socket options on an inet socket.
3449 */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3450 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3451 sockptr_t optval, unsigned int optlen)
3452 {
3453 struct sock *sk = sock->sk;
3454
3455 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3456 }
3457 EXPORT_SYMBOL(sock_common_setsockopt);
3458
sk_common_release(struct sock * sk)3459 void sk_common_release(struct sock *sk)
3460 {
3461 if (sk->sk_prot->destroy)
3462 sk->sk_prot->destroy(sk);
3463
3464 /*
3465 * Observation: when sk_common_release is called, processes have
3466 * no access to socket. But net still has.
3467 * Step one, detach it from networking:
3468 *
3469 * A. Remove from hash tables.
3470 */
3471
3472 sk->sk_prot->unhash(sk);
3473
3474 /*
3475 * In this point socket cannot receive new packets, but it is possible
3476 * that some packets are in flight because some CPU runs receiver and
3477 * did hash table lookup before we unhashed socket. They will achieve
3478 * receive queue and will be purged by socket destructor.
3479 *
3480 * Also we still have packets pending on receive queue and probably,
3481 * our own packets waiting in device queues. sock_destroy will drain
3482 * receive queue, but transmitted packets will delay socket destruction
3483 * until the last reference will be released.
3484 */
3485
3486 sock_orphan(sk);
3487
3488 xfrm_sk_free_policy(sk);
3489
3490 sk_refcnt_debug_release(sk);
3491
3492 sock_put(sk);
3493 }
3494 EXPORT_SYMBOL(sk_common_release);
3495
sk_get_meminfo(const struct sock * sk,u32 * mem)3496 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3497 {
3498 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3499
3500 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3501 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3502 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3503 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3504 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3505 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3506 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3507 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3508 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3509 }
3510
3511 #ifdef CONFIG_PROC_FS
3512 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
3513 struct prot_inuse {
3514 int val[PROTO_INUSE_NR];
3515 };
3516
3517 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3518
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)3519 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3520 {
3521 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3522 }
3523 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3524
sock_prot_inuse_get(struct net * net,struct proto * prot)3525 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3526 {
3527 int cpu, idx = prot->inuse_idx;
3528 int res = 0;
3529
3530 for_each_possible_cpu(cpu)
3531 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3532
3533 return res >= 0 ? res : 0;
3534 }
3535 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3536
sock_inuse_add(struct net * net,int val)3537 static void sock_inuse_add(struct net *net, int val)
3538 {
3539 this_cpu_add(*net->core.sock_inuse, val);
3540 }
3541
sock_inuse_get(struct net * net)3542 int sock_inuse_get(struct net *net)
3543 {
3544 int cpu, res = 0;
3545
3546 for_each_possible_cpu(cpu)
3547 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3548
3549 return res;
3550 }
3551
3552 EXPORT_SYMBOL_GPL(sock_inuse_get);
3553
sock_inuse_init_net(struct net * net)3554 static int __net_init sock_inuse_init_net(struct net *net)
3555 {
3556 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3557 if (net->core.prot_inuse == NULL)
3558 return -ENOMEM;
3559
3560 net->core.sock_inuse = alloc_percpu(int);
3561 if (net->core.sock_inuse == NULL)
3562 goto out;
3563
3564 return 0;
3565
3566 out:
3567 free_percpu(net->core.prot_inuse);
3568 return -ENOMEM;
3569 }
3570
sock_inuse_exit_net(struct net * net)3571 static void __net_exit sock_inuse_exit_net(struct net *net)
3572 {
3573 free_percpu(net->core.prot_inuse);
3574 free_percpu(net->core.sock_inuse);
3575 }
3576
3577 static struct pernet_operations net_inuse_ops = {
3578 .init = sock_inuse_init_net,
3579 .exit = sock_inuse_exit_net,
3580 };
3581
net_inuse_init(void)3582 static __init int net_inuse_init(void)
3583 {
3584 if (register_pernet_subsys(&net_inuse_ops))
3585 panic("Cannot initialize net inuse counters");
3586
3587 return 0;
3588 }
3589
3590 core_initcall(net_inuse_init);
3591
assign_proto_idx(struct proto * prot)3592 static int assign_proto_idx(struct proto *prot)
3593 {
3594 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3595
3596 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3597 pr_err("PROTO_INUSE_NR exhausted\n");
3598 return -ENOSPC;
3599 }
3600
3601 set_bit(prot->inuse_idx, proto_inuse_idx);
3602 return 0;
3603 }
3604
release_proto_idx(struct proto * prot)3605 static void release_proto_idx(struct proto *prot)
3606 {
3607 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3608 clear_bit(prot->inuse_idx, proto_inuse_idx);
3609 }
3610 #else
assign_proto_idx(struct proto * prot)3611 static inline int assign_proto_idx(struct proto *prot)
3612 {
3613 return 0;
3614 }
3615
release_proto_idx(struct proto * prot)3616 static inline void release_proto_idx(struct proto *prot)
3617 {
3618 }
3619
sock_inuse_add(struct net * net,int val)3620 static void sock_inuse_add(struct net *net, int val)
3621 {
3622 }
3623 #endif
3624
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3625 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3626 {
3627 if (!twsk_prot)
3628 return;
3629 kfree(twsk_prot->twsk_slab_name);
3630 twsk_prot->twsk_slab_name = NULL;
3631 kmem_cache_destroy(twsk_prot->twsk_slab);
3632 twsk_prot->twsk_slab = NULL;
3633 }
3634
tw_prot_init(const struct proto * prot)3635 static int tw_prot_init(const struct proto *prot)
3636 {
3637 struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3638
3639 if (!twsk_prot)
3640 return 0;
3641
3642 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3643 prot->name);
3644 if (!twsk_prot->twsk_slab_name)
3645 return -ENOMEM;
3646
3647 twsk_prot->twsk_slab =
3648 kmem_cache_create(twsk_prot->twsk_slab_name,
3649 twsk_prot->twsk_obj_size, 0,
3650 SLAB_ACCOUNT | prot->slab_flags,
3651 NULL);
3652 if (!twsk_prot->twsk_slab) {
3653 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3654 prot->name);
3655 return -ENOMEM;
3656 }
3657
3658 return 0;
3659 }
3660
req_prot_cleanup(struct request_sock_ops * rsk_prot)3661 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3662 {
3663 if (!rsk_prot)
3664 return;
3665 kfree(rsk_prot->slab_name);
3666 rsk_prot->slab_name = NULL;
3667 kmem_cache_destroy(rsk_prot->slab);
3668 rsk_prot->slab = NULL;
3669 }
3670
req_prot_init(const struct proto * prot)3671 static int req_prot_init(const struct proto *prot)
3672 {
3673 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3674
3675 if (!rsk_prot)
3676 return 0;
3677
3678 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3679 prot->name);
3680 if (!rsk_prot->slab_name)
3681 return -ENOMEM;
3682
3683 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3684 rsk_prot->obj_size, 0,
3685 SLAB_ACCOUNT | prot->slab_flags,
3686 NULL);
3687
3688 if (!rsk_prot->slab) {
3689 pr_crit("%s: Can't create request sock SLAB cache!\n",
3690 prot->name);
3691 return -ENOMEM;
3692 }
3693 return 0;
3694 }
3695
proto_register(struct proto * prot,int alloc_slab)3696 int proto_register(struct proto *prot, int alloc_slab)
3697 {
3698 int ret = -ENOBUFS;
3699
3700 if (alloc_slab) {
3701 prot->slab = kmem_cache_create_usercopy(prot->name,
3702 prot->obj_size, 0,
3703 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3704 prot->slab_flags,
3705 prot->useroffset, prot->usersize,
3706 NULL);
3707
3708 if (prot->slab == NULL) {
3709 pr_crit("%s: Can't create sock SLAB cache!\n",
3710 prot->name);
3711 goto out;
3712 }
3713
3714 if (req_prot_init(prot))
3715 goto out_free_request_sock_slab;
3716
3717 if (tw_prot_init(prot))
3718 goto out_free_timewait_sock_slab;
3719 }
3720
3721 mutex_lock(&proto_list_mutex);
3722 ret = assign_proto_idx(prot);
3723 if (ret) {
3724 mutex_unlock(&proto_list_mutex);
3725 goto out_free_timewait_sock_slab;
3726 }
3727 list_add(&prot->node, &proto_list);
3728 mutex_unlock(&proto_list_mutex);
3729 return ret;
3730
3731 out_free_timewait_sock_slab:
3732 if (alloc_slab)
3733 tw_prot_cleanup(prot->twsk_prot);
3734 out_free_request_sock_slab:
3735 if (alloc_slab) {
3736 req_prot_cleanup(prot->rsk_prot);
3737
3738 kmem_cache_destroy(prot->slab);
3739 prot->slab = NULL;
3740 }
3741 out:
3742 return ret;
3743 }
3744 EXPORT_SYMBOL(proto_register);
3745
proto_unregister(struct proto * prot)3746 void proto_unregister(struct proto *prot)
3747 {
3748 mutex_lock(&proto_list_mutex);
3749 release_proto_idx(prot);
3750 list_del(&prot->node);
3751 mutex_unlock(&proto_list_mutex);
3752
3753 kmem_cache_destroy(prot->slab);
3754 prot->slab = NULL;
3755
3756 req_prot_cleanup(prot->rsk_prot);
3757 tw_prot_cleanup(prot->twsk_prot);
3758 }
3759 EXPORT_SYMBOL(proto_unregister);
3760
sock_load_diag_module(int family,int protocol)3761 int sock_load_diag_module(int family, int protocol)
3762 {
3763 if (!protocol) {
3764 if (!sock_is_registered(family))
3765 return -ENOENT;
3766
3767 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3768 NETLINK_SOCK_DIAG, family);
3769 }
3770
3771 #ifdef CONFIG_INET
3772 if (family == AF_INET &&
3773 protocol != IPPROTO_RAW &&
3774 protocol < MAX_INET_PROTOS &&
3775 !rcu_access_pointer(inet_protos[protocol]))
3776 return -ENOENT;
3777 #endif
3778
3779 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3780 NETLINK_SOCK_DIAG, family, protocol);
3781 }
3782 EXPORT_SYMBOL(sock_load_diag_module);
3783
3784 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)3785 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3786 __acquires(proto_list_mutex)
3787 {
3788 mutex_lock(&proto_list_mutex);
3789 return seq_list_start_head(&proto_list, *pos);
3790 }
3791
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)3792 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3793 {
3794 return seq_list_next(v, &proto_list, pos);
3795 }
3796
proto_seq_stop(struct seq_file * seq,void * v)3797 static void proto_seq_stop(struct seq_file *seq, void *v)
3798 __releases(proto_list_mutex)
3799 {
3800 mutex_unlock(&proto_list_mutex);
3801 }
3802
proto_method_implemented(const void * method)3803 static char proto_method_implemented(const void *method)
3804 {
3805 return method == NULL ? 'n' : 'y';
3806 }
sock_prot_memory_allocated(struct proto * proto)3807 static long sock_prot_memory_allocated(struct proto *proto)
3808 {
3809 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3810 }
3811
sock_prot_memory_pressure(struct proto * proto)3812 static const char *sock_prot_memory_pressure(struct proto *proto)
3813 {
3814 return proto->memory_pressure != NULL ?
3815 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3816 }
3817
proto_seq_printf(struct seq_file * seq,struct proto * proto)3818 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3819 {
3820
3821 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3822 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3823 proto->name,
3824 proto->obj_size,
3825 sock_prot_inuse_get(seq_file_net(seq), proto),
3826 sock_prot_memory_allocated(proto),
3827 sock_prot_memory_pressure(proto),
3828 proto->max_header,
3829 proto->slab == NULL ? "no" : "yes",
3830 module_name(proto->owner),
3831 proto_method_implemented(proto->close),
3832 proto_method_implemented(proto->connect),
3833 proto_method_implemented(proto->disconnect),
3834 proto_method_implemented(proto->accept),
3835 proto_method_implemented(proto->ioctl),
3836 proto_method_implemented(proto->init),
3837 proto_method_implemented(proto->destroy),
3838 proto_method_implemented(proto->shutdown),
3839 proto_method_implemented(proto->setsockopt),
3840 proto_method_implemented(proto->getsockopt),
3841 proto_method_implemented(proto->sendmsg),
3842 proto_method_implemented(proto->recvmsg),
3843 proto_method_implemented(proto->sendpage),
3844 proto_method_implemented(proto->bind),
3845 proto_method_implemented(proto->backlog_rcv),
3846 proto_method_implemented(proto->hash),
3847 proto_method_implemented(proto->unhash),
3848 proto_method_implemented(proto->get_port),
3849 proto_method_implemented(proto->enter_memory_pressure));
3850 }
3851
proto_seq_show(struct seq_file * seq,void * v)3852 static int proto_seq_show(struct seq_file *seq, void *v)
3853 {
3854 if (v == &proto_list)
3855 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3856 "protocol",
3857 "size",
3858 "sockets",
3859 "memory",
3860 "press",
3861 "maxhdr",
3862 "slab",
3863 "module",
3864 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3865 else
3866 proto_seq_printf(seq, list_entry(v, struct proto, node));
3867 return 0;
3868 }
3869
3870 static const struct seq_operations proto_seq_ops = {
3871 .start = proto_seq_start,
3872 .next = proto_seq_next,
3873 .stop = proto_seq_stop,
3874 .show = proto_seq_show,
3875 };
3876
proto_init_net(struct net * net)3877 static __net_init int proto_init_net(struct net *net)
3878 {
3879 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3880 sizeof(struct seq_net_private)))
3881 return -ENOMEM;
3882
3883 return 0;
3884 }
3885
proto_exit_net(struct net * net)3886 static __net_exit void proto_exit_net(struct net *net)
3887 {
3888 remove_proc_entry("protocols", net->proc_net);
3889 }
3890
3891
3892 static __net_initdata struct pernet_operations proto_net_ops = {
3893 .init = proto_init_net,
3894 .exit = proto_exit_net,
3895 };
3896
proto_init(void)3897 static int __init proto_init(void)
3898 {
3899 return register_pernet_subsys(&proto_net_ops);
3900 }
3901
3902 subsys_initcall(proto_init);
3903
3904 #endif /* PROC_FS */
3905
3906 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)3907 bool sk_busy_loop_end(void *p, unsigned long start_time)
3908 {
3909 struct sock *sk = p;
3910
3911 return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3912 sk_busy_loop_timeout(sk, start_time);
3913 }
3914 EXPORT_SYMBOL(sk_busy_loop_end);
3915 #endif /* CONFIG_NET_RX_BUSY_POLL */
3916
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)3917 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3918 {
3919 if (!sk->sk_prot->bind_add)
3920 return -EOPNOTSUPP;
3921 return sk->sk_prot->bind_add(sk, addr, addr_len);
3922 }
3923 EXPORT_SYMBOL(sock_bind_add);
3924