1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 */
85
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117
118 #include <linux/uaccess.h>
119
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136
137 #include <trace/events/sock.h>
138 #include <trace/hooks/sched.h>
139 #include <trace/hooks/net.h>
140
141 #include <net/tcp.h>
142 #include <net/busy_poll.h>
143
144 #include <linux/ethtool.h>
145
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
148
149 static void sock_inuse_add(struct net *net, int val);
150
151 /**
152 * sk_ns_capable - General socket capability test
153 * @sk: Socket to use a capability on or through
154 * @user_ns: The user namespace of the capability to use
155 * @cap: The capability to use
156 *
157 * Test to see if the opener of the socket had when the socket was
158 * created and the current process has the capability @cap in the user
159 * namespace @user_ns.
160 */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)161 bool sk_ns_capable(const struct sock *sk,
162 struct user_namespace *user_ns, int cap)
163 {
164 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 ns_capable(user_ns, cap);
166 }
167 EXPORT_SYMBOL(sk_ns_capable);
168
169 /**
170 * sk_capable - Socket global capability test
171 * @sk: Socket to use a capability on or through
172 * @cap: The global capability to use
173 *
174 * Test to see if the opener of the socket had when the socket was
175 * created and the current process has the capability @cap in all user
176 * namespaces.
177 */
sk_capable(const struct sock * sk,int cap)178 bool sk_capable(const struct sock *sk, int cap)
179 {
180 return sk_ns_capable(sk, &init_user_ns, cap);
181 }
182 EXPORT_SYMBOL(sk_capable);
183
184 /**
185 * sk_net_capable - Network namespace socket capability test
186 * @sk: Socket to use a capability on or through
187 * @cap: The capability to use
188 *
189 * Test to see if the opener of the socket had when the socket was created
190 * and the current process has the capability @cap over the network namespace
191 * the socket is a member of.
192 */
sk_net_capable(const struct sock * sk,int cap)193 bool sk_net_capable(const struct sock *sk, int cap)
194 {
195 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
196 }
197 EXPORT_SYMBOL(sk_net_capable);
198
199 /*
200 * Each address family might have different locking rules, so we have
201 * one slock key per address family and separate keys for internal and
202 * userspace sockets.
203 */
204 static struct lock_class_key af_family_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_keys[AF_MAX];
206 static struct lock_class_key af_family_slock_keys[AF_MAX];
207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
208
209 /*
210 * Make lock validator output more readable. (we pre-construct these
211 * strings build-time, so that runtime initialization of socket
212 * locks is fast):
213 */
214
215 #define _sock_locks(x) \
216 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
217 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
218 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
219 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
220 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
221 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
222 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
223 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
224 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
225 x "27" , x "28" , x "AF_CAN" , \
226 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
227 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
228 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
229 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
230 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
231 x "AF_MCTP" , \
232 x "AF_MAX"
233
234 static const char *const af_family_key_strings[AF_MAX+1] = {
235 _sock_locks("sk_lock-")
236 };
237 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
238 _sock_locks("slock-")
239 };
240 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
241 _sock_locks("clock-")
242 };
243
244 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
245 _sock_locks("k-sk_lock-")
246 };
247 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
248 _sock_locks("k-slock-")
249 };
250 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
251 _sock_locks("k-clock-")
252 };
253 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
254 _sock_locks("rlock-")
255 };
256 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
257 _sock_locks("wlock-")
258 };
259 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
260 _sock_locks("elock-")
261 };
262
263 /*
264 * sk_callback_lock and sk queues locking rules are per-address-family,
265 * so split the lock classes by using a per-AF key:
266 */
267 static struct lock_class_key af_callback_keys[AF_MAX];
268 static struct lock_class_key af_rlock_keys[AF_MAX];
269 static struct lock_class_key af_wlock_keys[AF_MAX];
270 static struct lock_class_key af_elock_keys[AF_MAX];
271 static struct lock_class_key af_kern_callback_keys[AF_MAX];
272
273 /* Run time adjustable parameters. */
274 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
275 EXPORT_SYMBOL(sysctl_wmem_max);
276 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
277 EXPORT_SYMBOL(sysctl_rmem_max);
278 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
279 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
280
281 /* Maximal space eaten by iovec or ancillary data plus some space */
282 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
283 EXPORT_SYMBOL(sysctl_optmem_max);
284
285 int sysctl_tstamp_allow_data __read_mostly = 1;
286
287 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
288 EXPORT_SYMBOL_GPL(memalloc_socks_key);
289
290 /**
291 * sk_set_memalloc - sets %SOCK_MEMALLOC
292 * @sk: socket to set it on
293 *
294 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
295 * It's the responsibility of the admin to adjust min_free_kbytes
296 * to meet the requirements
297 */
sk_set_memalloc(struct sock * sk)298 void sk_set_memalloc(struct sock *sk)
299 {
300 sock_set_flag(sk, SOCK_MEMALLOC);
301 sk->sk_allocation |= __GFP_MEMALLOC;
302 static_branch_inc(&memalloc_socks_key);
303 }
304 EXPORT_SYMBOL_GPL(sk_set_memalloc);
305
sk_clear_memalloc(struct sock * sk)306 void sk_clear_memalloc(struct sock *sk)
307 {
308 sock_reset_flag(sk, SOCK_MEMALLOC);
309 sk->sk_allocation &= ~__GFP_MEMALLOC;
310 static_branch_dec(&memalloc_socks_key);
311
312 /*
313 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
314 * progress of swapping. SOCK_MEMALLOC may be cleared while
315 * it has rmem allocations due to the last swapfile being deactivated
316 * but there is a risk that the socket is unusable due to exceeding
317 * the rmem limits. Reclaim the reserves and obey rmem limits again.
318 */
319 sk_mem_reclaim(sk);
320 }
321 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
322
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)323 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
324 {
325 int ret;
326 unsigned int noreclaim_flag;
327
328 /* these should have been dropped before queueing */
329 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
330
331 noreclaim_flag = memalloc_noreclaim_save();
332 ret = sk->sk_backlog_rcv(sk, skb);
333 memalloc_noreclaim_restore(noreclaim_flag);
334
335 return ret;
336 }
337 EXPORT_SYMBOL(__sk_backlog_rcv);
338
sk_error_report(struct sock * sk)339 void sk_error_report(struct sock *sk)
340 {
341 sk->sk_error_report(sk);
342
343 switch (sk->sk_family) {
344 case AF_INET:
345 fallthrough;
346 case AF_INET6:
347 trace_inet_sk_error_report(sk);
348 break;
349 default:
350 break;
351 }
352 }
353 EXPORT_SYMBOL(sk_error_report);
354
sock_get_timeout(long timeo,void * optval,bool old_timeval)355 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
356 {
357 struct __kernel_sock_timeval tv;
358
359 if (timeo == MAX_SCHEDULE_TIMEOUT) {
360 tv.tv_sec = 0;
361 tv.tv_usec = 0;
362 } else {
363 tv.tv_sec = timeo / HZ;
364 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
365 }
366
367 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
368 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
369 *(struct old_timeval32 *)optval = tv32;
370 return sizeof(tv32);
371 }
372
373 if (old_timeval) {
374 struct __kernel_old_timeval old_tv;
375 old_tv.tv_sec = tv.tv_sec;
376 old_tv.tv_usec = tv.tv_usec;
377 *(struct __kernel_old_timeval *)optval = old_tv;
378 return sizeof(old_tv);
379 }
380
381 *(struct __kernel_sock_timeval *)optval = tv;
382 return sizeof(tv);
383 }
384
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)385 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
386 bool old_timeval)
387 {
388 struct __kernel_sock_timeval tv;
389
390 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
391 struct old_timeval32 tv32;
392
393 if (optlen < sizeof(tv32))
394 return -EINVAL;
395
396 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
397 return -EFAULT;
398 tv.tv_sec = tv32.tv_sec;
399 tv.tv_usec = tv32.tv_usec;
400 } else if (old_timeval) {
401 struct __kernel_old_timeval old_tv;
402
403 if (optlen < sizeof(old_tv))
404 return -EINVAL;
405 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
406 return -EFAULT;
407 tv.tv_sec = old_tv.tv_sec;
408 tv.tv_usec = old_tv.tv_usec;
409 } else {
410 if (optlen < sizeof(tv))
411 return -EINVAL;
412 if (copy_from_sockptr(&tv, optval, sizeof(tv)))
413 return -EFAULT;
414 }
415 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
416 return -EDOM;
417
418 if (tv.tv_sec < 0) {
419 static int warned __read_mostly;
420
421 *timeo_p = 0;
422 if (warned < 10 && net_ratelimit()) {
423 warned++;
424 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
425 __func__, current->comm, task_pid_nr(current));
426 }
427 return 0;
428 }
429 *timeo_p = MAX_SCHEDULE_TIMEOUT;
430 if (tv.tv_sec == 0 && tv.tv_usec == 0)
431 return 0;
432 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
433 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
434 return 0;
435 }
436
sock_needs_netstamp(const struct sock * sk)437 static bool sock_needs_netstamp(const struct sock *sk)
438 {
439 switch (sk->sk_family) {
440 case AF_UNSPEC:
441 case AF_UNIX:
442 return false;
443 default:
444 return true;
445 }
446 }
447
sock_disable_timestamp(struct sock * sk,unsigned long flags)448 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
449 {
450 if (sk->sk_flags & flags) {
451 sk->sk_flags &= ~flags;
452 if (sock_needs_netstamp(sk) &&
453 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
454 net_disable_timestamp();
455 }
456 }
457
458
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)459 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
460 {
461 unsigned long flags;
462 struct sk_buff_head *list = &sk->sk_receive_queue;
463
464 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
465 atomic_inc(&sk->sk_drops);
466 trace_sock_rcvqueue_full(sk, skb);
467 return -ENOMEM;
468 }
469
470 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
471 atomic_inc(&sk->sk_drops);
472 return -ENOBUFS;
473 }
474
475 skb->dev = NULL;
476 skb_set_owner_r(skb, sk);
477
478 /* we escape from rcu protected region, make sure we dont leak
479 * a norefcounted dst
480 */
481 skb_dst_force(skb);
482
483 spin_lock_irqsave(&list->lock, flags);
484 sock_skb_set_dropcount(sk, skb);
485 __skb_queue_tail(list, skb);
486 spin_unlock_irqrestore(&list->lock, flags);
487
488 if (!sock_flag(sk, SOCK_DEAD))
489 sk->sk_data_ready(sk);
490 return 0;
491 }
492 EXPORT_SYMBOL(__sock_queue_rcv_skb);
493
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)494 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
495 {
496 int err;
497
498 err = sk_filter(sk, skb);
499 if (err)
500 return err;
501
502 return __sock_queue_rcv_skb(sk, skb);
503 }
504 EXPORT_SYMBOL(sock_queue_rcv_skb);
505
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)506 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
507 const int nested, unsigned int trim_cap, bool refcounted)
508 {
509 int rc = NET_RX_SUCCESS;
510
511 if (sk_filter_trim_cap(sk, skb, trim_cap))
512 goto discard_and_relse;
513
514 skb->dev = NULL;
515
516 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
517 atomic_inc(&sk->sk_drops);
518 goto discard_and_relse;
519 }
520 if (nested)
521 bh_lock_sock_nested(sk);
522 else
523 bh_lock_sock(sk);
524 if (!sock_owned_by_user(sk)) {
525 /*
526 * trylock + unlock semantics:
527 */
528 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
529
530 rc = sk_backlog_rcv(sk, skb);
531
532 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
533 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
534 bh_unlock_sock(sk);
535 atomic_inc(&sk->sk_drops);
536 goto discard_and_relse;
537 }
538
539 bh_unlock_sock(sk);
540 out:
541 if (refcounted)
542 sock_put(sk);
543 return rc;
544 discard_and_relse:
545 kfree_skb(skb);
546 goto out;
547 }
548 EXPORT_SYMBOL(__sk_receive_skb);
549
550 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
551 u32));
552 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
553 u32));
__sk_dst_check(struct sock * sk,u32 cookie)554 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
555 {
556 struct dst_entry *dst = __sk_dst_get(sk);
557
558 if (dst && dst->obsolete &&
559 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
560 dst, cookie) == NULL) {
561 sk_tx_queue_clear(sk);
562 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
563 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
564 dst_release(dst);
565 return NULL;
566 }
567
568 return dst;
569 }
570 EXPORT_SYMBOL(__sk_dst_check);
571
sk_dst_check(struct sock * sk,u32 cookie)572 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
573 {
574 struct dst_entry *dst = sk_dst_get(sk);
575
576 if (dst && dst->obsolete &&
577 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
578 dst, cookie) == NULL) {
579 sk_dst_reset(sk);
580 dst_release(dst);
581 return NULL;
582 }
583
584 return dst;
585 }
586 EXPORT_SYMBOL(sk_dst_check);
587
sock_bindtoindex_locked(struct sock * sk,int ifindex)588 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
589 {
590 int ret = -ENOPROTOOPT;
591 #ifdef CONFIG_NETDEVICES
592 struct net *net = sock_net(sk);
593
594 /* Sorry... */
595 ret = -EPERM;
596 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
597 goto out;
598
599 ret = -EINVAL;
600 if (ifindex < 0)
601 goto out;
602
603 sk->sk_bound_dev_if = ifindex;
604 if (sk->sk_prot->rehash)
605 sk->sk_prot->rehash(sk);
606 sk_dst_reset(sk);
607
608 ret = 0;
609
610 out:
611 #endif
612
613 return ret;
614 }
615
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)616 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
617 {
618 int ret;
619
620 if (lock_sk)
621 lock_sock(sk);
622 ret = sock_bindtoindex_locked(sk, ifindex);
623 if (lock_sk)
624 release_sock(sk);
625
626 return ret;
627 }
628 EXPORT_SYMBOL(sock_bindtoindex);
629
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)630 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
631 {
632 int ret = -ENOPROTOOPT;
633 #ifdef CONFIG_NETDEVICES
634 struct net *net = sock_net(sk);
635 char devname[IFNAMSIZ];
636 int index;
637
638 ret = -EINVAL;
639 if (optlen < 0)
640 goto out;
641
642 /* Bind this socket to a particular device like "eth0",
643 * as specified in the passed interface name. If the
644 * name is "" or the option length is zero the socket
645 * is not bound.
646 */
647 if (optlen > IFNAMSIZ - 1)
648 optlen = IFNAMSIZ - 1;
649 memset(devname, 0, sizeof(devname));
650
651 ret = -EFAULT;
652 if (copy_from_sockptr(devname, optval, optlen))
653 goto out;
654
655 index = 0;
656 if (devname[0] != '\0') {
657 struct net_device *dev;
658
659 rcu_read_lock();
660 dev = dev_get_by_name_rcu(net, devname);
661 if (dev)
662 index = dev->ifindex;
663 rcu_read_unlock();
664 ret = -ENODEV;
665 if (!dev)
666 goto out;
667 }
668
669 return sock_bindtoindex(sk, index, true);
670 out:
671 #endif
672
673 return ret;
674 }
675
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)676 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
677 int __user *optlen, int len)
678 {
679 int ret = -ENOPROTOOPT;
680 #ifdef CONFIG_NETDEVICES
681 struct net *net = sock_net(sk);
682 char devname[IFNAMSIZ];
683
684 if (sk->sk_bound_dev_if == 0) {
685 len = 0;
686 goto zero;
687 }
688
689 ret = -EINVAL;
690 if (len < IFNAMSIZ)
691 goto out;
692
693 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
694 if (ret)
695 goto out;
696
697 len = strlen(devname) + 1;
698
699 ret = -EFAULT;
700 if (copy_to_user(optval, devname, len))
701 goto out;
702
703 zero:
704 ret = -EFAULT;
705 if (put_user(len, optlen))
706 goto out;
707
708 ret = 0;
709
710 out:
711 #endif
712
713 return ret;
714 }
715
sk_mc_loop(struct sock * sk)716 bool sk_mc_loop(struct sock *sk)
717 {
718 if (dev_recursion_level())
719 return false;
720 if (!sk)
721 return true;
722 /* IPV6_ADDRFORM can change sk->sk_family under us. */
723 switch (READ_ONCE(sk->sk_family)) {
724 case AF_INET:
725 return inet_sk(sk)->mc_loop;
726 #if IS_ENABLED(CONFIG_IPV6)
727 case AF_INET6:
728 return inet6_sk(sk)->mc_loop;
729 #endif
730 }
731 WARN_ON_ONCE(1);
732 return true;
733 }
734 EXPORT_SYMBOL(sk_mc_loop);
735
sock_set_reuseaddr(struct sock * sk)736 void sock_set_reuseaddr(struct sock *sk)
737 {
738 lock_sock(sk);
739 sk->sk_reuse = SK_CAN_REUSE;
740 release_sock(sk);
741 }
742 EXPORT_SYMBOL(sock_set_reuseaddr);
743
sock_set_reuseport(struct sock * sk)744 void sock_set_reuseport(struct sock *sk)
745 {
746 lock_sock(sk);
747 sk->sk_reuseport = true;
748 release_sock(sk);
749 }
750 EXPORT_SYMBOL(sock_set_reuseport);
751
sock_no_linger(struct sock * sk)752 void sock_no_linger(struct sock *sk)
753 {
754 lock_sock(sk);
755 sk->sk_lingertime = 0;
756 sock_set_flag(sk, SOCK_LINGER);
757 release_sock(sk);
758 }
759 EXPORT_SYMBOL(sock_no_linger);
760
sock_set_priority(struct sock * sk,u32 priority)761 void sock_set_priority(struct sock *sk, u32 priority)
762 {
763 lock_sock(sk);
764 sk->sk_priority = priority;
765 release_sock(sk);
766 }
767 EXPORT_SYMBOL(sock_set_priority);
768
sock_set_sndtimeo(struct sock * sk,s64 secs)769 void sock_set_sndtimeo(struct sock *sk, s64 secs)
770 {
771 lock_sock(sk);
772 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
773 sk->sk_sndtimeo = secs * HZ;
774 else
775 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
776 release_sock(sk);
777 }
778 EXPORT_SYMBOL(sock_set_sndtimeo);
779
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)780 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
781 {
782 if (val) {
783 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
784 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
785 sock_set_flag(sk, SOCK_RCVTSTAMP);
786 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
787 } else {
788 sock_reset_flag(sk, SOCK_RCVTSTAMP);
789 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
790 }
791 }
792
sock_enable_timestamps(struct sock * sk)793 void sock_enable_timestamps(struct sock *sk)
794 {
795 lock_sock(sk);
796 __sock_set_timestamps(sk, true, false, true);
797 release_sock(sk);
798 }
799 EXPORT_SYMBOL(sock_enable_timestamps);
800
sock_set_timestamp(struct sock * sk,int optname,bool valbool)801 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
802 {
803 switch (optname) {
804 case SO_TIMESTAMP_OLD:
805 __sock_set_timestamps(sk, valbool, false, false);
806 break;
807 case SO_TIMESTAMP_NEW:
808 __sock_set_timestamps(sk, valbool, true, false);
809 break;
810 case SO_TIMESTAMPNS_OLD:
811 __sock_set_timestamps(sk, valbool, false, true);
812 break;
813 case SO_TIMESTAMPNS_NEW:
814 __sock_set_timestamps(sk, valbool, true, true);
815 break;
816 }
817 }
818
sock_timestamping_bind_phc(struct sock * sk,int phc_index)819 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
820 {
821 struct net *net = sock_net(sk);
822 struct net_device *dev = NULL;
823 bool match = false;
824 int *vclock_index;
825 int i, num;
826
827 if (sk->sk_bound_dev_if)
828 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
829
830 if (!dev) {
831 pr_err("%s: sock not bind to device\n", __func__);
832 return -EOPNOTSUPP;
833 }
834
835 num = ethtool_get_phc_vclocks(dev, &vclock_index);
836 dev_put(dev);
837
838 for (i = 0; i < num; i++) {
839 if (*(vclock_index + i) == phc_index) {
840 match = true;
841 break;
842 }
843 }
844
845 if (num > 0)
846 kfree(vclock_index);
847
848 if (!match)
849 return -EINVAL;
850
851 sk->sk_bind_phc = phc_index;
852
853 return 0;
854 }
855
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)856 int sock_set_timestamping(struct sock *sk, int optname,
857 struct so_timestamping timestamping)
858 {
859 int val = timestamping.flags;
860 int ret;
861
862 if (val & ~SOF_TIMESTAMPING_MASK)
863 return -EINVAL;
864
865 if (val & SOF_TIMESTAMPING_OPT_ID &&
866 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
867 if (sk->sk_protocol == IPPROTO_TCP &&
868 sk->sk_type == SOCK_STREAM) {
869 if ((1 << sk->sk_state) &
870 (TCPF_CLOSE | TCPF_LISTEN))
871 return -EINVAL;
872 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
873 } else {
874 atomic_set(&sk->sk_tskey, 0);
875 }
876 }
877
878 if (val & SOF_TIMESTAMPING_OPT_STATS &&
879 !(val & SOF_TIMESTAMPING_OPT_TSONLY))
880 return -EINVAL;
881
882 if (val & SOF_TIMESTAMPING_BIND_PHC) {
883 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
884 if (ret)
885 return ret;
886 }
887
888 sk->sk_tsflags = val;
889 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
890
891 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
892 sock_enable_timestamp(sk,
893 SOCK_TIMESTAMPING_RX_SOFTWARE);
894 else
895 sock_disable_timestamp(sk,
896 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
897 return 0;
898 }
899
sock_set_keepalive(struct sock * sk)900 void sock_set_keepalive(struct sock *sk)
901 {
902 lock_sock(sk);
903 if (sk->sk_prot->keepalive)
904 sk->sk_prot->keepalive(sk, true);
905 sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
906 release_sock(sk);
907 }
908 EXPORT_SYMBOL(sock_set_keepalive);
909
__sock_set_rcvbuf(struct sock * sk,int val)910 static void __sock_set_rcvbuf(struct sock *sk, int val)
911 {
912 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
913 * as a negative value.
914 */
915 val = min_t(int, val, INT_MAX / 2);
916 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
917
918 /* We double it on the way in to account for "struct sk_buff" etc.
919 * overhead. Applications assume that the SO_RCVBUF setting they make
920 * will allow that much actual data to be received on that socket.
921 *
922 * Applications are unaware that "struct sk_buff" and other overheads
923 * allocate from the receive buffer during socket buffer allocation.
924 *
925 * And after considering the possible alternatives, returning the value
926 * we actually used in getsockopt is the most desirable behavior.
927 */
928 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
929 }
930
sock_set_rcvbuf(struct sock * sk,int val)931 void sock_set_rcvbuf(struct sock *sk, int val)
932 {
933 lock_sock(sk);
934 __sock_set_rcvbuf(sk, val);
935 release_sock(sk);
936 }
937 EXPORT_SYMBOL(sock_set_rcvbuf);
938
__sock_set_mark(struct sock * sk,u32 val)939 static void __sock_set_mark(struct sock *sk, u32 val)
940 {
941 if (val != sk->sk_mark) {
942 sk->sk_mark = val;
943 sk_dst_reset(sk);
944 }
945 }
946
sock_set_mark(struct sock * sk,u32 val)947 void sock_set_mark(struct sock *sk, u32 val)
948 {
949 lock_sock(sk);
950 __sock_set_mark(sk, val);
951 release_sock(sk);
952 }
953 EXPORT_SYMBOL(sock_set_mark);
954
955 /*
956 * This is meant for all protocols to use and covers goings on
957 * at the socket level. Everything here is generic.
958 */
959
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)960 int sock_setsockopt(struct socket *sock, int level, int optname,
961 sockptr_t optval, unsigned int optlen)
962 {
963 struct so_timestamping timestamping;
964 struct sock_txtime sk_txtime;
965 struct sock *sk = sock->sk;
966 int val;
967 int valbool;
968 struct linger ling;
969 int ret = 0;
970
971 /*
972 * Options without arguments
973 */
974
975 if (optname == SO_BINDTODEVICE)
976 return sock_setbindtodevice(sk, optval, optlen);
977
978 if (optlen < sizeof(int))
979 return -EINVAL;
980
981 if (copy_from_sockptr(&val, optval, sizeof(val)))
982 return -EFAULT;
983
984 valbool = val ? 1 : 0;
985
986 lock_sock(sk);
987
988 switch (optname) {
989 case SO_DEBUG:
990 if (val && !capable(CAP_NET_ADMIN))
991 ret = -EACCES;
992 else
993 sock_valbool_flag(sk, SOCK_DBG, valbool);
994 break;
995 case SO_REUSEADDR:
996 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
997 break;
998 case SO_REUSEPORT:
999 sk->sk_reuseport = valbool;
1000 break;
1001 case SO_TYPE:
1002 case SO_PROTOCOL:
1003 case SO_DOMAIN:
1004 case SO_ERROR:
1005 ret = -ENOPROTOOPT;
1006 break;
1007 case SO_DONTROUTE:
1008 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1009 sk_dst_reset(sk);
1010 break;
1011 case SO_BROADCAST:
1012 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1013 break;
1014 case SO_SNDBUF:
1015 /* Don't error on this BSD doesn't and if you think
1016 * about it this is right. Otherwise apps have to
1017 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1018 * are treated in BSD as hints
1019 */
1020 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1021 set_sndbuf:
1022 /* Ensure val * 2 fits into an int, to prevent max_t()
1023 * from treating it as a negative value.
1024 */
1025 val = min_t(int, val, INT_MAX / 2);
1026 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1027 WRITE_ONCE(sk->sk_sndbuf,
1028 max_t(int, val * 2, SOCK_MIN_SNDBUF));
1029 /* Wake up sending tasks if we upped the value. */
1030 sk->sk_write_space(sk);
1031 break;
1032
1033 case SO_SNDBUFFORCE:
1034 if (!capable(CAP_NET_ADMIN)) {
1035 ret = -EPERM;
1036 break;
1037 }
1038
1039 /* No negative values (to prevent underflow, as val will be
1040 * multiplied by 2).
1041 */
1042 if (val < 0)
1043 val = 0;
1044 goto set_sndbuf;
1045
1046 case SO_RCVBUF:
1047 /* Don't error on this BSD doesn't and if you think
1048 * about it this is right. Otherwise apps have to
1049 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1050 * are treated in BSD as hints
1051 */
1052 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1053 break;
1054
1055 case SO_RCVBUFFORCE:
1056 if (!capable(CAP_NET_ADMIN)) {
1057 ret = -EPERM;
1058 break;
1059 }
1060
1061 /* No negative values (to prevent underflow, as val will be
1062 * multiplied by 2).
1063 */
1064 __sock_set_rcvbuf(sk, max(val, 0));
1065 break;
1066
1067 case SO_KEEPALIVE:
1068 if (sk->sk_prot->keepalive)
1069 sk->sk_prot->keepalive(sk, valbool);
1070 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1071 break;
1072
1073 case SO_OOBINLINE:
1074 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1075 break;
1076
1077 case SO_NO_CHECK:
1078 sk->sk_no_check_tx = valbool;
1079 break;
1080
1081 case SO_PRIORITY:
1082 if ((val >= 0 && val <= 6) ||
1083 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1084 sk->sk_priority = val;
1085 else
1086 ret = -EPERM;
1087 break;
1088
1089 case SO_LINGER:
1090 if (optlen < sizeof(ling)) {
1091 ret = -EINVAL; /* 1003.1g */
1092 break;
1093 }
1094 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1095 ret = -EFAULT;
1096 break;
1097 }
1098 if (!ling.l_onoff)
1099 sock_reset_flag(sk, SOCK_LINGER);
1100 else {
1101 #if (BITS_PER_LONG == 32)
1102 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1103 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1104 else
1105 #endif
1106 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1107 sock_set_flag(sk, SOCK_LINGER);
1108 }
1109 break;
1110
1111 case SO_BSDCOMPAT:
1112 break;
1113
1114 case SO_PASSCRED:
1115 if (valbool)
1116 set_bit(SOCK_PASSCRED, &sock->flags);
1117 else
1118 clear_bit(SOCK_PASSCRED, &sock->flags);
1119 break;
1120
1121 case SO_TIMESTAMP_OLD:
1122 case SO_TIMESTAMP_NEW:
1123 case SO_TIMESTAMPNS_OLD:
1124 case SO_TIMESTAMPNS_NEW:
1125 sock_set_timestamp(sk, optname, valbool);
1126 break;
1127
1128 case SO_TIMESTAMPING_NEW:
1129 case SO_TIMESTAMPING_OLD:
1130 if (optlen == sizeof(timestamping)) {
1131 if (copy_from_sockptr(×tamping, optval,
1132 sizeof(timestamping))) {
1133 ret = -EFAULT;
1134 break;
1135 }
1136 } else {
1137 memset(×tamping, 0, sizeof(timestamping));
1138 timestamping.flags = val;
1139 }
1140 ret = sock_set_timestamping(sk, optname, timestamping);
1141 break;
1142
1143 case SO_RCVLOWAT:
1144 if (val < 0)
1145 val = INT_MAX;
1146 if (sock->ops->set_rcvlowat)
1147 ret = sock->ops->set_rcvlowat(sk, val);
1148 else
1149 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1150 break;
1151
1152 case SO_RCVTIMEO_OLD:
1153 case SO_RCVTIMEO_NEW:
1154 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1155 optlen, optname == SO_RCVTIMEO_OLD);
1156 break;
1157
1158 case SO_SNDTIMEO_OLD:
1159 case SO_SNDTIMEO_NEW:
1160 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1161 optlen, optname == SO_SNDTIMEO_OLD);
1162 break;
1163
1164 case SO_ATTACH_FILTER: {
1165 struct sock_fprog fprog;
1166
1167 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1168 if (!ret)
1169 ret = sk_attach_filter(&fprog, sk);
1170 break;
1171 }
1172 case SO_ATTACH_BPF:
1173 ret = -EINVAL;
1174 if (optlen == sizeof(u32)) {
1175 u32 ufd;
1176
1177 ret = -EFAULT;
1178 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1179 break;
1180
1181 ret = sk_attach_bpf(ufd, sk);
1182 }
1183 break;
1184
1185 case SO_ATTACH_REUSEPORT_CBPF: {
1186 struct sock_fprog fprog;
1187
1188 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1189 if (!ret)
1190 ret = sk_reuseport_attach_filter(&fprog, sk);
1191 break;
1192 }
1193 case SO_ATTACH_REUSEPORT_EBPF:
1194 ret = -EINVAL;
1195 if (optlen == sizeof(u32)) {
1196 u32 ufd;
1197
1198 ret = -EFAULT;
1199 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1200 break;
1201
1202 ret = sk_reuseport_attach_bpf(ufd, sk);
1203 }
1204 break;
1205
1206 case SO_DETACH_REUSEPORT_BPF:
1207 ret = reuseport_detach_prog(sk);
1208 break;
1209
1210 case SO_DETACH_FILTER:
1211 ret = sk_detach_filter(sk);
1212 break;
1213
1214 case SO_LOCK_FILTER:
1215 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1216 ret = -EPERM;
1217 else
1218 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1219 break;
1220
1221 case SO_PASSSEC:
1222 if (valbool)
1223 set_bit(SOCK_PASSSEC, &sock->flags);
1224 else
1225 clear_bit(SOCK_PASSSEC, &sock->flags);
1226 break;
1227 case SO_MARK:
1228 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1229 ret = -EPERM;
1230 break;
1231 }
1232
1233 __sock_set_mark(sk, val);
1234 break;
1235
1236 case SO_RXQ_OVFL:
1237 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1238 break;
1239
1240 case SO_WIFI_STATUS:
1241 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1242 break;
1243
1244 case SO_PEEK_OFF:
1245 if (sock->ops->set_peek_off)
1246 ret = sock->ops->set_peek_off(sk, val);
1247 else
1248 ret = -EOPNOTSUPP;
1249 break;
1250
1251 case SO_NOFCS:
1252 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1253 break;
1254
1255 case SO_SELECT_ERR_QUEUE:
1256 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1257 break;
1258
1259 #ifdef CONFIG_NET_RX_BUSY_POLL
1260 case SO_BUSY_POLL:
1261 /* allow unprivileged users to decrease the value */
1262 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1263 ret = -EPERM;
1264 else {
1265 if (val < 0)
1266 ret = -EINVAL;
1267 else
1268 WRITE_ONCE(sk->sk_ll_usec, val);
1269 }
1270 break;
1271 case SO_PREFER_BUSY_POLL:
1272 if (valbool && !capable(CAP_NET_ADMIN))
1273 ret = -EPERM;
1274 else
1275 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1276 break;
1277 case SO_BUSY_POLL_BUDGET:
1278 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1279 ret = -EPERM;
1280 } else {
1281 if (val < 0 || val > U16_MAX)
1282 ret = -EINVAL;
1283 else
1284 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1285 }
1286 break;
1287 #endif
1288
1289 case SO_MAX_PACING_RATE:
1290 {
1291 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1292
1293 if (sizeof(ulval) != sizeof(val) &&
1294 optlen >= sizeof(ulval) &&
1295 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1296 ret = -EFAULT;
1297 break;
1298 }
1299 if (ulval != ~0UL)
1300 cmpxchg(&sk->sk_pacing_status,
1301 SK_PACING_NONE,
1302 SK_PACING_NEEDED);
1303 /* Pairs with READ_ONCE() from sk_getsockopt() */
1304 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1305 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1306 break;
1307 }
1308 case SO_INCOMING_CPU:
1309 WRITE_ONCE(sk->sk_incoming_cpu, val);
1310 break;
1311
1312 case SO_CNX_ADVICE:
1313 if (val == 1)
1314 dst_negative_advice(sk);
1315 break;
1316
1317 case SO_ZEROCOPY:
1318 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1319 if (!((sk->sk_type == SOCK_STREAM &&
1320 sk->sk_protocol == IPPROTO_TCP) ||
1321 (sk->sk_type == SOCK_DGRAM &&
1322 sk->sk_protocol == IPPROTO_UDP)))
1323 ret = -ENOTSUPP;
1324 } else if (sk->sk_family != PF_RDS) {
1325 ret = -ENOTSUPP;
1326 }
1327 if (!ret) {
1328 if (val < 0 || val > 1)
1329 ret = -EINVAL;
1330 else
1331 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1332 }
1333 break;
1334
1335 case SO_TXTIME:
1336 if (optlen != sizeof(struct sock_txtime)) {
1337 ret = -EINVAL;
1338 break;
1339 } else if (copy_from_sockptr(&sk_txtime, optval,
1340 sizeof(struct sock_txtime))) {
1341 ret = -EFAULT;
1342 break;
1343 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1344 ret = -EINVAL;
1345 break;
1346 }
1347 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1348 * scheduler has enough safe guards.
1349 */
1350 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1351 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1352 ret = -EPERM;
1353 break;
1354 }
1355 sock_valbool_flag(sk, SOCK_TXTIME, true);
1356 sk->sk_clockid = sk_txtime.clockid;
1357 sk->sk_txtime_deadline_mode =
1358 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1359 sk->sk_txtime_report_errors =
1360 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1361 break;
1362
1363 case SO_BINDTOIFINDEX:
1364 ret = sock_bindtoindex_locked(sk, val);
1365 break;
1366
1367 case SO_BUF_LOCK:
1368 if (val & ~SOCK_BUF_LOCK_MASK) {
1369 ret = -EINVAL;
1370 break;
1371 }
1372 sk->sk_userlocks = val | (sk->sk_userlocks &
1373 ~SOCK_BUF_LOCK_MASK);
1374 break;
1375
1376 default:
1377 ret = -ENOPROTOOPT;
1378 break;
1379 }
1380 release_sock(sk);
1381 return ret;
1382 }
1383 EXPORT_SYMBOL(sock_setsockopt);
1384
sk_get_peer_cred(struct sock * sk)1385 static const struct cred *sk_get_peer_cred(struct sock *sk)
1386 {
1387 const struct cred *cred;
1388
1389 spin_lock(&sk->sk_peer_lock);
1390 cred = get_cred(sk->sk_peer_cred);
1391 spin_unlock(&sk->sk_peer_lock);
1392
1393 return cred;
1394 }
1395
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1396 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1397 struct ucred *ucred)
1398 {
1399 ucred->pid = pid_vnr(pid);
1400 ucred->uid = ucred->gid = -1;
1401 if (cred) {
1402 struct user_namespace *current_ns = current_user_ns();
1403
1404 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1405 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1406 }
1407 }
1408
groups_to_user(gid_t __user * dst,const struct group_info * src)1409 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1410 {
1411 struct user_namespace *user_ns = current_user_ns();
1412 int i;
1413
1414 for (i = 0; i < src->ngroups; i++)
1415 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1416 return -EFAULT;
1417
1418 return 0;
1419 }
1420
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1421 int sock_getsockopt(struct socket *sock, int level, int optname,
1422 char __user *optval, int __user *optlen)
1423 {
1424 struct sock *sk = sock->sk;
1425
1426 union {
1427 int val;
1428 u64 val64;
1429 unsigned long ulval;
1430 struct linger ling;
1431 struct old_timeval32 tm32;
1432 struct __kernel_old_timeval tm;
1433 struct __kernel_sock_timeval stm;
1434 struct sock_txtime txtime;
1435 struct so_timestamping timestamping;
1436 } v;
1437
1438 int lv = sizeof(int);
1439 int len;
1440
1441 if (get_user(len, optlen))
1442 return -EFAULT;
1443 if (len < 0)
1444 return -EINVAL;
1445
1446 memset(&v, 0, sizeof(v));
1447
1448 switch (optname) {
1449 case SO_DEBUG:
1450 v.val = sock_flag(sk, SOCK_DBG);
1451 break;
1452
1453 case SO_DONTROUTE:
1454 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1455 break;
1456
1457 case SO_BROADCAST:
1458 v.val = sock_flag(sk, SOCK_BROADCAST);
1459 break;
1460
1461 case SO_SNDBUF:
1462 v.val = READ_ONCE(sk->sk_sndbuf);
1463 break;
1464
1465 case SO_RCVBUF:
1466 v.val = READ_ONCE(sk->sk_rcvbuf);
1467 break;
1468
1469 case SO_REUSEADDR:
1470 v.val = sk->sk_reuse;
1471 break;
1472
1473 case SO_REUSEPORT:
1474 v.val = sk->sk_reuseport;
1475 break;
1476
1477 case SO_KEEPALIVE:
1478 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1479 break;
1480
1481 case SO_TYPE:
1482 v.val = sk->sk_type;
1483 break;
1484
1485 case SO_PROTOCOL:
1486 v.val = sk->sk_protocol;
1487 break;
1488
1489 case SO_DOMAIN:
1490 v.val = sk->sk_family;
1491 break;
1492
1493 case SO_ERROR:
1494 v.val = -sock_error(sk);
1495 if (v.val == 0)
1496 v.val = xchg(&sk->sk_err_soft, 0);
1497 break;
1498
1499 case SO_OOBINLINE:
1500 v.val = sock_flag(sk, SOCK_URGINLINE);
1501 break;
1502
1503 case SO_NO_CHECK:
1504 v.val = sk->sk_no_check_tx;
1505 break;
1506
1507 case SO_PRIORITY:
1508 v.val = sk->sk_priority;
1509 break;
1510
1511 case SO_LINGER:
1512 lv = sizeof(v.ling);
1513 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1514 v.ling.l_linger = sk->sk_lingertime / HZ;
1515 break;
1516
1517 case SO_BSDCOMPAT:
1518 break;
1519
1520 case SO_TIMESTAMP_OLD:
1521 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1522 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1523 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1524 break;
1525
1526 case SO_TIMESTAMPNS_OLD:
1527 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1528 break;
1529
1530 case SO_TIMESTAMP_NEW:
1531 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1532 break;
1533
1534 case SO_TIMESTAMPNS_NEW:
1535 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1536 break;
1537
1538 case SO_TIMESTAMPING_OLD:
1539 case SO_TIMESTAMPING_NEW:
1540 lv = sizeof(v.timestamping);
1541 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1542 * returning the flags when they were set through the same option.
1543 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1544 */
1545 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1546 v.timestamping.flags = sk->sk_tsflags;
1547 v.timestamping.bind_phc = sk->sk_bind_phc;
1548 }
1549 break;
1550
1551 case SO_RCVTIMEO_OLD:
1552 case SO_RCVTIMEO_NEW:
1553 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1554 break;
1555
1556 case SO_SNDTIMEO_OLD:
1557 case SO_SNDTIMEO_NEW:
1558 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1559 break;
1560
1561 case SO_RCVLOWAT:
1562 v.val = READ_ONCE(sk->sk_rcvlowat);
1563 break;
1564
1565 case SO_SNDLOWAT:
1566 v.val = 1;
1567 break;
1568
1569 case SO_PASSCRED:
1570 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1571 break;
1572
1573 case SO_PEERCRED:
1574 {
1575 struct ucred peercred;
1576 if (len > sizeof(peercred))
1577 len = sizeof(peercred);
1578
1579 spin_lock(&sk->sk_peer_lock);
1580 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1581 spin_unlock(&sk->sk_peer_lock);
1582
1583 if (copy_to_user(optval, &peercred, len))
1584 return -EFAULT;
1585 goto lenout;
1586 }
1587
1588 case SO_PEERGROUPS:
1589 {
1590 const struct cred *cred;
1591 int ret, n;
1592
1593 cred = sk_get_peer_cred(sk);
1594 if (!cred)
1595 return -ENODATA;
1596
1597 n = cred->group_info->ngroups;
1598 if (len < n * sizeof(gid_t)) {
1599 len = n * sizeof(gid_t);
1600 put_cred(cred);
1601 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1602 }
1603 len = n * sizeof(gid_t);
1604
1605 ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1606 put_cred(cred);
1607 if (ret)
1608 return ret;
1609 goto lenout;
1610 }
1611
1612 case SO_PEERNAME:
1613 {
1614 char address[128];
1615
1616 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1617 if (lv < 0)
1618 return -ENOTCONN;
1619 if (lv < len)
1620 return -EINVAL;
1621 if (copy_to_user(optval, address, len))
1622 return -EFAULT;
1623 goto lenout;
1624 }
1625
1626 /* Dubious BSD thing... Probably nobody even uses it, but
1627 * the UNIX standard wants it for whatever reason... -DaveM
1628 */
1629 case SO_ACCEPTCONN:
1630 v.val = sk->sk_state == TCP_LISTEN;
1631 break;
1632
1633 case SO_PASSSEC:
1634 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1635 break;
1636
1637 case SO_PEERSEC:
1638 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1639
1640 case SO_MARK:
1641 v.val = sk->sk_mark;
1642 break;
1643
1644 case SO_RXQ_OVFL:
1645 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1646 break;
1647
1648 case SO_WIFI_STATUS:
1649 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1650 break;
1651
1652 case SO_PEEK_OFF:
1653 if (!sock->ops->set_peek_off)
1654 return -EOPNOTSUPP;
1655
1656 v.val = READ_ONCE(sk->sk_peek_off);
1657 break;
1658 case SO_NOFCS:
1659 v.val = sock_flag(sk, SOCK_NOFCS);
1660 break;
1661
1662 case SO_BINDTODEVICE:
1663 return sock_getbindtodevice(sk, optval, optlen, len);
1664
1665 case SO_GET_FILTER:
1666 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1667 if (len < 0)
1668 return len;
1669
1670 goto lenout;
1671
1672 case SO_LOCK_FILTER:
1673 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1674 break;
1675
1676 case SO_BPF_EXTENSIONS:
1677 v.val = bpf_tell_extensions();
1678 break;
1679
1680 case SO_SELECT_ERR_QUEUE:
1681 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1682 break;
1683
1684 #ifdef CONFIG_NET_RX_BUSY_POLL
1685 case SO_BUSY_POLL:
1686 v.val = READ_ONCE(sk->sk_ll_usec);
1687 break;
1688 case SO_PREFER_BUSY_POLL:
1689 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1690 break;
1691 #endif
1692
1693 case SO_MAX_PACING_RATE:
1694 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1695 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1696 lv = sizeof(v.ulval);
1697 v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1698 } else {
1699 /* 32bit version */
1700 v.val = min_t(unsigned long, ~0U,
1701 READ_ONCE(sk->sk_max_pacing_rate));
1702 }
1703 break;
1704
1705 case SO_INCOMING_CPU:
1706 v.val = READ_ONCE(sk->sk_incoming_cpu);
1707 break;
1708
1709 case SO_MEMINFO:
1710 {
1711 u32 meminfo[SK_MEMINFO_VARS];
1712
1713 sk_get_meminfo(sk, meminfo);
1714
1715 len = min_t(unsigned int, len, sizeof(meminfo));
1716 if (copy_to_user(optval, &meminfo, len))
1717 return -EFAULT;
1718
1719 goto lenout;
1720 }
1721
1722 #ifdef CONFIG_NET_RX_BUSY_POLL
1723 case SO_INCOMING_NAPI_ID:
1724 v.val = READ_ONCE(sk->sk_napi_id);
1725
1726 /* aggregate non-NAPI IDs down to 0 */
1727 if (v.val < MIN_NAPI_ID)
1728 v.val = 0;
1729
1730 break;
1731 #endif
1732
1733 case SO_COOKIE:
1734 lv = sizeof(u64);
1735 if (len < lv)
1736 return -EINVAL;
1737 v.val64 = sock_gen_cookie(sk);
1738 break;
1739
1740 case SO_ZEROCOPY:
1741 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1742 break;
1743
1744 case SO_TXTIME:
1745 lv = sizeof(v.txtime);
1746 v.txtime.clockid = sk->sk_clockid;
1747 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1748 SOF_TXTIME_DEADLINE_MODE : 0;
1749 v.txtime.flags |= sk->sk_txtime_report_errors ?
1750 SOF_TXTIME_REPORT_ERRORS : 0;
1751 break;
1752
1753 case SO_BINDTOIFINDEX:
1754 v.val = sk->sk_bound_dev_if;
1755 break;
1756
1757 case SO_NETNS_COOKIE:
1758 lv = sizeof(u64);
1759 if (len != lv)
1760 return -EINVAL;
1761 v.val64 = sock_net(sk)->net_cookie;
1762 break;
1763
1764 case SO_BUF_LOCK:
1765 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1766 break;
1767
1768 default:
1769 /* We implement the SO_SNDLOWAT etc to not be settable
1770 * (1003.1g 7).
1771 */
1772 return -ENOPROTOOPT;
1773 }
1774
1775 if (len > lv)
1776 len = lv;
1777 if (copy_to_user(optval, &v, len))
1778 return -EFAULT;
1779 lenout:
1780 if (put_user(len, optlen))
1781 return -EFAULT;
1782 return 0;
1783 }
1784
1785 /*
1786 * Initialize an sk_lock.
1787 *
1788 * (We also register the sk_lock with the lock validator.)
1789 */
sock_lock_init(struct sock * sk)1790 static inline void sock_lock_init(struct sock *sk)
1791 {
1792 if (sk->sk_kern_sock)
1793 sock_lock_init_class_and_name(
1794 sk,
1795 af_family_kern_slock_key_strings[sk->sk_family],
1796 af_family_kern_slock_keys + sk->sk_family,
1797 af_family_kern_key_strings[sk->sk_family],
1798 af_family_kern_keys + sk->sk_family);
1799 else
1800 sock_lock_init_class_and_name(
1801 sk,
1802 af_family_slock_key_strings[sk->sk_family],
1803 af_family_slock_keys + sk->sk_family,
1804 af_family_key_strings[sk->sk_family],
1805 af_family_keys + sk->sk_family);
1806 }
1807
1808 /*
1809 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1810 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1811 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1812 */
sock_copy(struct sock * nsk,const struct sock * osk)1813 static void sock_copy(struct sock *nsk, const struct sock *osk)
1814 {
1815 const struct proto *prot = READ_ONCE(osk->sk_prot);
1816 #ifdef CONFIG_SECURITY_NETWORK
1817 void *sptr = nsk->sk_security;
1818 #endif
1819
1820 /* If we move sk_tx_queue_mapping out of the private section,
1821 * we must check if sk_tx_queue_clear() is called after
1822 * sock_copy() in sk_clone_lock().
1823 */
1824 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1825 offsetof(struct sock, sk_dontcopy_begin) ||
1826 offsetof(struct sock, sk_tx_queue_mapping) >=
1827 offsetof(struct sock, sk_dontcopy_end));
1828
1829 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1830
1831 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1832 prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1833
1834 #ifdef CONFIG_SECURITY_NETWORK
1835 nsk->sk_security = sptr;
1836 security_sk_clone(osk, nsk);
1837 #endif
1838 }
1839
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1840 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1841 int family)
1842 {
1843 struct sock *sk;
1844 struct kmem_cache *slab;
1845
1846 slab = prot->slab;
1847 if (slab != NULL) {
1848 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1849 if (!sk)
1850 return sk;
1851 if (want_init_on_alloc(priority))
1852 sk_prot_clear_nulls(sk, prot->obj_size);
1853 } else
1854 sk = kmalloc(prot->obj_size, priority);
1855
1856 if (sk != NULL) {
1857 if (security_sk_alloc(sk, family, priority))
1858 goto out_free;
1859
1860 trace_android_rvh_sk_alloc(sk);
1861
1862 if (!try_module_get(prot->owner))
1863 goto out_free_sec;
1864 }
1865
1866 return sk;
1867
1868 out_free_sec:
1869 security_sk_free(sk);
1870 trace_android_rvh_sk_free(sk);
1871 out_free:
1872 if (slab != NULL)
1873 kmem_cache_free(slab, sk);
1874 else
1875 kfree(sk);
1876 return NULL;
1877 }
1878
sk_prot_free(struct proto * prot,struct sock * sk)1879 static void sk_prot_free(struct proto *prot, struct sock *sk)
1880 {
1881 struct kmem_cache *slab;
1882 struct module *owner;
1883
1884 owner = prot->owner;
1885 slab = prot->slab;
1886
1887 cgroup_sk_free(&sk->sk_cgrp_data);
1888 mem_cgroup_sk_free(sk);
1889 security_sk_free(sk);
1890 trace_android_rvh_sk_free(sk);
1891 if (slab != NULL)
1892 kmem_cache_free(slab, sk);
1893 else
1894 kfree(sk);
1895 module_put(owner);
1896 }
1897
1898 /**
1899 * sk_alloc - All socket objects are allocated here
1900 * @net: the applicable net namespace
1901 * @family: protocol family
1902 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1903 * @prot: struct proto associated with this new sock instance
1904 * @kern: is this to be a kernel socket?
1905 */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1906 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1907 struct proto *prot, int kern)
1908 {
1909 struct sock *sk;
1910
1911 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1912 if (sk) {
1913 sk->sk_family = family;
1914 /*
1915 * See comment in struct sock definition to understand
1916 * why we need sk_prot_creator -acme
1917 */
1918 sk->sk_prot = sk->sk_prot_creator = prot;
1919 sk->sk_kern_sock = kern;
1920 sock_lock_init(sk);
1921 sk->sk_net_refcnt = kern ? 0 : 1;
1922 if (likely(sk->sk_net_refcnt)) {
1923 get_net(net);
1924 sock_inuse_add(net, 1);
1925 }
1926
1927 sock_net_set(sk, net);
1928 refcount_set(&sk->sk_wmem_alloc, 1);
1929
1930 mem_cgroup_sk_alloc(sk);
1931 cgroup_sk_alloc(&sk->sk_cgrp_data);
1932 sock_update_classid(&sk->sk_cgrp_data);
1933 sock_update_netprioidx(&sk->sk_cgrp_data);
1934 sk_tx_queue_clear(sk);
1935 }
1936
1937 return sk;
1938 }
1939 EXPORT_SYMBOL(sk_alloc);
1940
1941 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1942 * grace period. This is the case for UDP sockets and TCP listeners.
1943 */
__sk_destruct(struct rcu_head * head)1944 static void __sk_destruct(struct rcu_head *head)
1945 {
1946 struct sock *sk = container_of(head, struct sock, sk_rcu);
1947 struct sk_filter *filter;
1948
1949 if (sk->sk_destruct)
1950 sk->sk_destruct(sk);
1951
1952 filter = rcu_dereference_check(sk->sk_filter,
1953 refcount_read(&sk->sk_wmem_alloc) == 0);
1954 if (filter) {
1955 sk_filter_uncharge(sk, filter);
1956 RCU_INIT_POINTER(sk->sk_filter, NULL);
1957 }
1958
1959 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1960
1961 #ifdef CONFIG_BPF_SYSCALL
1962 bpf_sk_storage_free(sk);
1963 #endif
1964
1965 if (atomic_read(&sk->sk_omem_alloc))
1966 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1967 __func__, atomic_read(&sk->sk_omem_alloc));
1968
1969 if (sk->sk_frag.page) {
1970 put_page(sk->sk_frag.page);
1971 sk->sk_frag.page = NULL;
1972 }
1973
1974 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1975 put_cred(sk->sk_peer_cred);
1976 put_pid(sk->sk_peer_pid);
1977
1978 if (likely(sk->sk_net_refcnt))
1979 put_net(sock_net(sk));
1980 sk_prot_free(sk->sk_prot_creator, sk);
1981 }
1982
sk_destruct(struct sock * sk)1983 void sk_destruct(struct sock *sk)
1984 {
1985 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1986
1987 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1988 reuseport_detach_sock(sk);
1989 use_call_rcu = true;
1990 }
1991
1992 if (use_call_rcu)
1993 call_rcu(&sk->sk_rcu, __sk_destruct);
1994 else
1995 __sk_destruct(&sk->sk_rcu);
1996 }
1997
__sk_free(struct sock * sk)1998 static void __sk_free(struct sock *sk)
1999 {
2000 if (likely(sk->sk_net_refcnt))
2001 sock_inuse_add(sock_net(sk), -1);
2002
2003 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2004 sock_diag_broadcast_destroy(sk);
2005 else
2006 sk_destruct(sk);
2007 }
2008
sk_free(struct sock * sk)2009 void sk_free(struct sock *sk)
2010 {
2011 /*
2012 * We subtract one from sk_wmem_alloc and can know if
2013 * some packets are still in some tx queue.
2014 * If not null, sock_wfree() will call __sk_free(sk) later
2015 */
2016 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2017 __sk_free(sk);
2018 }
2019 EXPORT_SYMBOL(sk_free);
2020
sk_init_common(struct sock * sk)2021 static void sk_init_common(struct sock *sk)
2022 {
2023 skb_queue_head_init(&sk->sk_receive_queue);
2024 skb_queue_head_init(&sk->sk_write_queue);
2025 skb_queue_head_init(&sk->sk_error_queue);
2026
2027 rwlock_init(&sk->sk_callback_lock);
2028 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2029 af_rlock_keys + sk->sk_family,
2030 af_family_rlock_key_strings[sk->sk_family]);
2031 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2032 af_wlock_keys + sk->sk_family,
2033 af_family_wlock_key_strings[sk->sk_family]);
2034 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2035 af_elock_keys + sk->sk_family,
2036 af_family_elock_key_strings[sk->sk_family]);
2037 lockdep_set_class_and_name(&sk->sk_callback_lock,
2038 af_callback_keys + sk->sk_family,
2039 af_family_clock_key_strings[sk->sk_family]);
2040 }
2041
2042 /**
2043 * sk_clone_lock - clone a socket, and lock its clone
2044 * @sk: the socket to clone
2045 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2046 *
2047 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2048 */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2049 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2050 {
2051 struct proto *prot = READ_ONCE(sk->sk_prot);
2052 struct sk_filter *filter;
2053 bool is_charged = true;
2054 struct sock *newsk;
2055
2056 newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2057 if (!newsk)
2058 goto out;
2059
2060 sock_copy(newsk, sk);
2061
2062 newsk->sk_prot_creator = prot;
2063
2064 /* SANITY */
2065 if (likely(newsk->sk_net_refcnt)) {
2066 get_net(sock_net(newsk));
2067 sock_inuse_add(sock_net(newsk), 1);
2068 }
2069 sk_node_init(&newsk->sk_node);
2070 sock_lock_init(newsk);
2071 bh_lock_sock(newsk);
2072 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
2073 newsk->sk_backlog.len = 0;
2074
2075 atomic_set(&newsk->sk_rmem_alloc, 0);
2076
2077 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2078 refcount_set(&newsk->sk_wmem_alloc, 1);
2079
2080 atomic_set(&newsk->sk_omem_alloc, 0);
2081 sk_init_common(newsk);
2082
2083 newsk->sk_dst_cache = NULL;
2084 newsk->sk_dst_pending_confirm = 0;
2085 newsk->sk_wmem_queued = 0;
2086 newsk->sk_forward_alloc = 0;
2087 atomic_set(&newsk->sk_drops, 0);
2088 newsk->sk_send_head = NULL;
2089 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2090 atomic_set(&newsk->sk_zckey, 0);
2091
2092 sock_reset_flag(newsk, SOCK_DONE);
2093
2094 /* sk->sk_memcg will be populated at accept() time */
2095 newsk->sk_memcg = NULL;
2096
2097 cgroup_sk_clone(&newsk->sk_cgrp_data);
2098
2099 rcu_read_lock();
2100 filter = rcu_dereference(sk->sk_filter);
2101 if (filter != NULL)
2102 /* though it's an empty new sock, the charging may fail
2103 * if sysctl_optmem_max was changed between creation of
2104 * original socket and cloning
2105 */
2106 is_charged = sk_filter_charge(newsk, filter);
2107 RCU_INIT_POINTER(newsk->sk_filter, filter);
2108 rcu_read_unlock();
2109
2110 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2111 /* We need to make sure that we don't uncharge the new
2112 * socket if we couldn't charge it in the first place
2113 * as otherwise we uncharge the parent's filter.
2114 */
2115 if (!is_charged)
2116 RCU_INIT_POINTER(newsk->sk_filter, NULL);
2117 sk_free_unlock_clone(newsk);
2118 newsk = NULL;
2119 goto out;
2120 }
2121 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2122
2123 if (bpf_sk_storage_clone(sk, newsk)) {
2124 sk_free_unlock_clone(newsk);
2125 newsk = NULL;
2126 goto out;
2127 }
2128
2129 /* Clear sk_user_data if parent had the pointer tagged
2130 * as not suitable for copying when cloning.
2131 */
2132 if (sk_user_data_is_nocopy(newsk))
2133 newsk->sk_user_data = NULL;
2134
2135 newsk->sk_err = 0;
2136 newsk->sk_err_soft = 0;
2137 newsk->sk_priority = 0;
2138 newsk->sk_incoming_cpu = raw_smp_processor_id();
2139
2140 /* Before updating sk_refcnt, we must commit prior changes to memory
2141 * (Documentation/RCU/rculist_nulls.rst for details)
2142 */
2143 smp_wmb();
2144 refcount_set(&newsk->sk_refcnt, 2);
2145
2146 /* Increment the counter in the same struct proto as the master
2147 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2148 * is the same as sk->sk_prot->socks, as this field was copied
2149 * with memcpy).
2150 *
2151 * This _changes_ the previous behaviour, where
2152 * tcp_create_openreq_child always was incrementing the
2153 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2154 * to be taken into account in all callers. -acme
2155 */
2156 sk_refcnt_debug_inc(newsk);
2157 sk_set_socket(newsk, NULL);
2158 sk_tx_queue_clear(newsk);
2159 RCU_INIT_POINTER(newsk->sk_wq, NULL);
2160
2161 if (newsk->sk_prot->sockets_allocated)
2162 sk_sockets_allocated_inc(newsk);
2163
2164 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2165 net_enable_timestamp();
2166 out:
2167 return newsk;
2168 }
2169 EXPORT_SYMBOL_GPL(sk_clone_lock);
2170
sk_free_unlock_clone(struct sock * sk)2171 void sk_free_unlock_clone(struct sock *sk)
2172 {
2173 /* It is still raw copy of parent, so invalidate
2174 * destructor and make plain sk_free() */
2175 sk->sk_destruct = NULL;
2176 bh_unlock_sock(sk);
2177 sk_free(sk);
2178 }
2179 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2180
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2181 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2182 {
2183 u32 max_segs = 1;
2184
2185 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2186 if (sk->sk_route_caps & NETIF_F_GSO)
2187 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2188 sk->sk_route_caps &= ~sk->sk_route_nocaps;
2189 if (sk_can_gso(sk)) {
2190 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2191 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2192 } else {
2193 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2194 sk->sk_gso_max_size = dst->dev->gso_max_size;
2195 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2196 }
2197 }
2198 sk->sk_gso_max_segs = max_segs;
2199 sk_dst_set(sk, dst);
2200 }
2201 EXPORT_SYMBOL_GPL(sk_setup_caps);
2202
2203 /*
2204 * Simple resource managers for sockets.
2205 */
2206
2207
2208 /*
2209 * Write buffer destructor automatically called from kfree_skb.
2210 */
sock_wfree(struct sk_buff * skb)2211 void sock_wfree(struct sk_buff *skb)
2212 {
2213 struct sock *sk = skb->sk;
2214 unsigned int len = skb->truesize;
2215
2216 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2217 /*
2218 * Keep a reference on sk_wmem_alloc, this will be released
2219 * after sk_write_space() call
2220 */
2221 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2222 sk->sk_write_space(sk);
2223 len = 1;
2224 }
2225 /*
2226 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2227 * could not do because of in-flight packets
2228 */
2229 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2230 __sk_free(sk);
2231 }
2232 EXPORT_SYMBOL(sock_wfree);
2233
2234 /* This variant of sock_wfree() is used by TCP,
2235 * since it sets SOCK_USE_WRITE_QUEUE.
2236 */
__sock_wfree(struct sk_buff * skb)2237 void __sock_wfree(struct sk_buff *skb)
2238 {
2239 struct sock *sk = skb->sk;
2240
2241 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2242 __sk_free(sk);
2243 }
2244
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2245 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2246 {
2247 skb_orphan(skb);
2248 skb->sk = sk;
2249 #ifdef CONFIG_INET
2250 if (unlikely(!sk_fullsock(sk))) {
2251 skb->destructor = sock_edemux;
2252 sock_hold(sk);
2253 return;
2254 }
2255 #endif
2256 skb->destructor = sock_wfree;
2257 skb_set_hash_from_sk(skb, sk);
2258 /*
2259 * We used to take a refcount on sk, but following operation
2260 * is enough to guarantee sk_free() wont free this sock until
2261 * all in-flight packets are completed
2262 */
2263 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2264 }
2265 EXPORT_SYMBOL(skb_set_owner_w);
2266
can_skb_orphan_partial(const struct sk_buff * skb)2267 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2268 {
2269 #ifdef CONFIG_TLS_DEVICE
2270 /* Drivers depend on in-order delivery for crypto offload,
2271 * partial orphan breaks out-of-order-OK logic.
2272 */
2273 if (skb->decrypted)
2274 return false;
2275 #endif
2276 return (skb->destructor == sock_wfree ||
2277 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2278 }
2279
2280 /* This helper is used by netem, as it can hold packets in its
2281 * delay queue. We want to allow the owner socket to send more
2282 * packets, as if they were already TX completed by a typical driver.
2283 * But we also want to keep skb->sk set because some packet schedulers
2284 * rely on it (sch_fq for example).
2285 */
skb_orphan_partial(struct sk_buff * skb)2286 void skb_orphan_partial(struct sk_buff *skb)
2287 {
2288 if (skb_is_tcp_pure_ack(skb))
2289 return;
2290
2291 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2292 return;
2293
2294 skb_orphan(skb);
2295 }
2296 EXPORT_SYMBOL(skb_orphan_partial);
2297
2298 /*
2299 * Read buffer destructor automatically called from kfree_skb.
2300 */
sock_rfree(struct sk_buff * skb)2301 void sock_rfree(struct sk_buff *skb)
2302 {
2303 struct sock *sk = skb->sk;
2304 unsigned int len = skb->truesize;
2305
2306 atomic_sub(len, &sk->sk_rmem_alloc);
2307 sk_mem_uncharge(sk, len);
2308 }
2309 EXPORT_SYMBOL(sock_rfree);
2310
2311 /*
2312 * Buffer destructor for skbs that are not used directly in read or write
2313 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2314 */
sock_efree(struct sk_buff * skb)2315 void sock_efree(struct sk_buff *skb)
2316 {
2317 sock_put(skb->sk);
2318 }
2319 EXPORT_SYMBOL(sock_efree);
2320
2321 /* Buffer destructor for prefetch/receive path where reference count may
2322 * not be held, e.g. for listen sockets.
2323 */
2324 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2325 void sock_pfree(struct sk_buff *skb)
2326 {
2327 if (sk_is_refcounted(skb->sk))
2328 sock_gen_put(skb->sk);
2329 }
2330 EXPORT_SYMBOL(sock_pfree);
2331 #endif /* CONFIG_INET */
2332
sock_i_uid(struct sock * sk)2333 kuid_t sock_i_uid(struct sock *sk)
2334 {
2335 kuid_t uid;
2336
2337 read_lock_bh(&sk->sk_callback_lock);
2338 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2339 read_unlock_bh(&sk->sk_callback_lock);
2340 return uid;
2341 }
2342 EXPORT_SYMBOL(sock_i_uid);
2343
__sock_i_ino(struct sock * sk)2344 unsigned long __sock_i_ino(struct sock *sk)
2345 {
2346 unsigned long ino;
2347
2348 read_lock(&sk->sk_callback_lock);
2349 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2350 read_unlock(&sk->sk_callback_lock);
2351 return ino;
2352 }
2353 EXPORT_SYMBOL(__sock_i_ino);
2354
sock_i_ino(struct sock * sk)2355 unsigned long sock_i_ino(struct sock *sk)
2356 {
2357 unsigned long ino;
2358
2359 local_bh_disable();
2360 ino = __sock_i_ino(sk);
2361 local_bh_enable();
2362 return ino;
2363 }
2364 EXPORT_SYMBOL(sock_i_ino);
2365
2366 /*
2367 * Allocate a skb from the socket's send buffer.
2368 */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2369 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2370 gfp_t priority)
2371 {
2372 if (force ||
2373 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2374 struct sk_buff *skb = alloc_skb(size, priority);
2375
2376 if (skb) {
2377 skb_set_owner_w(skb, sk);
2378 return skb;
2379 }
2380 }
2381 return NULL;
2382 }
2383 EXPORT_SYMBOL(sock_wmalloc);
2384
sock_ofree(struct sk_buff * skb)2385 static void sock_ofree(struct sk_buff *skb)
2386 {
2387 struct sock *sk = skb->sk;
2388
2389 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2390 }
2391
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2392 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2393 gfp_t priority)
2394 {
2395 struct sk_buff *skb;
2396
2397 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2398 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2399 READ_ONCE(sysctl_optmem_max))
2400 return NULL;
2401
2402 skb = alloc_skb(size, priority);
2403 if (!skb)
2404 return NULL;
2405
2406 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2407 skb->sk = sk;
2408 skb->destructor = sock_ofree;
2409 return skb;
2410 }
2411
2412 /*
2413 * Allocate a memory block from the socket's option memory buffer.
2414 */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2415 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2416 {
2417 int optmem_max = READ_ONCE(sysctl_optmem_max);
2418
2419 if ((unsigned int)size <= optmem_max &&
2420 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2421 void *mem;
2422 /* First do the add, to avoid the race if kmalloc
2423 * might sleep.
2424 */
2425 atomic_add(size, &sk->sk_omem_alloc);
2426 mem = kmalloc(size, priority);
2427 if (mem)
2428 return mem;
2429 atomic_sub(size, &sk->sk_omem_alloc);
2430 }
2431 return NULL;
2432 }
2433 EXPORT_SYMBOL(sock_kmalloc);
2434
2435 /* Free an option memory block. Note, we actually want the inline
2436 * here as this allows gcc to detect the nullify and fold away the
2437 * condition entirely.
2438 */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2439 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2440 const bool nullify)
2441 {
2442 if (WARN_ON_ONCE(!mem))
2443 return;
2444 if (nullify)
2445 kfree_sensitive(mem);
2446 else
2447 kfree(mem);
2448 atomic_sub(size, &sk->sk_omem_alloc);
2449 }
2450
sock_kfree_s(struct sock * sk,void * mem,int size)2451 void sock_kfree_s(struct sock *sk, void *mem, int size)
2452 {
2453 __sock_kfree_s(sk, mem, size, false);
2454 }
2455 EXPORT_SYMBOL(sock_kfree_s);
2456
sock_kzfree_s(struct sock * sk,void * mem,int size)2457 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2458 {
2459 __sock_kfree_s(sk, mem, size, true);
2460 }
2461 EXPORT_SYMBOL(sock_kzfree_s);
2462
2463 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2464 I think, these locks should be removed for datagram sockets.
2465 */
sock_wait_for_wmem(struct sock * sk,long timeo)2466 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2467 {
2468 DEFINE_WAIT(wait);
2469
2470 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2471 for (;;) {
2472 if (!timeo)
2473 break;
2474 if (signal_pending(current))
2475 break;
2476 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2477 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2478 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2479 break;
2480 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2481 break;
2482 if (READ_ONCE(sk->sk_err))
2483 break;
2484 timeo = schedule_timeout(timeo);
2485 }
2486 finish_wait(sk_sleep(sk), &wait);
2487 return timeo;
2488 }
2489
2490
2491 /*
2492 * Generic send/receive buffer handlers
2493 */
2494
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2495 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2496 unsigned long data_len, int noblock,
2497 int *errcode, int max_page_order)
2498 {
2499 struct sk_buff *skb;
2500 long timeo;
2501 int err;
2502
2503 timeo = sock_sndtimeo(sk, noblock);
2504 for (;;) {
2505 err = sock_error(sk);
2506 if (err != 0)
2507 goto failure;
2508
2509 err = -EPIPE;
2510 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2511 goto failure;
2512
2513 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2514 break;
2515
2516 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2517 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2518 err = -EAGAIN;
2519 if (!timeo)
2520 goto failure;
2521 if (signal_pending(current))
2522 goto interrupted;
2523 timeo = sock_wait_for_wmem(sk, timeo);
2524 }
2525 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2526 errcode, sk->sk_allocation);
2527 if (skb)
2528 skb_set_owner_w(skb, sk);
2529 return skb;
2530
2531 interrupted:
2532 err = sock_intr_errno(timeo);
2533 failure:
2534 *errcode = err;
2535 return NULL;
2536 }
2537 EXPORT_SYMBOL(sock_alloc_send_pskb);
2538
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)2539 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2540 int noblock, int *errcode)
2541 {
2542 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2543 }
2544 EXPORT_SYMBOL(sock_alloc_send_skb);
2545
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2546 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2547 struct sockcm_cookie *sockc)
2548 {
2549 u32 tsflags;
2550
2551 switch (cmsg->cmsg_type) {
2552 case SO_MARK:
2553 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2554 return -EPERM;
2555 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2556 return -EINVAL;
2557 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2558 break;
2559 case SO_TIMESTAMPING_OLD:
2560 case SO_TIMESTAMPING_NEW:
2561 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2562 return -EINVAL;
2563
2564 tsflags = *(u32 *)CMSG_DATA(cmsg);
2565 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2566 return -EINVAL;
2567
2568 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2569 sockc->tsflags |= tsflags;
2570 break;
2571 case SCM_TXTIME:
2572 if (!sock_flag(sk, SOCK_TXTIME))
2573 return -EINVAL;
2574 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2575 return -EINVAL;
2576 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2577 break;
2578 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2579 case SCM_RIGHTS:
2580 case SCM_CREDENTIALS:
2581 break;
2582 default:
2583 return -EINVAL;
2584 }
2585 return 0;
2586 }
2587 EXPORT_SYMBOL(__sock_cmsg_send);
2588
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2589 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2590 struct sockcm_cookie *sockc)
2591 {
2592 struct cmsghdr *cmsg;
2593 int ret;
2594
2595 for_each_cmsghdr(cmsg, msg) {
2596 if (!CMSG_OK(msg, cmsg))
2597 return -EINVAL;
2598 if (cmsg->cmsg_level != SOL_SOCKET)
2599 continue;
2600 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2601 if (ret)
2602 return ret;
2603 }
2604 return 0;
2605 }
2606 EXPORT_SYMBOL(sock_cmsg_send);
2607
sk_enter_memory_pressure(struct sock * sk)2608 static void sk_enter_memory_pressure(struct sock *sk)
2609 {
2610 if (!sk->sk_prot->enter_memory_pressure)
2611 return;
2612
2613 sk->sk_prot->enter_memory_pressure(sk);
2614 }
2615
sk_leave_memory_pressure(struct sock * sk)2616 static void sk_leave_memory_pressure(struct sock *sk)
2617 {
2618 if (sk->sk_prot->leave_memory_pressure) {
2619 sk->sk_prot->leave_memory_pressure(sk);
2620 } else {
2621 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2622
2623 if (memory_pressure && READ_ONCE(*memory_pressure))
2624 WRITE_ONCE(*memory_pressure, 0);
2625 }
2626 }
2627
2628 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2629
2630 /**
2631 * skb_page_frag_refill - check that a page_frag contains enough room
2632 * @sz: minimum size of the fragment we want to get
2633 * @pfrag: pointer to page_frag
2634 * @gfp: priority for memory allocation
2635 *
2636 * Note: While this allocator tries to use high order pages, there is
2637 * no guarantee that allocations succeed. Therefore, @sz MUST be
2638 * less or equal than PAGE_SIZE.
2639 */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2640 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2641 {
2642 if (pfrag->page) {
2643 if (page_ref_count(pfrag->page) == 1) {
2644 pfrag->offset = 0;
2645 return true;
2646 }
2647 if (pfrag->offset + sz <= pfrag->size)
2648 return true;
2649 put_page(pfrag->page);
2650 }
2651
2652 pfrag->offset = 0;
2653 if (SKB_FRAG_PAGE_ORDER &&
2654 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2655 /* Avoid direct reclaim but allow kswapd to wake */
2656 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2657 __GFP_COMP | __GFP_NOWARN |
2658 __GFP_NORETRY,
2659 SKB_FRAG_PAGE_ORDER);
2660 if (likely(pfrag->page)) {
2661 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2662 return true;
2663 }
2664 }
2665 pfrag->page = alloc_page(gfp);
2666 if (likely(pfrag->page)) {
2667 pfrag->size = PAGE_SIZE;
2668 return true;
2669 }
2670 return false;
2671 }
2672 EXPORT_SYMBOL(skb_page_frag_refill);
2673
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2674 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2675 {
2676 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2677 return true;
2678
2679 sk_enter_memory_pressure(sk);
2680 sk_stream_moderate_sndbuf(sk);
2681 return false;
2682 }
2683 EXPORT_SYMBOL(sk_page_frag_refill);
2684
__lock_sock(struct sock * sk)2685 void __lock_sock(struct sock *sk)
2686 __releases(&sk->sk_lock.slock)
2687 __acquires(&sk->sk_lock.slock)
2688 {
2689 DEFINE_WAIT(wait);
2690
2691 for (;;) {
2692 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2693 TASK_UNINTERRUPTIBLE);
2694 spin_unlock_bh(&sk->sk_lock.slock);
2695 schedule();
2696 spin_lock_bh(&sk->sk_lock.slock);
2697 if (!sock_owned_by_user(sk))
2698 break;
2699 }
2700 finish_wait(&sk->sk_lock.wq, &wait);
2701 }
2702
__release_sock(struct sock * sk)2703 void __release_sock(struct sock *sk)
2704 __releases(&sk->sk_lock.slock)
2705 __acquires(&sk->sk_lock.slock)
2706 {
2707 struct sk_buff *skb, *next;
2708
2709 while ((skb = sk->sk_backlog.head) != NULL) {
2710 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2711
2712 spin_unlock_bh(&sk->sk_lock.slock);
2713
2714 do {
2715 next = skb->next;
2716 prefetch(next);
2717 WARN_ON_ONCE(skb_dst_is_noref(skb));
2718 skb_mark_not_on_list(skb);
2719 sk_backlog_rcv(sk, skb);
2720
2721 cond_resched();
2722
2723 skb = next;
2724 } while (skb != NULL);
2725
2726 spin_lock_bh(&sk->sk_lock.slock);
2727 }
2728
2729 /*
2730 * Doing the zeroing here guarantee we can not loop forever
2731 * while a wild producer attempts to flood us.
2732 */
2733 sk->sk_backlog.len = 0;
2734 }
2735
__sk_flush_backlog(struct sock * sk)2736 void __sk_flush_backlog(struct sock *sk)
2737 {
2738 spin_lock_bh(&sk->sk_lock.slock);
2739 __release_sock(sk);
2740 spin_unlock_bh(&sk->sk_lock.slock);
2741 }
2742
2743 /**
2744 * sk_wait_data - wait for data to arrive at sk_receive_queue
2745 * @sk: sock to wait on
2746 * @timeo: for how long
2747 * @skb: last skb seen on sk_receive_queue
2748 *
2749 * Now socket state including sk->sk_err is changed only under lock,
2750 * hence we may omit checks after joining wait queue.
2751 * We check receive queue before schedule() only as optimization;
2752 * it is very likely that release_sock() added new data.
2753 */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2754 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2755 {
2756 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2757 int rc;
2758
2759 add_wait_queue(sk_sleep(sk), &wait);
2760 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2761 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2762 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2763 remove_wait_queue(sk_sleep(sk), &wait);
2764 return rc;
2765 }
2766 EXPORT_SYMBOL(sk_wait_data);
2767
2768 /**
2769 * __sk_mem_raise_allocated - increase memory_allocated
2770 * @sk: socket
2771 * @size: memory size to allocate
2772 * @amt: pages to allocate
2773 * @kind: allocation type
2774 *
2775 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2776 */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)2777 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2778 {
2779 struct proto *prot = sk->sk_prot;
2780 long allocated = sk_memory_allocated_add(sk, amt);
2781 bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2782 bool charged = true;
2783
2784 if (memcg_charge &&
2785 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2786 gfp_memcg_charge())))
2787 goto suppress_allocation;
2788
2789 /* Under limit. */
2790 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2791 sk_leave_memory_pressure(sk);
2792 return 1;
2793 }
2794
2795 /* Under pressure. */
2796 if (allocated > sk_prot_mem_limits(sk, 1))
2797 sk_enter_memory_pressure(sk);
2798
2799 /* Over hard limit. */
2800 if (allocated > sk_prot_mem_limits(sk, 2))
2801 goto suppress_allocation;
2802
2803 /* guarantee minimum buffer size under pressure */
2804 if (kind == SK_MEM_RECV) {
2805 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2806 return 1;
2807
2808 } else { /* SK_MEM_SEND */
2809 int wmem0 = sk_get_wmem0(sk, prot);
2810
2811 if (sk->sk_type == SOCK_STREAM) {
2812 if (sk->sk_wmem_queued < wmem0)
2813 return 1;
2814 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2815 return 1;
2816 }
2817 }
2818
2819 if (sk_has_memory_pressure(sk)) {
2820 u64 alloc;
2821
2822 if (!sk_under_memory_pressure(sk))
2823 return 1;
2824 alloc = sk_sockets_allocated_read_positive(sk);
2825 if (sk_prot_mem_limits(sk, 2) > alloc *
2826 sk_mem_pages(sk->sk_wmem_queued +
2827 atomic_read(&sk->sk_rmem_alloc) +
2828 sk->sk_forward_alloc))
2829 return 1;
2830 }
2831
2832 suppress_allocation:
2833
2834 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2835 sk_stream_moderate_sndbuf(sk);
2836
2837 /* Fail only if socket is _under_ its sndbuf.
2838 * In this case we cannot block, so that we have to fail.
2839 */
2840 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2841 /* Force charge with __GFP_NOFAIL */
2842 if (memcg_charge && !charged) {
2843 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2844 gfp_memcg_charge() | __GFP_NOFAIL);
2845 }
2846 return 1;
2847 }
2848 }
2849
2850 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2851 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2852
2853 sk_memory_allocated_sub(sk, amt);
2854
2855 if (memcg_charge && charged)
2856 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2857
2858 return 0;
2859 }
2860 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2861
2862 /**
2863 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2864 * @sk: socket
2865 * @size: memory size to allocate
2866 * @kind: allocation type
2867 *
2868 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2869 * rmem allocation. This function assumes that protocols which have
2870 * memory_pressure use sk_wmem_queued as write buffer accounting.
2871 */
__sk_mem_schedule(struct sock * sk,int size,int kind)2872 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2873 {
2874 int ret, amt = sk_mem_pages(size);
2875
2876 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2877 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2878 if (!ret)
2879 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2880 return ret;
2881 }
2882 EXPORT_SYMBOL(__sk_mem_schedule);
2883
2884 /**
2885 * __sk_mem_reduce_allocated - reclaim memory_allocated
2886 * @sk: socket
2887 * @amount: number of quanta
2888 *
2889 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2890 */
__sk_mem_reduce_allocated(struct sock * sk,int amount)2891 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2892 {
2893 sk_memory_allocated_sub(sk, amount);
2894
2895 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2896 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2897
2898 if (sk_under_global_memory_pressure(sk) &&
2899 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2900 sk_leave_memory_pressure(sk);
2901 }
2902 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2903
2904 /**
2905 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2906 * @sk: socket
2907 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2908 */
__sk_mem_reclaim(struct sock * sk,int amount)2909 void __sk_mem_reclaim(struct sock *sk, int amount)
2910 {
2911 amount >>= SK_MEM_QUANTUM_SHIFT;
2912 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2913 __sk_mem_reduce_allocated(sk, amount);
2914 }
2915 EXPORT_SYMBOL(__sk_mem_reclaim);
2916
sk_set_peek_off(struct sock * sk,int val)2917 int sk_set_peek_off(struct sock *sk, int val)
2918 {
2919 WRITE_ONCE(sk->sk_peek_off, val);
2920 return 0;
2921 }
2922 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2923
2924 /*
2925 * Set of default routines for initialising struct proto_ops when
2926 * the protocol does not support a particular function. In certain
2927 * cases where it makes no sense for a protocol to have a "do nothing"
2928 * function, some default processing is provided.
2929 */
2930
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2931 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2932 {
2933 return -EOPNOTSUPP;
2934 }
2935 EXPORT_SYMBOL(sock_no_bind);
2936
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2937 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2938 int len, int flags)
2939 {
2940 return -EOPNOTSUPP;
2941 }
2942 EXPORT_SYMBOL(sock_no_connect);
2943
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2944 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2945 {
2946 return -EOPNOTSUPP;
2947 }
2948 EXPORT_SYMBOL(sock_no_socketpair);
2949
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)2950 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2951 bool kern)
2952 {
2953 return -EOPNOTSUPP;
2954 }
2955 EXPORT_SYMBOL(sock_no_accept);
2956
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)2957 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2958 int peer)
2959 {
2960 return -EOPNOTSUPP;
2961 }
2962 EXPORT_SYMBOL(sock_no_getname);
2963
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2964 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2965 {
2966 return -EOPNOTSUPP;
2967 }
2968 EXPORT_SYMBOL(sock_no_ioctl);
2969
sock_no_listen(struct socket * sock,int backlog)2970 int sock_no_listen(struct socket *sock, int backlog)
2971 {
2972 return -EOPNOTSUPP;
2973 }
2974 EXPORT_SYMBOL(sock_no_listen);
2975
sock_no_shutdown(struct socket * sock,int how)2976 int sock_no_shutdown(struct socket *sock, int how)
2977 {
2978 return -EOPNOTSUPP;
2979 }
2980 EXPORT_SYMBOL(sock_no_shutdown);
2981
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2982 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2983 {
2984 return -EOPNOTSUPP;
2985 }
2986 EXPORT_SYMBOL(sock_no_sendmsg);
2987
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)2988 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2989 {
2990 return -EOPNOTSUPP;
2991 }
2992 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2993
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)2994 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2995 int flags)
2996 {
2997 return -EOPNOTSUPP;
2998 }
2999 EXPORT_SYMBOL(sock_no_recvmsg);
3000
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)3001 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3002 {
3003 /* Mirror missing mmap method error code */
3004 return -ENODEV;
3005 }
3006 EXPORT_SYMBOL(sock_no_mmap);
3007
3008 /*
3009 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3010 * various sock-based usage counts.
3011 */
__receive_sock(struct file * file)3012 void __receive_sock(struct file *file)
3013 {
3014 struct socket *sock;
3015
3016 sock = sock_from_file(file);
3017 if (sock) {
3018 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3019 sock_update_classid(&sock->sk->sk_cgrp_data);
3020 }
3021 }
3022
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)3023 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3024 {
3025 ssize_t res;
3026 struct msghdr msg = {.msg_flags = flags};
3027 struct kvec iov;
3028 char *kaddr = kmap(page);
3029 iov.iov_base = kaddr + offset;
3030 iov.iov_len = size;
3031 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3032 kunmap(page);
3033 return res;
3034 }
3035 EXPORT_SYMBOL(sock_no_sendpage);
3036
sock_no_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)3037 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3038 int offset, size_t size, int flags)
3039 {
3040 ssize_t res;
3041 struct msghdr msg = {.msg_flags = flags};
3042 struct kvec iov;
3043 char *kaddr = kmap(page);
3044
3045 iov.iov_base = kaddr + offset;
3046 iov.iov_len = size;
3047 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3048 kunmap(page);
3049 return res;
3050 }
3051 EXPORT_SYMBOL(sock_no_sendpage_locked);
3052
3053 /*
3054 * Default Socket Callbacks
3055 */
3056
sock_def_wakeup(struct sock * sk)3057 static void sock_def_wakeup(struct sock *sk)
3058 {
3059 struct socket_wq *wq;
3060
3061 rcu_read_lock();
3062 wq = rcu_dereference(sk->sk_wq);
3063 if (skwq_has_sleeper(wq))
3064 wake_up_interruptible_all(&wq->wait);
3065 rcu_read_unlock();
3066 }
3067
sock_def_error_report(struct sock * sk)3068 static void sock_def_error_report(struct sock *sk)
3069 {
3070 struct socket_wq *wq;
3071
3072 rcu_read_lock();
3073 wq = rcu_dereference(sk->sk_wq);
3074 if (skwq_has_sleeper(wq))
3075 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3076 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3077 rcu_read_unlock();
3078 }
3079
sock_def_readable(struct sock * sk)3080 void sock_def_readable(struct sock *sk)
3081 {
3082 struct socket_wq *wq;
3083
3084 rcu_read_lock();
3085 wq = rcu_dereference(sk->sk_wq);
3086
3087 if (skwq_has_sleeper(wq)) {
3088 int done = 0;
3089
3090 trace_android_vh_do_wake_up_sync(&wq->wait, &done);
3091 if (done)
3092 goto out;
3093
3094 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3095 EPOLLRDNORM | EPOLLRDBAND);
3096 }
3097
3098 out:
3099 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3100 rcu_read_unlock();
3101 }
3102
sock_def_write_space(struct sock * sk)3103 static void sock_def_write_space(struct sock *sk)
3104 {
3105 struct socket_wq *wq;
3106
3107 rcu_read_lock();
3108
3109 /* Do not wake up a writer until he can make "significant"
3110 * progress. --DaveM
3111 */
3112 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3113 wq = rcu_dereference(sk->sk_wq);
3114 if (skwq_has_sleeper(wq))
3115 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3116 EPOLLWRNORM | EPOLLWRBAND);
3117
3118 /* Should agree with poll, otherwise some programs break */
3119 if (sock_writeable(sk))
3120 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3121 }
3122
3123 rcu_read_unlock();
3124 }
3125
sock_def_destruct(struct sock * sk)3126 static void sock_def_destruct(struct sock *sk)
3127 {
3128 }
3129
sk_send_sigurg(struct sock * sk)3130 void sk_send_sigurg(struct sock *sk)
3131 {
3132 if (sk->sk_socket && sk->sk_socket->file)
3133 if (send_sigurg(&sk->sk_socket->file->f_owner))
3134 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3135 }
3136 EXPORT_SYMBOL(sk_send_sigurg);
3137
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3138 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3139 unsigned long expires)
3140 {
3141 if (!mod_timer(timer, expires))
3142 sock_hold(sk);
3143 }
3144 EXPORT_SYMBOL(sk_reset_timer);
3145
sk_stop_timer(struct sock * sk,struct timer_list * timer)3146 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3147 {
3148 if (del_timer(timer))
3149 __sock_put(sk);
3150 }
3151 EXPORT_SYMBOL(sk_stop_timer);
3152
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3153 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3154 {
3155 if (del_timer_sync(timer))
3156 __sock_put(sk);
3157 }
3158 EXPORT_SYMBOL(sk_stop_timer_sync);
3159
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3160 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3161 {
3162 sk_init_common(sk);
3163 sk->sk_send_head = NULL;
3164
3165 timer_setup(&sk->sk_timer, NULL, 0);
3166
3167 sk->sk_allocation = GFP_KERNEL;
3168 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
3169 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
3170 sk->sk_state = TCP_CLOSE;
3171 sk_set_socket(sk, sock);
3172
3173 sock_set_flag(sk, SOCK_ZAPPED);
3174
3175 if (sock) {
3176 sk->sk_type = sock->type;
3177 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3178 sock->sk = sk;
3179 } else {
3180 RCU_INIT_POINTER(sk->sk_wq, NULL);
3181 }
3182 sk->sk_uid = uid;
3183
3184 rwlock_init(&sk->sk_callback_lock);
3185 if (sk->sk_kern_sock)
3186 lockdep_set_class_and_name(
3187 &sk->sk_callback_lock,
3188 af_kern_callback_keys + sk->sk_family,
3189 af_family_kern_clock_key_strings[sk->sk_family]);
3190 else
3191 lockdep_set_class_and_name(
3192 &sk->sk_callback_lock,
3193 af_callback_keys + sk->sk_family,
3194 af_family_clock_key_strings[sk->sk_family]);
3195
3196 sk->sk_state_change = sock_def_wakeup;
3197 sk->sk_data_ready = sock_def_readable;
3198 sk->sk_write_space = sock_def_write_space;
3199 sk->sk_error_report = sock_def_error_report;
3200 sk->sk_destruct = sock_def_destruct;
3201
3202 sk->sk_frag.page = NULL;
3203 sk->sk_frag.offset = 0;
3204 sk->sk_peek_off = -1;
3205
3206 sk->sk_peer_pid = NULL;
3207 sk->sk_peer_cred = NULL;
3208 spin_lock_init(&sk->sk_peer_lock);
3209
3210 sk->sk_write_pending = 0;
3211 sk->sk_rcvlowat = 1;
3212 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3213 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3214
3215 sk->sk_stamp = SK_DEFAULT_STAMP;
3216 #if BITS_PER_LONG==32
3217 seqlock_init(&sk->sk_stamp_seq);
3218 #endif
3219 atomic_set(&sk->sk_zckey, 0);
3220
3221 #ifdef CONFIG_NET_RX_BUSY_POLL
3222 sk->sk_napi_id = 0;
3223 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
3224 #endif
3225
3226 sk->sk_max_pacing_rate = ~0UL;
3227 sk->sk_pacing_rate = ~0UL;
3228 WRITE_ONCE(sk->sk_pacing_shift, 10);
3229 sk->sk_incoming_cpu = -1;
3230
3231 sk_rx_queue_clear(sk);
3232 /*
3233 * Before updating sk_refcnt, we must commit prior changes to memory
3234 * (Documentation/RCU/rculist_nulls.rst for details)
3235 */
3236 smp_wmb();
3237 refcount_set(&sk->sk_refcnt, 1);
3238 atomic_set(&sk->sk_drops, 0);
3239 }
3240 EXPORT_SYMBOL(sock_init_data_uid);
3241
sock_init_data(struct socket * sock,struct sock * sk)3242 void sock_init_data(struct socket *sock, struct sock *sk)
3243 {
3244 kuid_t uid = sock ?
3245 SOCK_INODE(sock)->i_uid :
3246 make_kuid(sock_net(sk)->user_ns, 0);
3247
3248 sock_init_data_uid(sock, sk, uid);
3249 }
3250 EXPORT_SYMBOL(sock_init_data);
3251
lock_sock_nested(struct sock * sk,int subclass)3252 void lock_sock_nested(struct sock *sk, int subclass)
3253 {
3254 /* The sk_lock has mutex_lock() semantics here. */
3255 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3256
3257 might_sleep();
3258 spin_lock_bh(&sk->sk_lock.slock);
3259 if (sk->sk_lock.owned)
3260 __lock_sock(sk);
3261 sk->sk_lock.owned = 1;
3262 spin_unlock_bh(&sk->sk_lock.slock);
3263 }
3264 EXPORT_SYMBOL(lock_sock_nested);
3265
release_sock(struct sock * sk)3266 void release_sock(struct sock *sk)
3267 {
3268 spin_lock_bh(&sk->sk_lock.slock);
3269 if (sk->sk_backlog.tail)
3270 __release_sock(sk);
3271
3272 /* Warning : release_cb() might need to release sk ownership,
3273 * ie call sock_release_ownership(sk) before us.
3274 */
3275 if (sk->sk_prot->release_cb)
3276 sk->sk_prot->release_cb(sk);
3277
3278 sock_release_ownership(sk);
3279 if (waitqueue_active(&sk->sk_lock.wq))
3280 wake_up(&sk->sk_lock.wq);
3281 spin_unlock_bh(&sk->sk_lock.slock);
3282 }
3283 EXPORT_SYMBOL(release_sock);
3284
__lock_sock_fast(struct sock * sk)3285 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3286 {
3287 might_sleep();
3288 spin_lock_bh(&sk->sk_lock.slock);
3289
3290 if (!sk->sk_lock.owned) {
3291 /*
3292 * Fast path return with bottom halves disabled and
3293 * sock::sk_lock.slock held.
3294 *
3295 * The 'mutex' is not contended and holding
3296 * sock::sk_lock.slock prevents all other lockers to
3297 * proceed so the corresponding unlock_sock_fast() can
3298 * avoid the slow path of release_sock() completely and
3299 * just release slock.
3300 *
3301 * From a semantical POV this is equivalent to 'acquiring'
3302 * the 'mutex', hence the corresponding lockdep
3303 * mutex_release() has to happen in the fast path of
3304 * unlock_sock_fast().
3305 */
3306 return false;
3307 }
3308
3309 __lock_sock(sk);
3310 sk->sk_lock.owned = 1;
3311 __acquire(&sk->sk_lock.slock);
3312 spin_unlock_bh(&sk->sk_lock.slock);
3313 return true;
3314 }
3315 EXPORT_SYMBOL(__lock_sock_fast);
3316
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3317 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3318 bool timeval, bool time32)
3319 {
3320 struct sock *sk = sock->sk;
3321 struct timespec64 ts;
3322
3323 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3324 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3325 if (ts.tv_sec == -1)
3326 return -ENOENT;
3327 if (ts.tv_sec == 0) {
3328 ktime_t kt = ktime_get_real();
3329 sock_write_timestamp(sk, kt);
3330 ts = ktime_to_timespec64(kt);
3331 }
3332
3333 if (timeval)
3334 ts.tv_nsec /= 1000;
3335
3336 #ifdef CONFIG_COMPAT_32BIT_TIME
3337 if (time32)
3338 return put_old_timespec32(&ts, userstamp);
3339 #endif
3340 #ifdef CONFIG_SPARC64
3341 /* beware of padding in sparc64 timeval */
3342 if (timeval && !in_compat_syscall()) {
3343 struct __kernel_old_timeval __user tv = {
3344 .tv_sec = ts.tv_sec,
3345 .tv_usec = ts.tv_nsec,
3346 };
3347 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3348 return -EFAULT;
3349 return 0;
3350 }
3351 #endif
3352 return put_timespec64(&ts, userstamp);
3353 }
3354 EXPORT_SYMBOL(sock_gettstamp);
3355
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3356 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3357 {
3358 if (!sock_flag(sk, flag)) {
3359 unsigned long previous_flags = sk->sk_flags;
3360
3361 sock_set_flag(sk, flag);
3362 /*
3363 * we just set one of the two flags which require net
3364 * time stamping, but time stamping might have been on
3365 * already because of the other one
3366 */
3367 if (sock_needs_netstamp(sk) &&
3368 !(previous_flags & SK_FLAGS_TIMESTAMP))
3369 net_enable_timestamp();
3370 }
3371 }
3372
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3373 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3374 int level, int type)
3375 {
3376 struct sock_exterr_skb *serr;
3377 struct sk_buff *skb;
3378 int copied, err;
3379
3380 err = -EAGAIN;
3381 skb = sock_dequeue_err_skb(sk);
3382 if (skb == NULL)
3383 goto out;
3384
3385 copied = skb->len;
3386 if (copied > len) {
3387 msg->msg_flags |= MSG_TRUNC;
3388 copied = len;
3389 }
3390 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3391 if (err)
3392 goto out_free_skb;
3393
3394 sock_recv_timestamp(msg, sk, skb);
3395
3396 serr = SKB_EXT_ERR(skb);
3397 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3398
3399 msg->msg_flags |= MSG_ERRQUEUE;
3400 err = copied;
3401
3402 out_free_skb:
3403 kfree_skb(skb);
3404 out:
3405 return err;
3406 }
3407 EXPORT_SYMBOL(sock_recv_errqueue);
3408
3409 /*
3410 * Get a socket option on an socket.
3411 *
3412 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3413 * asynchronous errors should be reported by getsockopt. We assume
3414 * this means if you specify SO_ERROR (otherwise whats the point of it).
3415 */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3416 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3417 char __user *optval, int __user *optlen)
3418 {
3419 struct sock *sk = sock->sk;
3420
3421 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3422 }
3423 EXPORT_SYMBOL(sock_common_getsockopt);
3424
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3425 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3426 int flags)
3427 {
3428 struct sock *sk = sock->sk;
3429 int addr_len = 0;
3430 int err;
3431
3432 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3433 flags & ~MSG_DONTWAIT, &addr_len);
3434 if (err >= 0)
3435 msg->msg_namelen = addr_len;
3436 return err;
3437 }
3438 EXPORT_SYMBOL(sock_common_recvmsg);
3439
3440 /*
3441 * Set socket options on an inet socket.
3442 */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3443 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3444 sockptr_t optval, unsigned int optlen)
3445 {
3446 struct sock *sk = sock->sk;
3447
3448 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3449 }
3450 EXPORT_SYMBOL(sock_common_setsockopt);
3451
sk_common_release(struct sock * sk)3452 void sk_common_release(struct sock *sk)
3453 {
3454 if (sk->sk_prot->destroy)
3455 sk->sk_prot->destroy(sk);
3456
3457 /*
3458 * Observation: when sk_common_release is called, processes have
3459 * no access to socket. But net still has.
3460 * Step one, detach it from networking:
3461 *
3462 * A. Remove from hash tables.
3463 */
3464
3465 sk->sk_prot->unhash(sk);
3466
3467 /*
3468 * In this point socket cannot receive new packets, but it is possible
3469 * that some packets are in flight because some CPU runs receiver and
3470 * did hash table lookup before we unhashed socket. They will achieve
3471 * receive queue and will be purged by socket destructor.
3472 *
3473 * Also we still have packets pending on receive queue and probably,
3474 * our own packets waiting in device queues. sock_destroy will drain
3475 * receive queue, but transmitted packets will delay socket destruction
3476 * until the last reference will be released.
3477 */
3478
3479 sock_orphan(sk);
3480
3481 xfrm_sk_free_policy(sk);
3482
3483 sk_refcnt_debug_release(sk);
3484
3485 sock_put(sk);
3486 }
3487 EXPORT_SYMBOL(sk_common_release);
3488
sk_get_meminfo(const struct sock * sk,u32 * mem)3489 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3490 {
3491 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3492
3493 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3494 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3495 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3496 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3497 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3498 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3499 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3500 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3501 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3502 }
3503
3504 #ifdef CONFIG_PROC_FS
3505 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
3506 struct prot_inuse {
3507 int val[PROTO_INUSE_NR];
3508 };
3509
3510 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3511
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)3512 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3513 {
3514 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3515 }
3516 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3517
sock_prot_inuse_get(struct net * net,struct proto * prot)3518 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3519 {
3520 int cpu, idx = prot->inuse_idx;
3521 int res = 0;
3522
3523 for_each_possible_cpu(cpu)
3524 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3525
3526 return res >= 0 ? res : 0;
3527 }
3528 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3529
sock_inuse_add(struct net * net,int val)3530 static void sock_inuse_add(struct net *net, int val)
3531 {
3532 this_cpu_add(*net->core.sock_inuse, val);
3533 }
3534
sock_inuse_get(struct net * net)3535 int sock_inuse_get(struct net *net)
3536 {
3537 int cpu, res = 0;
3538
3539 for_each_possible_cpu(cpu)
3540 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3541
3542 return res;
3543 }
3544
3545 EXPORT_SYMBOL_GPL(sock_inuse_get);
3546
sock_inuse_init_net(struct net * net)3547 static int __net_init sock_inuse_init_net(struct net *net)
3548 {
3549 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3550 if (net->core.prot_inuse == NULL)
3551 return -ENOMEM;
3552
3553 net->core.sock_inuse = alloc_percpu(int);
3554 if (net->core.sock_inuse == NULL)
3555 goto out;
3556
3557 return 0;
3558
3559 out:
3560 free_percpu(net->core.prot_inuse);
3561 return -ENOMEM;
3562 }
3563
sock_inuse_exit_net(struct net * net)3564 static void __net_exit sock_inuse_exit_net(struct net *net)
3565 {
3566 free_percpu(net->core.prot_inuse);
3567 free_percpu(net->core.sock_inuse);
3568 }
3569
3570 static struct pernet_operations net_inuse_ops = {
3571 .init = sock_inuse_init_net,
3572 .exit = sock_inuse_exit_net,
3573 };
3574
net_inuse_init(void)3575 static __init int net_inuse_init(void)
3576 {
3577 if (register_pernet_subsys(&net_inuse_ops))
3578 panic("Cannot initialize net inuse counters");
3579
3580 return 0;
3581 }
3582
3583 core_initcall(net_inuse_init);
3584
assign_proto_idx(struct proto * prot)3585 static int assign_proto_idx(struct proto *prot)
3586 {
3587 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3588
3589 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3590 pr_err("PROTO_INUSE_NR exhausted\n");
3591 return -ENOSPC;
3592 }
3593
3594 set_bit(prot->inuse_idx, proto_inuse_idx);
3595 return 0;
3596 }
3597
release_proto_idx(struct proto * prot)3598 static void release_proto_idx(struct proto *prot)
3599 {
3600 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3601 clear_bit(prot->inuse_idx, proto_inuse_idx);
3602 }
3603 #else
assign_proto_idx(struct proto * prot)3604 static inline int assign_proto_idx(struct proto *prot)
3605 {
3606 return 0;
3607 }
3608
release_proto_idx(struct proto * prot)3609 static inline void release_proto_idx(struct proto *prot)
3610 {
3611 }
3612
sock_inuse_add(struct net * net,int val)3613 static void sock_inuse_add(struct net *net, int val)
3614 {
3615 }
3616 #endif
3617
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3618 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3619 {
3620 if (!twsk_prot)
3621 return;
3622 kfree(twsk_prot->twsk_slab_name);
3623 twsk_prot->twsk_slab_name = NULL;
3624 kmem_cache_destroy(twsk_prot->twsk_slab);
3625 twsk_prot->twsk_slab = NULL;
3626 }
3627
tw_prot_init(const struct proto * prot)3628 static int tw_prot_init(const struct proto *prot)
3629 {
3630 struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3631
3632 if (!twsk_prot)
3633 return 0;
3634
3635 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3636 prot->name);
3637 if (!twsk_prot->twsk_slab_name)
3638 return -ENOMEM;
3639
3640 twsk_prot->twsk_slab =
3641 kmem_cache_create(twsk_prot->twsk_slab_name,
3642 twsk_prot->twsk_obj_size, 0,
3643 SLAB_ACCOUNT | prot->slab_flags,
3644 NULL);
3645 if (!twsk_prot->twsk_slab) {
3646 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3647 prot->name);
3648 return -ENOMEM;
3649 }
3650
3651 return 0;
3652 }
3653
req_prot_cleanup(struct request_sock_ops * rsk_prot)3654 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3655 {
3656 if (!rsk_prot)
3657 return;
3658 kfree(rsk_prot->slab_name);
3659 rsk_prot->slab_name = NULL;
3660 kmem_cache_destroy(rsk_prot->slab);
3661 rsk_prot->slab = NULL;
3662 }
3663
req_prot_init(const struct proto * prot)3664 static int req_prot_init(const struct proto *prot)
3665 {
3666 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3667
3668 if (!rsk_prot)
3669 return 0;
3670
3671 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3672 prot->name);
3673 if (!rsk_prot->slab_name)
3674 return -ENOMEM;
3675
3676 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3677 rsk_prot->obj_size, 0,
3678 SLAB_ACCOUNT | prot->slab_flags,
3679 NULL);
3680
3681 if (!rsk_prot->slab) {
3682 pr_crit("%s: Can't create request sock SLAB cache!\n",
3683 prot->name);
3684 return -ENOMEM;
3685 }
3686 return 0;
3687 }
3688
proto_register(struct proto * prot,int alloc_slab)3689 int proto_register(struct proto *prot, int alloc_slab)
3690 {
3691 int ret = -ENOBUFS;
3692
3693 if (alloc_slab) {
3694 prot->slab = kmem_cache_create_usercopy(prot->name,
3695 prot->obj_size, 0,
3696 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3697 prot->slab_flags,
3698 prot->useroffset, prot->usersize,
3699 NULL);
3700
3701 if (prot->slab == NULL) {
3702 pr_crit("%s: Can't create sock SLAB cache!\n",
3703 prot->name);
3704 goto out;
3705 }
3706
3707 if (req_prot_init(prot))
3708 goto out_free_request_sock_slab;
3709
3710 if (tw_prot_init(prot))
3711 goto out_free_timewait_sock_slab;
3712 }
3713
3714 mutex_lock(&proto_list_mutex);
3715 ret = assign_proto_idx(prot);
3716 if (ret) {
3717 mutex_unlock(&proto_list_mutex);
3718 goto out_free_timewait_sock_slab;
3719 }
3720 list_add(&prot->node, &proto_list);
3721 mutex_unlock(&proto_list_mutex);
3722 return ret;
3723
3724 out_free_timewait_sock_slab:
3725 if (alloc_slab)
3726 tw_prot_cleanup(prot->twsk_prot);
3727 out_free_request_sock_slab:
3728 if (alloc_slab) {
3729 req_prot_cleanup(prot->rsk_prot);
3730
3731 kmem_cache_destroy(prot->slab);
3732 prot->slab = NULL;
3733 }
3734 out:
3735 return ret;
3736 }
3737 EXPORT_SYMBOL(proto_register);
3738
proto_unregister(struct proto * prot)3739 void proto_unregister(struct proto *prot)
3740 {
3741 mutex_lock(&proto_list_mutex);
3742 release_proto_idx(prot);
3743 list_del(&prot->node);
3744 mutex_unlock(&proto_list_mutex);
3745
3746 kmem_cache_destroy(prot->slab);
3747 prot->slab = NULL;
3748
3749 req_prot_cleanup(prot->rsk_prot);
3750 tw_prot_cleanup(prot->twsk_prot);
3751 }
3752 EXPORT_SYMBOL(proto_unregister);
3753
sock_load_diag_module(int family,int protocol)3754 int sock_load_diag_module(int family, int protocol)
3755 {
3756 if (!protocol) {
3757 if (!sock_is_registered(family))
3758 return -ENOENT;
3759
3760 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3761 NETLINK_SOCK_DIAG, family);
3762 }
3763
3764 #ifdef CONFIG_INET
3765 if (family == AF_INET &&
3766 protocol != IPPROTO_RAW &&
3767 protocol < MAX_INET_PROTOS &&
3768 !rcu_access_pointer(inet_protos[protocol]))
3769 return -ENOENT;
3770 #endif
3771
3772 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3773 NETLINK_SOCK_DIAG, family, protocol);
3774 }
3775 EXPORT_SYMBOL(sock_load_diag_module);
3776
3777 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)3778 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3779 __acquires(proto_list_mutex)
3780 {
3781 mutex_lock(&proto_list_mutex);
3782 return seq_list_start_head(&proto_list, *pos);
3783 }
3784
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)3785 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3786 {
3787 return seq_list_next(v, &proto_list, pos);
3788 }
3789
proto_seq_stop(struct seq_file * seq,void * v)3790 static void proto_seq_stop(struct seq_file *seq, void *v)
3791 __releases(proto_list_mutex)
3792 {
3793 mutex_unlock(&proto_list_mutex);
3794 }
3795
proto_method_implemented(const void * method)3796 static char proto_method_implemented(const void *method)
3797 {
3798 return method == NULL ? 'n' : 'y';
3799 }
sock_prot_memory_allocated(struct proto * proto)3800 static long sock_prot_memory_allocated(struct proto *proto)
3801 {
3802 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3803 }
3804
sock_prot_memory_pressure(struct proto * proto)3805 static const char *sock_prot_memory_pressure(struct proto *proto)
3806 {
3807 return proto->memory_pressure != NULL ?
3808 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3809 }
3810
proto_seq_printf(struct seq_file * seq,struct proto * proto)3811 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3812 {
3813
3814 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3815 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3816 proto->name,
3817 proto->obj_size,
3818 sock_prot_inuse_get(seq_file_net(seq), proto),
3819 sock_prot_memory_allocated(proto),
3820 sock_prot_memory_pressure(proto),
3821 proto->max_header,
3822 proto->slab == NULL ? "no" : "yes",
3823 module_name(proto->owner),
3824 proto_method_implemented(proto->close),
3825 proto_method_implemented(proto->connect),
3826 proto_method_implemented(proto->disconnect),
3827 proto_method_implemented(proto->accept),
3828 proto_method_implemented(proto->ioctl),
3829 proto_method_implemented(proto->init),
3830 proto_method_implemented(proto->destroy),
3831 proto_method_implemented(proto->shutdown),
3832 proto_method_implemented(proto->setsockopt),
3833 proto_method_implemented(proto->getsockopt),
3834 proto_method_implemented(proto->sendmsg),
3835 proto_method_implemented(proto->recvmsg),
3836 proto_method_implemented(proto->sendpage),
3837 proto_method_implemented(proto->bind),
3838 proto_method_implemented(proto->backlog_rcv),
3839 proto_method_implemented(proto->hash),
3840 proto_method_implemented(proto->unhash),
3841 proto_method_implemented(proto->get_port),
3842 proto_method_implemented(proto->enter_memory_pressure));
3843 }
3844
proto_seq_show(struct seq_file * seq,void * v)3845 static int proto_seq_show(struct seq_file *seq, void *v)
3846 {
3847 if (v == &proto_list)
3848 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3849 "protocol",
3850 "size",
3851 "sockets",
3852 "memory",
3853 "press",
3854 "maxhdr",
3855 "slab",
3856 "module",
3857 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3858 else
3859 proto_seq_printf(seq, list_entry(v, struct proto, node));
3860 return 0;
3861 }
3862
3863 static const struct seq_operations proto_seq_ops = {
3864 .start = proto_seq_start,
3865 .next = proto_seq_next,
3866 .stop = proto_seq_stop,
3867 .show = proto_seq_show,
3868 };
3869
proto_init_net(struct net * net)3870 static __net_init int proto_init_net(struct net *net)
3871 {
3872 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3873 sizeof(struct seq_net_private)))
3874 return -ENOMEM;
3875
3876 return 0;
3877 }
3878
proto_exit_net(struct net * net)3879 static __net_exit void proto_exit_net(struct net *net)
3880 {
3881 remove_proc_entry("protocols", net->proc_net);
3882 }
3883
3884
3885 static __net_initdata struct pernet_operations proto_net_ops = {
3886 .init = proto_init_net,
3887 .exit = proto_exit_net,
3888 };
3889
proto_init(void)3890 static int __init proto_init(void)
3891 {
3892 return register_pernet_subsys(&proto_net_ops);
3893 }
3894
3895 subsys_initcall(proto_init);
3896
3897 #endif /* PROC_FS */
3898
3899 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)3900 bool sk_busy_loop_end(void *p, unsigned long start_time)
3901 {
3902 struct sock *sk = p;
3903
3904 return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3905 sk_busy_loop_timeout(sk, start_time);
3906 }
3907 EXPORT_SYMBOL(sk_busy_loop_end);
3908 #endif /* CONFIG_NET_RX_BUSY_POLL */
3909
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)3910 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3911 {
3912 if (!sk->sk_prot->bind_add)
3913 return -EOPNOTSUPP;
3914 return sk->sk_prot->bind_add(sk, addr, addr_len);
3915 }
3916 EXPORT_SYMBOL(sock_bind_add);
3917