1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 */
85
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121
122 #include <linux/uaccess.h>
123
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142
143 #include <trace/events/sock.h>
144 #include <trace/hooks/sched.h>
145 #include <trace/hooks/net.h>
146
147 #include <net/tcp.h>
148 #include <net/busy_poll.h>
149 #include <net/phonet/phonet.h>
150
151 #include <linux/ethtool.h>
152
153 #include "dev.h"
154
155 static DEFINE_MUTEX(proto_list_mutex);
156 static LIST_HEAD(proto_list);
157
158 static void sock_def_write_space_wfree(struct sock *sk);
159 static void sock_def_write_space(struct sock *sk);
160
161 /**
162 * sk_ns_capable - General socket capability test
163 * @sk: Socket to use a capability on or through
164 * @user_ns: The user namespace of the capability to use
165 * @cap: The capability to use
166 *
167 * Test to see if the opener of the socket had when the socket was
168 * created and the current process has the capability @cap in the user
169 * namespace @user_ns.
170 */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)171 bool sk_ns_capable(const struct sock *sk,
172 struct user_namespace *user_ns, int cap)
173 {
174 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
175 ns_capable(user_ns, cap);
176 }
177 EXPORT_SYMBOL(sk_ns_capable);
178
179 /**
180 * sk_capable - Socket global capability test
181 * @sk: Socket to use a capability on or through
182 * @cap: The global capability to use
183 *
184 * Test to see if the opener of the socket had when the socket was
185 * created and the current process has the capability @cap in all user
186 * namespaces.
187 */
sk_capable(const struct sock * sk,int cap)188 bool sk_capable(const struct sock *sk, int cap)
189 {
190 return sk_ns_capable(sk, &init_user_ns, cap);
191 }
192 EXPORT_SYMBOL(sk_capable);
193
194 /**
195 * sk_net_capable - Network namespace socket capability test
196 * @sk: Socket to use a capability on or through
197 * @cap: The capability to use
198 *
199 * Test to see if the opener of the socket had when the socket was created
200 * and the current process has the capability @cap over the network namespace
201 * the socket is a member of.
202 */
sk_net_capable(const struct sock * sk,int cap)203 bool sk_net_capable(const struct sock *sk, int cap)
204 {
205 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
206 }
207 EXPORT_SYMBOL(sk_net_capable);
208
209 /*
210 * Each address family might have different locking rules, so we have
211 * one slock key per address family and separate keys for internal and
212 * userspace sockets.
213 */
214 static struct lock_class_key af_family_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_keys[AF_MAX];
216 static struct lock_class_key af_family_slock_keys[AF_MAX];
217 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
218
219 /*
220 * Make lock validator output more readable. (we pre-construct these
221 * strings build-time, so that runtime initialization of socket
222 * locks is fast):
223 */
224
225 #define _sock_locks(x) \
226 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
227 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
228 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
229 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
230 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
231 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
232 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
233 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
234 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
235 x "27" , x "28" , x "AF_CAN" , \
236 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
237 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
238 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
239 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
240 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
241 x "AF_MCTP" , \
242 x "AF_MAX"
243
244 static const char *const af_family_key_strings[AF_MAX+1] = {
245 _sock_locks("sk_lock-")
246 };
247 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
248 _sock_locks("slock-")
249 };
250 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
251 _sock_locks("clock-")
252 };
253
254 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
255 _sock_locks("k-sk_lock-")
256 };
257 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
258 _sock_locks("k-slock-")
259 };
260 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
261 _sock_locks("k-clock-")
262 };
263 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
264 _sock_locks("rlock-")
265 };
266 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
267 _sock_locks("wlock-")
268 };
269 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
270 _sock_locks("elock-")
271 };
272
273 /*
274 * sk_callback_lock and sk queues locking rules are per-address-family,
275 * so split the lock classes by using a per-AF key:
276 */
277 static struct lock_class_key af_callback_keys[AF_MAX];
278 static struct lock_class_key af_rlock_keys[AF_MAX];
279 static struct lock_class_key af_wlock_keys[AF_MAX];
280 static struct lock_class_key af_elock_keys[AF_MAX];
281 static struct lock_class_key af_kern_callback_keys[AF_MAX];
282
283 /* Run time adjustable parameters. */
284 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
285 EXPORT_SYMBOL(sysctl_wmem_max);
286 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
287 EXPORT_SYMBOL(sysctl_rmem_max);
288 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
289 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
290
291 int sysctl_tstamp_allow_data __read_mostly = 1;
292
293 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
294 EXPORT_SYMBOL_GPL(memalloc_socks_key);
295
296 /**
297 * sk_set_memalloc - sets %SOCK_MEMALLOC
298 * @sk: socket to set it on
299 *
300 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
301 * It's the responsibility of the admin to adjust min_free_kbytes
302 * to meet the requirements
303 */
sk_set_memalloc(struct sock * sk)304 void sk_set_memalloc(struct sock *sk)
305 {
306 sock_set_flag(sk, SOCK_MEMALLOC);
307 sk->sk_allocation |= __GFP_MEMALLOC;
308 static_branch_inc(&memalloc_socks_key);
309 }
310 EXPORT_SYMBOL_GPL(sk_set_memalloc);
311
sk_clear_memalloc(struct sock * sk)312 void sk_clear_memalloc(struct sock *sk)
313 {
314 sock_reset_flag(sk, SOCK_MEMALLOC);
315 sk->sk_allocation &= ~__GFP_MEMALLOC;
316 static_branch_dec(&memalloc_socks_key);
317
318 /*
319 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
320 * progress of swapping. SOCK_MEMALLOC may be cleared while
321 * it has rmem allocations due to the last swapfile being deactivated
322 * but there is a risk that the socket is unusable due to exceeding
323 * the rmem limits. Reclaim the reserves and obey rmem limits again.
324 */
325 sk_mem_reclaim(sk);
326 }
327 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
328
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)329 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
330 {
331 int ret;
332 unsigned int noreclaim_flag;
333
334 /* these should have been dropped before queueing */
335 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
336
337 noreclaim_flag = memalloc_noreclaim_save();
338 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
339 tcp_v6_do_rcv,
340 tcp_v4_do_rcv,
341 sk, skb);
342 memalloc_noreclaim_restore(noreclaim_flag);
343
344 return ret;
345 }
346 EXPORT_SYMBOL(__sk_backlog_rcv);
347
sk_error_report(struct sock * sk)348 void sk_error_report(struct sock *sk)
349 {
350 sk->sk_error_report(sk);
351
352 switch (sk->sk_family) {
353 case AF_INET:
354 fallthrough;
355 case AF_INET6:
356 trace_inet_sk_error_report(sk);
357 break;
358 default:
359 break;
360 }
361 }
362 EXPORT_SYMBOL(sk_error_report);
363
sock_get_timeout(long timeo,void * optval,bool old_timeval)364 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
365 {
366 struct __kernel_sock_timeval tv;
367
368 if (timeo == MAX_SCHEDULE_TIMEOUT) {
369 tv.tv_sec = 0;
370 tv.tv_usec = 0;
371 } else {
372 tv.tv_sec = timeo / HZ;
373 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
374 }
375
376 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
377 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
378 *(struct old_timeval32 *)optval = tv32;
379 return sizeof(tv32);
380 }
381
382 if (old_timeval) {
383 struct __kernel_old_timeval old_tv;
384 old_tv.tv_sec = tv.tv_sec;
385 old_tv.tv_usec = tv.tv_usec;
386 *(struct __kernel_old_timeval *)optval = old_tv;
387 return sizeof(old_tv);
388 }
389
390 *(struct __kernel_sock_timeval *)optval = tv;
391 return sizeof(tv);
392 }
393 EXPORT_SYMBOL(sock_get_timeout);
394
sock_copy_user_timeval(struct __kernel_sock_timeval * tv,sockptr_t optval,int optlen,bool old_timeval)395 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
396 sockptr_t optval, int optlen, bool old_timeval)
397 {
398 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
399 struct old_timeval32 tv32;
400
401 if (optlen < sizeof(tv32))
402 return -EINVAL;
403
404 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
405 return -EFAULT;
406 tv->tv_sec = tv32.tv_sec;
407 tv->tv_usec = tv32.tv_usec;
408 } else if (old_timeval) {
409 struct __kernel_old_timeval old_tv;
410
411 if (optlen < sizeof(old_tv))
412 return -EINVAL;
413 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
414 return -EFAULT;
415 tv->tv_sec = old_tv.tv_sec;
416 tv->tv_usec = old_tv.tv_usec;
417 } else {
418 if (optlen < sizeof(*tv))
419 return -EINVAL;
420 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
421 return -EFAULT;
422 }
423
424 return 0;
425 }
426 EXPORT_SYMBOL(sock_copy_user_timeval);
427
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)428 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
429 bool old_timeval)
430 {
431 struct __kernel_sock_timeval tv;
432 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
433 long val;
434
435 if (err)
436 return err;
437
438 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
439 return -EDOM;
440
441 if (tv.tv_sec < 0) {
442 static int warned __read_mostly;
443
444 WRITE_ONCE(*timeo_p, 0);
445 if (warned < 10 && net_ratelimit()) {
446 warned++;
447 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
448 __func__, current->comm, task_pid_nr(current));
449 }
450 return 0;
451 }
452 val = MAX_SCHEDULE_TIMEOUT;
453 if ((tv.tv_sec || tv.tv_usec) &&
454 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
455 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
456 USEC_PER_SEC / HZ);
457 WRITE_ONCE(*timeo_p, val);
458 return 0;
459 }
460
sock_needs_netstamp(const struct sock * sk)461 static bool sock_needs_netstamp(const struct sock *sk)
462 {
463 switch (sk->sk_family) {
464 case AF_UNSPEC:
465 case AF_UNIX:
466 return false;
467 default:
468 return true;
469 }
470 }
471
sock_disable_timestamp(struct sock * sk,unsigned long flags)472 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
473 {
474 if (sk->sk_flags & flags) {
475 sk->sk_flags &= ~flags;
476 if (sock_needs_netstamp(sk) &&
477 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
478 net_disable_timestamp();
479 }
480 }
481
482
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)483 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
484 {
485 unsigned long flags;
486 struct sk_buff_head *list = &sk->sk_receive_queue;
487
488 if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
489 atomic_inc(&sk->sk_drops);
490 trace_sock_rcvqueue_full(sk, skb);
491 return -ENOMEM;
492 }
493
494 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
495 atomic_inc(&sk->sk_drops);
496 return -ENOBUFS;
497 }
498
499 skb->dev = NULL;
500 skb_set_owner_r(skb, sk);
501
502 /* we escape from rcu protected region, make sure we dont leak
503 * a norefcounted dst
504 */
505 skb_dst_force(skb);
506
507 spin_lock_irqsave(&list->lock, flags);
508 sock_skb_set_dropcount(sk, skb);
509 __skb_queue_tail(list, skb);
510 spin_unlock_irqrestore(&list->lock, flags);
511
512 if (!sock_flag(sk, SOCK_DEAD))
513 sk->sk_data_ready(sk);
514 return 0;
515 }
516 EXPORT_SYMBOL(__sock_queue_rcv_skb);
517
sock_queue_rcv_skb_reason(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)518 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
519 enum skb_drop_reason *reason)
520 {
521 enum skb_drop_reason drop_reason;
522 int err;
523
524 err = sk_filter(sk, skb);
525 if (err) {
526 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
527 goto out;
528 }
529 err = __sock_queue_rcv_skb(sk, skb);
530 switch (err) {
531 case -ENOMEM:
532 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
533 break;
534 case -ENOBUFS:
535 drop_reason = SKB_DROP_REASON_PROTO_MEM;
536 break;
537 default:
538 drop_reason = SKB_NOT_DROPPED_YET;
539 break;
540 }
541 out:
542 if (reason)
543 *reason = drop_reason;
544 return err;
545 }
546 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
547
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)548 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
549 const int nested, unsigned int trim_cap, bool refcounted)
550 {
551 int rc = NET_RX_SUCCESS;
552
553 if (sk_filter_trim_cap(sk, skb, trim_cap))
554 goto discard_and_relse;
555
556 skb->dev = NULL;
557
558 if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
559 atomic_inc(&sk->sk_drops);
560 goto discard_and_relse;
561 }
562 if (nested)
563 bh_lock_sock_nested(sk);
564 else
565 bh_lock_sock(sk);
566 if (!sock_owned_by_user(sk)) {
567 /*
568 * trylock + unlock semantics:
569 */
570 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
571
572 rc = sk_backlog_rcv(sk, skb);
573
574 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
575 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
576 bh_unlock_sock(sk);
577 atomic_inc(&sk->sk_drops);
578 goto discard_and_relse;
579 }
580
581 bh_unlock_sock(sk);
582 out:
583 if (refcounted)
584 sock_put(sk);
585 return rc;
586 discard_and_relse:
587 kfree_skb(skb);
588 goto out;
589 }
590 EXPORT_SYMBOL(__sk_receive_skb);
591
592 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
593 u32));
594 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
595 u32));
__sk_dst_check(struct sock * sk,u32 cookie)596 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
597 {
598 struct dst_entry *dst = __sk_dst_get(sk);
599
600 if (dst && dst->obsolete &&
601 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
602 dst, cookie) == NULL) {
603 sk_tx_queue_clear(sk);
604 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
605 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
606 dst_release(dst);
607 return NULL;
608 }
609
610 return dst;
611 }
612 EXPORT_SYMBOL(__sk_dst_check);
613
sk_dst_check(struct sock * sk,u32 cookie)614 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
615 {
616 struct dst_entry *dst = sk_dst_get(sk);
617
618 if (dst && dst->obsolete &&
619 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
620 dst, cookie) == NULL) {
621 sk_dst_reset(sk);
622 dst_release(dst);
623 return NULL;
624 }
625
626 return dst;
627 }
628 EXPORT_SYMBOL(sk_dst_check);
629
sock_bindtoindex_locked(struct sock * sk,int ifindex)630 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
631 {
632 int ret = -ENOPROTOOPT;
633 #ifdef CONFIG_NETDEVICES
634 struct net *net = sock_net(sk);
635
636 /* Sorry... */
637 ret = -EPERM;
638 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
639 goto out;
640
641 ret = -EINVAL;
642 if (ifindex < 0)
643 goto out;
644
645 /* Paired with all READ_ONCE() done locklessly. */
646 WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
647
648 if (sk->sk_prot->rehash)
649 sk->sk_prot->rehash(sk);
650 sk_dst_reset(sk);
651
652 ret = 0;
653
654 out:
655 #endif
656
657 return ret;
658 }
659
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)660 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
661 {
662 int ret;
663
664 if (lock_sk)
665 lock_sock(sk);
666 ret = sock_bindtoindex_locked(sk, ifindex);
667 if (lock_sk)
668 release_sock(sk);
669
670 return ret;
671 }
672 EXPORT_SYMBOL(sock_bindtoindex);
673
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)674 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
675 {
676 int ret = -ENOPROTOOPT;
677 #ifdef CONFIG_NETDEVICES
678 struct net *net = sock_net(sk);
679 char devname[IFNAMSIZ];
680 int index;
681
682 ret = -EINVAL;
683 if (optlen < 0)
684 goto out;
685
686 /* Bind this socket to a particular device like "eth0",
687 * as specified in the passed interface name. If the
688 * name is "" or the option length is zero the socket
689 * is not bound.
690 */
691 if (optlen > IFNAMSIZ - 1)
692 optlen = IFNAMSIZ - 1;
693 memset(devname, 0, sizeof(devname));
694
695 ret = -EFAULT;
696 if (copy_from_sockptr(devname, optval, optlen))
697 goto out;
698
699 index = 0;
700 if (devname[0] != '\0') {
701 struct net_device *dev;
702
703 rcu_read_lock();
704 dev = dev_get_by_name_rcu(net, devname);
705 if (dev)
706 index = dev->ifindex;
707 rcu_read_unlock();
708 ret = -ENODEV;
709 if (!dev)
710 goto out;
711 }
712
713 sockopt_lock_sock(sk);
714 ret = sock_bindtoindex_locked(sk, index);
715 sockopt_release_sock(sk);
716 out:
717 #endif
718
719 return ret;
720 }
721
sock_getbindtodevice(struct sock * sk,sockptr_t optval,sockptr_t optlen,int len)722 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
723 sockptr_t optlen, int len)
724 {
725 int ret = -ENOPROTOOPT;
726 #ifdef CONFIG_NETDEVICES
727 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
728 struct net *net = sock_net(sk);
729 char devname[IFNAMSIZ];
730
731 if (bound_dev_if == 0) {
732 len = 0;
733 goto zero;
734 }
735
736 ret = -EINVAL;
737 if (len < IFNAMSIZ)
738 goto out;
739
740 ret = netdev_get_name(net, devname, bound_dev_if);
741 if (ret)
742 goto out;
743
744 len = strlen(devname) + 1;
745
746 ret = -EFAULT;
747 if (copy_to_sockptr(optval, devname, len))
748 goto out;
749
750 zero:
751 ret = -EFAULT;
752 if (copy_to_sockptr(optlen, &len, sizeof(int)))
753 goto out;
754
755 ret = 0;
756
757 out:
758 #endif
759
760 return ret;
761 }
762
sk_mc_loop(const struct sock * sk)763 bool sk_mc_loop(const struct sock *sk)
764 {
765 if (dev_recursion_level())
766 return false;
767 if (!sk)
768 return true;
769 /* IPV6_ADDRFORM can change sk->sk_family under us. */
770 switch (READ_ONCE(sk->sk_family)) {
771 case AF_INET:
772 return inet_test_bit(MC_LOOP, sk);
773 #if IS_ENABLED(CONFIG_IPV6)
774 case AF_INET6:
775 return inet6_test_bit(MC6_LOOP, sk);
776 #endif
777 }
778 WARN_ON_ONCE(1);
779 return true;
780 }
781 EXPORT_SYMBOL(sk_mc_loop);
782
sock_set_reuseaddr(struct sock * sk)783 void sock_set_reuseaddr(struct sock *sk)
784 {
785 lock_sock(sk);
786 sk->sk_reuse = SK_CAN_REUSE;
787 release_sock(sk);
788 }
789 EXPORT_SYMBOL(sock_set_reuseaddr);
790
sock_set_reuseport(struct sock * sk)791 void sock_set_reuseport(struct sock *sk)
792 {
793 lock_sock(sk);
794 sk->sk_reuseport = true;
795 release_sock(sk);
796 }
797 EXPORT_SYMBOL(sock_set_reuseport);
798
sock_no_linger(struct sock * sk)799 void sock_no_linger(struct sock *sk)
800 {
801 lock_sock(sk);
802 WRITE_ONCE(sk->sk_lingertime, 0);
803 sock_set_flag(sk, SOCK_LINGER);
804 release_sock(sk);
805 }
806 EXPORT_SYMBOL(sock_no_linger);
807
sock_set_priority(struct sock * sk,u32 priority)808 void sock_set_priority(struct sock *sk, u32 priority)
809 {
810 WRITE_ONCE(sk->sk_priority, priority);
811 }
812 EXPORT_SYMBOL(sock_set_priority);
813
sock_set_sndtimeo(struct sock * sk,s64 secs)814 void sock_set_sndtimeo(struct sock *sk, s64 secs)
815 {
816 lock_sock(sk);
817 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
818 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
819 else
820 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
821 release_sock(sk);
822 }
823 EXPORT_SYMBOL(sock_set_sndtimeo);
824
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)825 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
826 {
827 if (val) {
828 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
829 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
830 sock_set_flag(sk, SOCK_RCVTSTAMP);
831 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
832 } else {
833 sock_reset_flag(sk, SOCK_RCVTSTAMP);
834 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
835 }
836 }
837
sock_enable_timestamps(struct sock * sk)838 void sock_enable_timestamps(struct sock *sk)
839 {
840 lock_sock(sk);
841 __sock_set_timestamps(sk, true, false, true);
842 release_sock(sk);
843 }
844 EXPORT_SYMBOL(sock_enable_timestamps);
845
sock_set_timestamp(struct sock * sk,int optname,bool valbool)846 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
847 {
848 switch (optname) {
849 case SO_TIMESTAMP_OLD:
850 __sock_set_timestamps(sk, valbool, false, false);
851 break;
852 case SO_TIMESTAMP_NEW:
853 __sock_set_timestamps(sk, valbool, true, false);
854 break;
855 case SO_TIMESTAMPNS_OLD:
856 __sock_set_timestamps(sk, valbool, false, true);
857 break;
858 case SO_TIMESTAMPNS_NEW:
859 __sock_set_timestamps(sk, valbool, true, true);
860 break;
861 }
862 }
863
sock_timestamping_bind_phc(struct sock * sk,int phc_index)864 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
865 {
866 struct net *net = sock_net(sk);
867 struct net_device *dev = NULL;
868 bool match = false;
869 int *vclock_index;
870 int i, num;
871
872 if (sk->sk_bound_dev_if)
873 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
874
875 if (!dev) {
876 pr_err("%s: sock not bind to device\n", __func__);
877 return -EOPNOTSUPP;
878 }
879
880 num = ethtool_get_phc_vclocks(dev, &vclock_index);
881 dev_put(dev);
882
883 for (i = 0; i < num; i++) {
884 if (*(vclock_index + i) == phc_index) {
885 match = true;
886 break;
887 }
888 }
889
890 if (num > 0)
891 kfree(vclock_index);
892
893 if (!match)
894 return -EINVAL;
895
896 WRITE_ONCE(sk->sk_bind_phc, phc_index);
897
898 return 0;
899 }
900
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)901 int sock_set_timestamping(struct sock *sk, int optname,
902 struct so_timestamping timestamping)
903 {
904 int val = timestamping.flags;
905 int ret;
906
907 if (val & ~SOF_TIMESTAMPING_MASK)
908 return -EINVAL;
909
910 if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
911 !(val & SOF_TIMESTAMPING_OPT_ID))
912 return -EINVAL;
913
914 if (val & SOF_TIMESTAMPING_OPT_ID &&
915 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
916 if (sk_is_tcp(sk)) {
917 if ((1 << sk->sk_state) &
918 (TCPF_CLOSE | TCPF_LISTEN))
919 return -EINVAL;
920 if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
921 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
922 else
923 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
924 } else {
925 atomic_set(&sk->sk_tskey, 0);
926 }
927 }
928
929 if (val & SOF_TIMESTAMPING_OPT_STATS &&
930 !(val & SOF_TIMESTAMPING_OPT_TSONLY))
931 return -EINVAL;
932
933 if (val & SOF_TIMESTAMPING_BIND_PHC) {
934 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
935 if (ret)
936 return ret;
937 }
938
939 WRITE_ONCE(sk->sk_tsflags, val);
940 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
941
942 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
943 sock_enable_timestamp(sk,
944 SOCK_TIMESTAMPING_RX_SOFTWARE);
945 else
946 sock_disable_timestamp(sk,
947 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
948 return 0;
949 }
950
sock_set_keepalive(struct sock * sk)951 void sock_set_keepalive(struct sock *sk)
952 {
953 lock_sock(sk);
954 if (sk->sk_prot->keepalive)
955 sk->sk_prot->keepalive(sk, true);
956 sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
957 release_sock(sk);
958 }
959 EXPORT_SYMBOL(sock_set_keepalive);
960
__sock_set_rcvbuf(struct sock * sk,int val)961 static void __sock_set_rcvbuf(struct sock *sk, int val)
962 {
963 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
964 * as a negative value.
965 */
966 val = min_t(int, val, INT_MAX / 2);
967 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
968
969 /* We double it on the way in to account for "struct sk_buff" etc.
970 * overhead. Applications assume that the SO_RCVBUF setting they make
971 * will allow that much actual data to be received on that socket.
972 *
973 * Applications are unaware that "struct sk_buff" and other overheads
974 * allocate from the receive buffer during socket buffer allocation.
975 *
976 * And after considering the possible alternatives, returning the value
977 * we actually used in getsockopt is the most desirable behavior.
978 */
979 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
980 }
981
sock_set_rcvbuf(struct sock * sk,int val)982 void sock_set_rcvbuf(struct sock *sk, int val)
983 {
984 lock_sock(sk);
985 __sock_set_rcvbuf(sk, val);
986 release_sock(sk);
987 }
988 EXPORT_SYMBOL(sock_set_rcvbuf);
989
__sock_set_mark(struct sock * sk,u32 val)990 static void __sock_set_mark(struct sock *sk, u32 val)
991 {
992 if (val != sk->sk_mark) {
993 WRITE_ONCE(sk->sk_mark, val);
994 sk_dst_reset(sk);
995 }
996 }
997
sock_set_mark(struct sock * sk,u32 val)998 void sock_set_mark(struct sock *sk, u32 val)
999 {
1000 lock_sock(sk);
1001 __sock_set_mark(sk, val);
1002 release_sock(sk);
1003 }
1004 EXPORT_SYMBOL(sock_set_mark);
1005
sock_release_reserved_memory(struct sock * sk,int bytes)1006 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1007 {
1008 /* Round down bytes to multiple of pages */
1009 bytes = round_down(bytes, PAGE_SIZE);
1010
1011 WARN_ON(bytes > sk->sk_reserved_mem);
1012 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1013 sk_mem_reclaim(sk);
1014 }
1015
sock_reserve_memory(struct sock * sk,int bytes)1016 static int sock_reserve_memory(struct sock *sk, int bytes)
1017 {
1018 long allocated;
1019 bool charged;
1020 int pages;
1021
1022 if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1023 return -EOPNOTSUPP;
1024
1025 if (!bytes)
1026 return 0;
1027
1028 pages = sk_mem_pages(bytes);
1029
1030 /* pre-charge to memcg */
1031 charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1032 GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1033 if (!charged)
1034 return -ENOMEM;
1035
1036 /* pre-charge to forward_alloc */
1037 sk_memory_allocated_add(sk, pages);
1038 allocated = sk_memory_allocated(sk);
1039 /* If the system goes into memory pressure with this
1040 * precharge, give up and return error.
1041 */
1042 if (allocated > sk_prot_mem_limits(sk, 1)) {
1043 sk_memory_allocated_sub(sk, pages);
1044 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1045 return -ENOMEM;
1046 }
1047 sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1048
1049 WRITE_ONCE(sk->sk_reserved_mem,
1050 sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1051
1052 return 0;
1053 }
1054
1055 #ifdef CONFIG_PAGE_POOL
1056
1057 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1058 * in 1 syscall. The limit exists to limit the amount of memory the kernel
1059 * allocates to copy these tokens, and to prevent looping over the frags for
1060 * too long.
1061 */
1062 #define MAX_DONTNEED_TOKENS 128
1063 #define MAX_DONTNEED_FRAGS 1024
1064
1065 static noinline_for_stack int
sock_devmem_dontneed(struct sock * sk,sockptr_t optval,unsigned int optlen)1066 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1067 {
1068 unsigned int num_tokens, i, j, k, netmem_num = 0;
1069 struct dmabuf_token *tokens;
1070 int ret = 0, num_frags = 0;
1071 netmem_ref netmems[16];
1072
1073 if (!sk_is_tcp(sk))
1074 return -EBADF;
1075
1076 if (optlen % sizeof(*tokens) ||
1077 optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1078 return -EINVAL;
1079
1080 num_tokens = optlen / sizeof(*tokens);
1081 tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1082 if (!tokens)
1083 return -ENOMEM;
1084
1085 if (copy_from_sockptr(tokens, optval, optlen)) {
1086 kvfree(tokens);
1087 return -EFAULT;
1088 }
1089
1090 xa_lock_bh(&sk->sk_user_frags);
1091 for (i = 0; i < num_tokens; i++) {
1092 for (j = 0; j < tokens[i].token_count; j++) {
1093 if (++num_frags > MAX_DONTNEED_FRAGS)
1094 goto frag_limit_reached;
1095
1096 netmem_ref netmem = (__force netmem_ref)__xa_erase(
1097 &sk->sk_user_frags, tokens[i].token_start + j);
1098
1099 if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1100 continue;
1101
1102 netmems[netmem_num++] = netmem;
1103 if (netmem_num == ARRAY_SIZE(netmems)) {
1104 xa_unlock_bh(&sk->sk_user_frags);
1105 for (k = 0; k < netmem_num; k++)
1106 WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1107 netmem_num = 0;
1108 xa_lock_bh(&sk->sk_user_frags);
1109 }
1110 ret++;
1111 }
1112 }
1113
1114 frag_limit_reached:
1115 xa_unlock_bh(&sk->sk_user_frags);
1116 for (k = 0; k < netmem_num; k++)
1117 WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1118
1119 kvfree(tokens);
1120 return ret;
1121 }
1122 #endif
1123
sockopt_lock_sock(struct sock * sk)1124 void sockopt_lock_sock(struct sock *sk)
1125 {
1126 /* When current->bpf_ctx is set, the setsockopt is called from
1127 * a bpf prog. bpf has ensured the sk lock has been
1128 * acquired before calling setsockopt().
1129 */
1130 if (has_current_bpf_ctx())
1131 return;
1132
1133 lock_sock(sk);
1134 }
1135 EXPORT_SYMBOL(sockopt_lock_sock);
1136
sockopt_release_sock(struct sock * sk)1137 void sockopt_release_sock(struct sock *sk)
1138 {
1139 if (has_current_bpf_ctx())
1140 return;
1141
1142 release_sock(sk);
1143 }
1144 EXPORT_SYMBOL(sockopt_release_sock);
1145
sockopt_ns_capable(struct user_namespace * ns,int cap)1146 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1147 {
1148 return has_current_bpf_ctx() || ns_capable(ns, cap);
1149 }
1150 EXPORT_SYMBOL(sockopt_ns_capable);
1151
sockopt_capable(int cap)1152 bool sockopt_capable(int cap)
1153 {
1154 return has_current_bpf_ctx() || capable(cap);
1155 }
1156 EXPORT_SYMBOL(sockopt_capable);
1157
sockopt_validate_clockid(__kernel_clockid_t value)1158 static int sockopt_validate_clockid(__kernel_clockid_t value)
1159 {
1160 switch (value) {
1161 case CLOCK_REALTIME:
1162 case CLOCK_MONOTONIC:
1163 case CLOCK_TAI:
1164 return 0;
1165 }
1166 return -EINVAL;
1167 }
1168
1169 /*
1170 * This is meant for all protocols to use and covers goings on
1171 * at the socket level. Everything here is generic.
1172 */
1173
sk_setsockopt(struct sock * sk,int level,int optname,sockptr_t optval,unsigned int optlen)1174 int sk_setsockopt(struct sock *sk, int level, int optname,
1175 sockptr_t optval, unsigned int optlen)
1176 {
1177 struct so_timestamping timestamping;
1178 struct socket *sock = sk->sk_socket;
1179 struct sock_txtime sk_txtime;
1180 int val;
1181 int valbool;
1182 struct linger ling;
1183 int ret = 0;
1184
1185 /*
1186 * Options without arguments
1187 */
1188
1189 if (optname == SO_BINDTODEVICE)
1190 return sock_setbindtodevice(sk, optval, optlen);
1191
1192 if (optlen < sizeof(int))
1193 return -EINVAL;
1194
1195 if (copy_from_sockptr(&val, optval, sizeof(val)))
1196 return -EFAULT;
1197
1198 valbool = val ? 1 : 0;
1199
1200 /* handle options which do not require locking the socket. */
1201 switch (optname) {
1202 case SO_PRIORITY:
1203 if ((val >= 0 && val <= 6) ||
1204 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1205 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1206 sock_set_priority(sk, val);
1207 return 0;
1208 }
1209 return -EPERM;
1210 case SO_PASSSEC:
1211 assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1212 return 0;
1213 case SO_PASSCRED:
1214 assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1215 return 0;
1216 case SO_PASSPIDFD:
1217 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1218 return 0;
1219 case SO_TYPE:
1220 case SO_PROTOCOL:
1221 case SO_DOMAIN:
1222 case SO_ERROR:
1223 return -ENOPROTOOPT;
1224 #ifdef CONFIG_NET_RX_BUSY_POLL
1225 case SO_BUSY_POLL:
1226 if (val < 0)
1227 return -EINVAL;
1228 WRITE_ONCE(sk->sk_ll_usec, val);
1229 return 0;
1230 case SO_PREFER_BUSY_POLL:
1231 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1232 return -EPERM;
1233 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1234 return 0;
1235 case SO_BUSY_POLL_BUDGET:
1236 if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1237 !sockopt_capable(CAP_NET_ADMIN))
1238 return -EPERM;
1239 if (val < 0 || val > U16_MAX)
1240 return -EINVAL;
1241 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1242 return 0;
1243 #endif
1244 case SO_MAX_PACING_RATE:
1245 {
1246 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1247 unsigned long pacing_rate;
1248
1249 if (sizeof(ulval) != sizeof(val) &&
1250 optlen >= sizeof(ulval) &&
1251 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1252 return -EFAULT;
1253 }
1254 if (ulval != ~0UL)
1255 cmpxchg(&sk->sk_pacing_status,
1256 SK_PACING_NONE,
1257 SK_PACING_NEEDED);
1258 /* Pairs with READ_ONCE() from sk_getsockopt() */
1259 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1260 pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1261 if (ulval < pacing_rate)
1262 WRITE_ONCE(sk->sk_pacing_rate, ulval);
1263 return 0;
1264 }
1265 case SO_TXREHASH:
1266 if (val < -1 || val > 1)
1267 return -EINVAL;
1268 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1269 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1270 /* Paired with READ_ONCE() in tcp_rtx_synack()
1271 * and sk_getsockopt().
1272 */
1273 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1274 return 0;
1275 case SO_PEEK_OFF:
1276 {
1277 int (*set_peek_off)(struct sock *sk, int val);
1278
1279 set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1280 if (set_peek_off)
1281 ret = set_peek_off(sk, val);
1282 else
1283 ret = -EOPNOTSUPP;
1284 return ret;
1285 }
1286 #ifdef CONFIG_PAGE_POOL
1287 case SO_DEVMEM_DONTNEED:
1288 return sock_devmem_dontneed(sk, optval, optlen);
1289 #endif
1290 }
1291
1292 sockopt_lock_sock(sk);
1293
1294 switch (optname) {
1295 case SO_DEBUG:
1296 if (val && !sockopt_capable(CAP_NET_ADMIN))
1297 ret = -EACCES;
1298 else
1299 sock_valbool_flag(sk, SOCK_DBG, valbool);
1300 break;
1301 case SO_REUSEADDR:
1302 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1303 break;
1304 case SO_REUSEPORT:
1305 if (valbool && !sk_is_inet(sk))
1306 ret = -EOPNOTSUPP;
1307 else
1308 sk->sk_reuseport = valbool;
1309 break;
1310 case SO_DONTROUTE:
1311 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1312 sk_dst_reset(sk);
1313 break;
1314 case SO_BROADCAST:
1315 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1316 break;
1317 case SO_SNDBUF:
1318 /* Don't error on this BSD doesn't and if you think
1319 * about it this is right. Otherwise apps have to
1320 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1321 * are treated in BSD as hints
1322 */
1323 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1324 set_sndbuf:
1325 /* Ensure val * 2 fits into an int, to prevent max_t()
1326 * from treating it as a negative value.
1327 */
1328 val = min_t(int, val, INT_MAX / 2);
1329 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1330 WRITE_ONCE(sk->sk_sndbuf,
1331 max_t(int, val * 2, SOCK_MIN_SNDBUF));
1332 /* Wake up sending tasks if we upped the value. */
1333 sk->sk_write_space(sk);
1334 break;
1335
1336 case SO_SNDBUFFORCE:
1337 if (!sockopt_capable(CAP_NET_ADMIN)) {
1338 ret = -EPERM;
1339 break;
1340 }
1341
1342 /* No negative values (to prevent underflow, as val will be
1343 * multiplied by 2).
1344 */
1345 if (val < 0)
1346 val = 0;
1347 goto set_sndbuf;
1348
1349 case SO_RCVBUF:
1350 /* Don't error on this BSD doesn't and if you think
1351 * about it this is right. Otherwise apps have to
1352 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1353 * are treated in BSD as hints
1354 */
1355 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1356 break;
1357
1358 case SO_RCVBUFFORCE:
1359 if (!sockopt_capable(CAP_NET_ADMIN)) {
1360 ret = -EPERM;
1361 break;
1362 }
1363
1364 /* No negative values (to prevent underflow, as val will be
1365 * multiplied by 2).
1366 */
1367 __sock_set_rcvbuf(sk, max(val, 0));
1368 break;
1369
1370 case SO_KEEPALIVE:
1371 if (sk->sk_prot->keepalive)
1372 sk->sk_prot->keepalive(sk, valbool);
1373 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1374 break;
1375
1376 case SO_OOBINLINE:
1377 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1378 break;
1379
1380 case SO_NO_CHECK:
1381 sk->sk_no_check_tx = valbool;
1382 break;
1383
1384 case SO_LINGER:
1385 if (optlen < sizeof(ling)) {
1386 ret = -EINVAL; /* 1003.1g */
1387 break;
1388 }
1389 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1390 ret = -EFAULT;
1391 break;
1392 }
1393 if (!ling.l_onoff) {
1394 sock_reset_flag(sk, SOCK_LINGER);
1395 } else {
1396 unsigned long t_sec = ling.l_linger;
1397
1398 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1399 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1400 else
1401 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1402 sock_set_flag(sk, SOCK_LINGER);
1403 }
1404 break;
1405
1406 case SO_BSDCOMPAT:
1407 break;
1408
1409 case SO_TIMESTAMP_OLD:
1410 case SO_TIMESTAMP_NEW:
1411 case SO_TIMESTAMPNS_OLD:
1412 case SO_TIMESTAMPNS_NEW:
1413 sock_set_timestamp(sk, optname, valbool);
1414 break;
1415
1416 case SO_TIMESTAMPING_NEW:
1417 case SO_TIMESTAMPING_OLD:
1418 if (optlen == sizeof(timestamping)) {
1419 if (copy_from_sockptr(×tamping, optval,
1420 sizeof(timestamping))) {
1421 ret = -EFAULT;
1422 break;
1423 }
1424 } else {
1425 memset(×tamping, 0, sizeof(timestamping));
1426 timestamping.flags = val;
1427 }
1428 ret = sock_set_timestamping(sk, optname, timestamping);
1429 break;
1430
1431 case SO_RCVLOWAT:
1432 {
1433 int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1434
1435 if (val < 0)
1436 val = INT_MAX;
1437 if (sock)
1438 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1439 if (set_rcvlowat)
1440 ret = set_rcvlowat(sk, val);
1441 else
1442 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1443 break;
1444 }
1445 case SO_RCVTIMEO_OLD:
1446 case SO_RCVTIMEO_NEW:
1447 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1448 optlen, optname == SO_RCVTIMEO_OLD);
1449 break;
1450
1451 case SO_SNDTIMEO_OLD:
1452 case SO_SNDTIMEO_NEW:
1453 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1454 optlen, optname == SO_SNDTIMEO_OLD);
1455 break;
1456
1457 case SO_ATTACH_FILTER: {
1458 struct sock_fprog fprog;
1459
1460 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1461 if (!ret)
1462 ret = sk_attach_filter(&fprog, sk);
1463 break;
1464 }
1465 case SO_ATTACH_BPF:
1466 ret = -EINVAL;
1467 if (optlen == sizeof(u32)) {
1468 u32 ufd;
1469
1470 ret = -EFAULT;
1471 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1472 break;
1473
1474 ret = sk_attach_bpf(ufd, sk);
1475 }
1476 break;
1477
1478 case SO_ATTACH_REUSEPORT_CBPF: {
1479 struct sock_fprog fprog;
1480
1481 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1482 if (!ret)
1483 ret = sk_reuseport_attach_filter(&fprog, sk);
1484 break;
1485 }
1486 case SO_ATTACH_REUSEPORT_EBPF:
1487 ret = -EINVAL;
1488 if (optlen == sizeof(u32)) {
1489 u32 ufd;
1490
1491 ret = -EFAULT;
1492 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1493 break;
1494
1495 ret = sk_reuseport_attach_bpf(ufd, sk);
1496 }
1497 break;
1498
1499 case SO_DETACH_REUSEPORT_BPF:
1500 ret = reuseport_detach_prog(sk);
1501 break;
1502
1503 case SO_DETACH_FILTER:
1504 ret = sk_detach_filter(sk);
1505 break;
1506
1507 case SO_LOCK_FILTER:
1508 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1509 ret = -EPERM;
1510 else
1511 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1512 break;
1513
1514 case SO_MARK:
1515 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1516 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1517 ret = -EPERM;
1518 break;
1519 }
1520
1521 __sock_set_mark(sk, val);
1522 break;
1523 case SO_RCVMARK:
1524 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1525 break;
1526
1527 case SO_RXQ_OVFL:
1528 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1529 break;
1530
1531 case SO_WIFI_STATUS:
1532 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1533 break;
1534
1535 case SO_NOFCS:
1536 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1537 break;
1538
1539 case SO_SELECT_ERR_QUEUE:
1540 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1541 break;
1542
1543
1544 case SO_INCOMING_CPU:
1545 reuseport_update_incoming_cpu(sk, val);
1546 break;
1547
1548 case SO_CNX_ADVICE:
1549 if (val == 1)
1550 dst_negative_advice(sk);
1551 break;
1552
1553 case SO_ZEROCOPY:
1554 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1555 if (!(sk_is_tcp(sk) ||
1556 (sk->sk_type == SOCK_DGRAM &&
1557 sk->sk_protocol == IPPROTO_UDP)))
1558 ret = -EOPNOTSUPP;
1559 } else if (sk->sk_family != PF_RDS) {
1560 ret = -EOPNOTSUPP;
1561 }
1562 if (!ret) {
1563 if (val < 0 || val > 1)
1564 ret = -EINVAL;
1565 else
1566 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1567 }
1568 break;
1569
1570 case SO_TXTIME:
1571 if (optlen != sizeof(struct sock_txtime)) {
1572 ret = -EINVAL;
1573 break;
1574 } else if (copy_from_sockptr(&sk_txtime, optval,
1575 sizeof(struct sock_txtime))) {
1576 ret = -EFAULT;
1577 break;
1578 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1579 ret = -EINVAL;
1580 break;
1581 }
1582 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1583 * scheduler has enough safe guards.
1584 */
1585 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1586 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1587 ret = -EPERM;
1588 break;
1589 }
1590
1591 ret = sockopt_validate_clockid(sk_txtime.clockid);
1592 if (ret)
1593 break;
1594
1595 sock_valbool_flag(sk, SOCK_TXTIME, true);
1596 sk->sk_clockid = sk_txtime.clockid;
1597 sk->sk_txtime_deadline_mode =
1598 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1599 sk->sk_txtime_report_errors =
1600 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1601 break;
1602
1603 case SO_BINDTOIFINDEX:
1604 ret = sock_bindtoindex_locked(sk, val);
1605 break;
1606
1607 case SO_BUF_LOCK:
1608 if (val & ~SOCK_BUF_LOCK_MASK) {
1609 ret = -EINVAL;
1610 break;
1611 }
1612 sk->sk_userlocks = val | (sk->sk_userlocks &
1613 ~SOCK_BUF_LOCK_MASK);
1614 break;
1615
1616 case SO_RESERVE_MEM:
1617 {
1618 int delta;
1619
1620 if (val < 0) {
1621 ret = -EINVAL;
1622 break;
1623 }
1624
1625 delta = val - sk->sk_reserved_mem;
1626 if (delta < 0)
1627 sock_release_reserved_memory(sk, -delta);
1628 else
1629 ret = sock_reserve_memory(sk, delta);
1630 break;
1631 }
1632
1633 default:
1634 ret = -ENOPROTOOPT;
1635 break;
1636 }
1637 sockopt_release_sock(sk);
1638 return ret;
1639 }
1640
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)1641 int sock_setsockopt(struct socket *sock, int level, int optname,
1642 sockptr_t optval, unsigned int optlen)
1643 {
1644 return sk_setsockopt(sock->sk, level, optname,
1645 optval, optlen);
1646 }
1647 EXPORT_SYMBOL(sock_setsockopt);
1648
sk_get_peer_cred(struct sock * sk)1649 static const struct cred *sk_get_peer_cred(struct sock *sk)
1650 {
1651 const struct cred *cred;
1652
1653 spin_lock(&sk->sk_peer_lock);
1654 cred = get_cred(sk->sk_peer_cred);
1655 spin_unlock(&sk->sk_peer_lock);
1656
1657 return cred;
1658 }
1659
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1660 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1661 struct ucred *ucred)
1662 {
1663 ucred->pid = pid_vnr(pid);
1664 ucred->uid = ucred->gid = -1;
1665 if (cred) {
1666 struct user_namespace *current_ns = current_user_ns();
1667
1668 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1669 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1670 }
1671 }
1672
groups_to_user(sockptr_t dst,const struct group_info * src)1673 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1674 {
1675 struct user_namespace *user_ns = current_user_ns();
1676 int i;
1677
1678 for (i = 0; i < src->ngroups; i++) {
1679 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1680
1681 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1682 return -EFAULT;
1683 }
1684
1685 return 0;
1686 }
1687
sk_getsockopt(struct sock * sk,int level,int optname,sockptr_t optval,sockptr_t optlen)1688 int sk_getsockopt(struct sock *sk, int level, int optname,
1689 sockptr_t optval, sockptr_t optlen)
1690 {
1691 struct socket *sock = sk->sk_socket;
1692
1693 union {
1694 int val;
1695 u64 val64;
1696 unsigned long ulval;
1697 struct linger ling;
1698 struct old_timeval32 tm32;
1699 struct __kernel_old_timeval tm;
1700 struct __kernel_sock_timeval stm;
1701 struct sock_txtime txtime;
1702 struct so_timestamping timestamping;
1703 } v;
1704
1705 int lv = sizeof(int);
1706 int len;
1707
1708 if (copy_from_sockptr(&len, optlen, sizeof(int)))
1709 return -EFAULT;
1710 if (len < 0)
1711 return -EINVAL;
1712
1713 memset(&v, 0, sizeof(v));
1714
1715 switch (optname) {
1716 case SO_DEBUG:
1717 v.val = sock_flag(sk, SOCK_DBG);
1718 break;
1719
1720 case SO_DONTROUTE:
1721 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1722 break;
1723
1724 case SO_BROADCAST:
1725 v.val = sock_flag(sk, SOCK_BROADCAST);
1726 break;
1727
1728 case SO_SNDBUF:
1729 v.val = READ_ONCE(sk->sk_sndbuf);
1730 break;
1731
1732 case SO_RCVBUF:
1733 v.val = READ_ONCE(sk->sk_rcvbuf);
1734 break;
1735
1736 case SO_REUSEADDR:
1737 v.val = sk->sk_reuse;
1738 break;
1739
1740 case SO_REUSEPORT:
1741 v.val = sk->sk_reuseport;
1742 break;
1743
1744 case SO_KEEPALIVE:
1745 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1746 break;
1747
1748 case SO_TYPE:
1749 v.val = sk->sk_type;
1750 break;
1751
1752 case SO_PROTOCOL:
1753 v.val = sk->sk_protocol;
1754 break;
1755
1756 case SO_DOMAIN:
1757 v.val = sk->sk_family;
1758 break;
1759
1760 case SO_ERROR:
1761 v.val = -sock_error(sk);
1762 if (v.val == 0)
1763 v.val = xchg(&sk->sk_err_soft, 0);
1764 break;
1765
1766 case SO_OOBINLINE:
1767 v.val = sock_flag(sk, SOCK_URGINLINE);
1768 break;
1769
1770 case SO_NO_CHECK:
1771 v.val = sk->sk_no_check_tx;
1772 break;
1773
1774 case SO_PRIORITY:
1775 v.val = READ_ONCE(sk->sk_priority);
1776 break;
1777
1778 case SO_LINGER:
1779 lv = sizeof(v.ling);
1780 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1781 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1782 break;
1783
1784 case SO_BSDCOMPAT:
1785 break;
1786
1787 case SO_TIMESTAMP_OLD:
1788 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1789 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1790 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1791 break;
1792
1793 case SO_TIMESTAMPNS_OLD:
1794 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1795 break;
1796
1797 case SO_TIMESTAMP_NEW:
1798 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1799 break;
1800
1801 case SO_TIMESTAMPNS_NEW:
1802 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1803 break;
1804
1805 case SO_TIMESTAMPING_OLD:
1806 case SO_TIMESTAMPING_NEW:
1807 lv = sizeof(v.timestamping);
1808 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1809 * returning the flags when they were set through the same option.
1810 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1811 */
1812 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1813 v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1814 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1815 }
1816 break;
1817
1818 case SO_RCVTIMEO_OLD:
1819 case SO_RCVTIMEO_NEW:
1820 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1821 SO_RCVTIMEO_OLD == optname);
1822 break;
1823
1824 case SO_SNDTIMEO_OLD:
1825 case SO_SNDTIMEO_NEW:
1826 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1827 SO_SNDTIMEO_OLD == optname);
1828 break;
1829
1830 case SO_RCVLOWAT:
1831 v.val = READ_ONCE(sk->sk_rcvlowat);
1832 break;
1833
1834 case SO_SNDLOWAT:
1835 v.val = 1;
1836 break;
1837
1838 case SO_PASSCRED:
1839 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1840 break;
1841
1842 case SO_PASSPIDFD:
1843 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1844 break;
1845
1846 case SO_PEERCRED:
1847 {
1848 struct ucred peercred;
1849 if (len > sizeof(peercred))
1850 len = sizeof(peercred);
1851
1852 spin_lock(&sk->sk_peer_lock);
1853 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1854 spin_unlock(&sk->sk_peer_lock);
1855
1856 if (copy_to_sockptr(optval, &peercred, len))
1857 return -EFAULT;
1858 goto lenout;
1859 }
1860
1861 case SO_PEERPIDFD:
1862 {
1863 struct pid *peer_pid;
1864 struct file *pidfd_file = NULL;
1865 int pidfd;
1866
1867 if (len > sizeof(pidfd))
1868 len = sizeof(pidfd);
1869
1870 spin_lock(&sk->sk_peer_lock);
1871 peer_pid = get_pid(sk->sk_peer_pid);
1872 spin_unlock(&sk->sk_peer_lock);
1873
1874 if (!peer_pid)
1875 return -ENODATA;
1876
1877 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1878 put_pid(peer_pid);
1879 if (pidfd < 0)
1880 return pidfd;
1881
1882 if (copy_to_sockptr(optval, &pidfd, len) ||
1883 copy_to_sockptr(optlen, &len, sizeof(int))) {
1884 put_unused_fd(pidfd);
1885 fput(pidfd_file);
1886
1887 return -EFAULT;
1888 }
1889
1890 fd_install(pidfd, pidfd_file);
1891 return 0;
1892 }
1893
1894 case SO_PEERGROUPS:
1895 {
1896 const struct cred *cred;
1897 int ret, n;
1898
1899 cred = sk_get_peer_cred(sk);
1900 if (!cred)
1901 return -ENODATA;
1902
1903 n = cred->group_info->ngroups;
1904 if (len < n * sizeof(gid_t)) {
1905 len = n * sizeof(gid_t);
1906 put_cred(cred);
1907 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1908 }
1909 len = n * sizeof(gid_t);
1910
1911 ret = groups_to_user(optval, cred->group_info);
1912 put_cred(cred);
1913 if (ret)
1914 return ret;
1915 goto lenout;
1916 }
1917
1918 case SO_PEERNAME:
1919 {
1920 struct sockaddr_storage address;
1921
1922 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1923 if (lv < 0)
1924 return -ENOTCONN;
1925 if (lv < len)
1926 return -EINVAL;
1927 if (copy_to_sockptr(optval, &address, len))
1928 return -EFAULT;
1929 goto lenout;
1930 }
1931
1932 /* Dubious BSD thing... Probably nobody even uses it, but
1933 * the UNIX standard wants it for whatever reason... -DaveM
1934 */
1935 case SO_ACCEPTCONN:
1936 v.val = sk->sk_state == TCP_LISTEN;
1937 break;
1938
1939 case SO_PASSSEC:
1940 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1941 break;
1942
1943 case SO_PEERSEC:
1944 return security_socket_getpeersec_stream(sock,
1945 optval, optlen, len);
1946
1947 case SO_MARK:
1948 v.val = READ_ONCE(sk->sk_mark);
1949 break;
1950
1951 case SO_RCVMARK:
1952 v.val = sock_flag(sk, SOCK_RCVMARK);
1953 break;
1954
1955 case SO_RXQ_OVFL:
1956 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1957 break;
1958
1959 case SO_WIFI_STATUS:
1960 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1961 break;
1962
1963 case SO_PEEK_OFF:
1964 if (!READ_ONCE(sock->ops)->set_peek_off)
1965 return -EOPNOTSUPP;
1966
1967 v.val = READ_ONCE(sk->sk_peek_off);
1968 break;
1969 case SO_NOFCS:
1970 v.val = sock_flag(sk, SOCK_NOFCS);
1971 break;
1972
1973 case SO_BINDTODEVICE:
1974 return sock_getbindtodevice(sk, optval, optlen, len);
1975
1976 case SO_GET_FILTER:
1977 len = sk_get_filter(sk, optval, len);
1978 if (len < 0)
1979 return len;
1980
1981 goto lenout;
1982
1983 case SO_LOCK_FILTER:
1984 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1985 break;
1986
1987 case SO_BPF_EXTENSIONS:
1988 v.val = bpf_tell_extensions();
1989 break;
1990
1991 case SO_SELECT_ERR_QUEUE:
1992 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1993 break;
1994
1995 #ifdef CONFIG_NET_RX_BUSY_POLL
1996 case SO_BUSY_POLL:
1997 v.val = READ_ONCE(sk->sk_ll_usec);
1998 break;
1999 case SO_PREFER_BUSY_POLL:
2000 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2001 break;
2002 #endif
2003
2004 case SO_MAX_PACING_RATE:
2005 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2006 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2007 lv = sizeof(v.ulval);
2008 v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2009 } else {
2010 /* 32bit version */
2011 v.val = min_t(unsigned long, ~0U,
2012 READ_ONCE(sk->sk_max_pacing_rate));
2013 }
2014 break;
2015
2016 case SO_INCOMING_CPU:
2017 v.val = READ_ONCE(sk->sk_incoming_cpu);
2018 break;
2019
2020 case SO_MEMINFO:
2021 {
2022 u32 meminfo[SK_MEMINFO_VARS];
2023
2024 sk_get_meminfo(sk, meminfo);
2025
2026 len = min_t(unsigned int, len, sizeof(meminfo));
2027 if (copy_to_sockptr(optval, &meminfo, len))
2028 return -EFAULT;
2029
2030 goto lenout;
2031 }
2032
2033 #ifdef CONFIG_NET_RX_BUSY_POLL
2034 case SO_INCOMING_NAPI_ID:
2035 v.val = READ_ONCE(sk->sk_napi_id);
2036
2037 /* aggregate non-NAPI IDs down to 0 */
2038 if (v.val < MIN_NAPI_ID)
2039 v.val = 0;
2040
2041 break;
2042 #endif
2043
2044 case SO_COOKIE:
2045 lv = sizeof(u64);
2046 if (len < lv)
2047 return -EINVAL;
2048 v.val64 = sock_gen_cookie(sk);
2049 break;
2050
2051 case SO_ZEROCOPY:
2052 v.val = sock_flag(sk, SOCK_ZEROCOPY);
2053 break;
2054
2055 case SO_TXTIME:
2056 lv = sizeof(v.txtime);
2057 v.txtime.clockid = sk->sk_clockid;
2058 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2059 SOF_TXTIME_DEADLINE_MODE : 0;
2060 v.txtime.flags |= sk->sk_txtime_report_errors ?
2061 SOF_TXTIME_REPORT_ERRORS : 0;
2062 break;
2063
2064 case SO_BINDTOIFINDEX:
2065 v.val = READ_ONCE(sk->sk_bound_dev_if);
2066 break;
2067
2068 case SO_NETNS_COOKIE:
2069 lv = sizeof(u64);
2070 if (len != lv)
2071 return -EINVAL;
2072 v.val64 = sock_net(sk)->net_cookie;
2073 break;
2074
2075 case SO_BUF_LOCK:
2076 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2077 break;
2078
2079 case SO_RESERVE_MEM:
2080 v.val = READ_ONCE(sk->sk_reserved_mem);
2081 break;
2082
2083 case SO_TXREHASH:
2084 /* Paired with WRITE_ONCE() in sk_setsockopt() */
2085 v.val = READ_ONCE(sk->sk_txrehash);
2086 break;
2087
2088 default:
2089 /* We implement the SO_SNDLOWAT etc to not be settable
2090 * (1003.1g 7).
2091 */
2092 return -ENOPROTOOPT;
2093 }
2094
2095 if (len > lv)
2096 len = lv;
2097 if (copy_to_sockptr(optval, &v, len))
2098 return -EFAULT;
2099 lenout:
2100 if (copy_to_sockptr(optlen, &len, sizeof(int)))
2101 return -EFAULT;
2102 return 0;
2103 }
2104
2105 /*
2106 * Initialize an sk_lock.
2107 *
2108 * (We also register the sk_lock with the lock validator.)
2109 */
sock_lock_init(struct sock * sk)2110 static inline void sock_lock_init(struct sock *sk)
2111 {
2112 sk_owner_clear(sk);
2113
2114 if (sk->sk_kern_sock)
2115 sock_lock_init_class_and_name(
2116 sk,
2117 af_family_kern_slock_key_strings[sk->sk_family],
2118 af_family_kern_slock_keys + sk->sk_family,
2119 af_family_kern_key_strings[sk->sk_family],
2120 af_family_kern_keys + sk->sk_family);
2121 else
2122 sock_lock_init_class_and_name(
2123 sk,
2124 af_family_slock_key_strings[sk->sk_family],
2125 af_family_slock_keys + sk->sk_family,
2126 af_family_key_strings[sk->sk_family],
2127 af_family_keys + sk->sk_family);
2128 }
2129
2130 /*
2131 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2132 * even temporarily, because of RCU lookups. sk_node should also be left as is.
2133 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2134 */
sock_copy(struct sock * nsk,const struct sock * osk)2135 static void sock_copy(struct sock *nsk, const struct sock *osk)
2136 {
2137 const struct proto *prot = READ_ONCE(osk->sk_prot);
2138 #ifdef CONFIG_SECURITY_NETWORK
2139 void *sptr = nsk->sk_security;
2140 #endif
2141
2142 /* If we move sk_tx_queue_mapping out of the private section,
2143 * we must check if sk_tx_queue_clear() is called after
2144 * sock_copy() in sk_clone_lock().
2145 */
2146 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2147 offsetof(struct sock, sk_dontcopy_begin) ||
2148 offsetof(struct sock, sk_tx_queue_mapping) >=
2149 offsetof(struct sock, sk_dontcopy_end));
2150
2151 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2152
2153 unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2154 prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2155 /* alloc is larger than struct, see sk_prot_alloc() */);
2156
2157 #ifdef CONFIG_SECURITY_NETWORK
2158 nsk->sk_security = sptr;
2159 security_sk_clone(osk, nsk);
2160 #endif
2161 }
2162
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)2163 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2164 int family)
2165 {
2166 struct sock *sk;
2167 struct kmem_cache *slab;
2168
2169 slab = prot->slab;
2170 if (slab != NULL) {
2171 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2172 if (!sk)
2173 return sk;
2174 if (want_init_on_alloc(priority))
2175 sk_prot_clear_nulls(sk, prot->obj_size);
2176 } else
2177 sk = kmalloc(prot->obj_size, priority);
2178
2179 if (sk != NULL) {
2180 if (security_sk_alloc(sk, family, priority))
2181 goto out_free;
2182
2183 if (!try_module_get(prot->owner))
2184 goto out_free_sec;
2185 }
2186
2187 return sk;
2188
2189 out_free_sec:
2190 security_sk_free(sk);
2191 out_free:
2192 if (slab != NULL)
2193 kmem_cache_free(slab, sk);
2194 else
2195 kfree(sk);
2196 return NULL;
2197 }
2198
sk_prot_free(struct proto * prot,struct sock * sk)2199 static void sk_prot_free(struct proto *prot, struct sock *sk)
2200 {
2201 struct kmem_cache *slab;
2202 struct module *owner;
2203
2204 owner = prot->owner;
2205 slab = prot->slab;
2206
2207 cgroup_sk_free(&sk->sk_cgrp_data);
2208 mem_cgroup_sk_free(sk);
2209 trace_android_vh_sk_free(sk);
2210 security_sk_free(sk);
2211
2212 sk_owner_put(sk);
2213
2214 if (slab != NULL)
2215 kmem_cache_free(slab, sk);
2216 else
2217 kfree(sk);
2218 module_put(owner);
2219 }
2220
2221 /**
2222 * sk_alloc - All socket objects are allocated here
2223 * @net: the applicable net namespace
2224 * @family: protocol family
2225 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2226 * @prot: struct proto associated with this new sock instance
2227 * @kern: is this to be a kernel socket?
2228 */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)2229 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2230 struct proto *prot, int kern)
2231 {
2232 struct sock *sk;
2233
2234 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2235 if (sk) {
2236 sk->sk_family = family;
2237 /*
2238 * See comment in struct sock definition to understand
2239 * why we need sk_prot_creator -acme
2240 */
2241 sk->sk_prot = sk->sk_prot_creator = prot;
2242 sk->sk_kern_sock = kern;
2243 sock_lock_init(sk);
2244 sk->sk_net_refcnt = kern ? 0 : 1;
2245 if (likely(sk->sk_net_refcnt)) {
2246 get_net_track(net, &sk->ns_tracker, priority);
2247 sock_inuse_add(net, 1);
2248 } else {
2249 net_passive_inc(net);
2250 __netns_tracker_alloc(net, &sk->ns_tracker,
2251 false, priority);
2252 }
2253
2254 sock_net_set(sk, net);
2255 refcount_set(&sk->sk_wmem_alloc, 1);
2256
2257 mem_cgroup_sk_alloc(sk);
2258 trace_android_vh_sk_alloc(sk);
2259 cgroup_sk_alloc(&sk->sk_cgrp_data);
2260 sock_update_classid(&sk->sk_cgrp_data);
2261 sock_update_netprioidx(&sk->sk_cgrp_data);
2262 sk_tx_queue_clear(sk);
2263 }
2264
2265 return sk;
2266 }
2267 EXPORT_SYMBOL(sk_alloc);
2268
2269 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2270 * grace period. This is the case for UDP sockets and TCP listeners.
2271 */
__sk_destruct(struct rcu_head * head)2272 static void __sk_destruct(struct rcu_head *head)
2273 {
2274 struct sock *sk = container_of(head, struct sock, sk_rcu);
2275 struct net *net = sock_net(sk);
2276 struct sk_filter *filter;
2277
2278 if (sk->sk_destruct)
2279 sk->sk_destruct(sk);
2280
2281 filter = rcu_dereference_check(sk->sk_filter,
2282 refcount_read(&sk->sk_wmem_alloc) == 0);
2283 if (filter) {
2284 sk_filter_uncharge(sk, filter);
2285 RCU_INIT_POINTER(sk->sk_filter, NULL);
2286 }
2287
2288 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2289
2290 #ifdef CONFIG_BPF_SYSCALL
2291 bpf_sk_storage_free(sk);
2292 #endif
2293
2294 if (atomic_read(&sk->sk_omem_alloc))
2295 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2296 __func__, atomic_read(&sk->sk_omem_alloc));
2297
2298 if (sk->sk_frag.page) {
2299 put_page(sk->sk_frag.page);
2300 sk->sk_frag.page = NULL;
2301 }
2302
2303 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2304 put_cred(sk->sk_peer_cred);
2305 put_pid(sk->sk_peer_pid);
2306
2307 if (likely(sk->sk_net_refcnt)) {
2308 put_net_track(net, &sk->ns_tracker);
2309 } else {
2310 __netns_tracker_free(net, &sk->ns_tracker, false);
2311 net_passive_dec(net);
2312 }
2313 sk_prot_free(sk->sk_prot_creator, sk);
2314 }
2315
sk_net_refcnt_upgrade(struct sock * sk)2316 void sk_net_refcnt_upgrade(struct sock *sk)
2317 {
2318 struct net *net = sock_net(sk);
2319
2320 WARN_ON_ONCE(sk->sk_net_refcnt);
2321 __netns_tracker_free(net, &sk->ns_tracker, false);
2322 net_passive_dec(net);
2323 sk->sk_net_refcnt = 1;
2324 get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
2325 sock_inuse_add(net, 1);
2326 }
2327 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
2328
sk_destruct(struct sock * sk)2329 void sk_destruct(struct sock *sk)
2330 {
2331 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2332
2333 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2334 reuseport_detach_sock(sk);
2335 use_call_rcu = true;
2336 }
2337
2338 if (use_call_rcu)
2339 call_rcu(&sk->sk_rcu, __sk_destruct);
2340 else
2341 __sk_destruct(&sk->sk_rcu);
2342 }
2343
__sk_free(struct sock * sk)2344 static void __sk_free(struct sock *sk)
2345 {
2346 if (likely(sk->sk_net_refcnt))
2347 sock_inuse_add(sock_net(sk), -1);
2348
2349 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2350 sock_diag_broadcast_destroy(sk);
2351 else
2352 sk_destruct(sk);
2353 }
2354
sk_free(struct sock * sk)2355 void sk_free(struct sock *sk)
2356 {
2357 /*
2358 * We subtract one from sk_wmem_alloc and can know if
2359 * some packets are still in some tx queue.
2360 * If not null, sock_wfree() will call __sk_free(sk) later
2361 */
2362 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2363 __sk_free(sk);
2364 }
2365 EXPORT_SYMBOL(sk_free);
2366
sk_init_common(struct sock * sk)2367 static void sk_init_common(struct sock *sk)
2368 {
2369 skb_queue_head_init(&sk->sk_receive_queue);
2370 skb_queue_head_init(&sk->sk_write_queue);
2371 skb_queue_head_init(&sk->sk_error_queue);
2372
2373 rwlock_init(&sk->sk_callback_lock);
2374 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2375 af_rlock_keys + sk->sk_family,
2376 af_family_rlock_key_strings[sk->sk_family]);
2377 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2378 af_wlock_keys + sk->sk_family,
2379 af_family_wlock_key_strings[sk->sk_family]);
2380 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2381 af_elock_keys + sk->sk_family,
2382 af_family_elock_key_strings[sk->sk_family]);
2383 if (sk->sk_kern_sock)
2384 lockdep_set_class_and_name(&sk->sk_callback_lock,
2385 af_kern_callback_keys + sk->sk_family,
2386 af_family_kern_clock_key_strings[sk->sk_family]);
2387 else
2388 lockdep_set_class_and_name(&sk->sk_callback_lock,
2389 af_callback_keys + sk->sk_family,
2390 af_family_clock_key_strings[sk->sk_family]);
2391 }
2392
2393 /**
2394 * sk_clone_lock - clone a socket, and lock its clone
2395 * @sk: the socket to clone
2396 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2397 *
2398 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2399 */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2400 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2401 {
2402 struct proto *prot = READ_ONCE(sk->sk_prot);
2403 struct sk_filter *filter;
2404 bool is_charged = true;
2405 struct sock *newsk;
2406
2407 newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2408 if (!newsk)
2409 goto out;
2410
2411 sock_copy(newsk, sk);
2412 trace_android_vh_sk_clone_lock(newsk);
2413
2414 newsk->sk_prot_creator = prot;
2415
2416 /* SANITY */
2417 if (likely(newsk->sk_net_refcnt)) {
2418 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2419 sock_inuse_add(sock_net(newsk), 1);
2420 } else {
2421 /* Kernel sockets are not elevating the struct net refcount.
2422 * Instead, use a tracker to more easily detect if a layer
2423 * is not properly dismantling its kernel sockets at netns
2424 * destroy time.
2425 */
2426 net_passive_inc(sock_net(newsk));
2427 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2428 false, priority);
2429 }
2430 sk_node_init(&newsk->sk_node);
2431 sock_lock_init(newsk);
2432 bh_lock_sock(newsk);
2433 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
2434 newsk->sk_backlog.len = 0;
2435
2436 atomic_set(&newsk->sk_rmem_alloc, 0);
2437
2438 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2439 refcount_set(&newsk->sk_wmem_alloc, 1);
2440
2441 atomic_set(&newsk->sk_omem_alloc, 0);
2442 sk_init_common(newsk);
2443
2444 newsk->sk_dst_cache = NULL;
2445 newsk->sk_dst_pending_confirm = 0;
2446 newsk->sk_wmem_queued = 0;
2447 newsk->sk_forward_alloc = 0;
2448 newsk->sk_reserved_mem = 0;
2449 atomic_set(&newsk->sk_drops, 0);
2450 newsk->sk_send_head = NULL;
2451 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2452 atomic_set(&newsk->sk_zckey, 0);
2453
2454 sock_reset_flag(newsk, SOCK_DONE);
2455
2456 /* sk->sk_memcg will be populated at accept() time */
2457 newsk->sk_memcg = NULL;
2458
2459 cgroup_sk_clone(&newsk->sk_cgrp_data);
2460
2461 rcu_read_lock();
2462 filter = rcu_dereference(sk->sk_filter);
2463 if (filter != NULL)
2464 /* though it's an empty new sock, the charging may fail
2465 * if sysctl_optmem_max was changed between creation of
2466 * original socket and cloning
2467 */
2468 is_charged = sk_filter_charge(newsk, filter);
2469 RCU_INIT_POINTER(newsk->sk_filter, filter);
2470 rcu_read_unlock();
2471
2472 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2473 /* We need to make sure that we don't uncharge the new
2474 * socket if we couldn't charge it in the first place
2475 * as otherwise we uncharge the parent's filter.
2476 */
2477 if (!is_charged)
2478 RCU_INIT_POINTER(newsk->sk_filter, NULL);
2479 sk_free_unlock_clone(newsk);
2480 newsk = NULL;
2481 goto out;
2482 }
2483 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2484
2485 if (bpf_sk_storage_clone(sk, newsk)) {
2486 sk_free_unlock_clone(newsk);
2487 newsk = NULL;
2488 goto out;
2489 }
2490
2491 /* Clear sk_user_data if parent had the pointer tagged
2492 * as not suitable for copying when cloning.
2493 */
2494 if (sk_user_data_is_nocopy(newsk))
2495 newsk->sk_user_data = NULL;
2496
2497 newsk->sk_err = 0;
2498 newsk->sk_err_soft = 0;
2499 newsk->sk_priority = 0;
2500 newsk->sk_incoming_cpu = raw_smp_processor_id();
2501
2502 /* Before updating sk_refcnt, we must commit prior changes to memory
2503 * (Documentation/RCU/rculist_nulls.rst for details)
2504 */
2505 smp_wmb();
2506 refcount_set(&newsk->sk_refcnt, 2);
2507
2508 sk_set_socket(newsk, NULL);
2509 sk_tx_queue_clear(newsk);
2510 RCU_INIT_POINTER(newsk->sk_wq, NULL);
2511
2512 if (newsk->sk_prot->sockets_allocated)
2513 sk_sockets_allocated_inc(newsk);
2514
2515 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2516 net_enable_timestamp();
2517 out:
2518 return newsk;
2519 }
2520 EXPORT_SYMBOL_GPL(sk_clone_lock);
2521
sk_free_unlock_clone(struct sock * sk)2522 void sk_free_unlock_clone(struct sock *sk)
2523 {
2524 /* It is still raw copy of parent, so invalidate
2525 * destructor and make plain sk_free() */
2526 sk->sk_destruct = NULL;
2527 bh_unlock_sock(sk);
2528 sk_free(sk);
2529 }
2530 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2531
sk_dst_gso_max_size(struct sock * sk,struct dst_entry * dst)2532 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2533 {
2534 bool is_ipv6 = false;
2535 u32 max_size;
2536
2537 #if IS_ENABLED(CONFIG_IPV6)
2538 is_ipv6 = (sk->sk_family == AF_INET6 &&
2539 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2540 #endif
2541 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2542 max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2543 READ_ONCE(dst->dev->gso_ipv4_max_size);
2544 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2545 max_size = GSO_LEGACY_MAX_SIZE;
2546
2547 return max_size - (MAX_TCP_HEADER + 1);
2548 }
2549
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2550 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2551 {
2552 u32 max_segs = 1;
2553
2554 sk->sk_route_caps = dst->dev->features;
2555 if (sk_is_tcp(sk))
2556 sk->sk_route_caps |= NETIF_F_GSO;
2557 if (sk->sk_route_caps & NETIF_F_GSO)
2558 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2559 if (unlikely(sk->sk_gso_disabled))
2560 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2561 if (sk_can_gso(sk)) {
2562 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2563 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2564 } else {
2565 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2566 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2567 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2568 max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2569 }
2570 }
2571 sk->sk_gso_max_segs = max_segs;
2572 sk_dst_set(sk, dst);
2573 }
2574 EXPORT_SYMBOL_GPL(sk_setup_caps);
2575
2576 /*
2577 * Simple resource managers for sockets.
2578 */
2579
2580
2581 /*
2582 * Write buffer destructor automatically called from kfree_skb.
2583 */
sock_wfree(struct sk_buff * skb)2584 void sock_wfree(struct sk_buff *skb)
2585 {
2586 struct sock *sk = skb->sk;
2587 unsigned int len = skb->truesize;
2588 bool free;
2589
2590 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2591 if (sock_flag(sk, SOCK_RCU_FREE) &&
2592 sk->sk_write_space == sock_def_write_space) {
2593 rcu_read_lock();
2594 free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2595 sock_def_write_space_wfree(sk);
2596 rcu_read_unlock();
2597 if (unlikely(free))
2598 __sk_free(sk);
2599 return;
2600 }
2601
2602 /*
2603 * Keep a reference on sk_wmem_alloc, this will be released
2604 * after sk_write_space() call
2605 */
2606 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2607 sk->sk_write_space(sk);
2608 len = 1;
2609 }
2610 /*
2611 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2612 * could not do because of in-flight packets
2613 */
2614 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2615 __sk_free(sk);
2616 }
2617 EXPORT_SYMBOL(sock_wfree);
2618
2619 /* This variant of sock_wfree() is used by TCP,
2620 * since it sets SOCK_USE_WRITE_QUEUE.
2621 */
__sock_wfree(struct sk_buff * skb)2622 void __sock_wfree(struct sk_buff *skb)
2623 {
2624 struct sock *sk = skb->sk;
2625
2626 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2627 __sk_free(sk);
2628 }
2629
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2630 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2631 {
2632 skb_orphan(skb);
2633 skb->sk = sk;
2634 #ifdef CONFIG_INET
2635 if (unlikely(!sk_fullsock(sk))) {
2636 skb->destructor = sock_edemux;
2637 sock_hold(sk);
2638 return;
2639 }
2640 #endif
2641 skb->destructor = sock_wfree;
2642 skb_set_hash_from_sk(skb, sk);
2643 /*
2644 * We used to take a refcount on sk, but following operation
2645 * is enough to guarantee sk_free() won't free this sock until
2646 * all in-flight packets are completed
2647 */
2648 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2649 }
2650 EXPORT_SYMBOL(skb_set_owner_w);
2651
can_skb_orphan_partial(const struct sk_buff * skb)2652 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2653 {
2654 /* Drivers depend on in-order delivery for crypto offload,
2655 * partial orphan breaks out-of-order-OK logic.
2656 */
2657 if (skb_is_decrypted(skb))
2658 return false;
2659
2660 return (skb->destructor == sock_wfree ||
2661 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2662 }
2663
2664 /* This helper is used by netem, as it can hold packets in its
2665 * delay queue. We want to allow the owner socket to send more
2666 * packets, as if they were already TX completed by a typical driver.
2667 * But we also want to keep skb->sk set because some packet schedulers
2668 * rely on it (sch_fq for example).
2669 */
skb_orphan_partial(struct sk_buff * skb)2670 void skb_orphan_partial(struct sk_buff *skb)
2671 {
2672 if (skb_is_tcp_pure_ack(skb))
2673 return;
2674
2675 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2676 return;
2677
2678 skb_orphan(skb);
2679 }
2680 EXPORT_SYMBOL(skb_orphan_partial);
2681
2682 /*
2683 * Read buffer destructor automatically called from kfree_skb.
2684 */
sock_rfree(struct sk_buff * skb)2685 void sock_rfree(struct sk_buff *skb)
2686 {
2687 struct sock *sk = skb->sk;
2688 unsigned int len = skb->truesize;
2689
2690 atomic_sub(len, &sk->sk_rmem_alloc);
2691 sk_mem_uncharge(sk, len);
2692 }
2693 EXPORT_SYMBOL(sock_rfree);
2694
2695 /*
2696 * Buffer destructor for skbs that are not used directly in read or write
2697 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2698 */
sock_efree(struct sk_buff * skb)2699 void sock_efree(struct sk_buff *skb)
2700 {
2701 sock_put(skb->sk);
2702 }
2703 EXPORT_SYMBOL(sock_efree);
2704
2705 /* Buffer destructor for prefetch/receive path where reference count may
2706 * not be held, e.g. for listen sockets.
2707 */
2708 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2709 void sock_pfree(struct sk_buff *skb)
2710 {
2711 struct sock *sk = skb->sk;
2712
2713 if (!sk_is_refcounted(sk))
2714 return;
2715
2716 if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2717 inet_reqsk(sk)->rsk_listener = NULL;
2718 reqsk_free(inet_reqsk(sk));
2719 return;
2720 }
2721
2722 sock_gen_put(sk);
2723 }
2724 EXPORT_SYMBOL(sock_pfree);
2725 #endif /* CONFIG_INET */
2726
sock_i_uid(struct sock * sk)2727 kuid_t sock_i_uid(struct sock *sk)
2728 {
2729 kuid_t uid;
2730
2731 read_lock_bh(&sk->sk_callback_lock);
2732 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2733 read_unlock_bh(&sk->sk_callback_lock);
2734 return uid;
2735 }
2736 EXPORT_SYMBOL(sock_i_uid);
2737
__sock_i_ino(struct sock * sk)2738 unsigned long __sock_i_ino(struct sock *sk)
2739 {
2740 unsigned long ino;
2741
2742 read_lock(&sk->sk_callback_lock);
2743 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2744 read_unlock(&sk->sk_callback_lock);
2745 return ino;
2746 }
2747 EXPORT_SYMBOL(__sock_i_ino);
2748
sock_i_ino(struct sock * sk)2749 unsigned long sock_i_ino(struct sock *sk)
2750 {
2751 unsigned long ino;
2752
2753 local_bh_disable();
2754 ino = __sock_i_ino(sk);
2755 local_bh_enable();
2756 return ino;
2757 }
2758 EXPORT_SYMBOL(sock_i_ino);
2759
2760 /*
2761 * Allocate a skb from the socket's send buffer.
2762 */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2763 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2764 gfp_t priority)
2765 {
2766 if (force ||
2767 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2768 struct sk_buff *skb = alloc_skb(size, priority);
2769
2770 if (skb) {
2771 skb_set_owner_w(skb, sk);
2772 return skb;
2773 }
2774 }
2775 return NULL;
2776 }
2777 EXPORT_SYMBOL(sock_wmalloc);
2778
sock_ofree(struct sk_buff * skb)2779 static void sock_ofree(struct sk_buff *skb)
2780 {
2781 struct sock *sk = skb->sk;
2782
2783 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2784 }
2785
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2786 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2787 gfp_t priority)
2788 {
2789 struct sk_buff *skb;
2790
2791 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2792 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2793 READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2794 return NULL;
2795
2796 skb = alloc_skb(size, priority);
2797 if (!skb)
2798 return NULL;
2799
2800 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2801 skb->sk = sk;
2802 skb->destructor = sock_ofree;
2803 return skb;
2804 }
2805
2806 /*
2807 * Allocate a memory block from the socket's option memory buffer.
2808 */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2809 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2810 {
2811 int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2812
2813 if ((unsigned int)size <= optmem_max &&
2814 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2815 void *mem;
2816 /* First do the add, to avoid the race if kmalloc
2817 * might sleep.
2818 */
2819 atomic_add(size, &sk->sk_omem_alloc);
2820 mem = kmalloc(size, priority);
2821 if (mem)
2822 return mem;
2823 atomic_sub(size, &sk->sk_omem_alloc);
2824 }
2825 return NULL;
2826 }
2827 EXPORT_SYMBOL(sock_kmalloc);
2828
2829 /* Free an option memory block. Note, we actually want the inline
2830 * here as this allows gcc to detect the nullify and fold away the
2831 * condition entirely.
2832 */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2833 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2834 const bool nullify)
2835 {
2836 if (WARN_ON_ONCE(!mem))
2837 return;
2838 if (nullify)
2839 kfree_sensitive(mem);
2840 else
2841 kfree(mem);
2842 atomic_sub(size, &sk->sk_omem_alloc);
2843 }
2844
sock_kfree_s(struct sock * sk,void * mem,int size)2845 void sock_kfree_s(struct sock *sk, void *mem, int size)
2846 {
2847 __sock_kfree_s(sk, mem, size, false);
2848 }
2849 EXPORT_SYMBOL(sock_kfree_s);
2850
sock_kzfree_s(struct sock * sk,void * mem,int size)2851 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2852 {
2853 __sock_kfree_s(sk, mem, size, true);
2854 }
2855 EXPORT_SYMBOL(sock_kzfree_s);
2856
2857 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2858 I think, these locks should be removed for datagram sockets.
2859 */
sock_wait_for_wmem(struct sock * sk,long timeo)2860 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2861 {
2862 DEFINE_WAIT(wait);
2863
2864 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2865 for (;;) {
2866 if (!timeo)
2867 break;
2868 if (signal_pending(current))
2869 break;
2870 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2871 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2872 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2873 break;
2874 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2875 break;
2876 if (READ_ONCE(sk->sk_err))
2877 break;
2878 timeo = schedule_timeout(timeo);
2879 }
2880 finish_wait(sk_sleep(sk), &wait);
2881 return timeo;
2882 }
2883
2884
2885 /*
2886 * Generic send/receive buffer handlers
2887 */
2888
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2889 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2890 unsigned long data_len, int noblock,
2891 int *errcode, int max_page_order)
2892 {
2893 struct sk_buff *skb;
2894 long timeo;
2895 int err;
2896
2897 timeo = sock_sndtimeo(sk, noblock);
2898 for (;;) {
2899 err = sock_error(sk);
2900 if (err != 0)
2901 goto failure;
2902
2903 err = -EPIPE;
2904 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2905 goto failure;
2906
2907 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2908 break;
2909
2910 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2911 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2912 err = -EAGAIN;
2913 if (!timeo)
2914 goto failure;
2915 if (signal_pending(current))
2916 goto interrupted;
2917 timeo = sock_wait_for_wmem(sk, timeo);
2918 }
2919 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2920 errcode, sk->sk_allocation);
2921 if (skb)
2922 skb_set_owner_w(skb, sk);
2923 return skb;
2924
2925 interrupted:
2926 err = sock_intr_errno(timeo);
2927 failure:
2928 *errcode = err;
2929 return NULL;
2930 }
2931 EXPORT_SYMBOL(sock_alloc_send_pskb);
2932
__sock_cmsg_send(struct sock * sk,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2933 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2934 struct sockcm_cookie *sockc)
2935 {
2936 u32 tsflags;
2937
2938 switch (cmsg->cmsg_type) {
2939 case SO_MARK:
2940 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2941 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2942 return -EPERM;
2943 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2944 return -EINVAL;
2945 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2946 break;
2947 case SO_TIMESTAMPING_OLD:
2948 case SO_TIMESTAMPING_NEW:
2949 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2950 return -EINVAL;
2951
2952 tsflags = *(u32 *)CMSG_DATA(cmsg);
2953 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2954 return -EINVAL;
2955
2956 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2957 sockc->tsflags |= tsflags;
2958 break;
2959 case SCM_TXTIME:
2960 if (!sock_flag(sk, SOCK_TXTIME))
2961 return -EINVAL;
2962 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2963 return -EINVAL;
2964 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2965 break;
2966 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2967 case SCM_RIGHTS:
2968 case SCM_CREDENTIALS:
2969 break;
2970 default:
2971 return -EINVAL;
2972 }
2973 return 0;
2974 }
2975 EXPORT_SYMBOL(__sock_cmsg_send);
2976
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2977 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2978 struct sockcm_cookie *sockc)
2979 {
2980 struct cmsghdr *cmsg;
2981 int ret;
2982
2983 for_each_cmsghdr(cmsg, msg) {
2984 if (!CMSG_OK(msg, cmsg))
2985 return -EINVAL;
2986 if (cmsg->cmsg_level != SOL_SOCKET)
2987 continue;
2988 ret = __sock_cmsg_send(sk, cmsg, sockc);
2989 if (ret)
2990 return ret;
2991 }
2992 return 0;
2993 }
2994 EXPORT_SYMBOL(sock_cmsg_send);
2995
sk_enter_memory_pressure(struct sock * sk)2996 static void sk_enter_memory_pressure(struct sock *sk)
2997 {
2998 if (!sk->sk_prot->enter_memory_pressure)
2999 return;
3000
3001 sk->sk_prot->enter_memory_pressure(sk);
3002 }
3003
sk_leave_memory_pressure(struct sock * sk)3004 static void sk_leave_memory_pressure(struct sock *sk)
3005 {
3006 if (sk->sk_prot->leave_memory_pressure) {
3007 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3008 tcp_leave_memory_pressure, sk);
3009 } else {
3010 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3011
3012 if (memory_pressure && READ_ONCE(*memory_pressure))
3013 WRITE_ONCE(*memory_pressure, 0);
3014 }
3015 }
3016
3017 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3018
3019 /**
3020 * skb_page_frag_refill - check that a page_frag contains enough room
3021 * @sz: minimum size of the fragment we want to get
3022 * @pfrag: pointer to page_frag
3023 * @gfp: priority for memory allocation
3024 *
3025 * Note: While this allocator tries to use high order pages, there is
3026 * no guarantee that allocations succeed. Therefore, @sz MUST be
3027 * less or equal than PAGE_SIZE.
3028 */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)3029 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3030 {
3031 if (pfrag->page) {
3032 if (page_ref_count(pfrag->page) == 1) {
3033 pfrag->offset = 0;
3034 return true;
3035 }
3036 if (pfrag->offset + sz <= pfrag->size)
3037 return true;
3038 put_page(pfrag->page);
3039 }
3040
3041 pfrag->offset = 0;
3042 if (SKB_FRAG_PAGE_ORDER &&
3043 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3044 /* Avoid direct reclaim but allow kswapd to wake */
3045 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3046 __GFP_COMP | __GFP_NOWARN |
3047 __GFP_NORETRY,
3048 SKB_FRAG_PAGE_ORDER);
3049 if (likely(pfrag->page)) {
3050 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3051 return true;
3052 }
3053 }
3054 pfrag->page = alloc_page(gfp);
3055 if (likely(pfrag->page)) {
3056 pfrag->size = PAGE_SIZE;
3057 return true;
3058 }
3059 return false;
3060 }
3061 EXPORT_SYMBOL(skb_page_frag_refill);
3062
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)3063 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3064 {
3065 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3066 return true;
3067
3068 sk_enter_memory_pressure(sk);
3069 sk_stream_moderate_sndbuf(sk);
3070 return false;
3071 }
3072 EXPORT_SYMBOL(sk_page_frag_refill);
3073
__lock_sock(struct sock * sk)3074 void __lock_sock(struct sock *sk)
3075 __releases(&sk->sk_lock.slock)
3076 __acquires(&sk->sk_lock.slock)
3077 {
3078 DEFINE_WAIT(wait);
3079
3080 for (;;) {
3081 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3082 TASK_UNINTERRUPTIBLE);
3083 spin_unlock_bh(&sk->sk_lock.slock);
3084 schedule();
3085 spin_lock_bh(&sk->sk_lock.slock);
3086 if (!sock_owned_by_user(sk))
3087 break;
3088 }
3089 finish_wait(&sk->sk_lock.wq, &wait);
3090 }
3091
__release_sock(struct sock * sk)3092 void __release_sock(struct sock *sk)
3093 __releases(&sk->sk_lock.slock)
3094 __acquires(&sk->sk_lock.slock)
3095 {
3096 struct sk_buff *skb, *next;
3097
3098 while ((skb = sk->sk_backlog.head) != NULL) {
3099 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3100
3101 spin_unlock_bh(&sk->sk_lock.slock);
3102
3103 do {
3104 next = skb->next;
3105 prefetch(next);
3106 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3107 skb_mark_not_on_list(skb);
3108 sk_backlog_rcv(sk, skb);
3109
3110 cond_resched();
3111
3112 skb = next;
3113 } while (skb != NULL);
3114
3115 spin_lock_bh(&sk->sk_lock.slock);
3116 }
3117
3118 /*
3119 * Doing the zeroing here guarantee we can not loop forever
3120 * while a wild producer attempts to flood us.
3121 */
3122 sk->sk_backlog.len = 0;
3123 }
3124
__sk_flush_backlog(struct sock * sk)3125 void __sk_flush_backlog(struct sock *sk)
3126 {
3127 spin_lock_bh(&sk->sk_lock.slock);
3128 __release_sock(sk);
3129
3130 if (sk->sk_prot->release_cb)
3131 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3132 tcp_release_cb, sk);
3133
3134 spin_unlock_bh(&sk->sk_lock.slock);
3135 }
3136 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3137
3138 /**
3139 * sk_wait_data - wait for data to arrive at sk_receive_queue
3140 * @sk: sock to wait on
3141 * @timeo: for how long
3142 * @skb: last skb seen on sk_receive_queue
3143 *
3144 * Now socket state including sk->sk_err is changed only under lock,
3145 * hence we may omit checks after joining wait queue.
3146 * We check receive queue before schedule() only as optimization;
3147 * it is very likely that release_sock() added new data.
3148 */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)3149 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3150 {
3151 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3152 int rc;
3153
3154 add_wait_queue(sk_sleep(sk), &wait);
3155 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3156 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3157 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3158 remove_wait_queue(sk_sleep(sk), &wait);
3159 return rc;
3160 }
3161 EXPORT_SYMBOL(sk_wait_data);
3162
3163 /**
3164 * __sk_mem_raise_allocated - increase memory_allocated
3165 * @sk: socket
3166 * @size: memory size to allocate
3167 * @amt: pages to allocate
3168 * @kind: allocation type
3169 *
3170 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3171 *
3172 * Unlike the globally shared limits among the sockets under same protocol,
3173 * consuming the budget of a memcg won't have direct effect on other ones.
3174 * So be optimistic about memcg's tolerance, and leave the callers to decide
3175 * whether or not to raise allocated through sk_under_memory_pressure() or
3176 * its variants.
3177 */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)3178 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3179 {
3180 struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3181 struct proto *prot = sk->sk_prot;
3182 bool charged = true;
3183 long allocated;
3184
3185 sk_memory_allocated_add(sk, amt);
3186 allocated = sk_memory_allocated(sk);
3187
3188 if (memcg) {
3189 charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge());
3190 if (!charged)
3191 goto suppress_allocation;
3192 }
3193
3194 /* Under limit. */
3195 if (allocated <= sk_prot_mem_limits(sk, 0)) {
3196 sk_leave_memory_pressure(sk);
3197 return 1;
3198 }
3199
3200 /* Under pressure. */
3201 if (allocated > sk_prot_mem_limits(sk, 1))
3202 sk_enter_memory_pressure(sk);
3203
3204 /* Over hard limit. */
3205 if (allocated > sk_prot_mem_limits(sk, 2))
3206 goto suppress_allocation;
3207
3208 /* Guarantee minimum buffer size under pressure (either global
3209 * or memcg) to make sure features described in RFC 7323 (TCP
3210 * Extensions for High Performance) work properly.
3211 *
3212 * This rule does NOT stand when exceeds global or memcg's hard
3213 * limit, or else a DoS attack can be taken place by spawning
3214 * lots of sockets whose usage are under minimum buffer size.
3215 */
3216 if (kind == SK_MEM_RECV) {
3217 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3218 return 1;
3219
3220 } else { /* SK_MEM_SEND */
3221 int wmem0 = sk_get_wmem0(sk, prot);
3222
3223 if (sk->sk_type == SOCK_STREAM) {
3224 if (sk->sk_wmem_queued < wmem0)
3225 return 1;
3226 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3227 return 1;
3228 }
3229 }
3230
3231 if (sk_has_memory_pressure(sk)) {
3232 u64 alloc;
3233
3234 /* The following 'average' heuristic is within the
3235 * scope of global accounting, so it only makes
3236 * sense for global memory pressure.
3237 */
3238 if (!sk_under_global_memory_pressure(sk))
3239 return 1;
3240
3241 /* Try to be fair among all the sockets under global
3242 * pressure by allowing the ones that below average
3243 * usage to raise.
3244 */
3245 alloc = sk_sockets_allocated_read_positive(sk);
3246 if (sk_prot_mem_limits(sk, 2) > alloc *
3247 sk_mem_pages(sk->sk_wmem_queued +
3248 atomic_read(&sk->sk_rmem_alloc) +
3249 sk->sk_forward_alloc))
3250 return 1;
3251 }
3252
3253 suppress_allocation:
3254
3255 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3256 sk_stream_moderate_sndbuf(sk);
3257
3258 /* Fail only if socket is _under_ its sndbuf.
3259 * In this case we cannot block, so that we have to fail.
3260 */
3261 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3262 /* Force charge with __GFP_NOFAIL */
3263 if (memcg && !charged) {
3264 mem_cgroup_charge_skmem(memcg, amt,
3265 gfp_memcg_charge() | __GFP_NOFAIL);
3266 }
3267 return 1;
3268 }
3269 }
3270
3271 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3272 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3273
3274 sk_memory_allocated_sub(sk, amt);
3275
3276 if (memcg && charged)
3277 mem_cgroup_uncharge_skmem(memcg, amt);
3278
3279 return 0;
3280 }
3281
3282 /**
3283 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3284 * @sk: socket
3285 * @size: memory size to allocate
3286 * @kind: allocation type
3287 *
3288 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3289 * rmem allocation. This function assumes that protocols which have
3290 * memory_pressure use sk_wmem_queued as write buffer accounting.
3291 */
__sk_mem_schedule(struct sock * sk,int size,int kind)3292 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3293 {
3294 int ret, amt = sk_mem_pages(size);
3295
3296 sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3297 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3298 if (!ret)
3299 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3300 return ret;
3301 }
3302 EXPORT_SYMBOL(__sk_mem_schedule);
3303
3304 /**
3305 * __sk_mem_reduce_allocated - reclaim memory_allocated
3306 * @sk: socket
3307 * @amount: number of quanta
3308 *
3309 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3310 */
__sk_mem_reduce_allocated(struct sock * sk,int amount)3311 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3312 {
3313 sk_memory_allocated_sub(sk, amount);
3314
3315 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3316 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3317
3318 if (sk_under_global_memory_pressure(sk) &&
3319 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3320 sk_leave_memory_pressure(sk);
3321 }
3322
3323 /**
3324 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3325 * @sk: socket
3326 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3327 */
__sk_mem_reclaim(struct sock * sk,int amount)3328 void __sk_mem_reclaim(struct sock *sk, int amount)
3329 {
3330 amount >>= PAGE_SHIFT;
3331 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3332 __sk_mem_reduce_allocated(sk, amount);
3333 }
3334 EXPORT_SYMBOL(__sk_mem_reclaim);
3335
sk_set_peek_off(struct sock * sk,int val)3336 int sk_set_peek_off(struct sock *sk, int val)
3337 {
3338 WRITE_ONCE(sk->sk_peek_off, val);
3339 return 0;
3340 }
3341 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3342
3343 /*
3344 * Set of default routines for initialising struct proto_ops when
3345 * the protocol does not support a particular function. In certain
3346 * cases where it makes no sense for a protocol to have a "do nothing"
3347 * function, some default processing is provided.
3348 */
3349
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)3350 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3351 {
3352 return -EOPNOTSUPP;
3353 }
3354 EXPORT_SYMBOL(sock_no_bind);
3355
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)3356 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3357 int len, int flags)
3358 {
3359 return -EOPNOTSUPP;
3360 }
3361 EXPORT_SYMBOL(sock_no_connect);
3362
sock_no_socketpair(struct socket * sock1,struct socket * sock2)3363 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3364 {
3365 return -EOPNOTSUPP;
3366 }
3367 EXPORT_SYMBOL(sock_no_socketpair);
3368
sock_no_accept(struct socket * sock,struct socket * newsock,struct proto_accept_arg * arg)3369 int sock_no_accept(struct socket *sock, struct socket *newsock,
3370 struct proto_accept_arg *arg)
3371 {
3372 return -EOPNOTSUPP;
3373 }
3374 EXPORT_SYMBOL(sock_no_accept);
3375
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)3376 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3377 int peer)
3378 {
3379 return -EOPNOTSUPP;
3380 }
3381 EXPORT_SYMBOL(sock_no_getname);
3382
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3383 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3384 {
3385 return -EOPNOTSUPP;
3386 }
3387 EXPORT_SYMBOL(sock_no_ioctl);
3388
sock_no_listen(struct socket * sock,int backlog)3389 int sock_no_listen(struct socket *sock, int backlog)
3390 {
3391 return -EOPNOTSUPP;
3392 }
3393 EXPORT_SYMBOL(sock_no_listen);
3394
sock_no_shutdown(struct socket * sock,int how)3395 int sock_no_shutdown(struct socket *sock, int how)
3396 {
3397 return -EOPNOTSUPP;
3398 }
3399 EXPORT_SYMBOL(sock_no_shutdown);
3400
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)3401 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3402 {
3403 return -EOPNOTSUPP;
3404 }
3405 EXPORT_SYMBOL(sock_no_sendmsg);
3406
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)3407 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3408 {
3409 return -EOPNOTSUPP;
3410 }
3411 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3412
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)3413 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3414 int flags)
3415 {
3416 return -EOPNOTSUPP;
3417 }
3418 EXPORT_SYMBOL(sock_no_recvmsg);
3419
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)3420 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3421 {
3422 /* Mirror missing mmap method error code */
3423 return -ENODEV;
3424 }
3425 EXPORT_SYMBOL(sock_no_mmap);
3426
3427 /*
3428 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3429 * various sock-based usage counts.
3430 */
__receive_sock(struct file * file)3431 void __receive_sock(struct file *file)
3432 {
3433 struct socket *sock;
3434
3435 sock = sock_from_file(file);
3436 if (sock) {
3437 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3438 sock_update_classid(&sock->sk->sk_cgrp_data);
3439 trace_android_vh_receive_sock(sock->sk);
3440 }
3441 }
3442
3443 /*
3444 * Default Socket Callbacks
3445 */
3446
sock_def_wakeup(struct sock * sk)3447 static void sock_def_wakeup(struct sock *sk)
3448 {
3449 struct socket_wq *wq;
3450
3451 rcu_read_lock();
3452 wq = rcu_dereference(sk->sk_wq);
3453 if (skwq_has_sleeper(wq))
3454 wake_up_interruptible_all(&wq->wait);
3455 rcu_read_unlock();
3456 }
3457
sock_def_error_report(struct sock * sk)3458 static void sock_def_error_report(struct sock *sk)
3459 {
3460 struct socket_wq *wq;
3461
3462 rcu_read_lock();
3463 wq = rcu_dereference(sk->sk_wq);
3464 if (skwq_has_sleeper(wq))
3465 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3466 sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3467 rcu_read_unlock();
3468 }
3469
sock_def_readable(struct sock * sk)3470 void sock_def_readable(struct sock *sk)
3471 {
3472 struct socket_wq *wq;
3473
3474 trace_sk_data_ready(sk);
3475
3476 rcu_read_lock();
3477 wq = rcu_dereference(sk->sk_wq);
3478
3479 if (skwq_has_sleeper(wq)) {
3480 int done = 0;
3481
3482 trace_android_vh_do_wake_up_sync(&wq->wait, &done, sk);
3483 if (done)
3484 goto out;
3485
3486 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3487 EPOLLRDNORM | EPOLLRDBAND);
3488 }
3489
3490 out:
3491 sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3492 rcu_read_unlock();
3493 }
3494
sock_def_write_space(struct sock * sk)3495 static void sock_def_write_space(struct sock *sk)
3496 {
3497 struct socket_wq *wq;
3498
3499 rcu_read_lock();
3500
3501 /* Do not wake up a writer until he can make "significant"
3502 * progress. --DaveM
3503 */
3504 if (sock_writeable(sk)) {
3505 wq = rcu_dereference(sk->sk_wq);
3506 if (skwq_has_sleeper(wq))
3507 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3508 EPOLLWRNORM | EPOLLWRBAND);
3509
3510 /* Should agree with poll, otherwise some programs break */
3511 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3512 }
3513
3514 rcu_read_unlock();
3515 }
3516
3517 /* An optimised version of sock_def_write_space(), should only be called
3518 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3519 * ->sk_wmem_alloc.
3520 */
sock_def_write_space_wfree(struct sock * sk)3521 static void sock_def_write_space_wfree(struct sock *sk)
3522 {
3523 /* Do not wake up a writer until he can make "significant"
3524 * progress. --DaveM
3525 */
3526 if (sock_writeable(sk)) {
3527 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3528
3529 /* rely on refcount_sub from sock_wfree() */
3530 smp_mb__after_atomic();
3531 if (wq && waitqueue_active(&wq->wait))
3532 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3533 EPOLLWRNORM | EPOLLWRBAND);
3534
3535 /* Should agree with poll, otherwise some programs break */
3536 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3537 }
3538 }
3539
sock_def_destruct(struct sock * sk)3540 static void sock_def_destruct(struct sock *sk)
3541 {
3542 }
3543
sk_send_sigurg(struct sock * sk)3544 void sk_send_sigurg(struct sock *sk)
3545 {
3546 if (sk->sk_socket && sk->sk_socket->file)
3547 if (send_sigurg(sk->sk_socket->file))
3548 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3549 }
3550 EXPORT_SYMBOL(sk_send_sigurg);
3551
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3552 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3553 unsigned long expires)
3554 {
3555 if (!mod_timer(timer, expires))
3556 sock_hold(sk);
3557 }
3558 EXPORT_SYMBOL(sk_reset_timer);
3559
sk_stop_timer(struct sock * sk,struct timer_list * timer)3560 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3561 {
3562 if (del_timer(timer))
3563 __sock_put(sk);
3564 }
3565 EXPORT_SYMBOL(sk_stop_timer);
3566
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3567 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3568 {
3569 if (del_timer_sync(timer))
3570 __sock_put(sk);
3571 }
3572 EXPORT_SYMBOL(sk_stop_timer_sync);
3573
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3574 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3575 {
3576 sk_init_common(sk);
3577 sk->sk_send_head = NULL;
3578
3579 timer_setup(&sk->sk_timer, NULL, 0);
3580
3581 sk->sk_allocation = GFP_KERNEL;
3582 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
3583 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
3584 sk->sk_state = TCP_CLOSE;
3585 sk->sk_use_task_frag = true;
3586 sk_set_socket(sk, sock);
3587
3588 sock_set_flag(sk, SOCK_ZAPPED);
3589
3590 if (sock) {
3591 sk->sk_type = sock->type;
3592 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3593 sock->sk = sk;
3594 } else {
3595 RCU_INIT_POINTER(sk->sk_wq, NULL);
3596 }
3597 sk->sk_uid = uid;
3598
3599 sk->sk_state_change = sock_def_wakeup;
3600 sk->sk_data_ready = sock_def_readable;
3601 sk->sk_write_space = sock_def_write_space;
3602 sk->sk_error_report = sock_def_error_report;
3603 sk->sk_destruct = sock_def_destruct;
3604
3605 sk->sk_frag.page = NULL;
3606 sk->sk_frag.offset = 0;
3607 sk->sk_peek_off = -1;
3608
3609 sk->sk_peer_pid = NULL;
3610 sk->sk_peer_cred = NULL;
3611 spin_lock_init(&sk->sk_peer_lock);
3612
3613 sk->sk_write_pending = 0;
3614 sk->sk_rcvlowat = 1;
3615 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3616 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3617
3618 sk->sk_stamp = SK_DEFAULT_STAMP;
3619 #if BITS_PER_LONG==32
3620 seqlock_init(&sk->sk_stamp_seq);
3621 #endif
3622 atomic_set(&sk->sk_zckey, 0);
3623
3624 #ifdef CONFIG_NET_RX_BUSY_POLL
3625 sk->sk_napi_id = 0;
3626 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
3627 #endif
3628
3629 sk->sk_max_pacing_rate = ~0UL;
3630 sk->sk_pacing_rate = ~0UL;
3631 WRITE_ONCE(sk->sk_pacing_shift, 10);
3632 sk->sk_incoming_cpu = -1;
3633
3634 sk_rx_queue_clear(sk);
3635 /*
3636 * Before updating sk_refcnt, we must commit prior changes to memory
3637 * (Documentation/RCU/rculist_nulls.rst for details)
3638 */
3639 smp_wmb();
3640 refcount_set(&sk->sk_refcnt, 1);
3641 atomic_set(&sk->sk_drops, 0);
3642 }
3643 EXPORT_SYMBOL(sock_init_data_uid);
3644
sock_init_data(struct socket * sock,struct sock * sk)3645 void sock_init_data(struct socket *sock, struct sock *sk)
3646 {
3647 kuid_t uid = sock ?
3648 SOCK_INODE(sock)->i_uid :
3649 make_kuid(sock_net(sk)->user_ns, 0);
3650
3651 sock_init_data_uid(sock, sk, uid);
3652 }
3653 EXPORT_SYMBOL(sock_init_data);
3654
lock_sock_nested(struct sock * sk,int subclass)3655 void lock_sock_nested(struct sock *sk, int subclass)
3656 {
3657 /* The sk_lock has mutex_lock() semantics here. */
3658 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3659
3660 might_sleep();
3661 spin_lock_bh(&sk->sk_lock.slock);
3662 if (sock_owned_by_user_nocheck(sk))
3663 __lock_sock(sk);
3664 sk->sk_lock.owned = 1;
3665 spin_unlock_bh(&sk->sk_lock.slock);
3666 }
3667 EXPORT_SYMBOL(lock_sock_nested);
3668
release_sock(struct sock * sk)3669 void release_sock(struct sock *sk)
3670 {
3671 spin_lock_bh(&sk->sk_lock.slock);
3672 if (sk->sk_backlog.tail)
3673 __release_sock(sk);
3674
3675 if (sk->sk_prot->release_cb)
3676 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3677 tcp_release_cb, sk);
3678
3679 sock_release_ownership(sk);
3680 if (waitqueue_active(&sk->sk_lock.wq))
3681 wake_up(&sk->sk_lock.wq);
3682 spin_unlock_bh(&sk->sk_lock.slock);
3683 }
3684 EXPORT_SYMBOL(release_sock);
3685
__lock_sock_fast(struct sock * sk)3686 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3687 {
3688 might_sleep();
3689 spin_lock_bh(&sk->sk_lock.slock);
3690
3691 if (!sock_owned_by_user_nocheck(sk)) {
3692 /*
3693 * Fast path return with bottom halves disabled and
3694 * sock::sk_lock.slock held.
3695 *
3696 * The 'mutex' is not contended and holding
3697 * sock::sk_lock.slock prevents all other lockers to
3698 * proceed so the corresponding unlock_sock_fast() can
3699 * avoid the slow path of release_sock() completely and
3700 * just release slock.
3701 *
3702 * From a semantical POV this is equivalent to 'acquiring'
3703 * the 'mutex', hence the corresponding lockdep
3704 * mutex_release() has to happen in the fast path of
3705 * unlock_sock_fast().
3706 */
3707 return false;
3708 }
3709
3710 __lock_sock(sk);
3711 sk->sk_lock.owned = 1;
3712 __acquire(&sk->sk_lock.slock);
3713 spin_unlock_bh(&sk->sk_lock.slock);
3714 return true;
3715 }
3716 EXPORT_SYMBOL(__lock_sock_fast);
3717
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3718 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3719 bool timeval, bool time32)
3720 {
3721 struct sock *sk = sock->sk;
3722 struct timespec64 ts;
3723
3724 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3725 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3726 if (ts.tv_sec == -1)
3727 return -ENOENT;
3728 if (ts.tv_sec == 0) {
3729 ktime_t kt = ktime_get_real();
3730 sock_write_timestamp(sk, kt);
3731 ts = ktime_to_timespec64(kt);
3732 }
3733
3734 if (timeval)
3735 ts.tv_nsec /= 1000;
3736
3737 #ifdef CONFIG_COMPAT_32BIT_TIME
3738 if (time32)
3739 return put_old_timespec32(&ts, userstamp);
3740 #endif
3741 #ifdef CONFIG_SPARC64
3742 /* beware of padding in sparc64 timeval */
3743 if (timeval && !in_compat_syscall()) {
3744 struct __kernel_old_timeval __user tv = {
3745 .tv_sec = ts.tv_sec,
3746 .tv_usec = ts.tv_nsec,
3747 };
3748 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3749 return -EFAULT;
3750 return 0;
3751 }
3752 #endif
3753 return put_timespec64(&ts, userstamp);
3754 }
3755 EXPORT_SYMBOL(sock_gettstamp);
3756
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3757 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3758 {
3759 if (!sock_flag(sk, flag)) {
3760 unsigned long previous_flags = sk->sk_flags;
3761
3762 sock_set_flag(sk, flag);
3763 /*
3764 * we just set one of the two flags which require net
3765 * time stamping, but time stamping might have been on
3766 * already because of the other one
3767 */
3768 if (sock_needs_netstamp(sk) &&
3769 !(previous_flags & SK_FLAGS_TIMESTAMP))
3770 net_enable_timestamp();
3771 }
3772 }
3773
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3774 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3775 int level, int type)
3776 {
3777 struct sock_exterr_skb *serr;
3778 struct sk_buff *skb;
3779 int copied, err;
3780
3781 err = -EAGAIN;
3782 skb = sock_dequeue_err_skb(sk);
3783 if (skb == NULL)
3784 goto out;
3785
3786 copied = skb->len;
3787 if (copied > len) {
3788 msg->msg_flags |= MSG_TRUNC;
3789 copied = len;
3790 }
3791 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3792 if (err)
3793 goto out_free_skb;
3794
3795 sock_recv_timestamp(msg, sk, skb);
3796
3797 serr = SKB_EXT_ERR(skb);
3798 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3799
3800 msg->msg_flags |= MSG_ERRQUEUE;
3801 err = copied;
3802
3803 out_free_skb:
3804 kfree_skb(skb);
3805 out:
3806 return err;
3807 }
3808 EXPORT_SYMBOL(sock_recv_errqueue);
3809
3810 /*
3811 * Get a socket option on an socket.
3812 *
3813 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3814 * asynchronous errors should be reported by getsockopt. We assume
3815 * this means if you specify SO_ERROR (otherwise what is the point of it).
3816 */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3817 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3818 char __user *optval, int __user *optlen)
3819 {
3820 struct sock *sk = sock->sk;
3821
3822 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3823 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3824 }
3825 EXPORT_SYMBOL(sock_common_getsockopt);
3826
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3827 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3828 int flags)
3829 {
3830 struct sock *sk = sock->sk;
3831 int addr_len = 0;
3832 int err;
3833
3834 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3835 if (err >= 0)
3836 msg->msg_namelen = addr_len;
3837 return err;
3838 }
3839 EXPORT_SYMBOL(sock_common_recvmsg);
3840
3841 /*
3842 * Set socket options on an inet socket.
3843 */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3844 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3845 sockptr_t optval, unsigned int optlen)
3846 {
3847 struct sock *sk = sock->sk;
3848
3849 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3850 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3851 }
3852 EXPORT_SYMBOL(sock_common_setsockopt);
3853
sk_common_release(struct sock * sk)3854 void sk_common_release(struct sock *sk)
3855 {
3856 if (sk->sk_prot->destroy)
3857 sk->sk_prot->destroy(sk);
3858
3859 /*
3860 * Observation: when sk_common_release is called, processes have
3861 * no access to socket. But net still has.
3862 * Step one, detach it from networking:
3863 *
3864 * A. Remove from hash tables.
3865 */
3866
3867 sk->sk_prot->unhash(sk);
3868
3869 if (sk->sk_socket)
3870 sk->sk_socket->sk = NULL;
3871
3872 /*
3873 * In this point socket cannot receive new packets, but it is possible
3874 * that some packets are in flight because some CPU runs receiver and
3875 * did hash table lookup before we unhashed socket. They will achieve
3876 * receive queue and will be purged by socket destructor.
3877 *
3878 * Also we still have packets pending on receive queue and probably,
3879 * our own packets waiting in device queues. sock_destroy will drain
3880 * receive queue, but transmitted packets will delay socket destruction
3881 * until the last reference will be released.
3882 */
3883
3884 sock_orphan(sk);
3885
3886 xfrm_sk_free_policy(sk);
3887
3888 sock_put(sk);
3889 }
3890 EXPORT_SYMBOL(sk_common_release);
3891
sk_get_meminfo(const struct sock * sk,u32 * mem)3892 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3893 {
3894 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3895
3896 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3897 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3898 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3899 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3900 mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3901 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3902 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3903 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3904 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3905 }
3906
3907 #ifdef CONFIG_PROC_FS
3908 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3909
sock_prot_inuse_get(struct net * net,struct proto * prot)3910 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3911 {
3912 int cpu, idx = prot->inuse_idx;
3913 int res = 0;
3914
3915 for_each_possible_cpu(cpu)
3916 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3917
3918 return res >= 0 ? res : 0;
3919 }
3920 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3921
sock_inuse_get(struct net * net)3922 int sock_inuse_get(struct net *net)
3923 {
3924 int cpu, res = 0;
3925
3926 for_each_possible_cpu(cpu)
3927 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3928
3929 return res;
3930 }
3931
3932 EXPORT_SYMBOL_GPL(sock_inuse_get);
3933
sock_inuse_init_net(struct net * net)3934 static int __net_init sock_inuse_init_net(struct net *net)
3935 {
3936 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3937 if (net->core.prot_inuse == NULL)
3938 return -ENOMEM;
3939 return 0;
3940 }
3941
sock_inuse_exit_net(struct net * net)3942 static void __net_exit sock_inuse_exit_net(struct net *net)
3943 {
3944 free_percpu(net->core.prot_inuse);
3945 }
3946
3947 static struct pernet_operations net_inuse_ops = {
3948 .init = sock_inuse_init_net,
3949 .exit = sock_inuse_exit_net,
3950 };
3951
net_inuse_init(void)3952 static __init int net_inuse_init(void)
3953 {
3954 if (register_pernet_subsys(&net_inuse_ops))
3955 panic("Cannot initialize net inuse counters");
3956
3957 return 0;
3958 }
3959
3960 core_initcall(net_inuse_init);
3961
assign_proto_idx(struct proto * prot)3962 static int assign_proto_idx(struct proto *prot)
3963 {
3964 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3965
3966 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
3967 pr_err("PROTO_INUSE_NR exhausted\n");
3968 return -ENOSPC;
3969 }
3970
3971 set_bit(prot->inuse_idx, proto_inuse_idx);
3972 return 0;
3973 }
3974
release_proto_idx(struct proto * prot)3975 static void release_proto_idx(struct proto *prot)
3976 {
3977 if (prot->inuse_idx != PROTO_INUSE_NR)
3978 clear_bit(prot->inuse_idx, proto_inuse_idx);
3979 }
3980 #else
assign_proto_idx(struct proto * prot)3981 static inline int assign_proto_idx(struct proto *prot)
3982 {
3983 return 0;
3984 }
3985
release_proto_idx(struct proto * prot)3986 static inline void release_proto_idx(struct proto *prot)
3987 {
3988 }
3989
3990 #endif
3991
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3992 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3993 {
3994 if (!twsk_prot)
3995 return;
3996 kfree(twsk_prot->twsk_slab_name);
3997 twsk_prot->twsk_slab_name = NULL;
3998 kmem_cache_destroy(twsk_prot->twsk_slab);
3999 twsk_prot->twsk_slab = NULL;
4000 }
4001
tw_prot_init(const struct proto * prot)4002 static int tw_prot_init(const struct proto *prot)
4003 {
4004 struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
4005
4006 if (!twsk_prot)
4007 return 0;
4008
4009 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
4010 prot->name);
4011 if (!twsk_prot->twsk_slab_name)
4012 return -ENOMEM;
4013
4014 twsk_prot->twsk_slab =
4015 kmem_cache_create(twsk_prot->twsk_slab_name,
4016 twsk_prot->twsk_obj_size, 0,
4017 SLAB_ACCOUNT | prot->slab_flags,
4018 NULL);
4019 if (!twsk_prot->twsk_slab) {
4020 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4021 prot->name);
4022 return -ENOMEM;
4023 }
4024
4025 return 0;
4026 }
4027
req_prot_cleanup(struct request_sock_ops * rsk_prot)4028 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4029 {
4030 if (!rsk_prot)
4031 return;
4032 kfree(rsk_prot->slab_name);
4033 rsk_prot->slab_name = NULL;
4034 kmem_cache_destroy(rsk_prot->slab);
4035 rsk_prot->slab = NULL;
4036 }
4037
req_prot_init(const struct proto * prot)4038 static int req_prot_init(const struct proto *prot)
4039 {
4040 struct request_sock_ops *rsk_prot = prot->rsk_prot;
4041
4042 if (!rsk_prot)
4043 return 0;
4044
4045 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4046 prot->name);
4047 if (!rsk_prot->slab_name)
4048 return -ENOMEM;
4049
4050 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4051 rsk_prot->obj_size, 0,
4052 SLAB_ACCOUNT | prot->slab_flags,
4053 NULL);
4054
4055 if (!rsk_prot->slab) {
4056 pr_crit("%s: Can't create request sock SLAB cache!\n",
4057 prot->name);
4058 return -ENOMEM;
4059 }
4060 return 0;
4061 }
4062
proto_register(struct proto * prot,int alloc_slab)4063 int proto_register(struct proto *prot, int alloc_slab)
4064 {
4065 int ret = -ENOBUFS;
4066
4067 if (prot->memory_allocated && !prot->sysctl_mem) {
4068 pr_err("%s: missing sysctl_mem\n", prot->name);
4069 return -EINVAL;
4070 }
4071 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4072 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4073 return -EINVAL;
4074 }
4075 if (alloc_slab) {
4076 prot->slab = kmem_cache_create_usercopy(prot->name,
4077 prot->obj_size, 0,
4078 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4079 prot->slab_flags,
4080 prot->useroffset, prot->usersize,
4081 NULL);
4082
4083 if (prot->slab == NULL) {
4084 pr_crit("%s: Can't create sock SLAB cache!\n",
4085 prot->name);
4086 goto out;
4087 }
4088
4089 if (req_prot_init(prot))
4090 goto out_free_request_sock_slab;
4091
4092 if (tw_prot_init(prot))
4093 goto out_free_timewait_sock_slab;
4094 }
4095
4096 mutex_lock(&proto_list_mutex);
4097 ret = assign_proto_idx(prot);
4098 if (ret) {
4099 mutex_unlock(&proto_list_mutex);
4100 goto out_free_timewait_sock_slab;
4101 }
4102 list_add(&prot->node, &proto_list);
4103 mutex_unlock(&proto_list_mutex);
4104 return ret;
4105
4106 out_free_timewait_sock_slab:
4107 if (alloc_slab)
4108 tw_prot_cleanup(prot->twsk_prot);
4109 out_free_request_sock_slab:
4110 if (alloc_slab) {
4111 req_prot_cleanup(prot->rsk_prot);
4112
4113 kmem_cache_destroy(prot->slab);
4114 prot->slab = NULL;
4115 }
4116 out:
4117 return ret;
4118 }
4119 EXPORT_SYMBOL(proto_register);
4120
proto_unregister(struct proto * prot)4121 void proto_unregister(struct proto *prot)
4122 {
4123 mutex_lock(&proto_list_mutex);
4124 release_proto_idx(prot);
4125 list_del(&prot->node);
4126 mutex_unlock(&proto_list_mutex);
4127
4128 kmem_cache_destroy(prot->slab);
4129 prot->slab = NULL;
4130
4131 req_prot_cleanup(prot->rsk_prot);
4132 tw_prot_cleanup(prot->twsk_prot);
4133 }
4134 EXPORT_SYMBOL(proto_unregister);
4135
sock_load_diag_module(int family,int protocol)4136 int sock_load_diag_module(int family, int protocol)
4137 {
4138 if (!protocol) {
4139 if (!sock_is_registered(family))
4140 return -ENOENT;
4141
4142 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4143 NETLINK_SOCK_DIAG, family);
4144 }
4145
4146 #ifdef CONFIG_INET
4147 if (family == AF_INET &&
4148 protocol != IPPROTO_RAW &&
4149 protocol < MAX_INET_PROTOS &&
4150 !rcu_access_pointer(inet_protos[protocol]))
4151 return -ENOENT;
4152 #endif
4153
4154 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4155 NETLINK_SOCK_DIAG, family, protocol);
4156 }
4157 EXPORT_SYMBOL(sock_load_diag_module);
4158
4159 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)4160 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4161 __acquires(proto_list_mutex)
4162 {
4163 mutex_lock(&proto_list_mutex);
4164 return seq_list_start_head(&proto_list, *pos);
4165 }
4166
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)4167 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4168 {
4169 return seq_list_next(v, &proto_list, pos);
4170 }
4171
proto_seq_stop(struct seq_file * seq,void * v)4172 static void proto_seq_stop(struct seq_file *seq, void *v)
4173 __releases(proto_list_mutex)
4174 {
4175 mutex_unlock(&proto_list_mutex);
4176 }
4177
proto_method_implemented(const void * method)4178 static char proto_method_implemented(const void *method)
4179 {
4180 return method == NULL ? 'n' : 'y';
4181 }
sock_prot_memory_allocated(struct proto * proto)4182 static long sock_prot_memory_allocated(struct proto *proto)
4183 {
4184 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4185 }
4186
sock_prot_memory_pressure(struct proto * proto)4187 static const char *sock_prot_memory_pressure(struct proto *proto)
4188 {
4189 return proto->memory_pressure != NULL ?
4190 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4191 }
4192
proto_seq_printf(struct seq_file * seq,struct proto * proto)4193 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4194 {
4195
4196 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
4197 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4198 proto->name,
4199 proto->obj_size,
4200 sock_prot_inuse_get(seq_file_net(seq), proto),
4201 sock_prot_memory_allocated(proto),
4202 sock_prot_memory_pressure(proto),
4203 proto->max_header,
4204 proto->slab == NULL ? "no" : "yes",
4205 module_name(proto->owner),
4206 proto_method_implemented(proto->close),
4207 proto_method_implemented(proto->connect),
4208 proto_method_implemented(proto->disconnect),
4209 proto_method_implemented(proto->accept),
4210 proto_method_implemented(proto->ioctl),
4211 proto_method_implemented(proto->init),
4212 proto_method_implemented(proto->destroy),
4213 proto_method_implemented(proto->shutdown),
4214 proto_method_implemented(proto->setsockopt),
4215 proto_method_implemented(proto->getsockopt),
4216 proto_method_implemented(proto->sendmsg),
4217 proto_method_implemented(proto->recvmsg),
4218 proto_method_implemented(proto->bind),
4219 proto_method_implemented(proto->backlog_rcv),
4220 proto_method_implemented(proto->hash),
4221 proto_method_implemented(proto->unhash),
4222 proto_method_implemented(proto->get_port),
4223 proto_method_implemented(proto->enter_memory_pressure));
4224 }
4225
proto_seq_show(struct seq_file * seq,void * v)4226 static int proto_seq_show(struct seq_file *seq, void *v)
4227 {
4228 if (v == &proto_list)
4229 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4230 "protocol",
4231 "size",
4232 "sockets",
4233 "memory",
4234 "press",
4235 "maxhdr",
4236 "slab",
4237 "module",
4238 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4239 else
4240 proto_seq_printf(seq, list_entry(v, struct proto, node));
4241 return 0;
4242 }
4243
4244 static const struct seq_operations proto_seq_ops = {
4245 .start = proto_seq_start,
4246 .next = proto_seq_next,
4247 .stop = proto_seq_stop,
4248 .show = proto_seq_show,
4249 };
4250
proto_init_net(struct net * net)4251 static __net_init int proto_init_net(struct net *net)
4252 {
4253 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4254 sizeof(struct seq_net_private)))
4255 return -ENOMEM;
4256
4257 return 0;
4258 }
4259
proto_exit_net(struct net * net)4260 static __net_exit void proto_exit_net(struct net *net)
4261 {
4262 remove_proc_entry("protocols", net->proc_net);
4263 }
4264
4265
4266 static __net_initdata struct pernet_operations proto_net_ops = {
4267 .init = proto_init_net,
4268 .exit = proto_exit_net,
4269 };
4270
proto_init(void)4271 static int __init proto_init(void)
4272 {
4273 return register_pernet_subsys(&proto_net_ops);
4274 }
4275
4276 subsys_initcall(proto_init);
4277
4278 #endif /* PROC_FS */
4279
4280 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)4281 bool sk_busy_loop_end(void *p, unsigned long start_time)
4282 {
4283 struct sock *sk = p;
4284
4285 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4286 return true;
4287
4288 if (sk_is_udp(sk) &&
4289 !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4290 return true;
4291
4292 return sk_busy_loop_timeout(sk, start_time);
4293 }
4294 EXPORT_SYMBOL(sk_busy_loop_end);
4295 #endif /* CONFIG_NET_RX_BUSY_POLL */
4296
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)4297 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4298 {
4299 if (!sk->sk_prot->bind_add)
4300 return -EOPNOTSUPP;
4301 return sk->sk_prot->bind_add(sk, addr, addr_len);
4302 }
4303 EXPORT_SYMBOL(sock_bind_add);
4304
4305 /* Copy 'size' bytes from userspace and return `size` back to userspace */
sock_ioctl_inout(struct sock * sk,unsigned int cmd,void __user * arg,void * karg,size_t size)4306 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4307 void __user *arg, void *karg, size_t size)
4308 {
4309 int ret;
4310
4311 if (copy_from_user(karg, arg, size))
4312 return -EFAULT;
4313
4314 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4315 if (ret)
4316 return ret;
4317
4318 if (copy_to_user(arg, karg, size))
4319 return -EFAULT;
4320
4321 return 0;
4322 }
4323 EXPORT_SYMBOL(sock_ioctl_inout);
4324
4325 /* This is the most common ioctl prep function, where the result (4 bytes) is
4326 * copied back to userspace if the ioctl() returns successfully. No input is
4327 * copied from userspace as input argument.
4328 */
sock_ioctl_out(struct sock * sk,unsigned int cmd,void __user * arg)4329 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4330 {
4331 int ret, karg = 0;
4332
4333 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4334 if (ret)
4335 return ret;
4336
4337 return put_user(karg, (int __user *)arg);
4338 }
4339
4340 /* A wrapper around sock ioctls, which copies the data from userspace
4341 * (depending on the protocol/ioctl), and copies back the result to userspace.
4342 * The main motivation for this function is to pass kernel memory to the
4343 * protocol ioctl callbacks, instead of userspace memory.
4344 */
sk_ioctl(struct sock * sk,unsigned int cmd,void __user * arg)4345 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4346 {
4347 int rc = 1;
4348
4349 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4350 rc = ipmr_sk_ioctl(sk, cmd, arg);
4351 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4352 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4353 else if (sk_is_phonet(sk))
4354 rc = phonet_sk_ioctl(sk, cmd, arg);
4355
4356 /* If ioctl was processed, returns its value */
4357 if (rc <= 0)
4358 return rc;
4359
4360 /* Otherwise call the default handler */
4361 return sock_ioctl_out(sk, cmd, arg);
4362 }
4363 EXPORT_SYMBOL(sk_ioctl);
4364
sock_struct_check(void)4365 static int __init sock_struct_check(void)
4366 {
4367 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4368 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4369 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4370 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4371 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4372
4373 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4374 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4375 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4376 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4377 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4378 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4379 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4380 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4381 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4382
4383 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4384 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4385 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4386
4387 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4388 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4389 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4390 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4391
4392 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4393 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4394 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4395 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4396 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4397 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4398 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4399 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4400 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4401 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4402 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4403 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4404 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4405 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4406 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4407 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4408
4409 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4410 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4411 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4412 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4413 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4414 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4415 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4416 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4417 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4418 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4419 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4420 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4421 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4422 return 0;
4423 }
4424
4425 core_initcall(sock_struct_check);
4426