1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 */
85
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121
122 #include <linux/uaccess.h>
123
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <linux/net_tstamp.h>
131 #include <net/xfrm.h>
132 #include <linux/ipsec.h>
133 #include <net/cls_cgroup.h>
134 #include <net/netprio_cgroup.h>
135 #include <linux/sock_diag.h>
136
137 #include <linux/filter.h>
138 #include <net/sock_reuseport.h>
139 #include <net/bpf_sk_storage.h>
140
141 #include <trace/events/sock.h>
142 #include <trace/hooks/sched.h>
143 #include <trace/hooks/net.h>
144
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148
149 #include <linux/ethtool.h>
150
151 #include "dev.h"
152
153 static DEFINE_MUTEX(proto_list_mutex);
154 static LIST_HEAD(proto_list);
155
156 static void sock_def_write_space_wfree(struct sock *sk);
157 static void sock_def_write_space(struct sock *sk);
158
159 /**
160 * sk_ns_capable - General socket capability test
161 * @sk: Socket to use a capability on or through
162 * @user_ns: The user namespace of the capability to use
163 * @cap: The capability to use
164 *
165 * Test to see if the opener of the socket had when the socket was
166 * created and the current process has the capability @cap in the user
167 * namespace @user_ns.
168 */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)169 bool sk_ns_capable(const struct sock *sk,
170 struct user_namespace *user_ns, int cap)
171 {
172 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
173 ns_capable(user_ns, cap);
174 }
175 EXPORT_SYMBOL(sk_ns_capable);
176
177 /**
178 * sk_capable - Socket global capability test
179 * @sk: Socket to use a capability on or through
180 * @cap: The global capability to use
181 *
182 * Test to see if the opener of the socket had when the socket was
183 * created and the current process has the capability @cap in all user
184 * namespaces.
185 */
sk_capable(const struct sock * sk,int cap)186 bool sk_capable(const struct sock *sk, int cap)
187 {
188 return sk_ns_capable(sk, &init_user_ns, cap);
189 }
190 EXPORT_SYMBOL(sk_capable);
191
192 /**
193 * sk_net_capable - Network namespace socket capability test
194 * @sk: Socket to use a capability on or through
195 * @cap: The capability to use
196 *
197 * Test to see if the opener of the socket had when the socket was created
198 * and the current process has the capability @cap over the network namespace
199 * the socket is a member of.
200 */
sk_net_capable(const struct sock * sk,int cap)201 bool sk_net_capable(const struct sock *sk, int cap)
202 {
203 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
204 }
205 EXPORT_SYMBOL(sk_net_capable);
206
207 /*
208 * Each address family might have different locking rules, so we have
209 * one slock key per address family and separate keys for internal and
210 * userspace sockets.
211 */
212 static struct lock_class_key af_family_keys[AF_MAX];
213 static struct lock_class_key af_family_kern_keys[AF_MAX];
214 static struct lock_class_key af_family_slock_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
216
217 /*
218 * Make lock validator output more readable. (we pre-construct these
219 * strings build-time, so that runtime initialization of socket
220 * locks is fast):
221 */
222
223 #define _sock_locks(x) \
224 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
225 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
226 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
227 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
228 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
229 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
230 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
231 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
232 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
233 x "27" , x "28" , x "AF_CAN" , \
234 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
235 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
236 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
237 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
238 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
239 x "AF_MCTP" , \
240 x "AF_MAX"
241
242 static const char *const af_family_key_strings[AF_MAX+1] = {
243 _sock_locks("sk_lock-")
244 };
245 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
246 _sock_locks("slock-")
247 };
248 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
249 _sock_locks("clock-")
250 };
251
252 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
253 _sock_locks("k-sk_lock-")
254 };
255 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
256 _sock_locks("k-slock-")
257 };
258 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
259 _sock_locks("k-clock-")
260 };
261 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
262 _sock_locks("rlock-")
263 };
264 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
265 _sock_locks("wlock-")
266 };
267 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
268 _sock_locks("elock-")
269 };
270
271 /*
272 * sk_callback_lock and sk queues locking rules are per-address-family,
273 * so split the lock classes by using a per-AF key:
274 */
275 static struct lock_class_key af_callback_keys[AF_MAX];
276 static struct lock_class_key af_rlock_keys[AF_MAX];
277 static struct lock_class_key af_wlock_keys[AF_MAX];
278 static struct lock_class_key af_elock_keys[AF_MAX];
279 static struct lock_class_key af_kern_callback_keys[AF_MAX];
280
281 /* Run time adjustable parameters. */
282 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
283 EXPORT_SYMBOL(sysctl_wmem_max);
284 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
285 EXPORT_SYMBOL(sysctl_rmem_max);
286 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
287 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
288 int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE;
289
290 /* Maximal space eaten by iovec or ancillary data plus some space */
291 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
292 EXPORT_SYMBOL(sysctl_optmem_max);
293
294 int sysctl_tstamp_allow_data __read_mostly = 1;
295
296 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
297 EXPORT_SYMBOL_GPL(memalloc_socks_key);
298
299 /**
300 * sk_set_memalloc - sets %SOCK_MEMALLOC
301 * @sk: socket to set it on
302 *
303 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
304 * It's the responsibility of the admin to adjust min_free_kbytes
305 * to meet the requirements
306 */
sk_set_memalloc(struct sock * sk)307 void sk_set_memalloc(struct sock *sk)
308 {
309 sock_set_flag(sk, SOCK_MEMALLOC);
310 sk->sk_allocation |= __GFP_MEMALLOC;
311 static_branch_inc(&memalloc_socks_key);
312 }
313 EXPORT_SYMBOL_GPL(sk_set_memalloc);
314
sk_clear_memalloc(struct sock * sk)315 void sk_clear_memalloc(struct sock *sk)
316 {
317 sock_reset_flag(sk, SOCK_MEMALLOC);
318 sk->sk_allocation &= ~__GFP_MEMALLOC;
319 static_branch_dec(&memalloc_socks_key);
320
321 /*
322 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
323 * progress of swapping. SOCK_MEMALLOC may be cleared while
324 * it has rmem allocations due to the last swapfile being deactivated
325 * but there is a risk that the socket is unusable due to exceeding
326 * the rmem limits. Reclaim the reserves and obey rmem limits again.
327 */
328 sk_mem_reclaim(sk);
329 }
330 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
331
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)332 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
333 {
334 int ret;
335 unsigned int noreclaim_flag;
336
337 /* these should have been dropped before queueing */
338 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
339
340 noreclaim_flag = memalloc_noreclaim_save();
341 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
342 tcp_v6_do_rcv,
343 tcp_v4_do_rcv,
344 sk, skb);
345 memalloc_noreclaim_restore(noreclaim_flag);
346
347 return ret;
348 }
349 EXPORT_SYMBOL(__sk_backlog_rcv);
350
sk_error_report(struct sock * sk)351 void sk_error_report(struct sock *sk)
352 {
353 sk->sk_error_report(sk);
354
355 switch (sk->sk_family) {
356 case AF_INET:
357 fallthrough;
358 case AF_INET6:
359 trace_inet_sk_error_report(sk);
360 break;
361 default:
362 break;
363 }
364 }
365 EXPORT_SYMBOL(sk_error_report);
366
sock_get_timeout(long timeo,void * optval,bool old_timeval)367 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
368 {
369 struct __kernel_sock_timeval tv;
370
371 if (timeo == MAX_SCHEDULE_TIMEOUT) {
372 tv.tv_sec = 0;
373 tv.tv_usec = 0;
374 } else {
375 tv.tv_sec = timeo / HZ;
376 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
377 }
378
379 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
380 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
381 *(struct old_timeval32 *)optval = tv32;
382 return sizeof(tv32);
383 }
384
385 if (old_timeval) {
386 struct __kernel_old_timeval old_tv;
387 old_tv.tv_sec = tv.tv_sec;
388 old_tv.tv_usec = tv.tv_usec;
389 *(struct __kernel_old_timeval *)optval = old_tv;
390 return sizeof(old_tv);
391 }
392
393 *(struct __kernel_sock_timeval *)optval = tv;
394 return sizeof(tv);
395 }
396 EXPORT_SYMBOL(sock_get_timeout);
397
sock_copy_user_timeval(struct __kernel_sock_timeval * tv,sockptr_t optval,int optlen,bool old_timeval)398 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
399 sockptr_t optval, int optlen, bool old_timeval)
400 {
401 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
402 struct old_timeval32 tv32;
403
404 if (optlen < sizeof(tv32))
405 return -EINVAL;
406
407 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
408 return -EFAULT;
409 tv->tv_sec = tv32.tv_sec;
410 tv->tv_usec = tv32.tv_usec;
411 } else if (old_timeval) {
412 struct __kernel_old_timeval old_tv;
413
414 if (optlen < sizeof(old_tv))
415 return -EINVAL;
416 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
417 return -EFAULT;
418 tv->tv_sec = old_tv.tv_sec;
419 tv->tv_usec = old_tv.tv_usec;
420 } else {
421 if (optlen < sizeof(*tv))
422 return -EINVAL;
423 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
424 return -EFAULT;
425 }
426
427 return 0;
428 }
429 EXPORT_SYMBOL(sock_copy_user_timeval);
430
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)431 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
432 bool old_timeval)
433 {
434 struct __kernel_sock_timeval tv;
435 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
436 long val;
437
438 if (err)
439 return err;
440
441 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
442 return -EDOM;
443
444 if (tv.tv_sec < 0) {
445 static int warned __read_mostly;
446
447 WRITE_ONCE(*timeo_p, 0);
448 if (warned < 10 && net_ratelimit()) {
449 warned++;
450 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
451 __func__, current->comm, task_pid_nr(current));
452 }
453 return 0;
454 }
455 val = MAX_SCHEDULE_TIMEOUT;
456 if ((tv.tv_sec || tv.tv_usec) &&
457 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
458 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
459 USEC_PER_SEC / HZ);
460 WRITE_ONCE(*timeo_p, val);
461 return 0;
462 }
463
sock_needs_netstamp(const struct sock * sk)464 static bool sock_needs_netstamp(const struct sock *sk)
465 {
466 switch (sk->sk_family) {
467 case AF_UNSPEC:
468 case AF_UNIX:
469 return false;
470 default:
471 return true;
472 }
473 }
474
sock_disable_timestamp(struct sock * sk,unsigned long flags)475 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
476 {
477 if (sk->sk_flags & flags) {
478 sk->sk_flags &= ~flags;
479 if (sock_needs_netstamp(sk) &&
480 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
481 net_disable_timestamp();
482 }
483 }
484
485
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)486 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
487 {
488 unsigned long flags;
489 struct sk_buff_head *list = &sk->sk_receive_queue;
490
491 if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
492 atomic_inc(&sk->sk_drops);
493 trace_sock_rcvqueue_full(sk, skb);
494 return -ENOMEM;
495 }
496
497 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
498 atomic_inc(&sk->sk_drops);
499 return -ENOBUFS;
500 }
501
502 skb->dev = NULL;
503 skb_set_owner_r(skb, sk);
504
505 /* we escape from rcu protected region, make sure we dont leak
506 * a norefcounted dst
507 */
508 skb_dst_force(skb);
509
510 spin_lock_irqsave(&list->lock, flags);
511 sock_skb_set_dropcount(sk, skb);
512 __skb_queue_tail(list, skb);
513 spin_unlock_irqrestore(&list->lock, flags);
514
515 if (!sock_flag(sk, SOCK_DEAD))
516 sk->sk_data_ready(sk);
517 return 0;
518 }
519 EXPORT_SYMBOL(__sock_queue_rcv_skb);
520
sock_queue_rcv_skb_reason(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)521 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
522 enum skb_drop_reason *reason)
523 {
524 enum skb_drop_reason drop_reason;
525 int err;
526
527 err = sk_filter(sk, skb);
528 if (err) {
529 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
530 goto out;
531 }
532 err = __sock_queue_rcv_skb(sk, skb);
533 switch (err) {
534 case -ENOMEM:
535 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
536 break;
537 case -ENOBUFS:
538 drop_reason = SKB_DROP_REASON_PROTO_MEM;
539 break;
540 default:
541 drop_reason = SKB_NOT_DROPPED_YET;
542 break;
543 }
544 out:
545 if (reason)
546 *reason = drop_reason;
547 return err;
548 }
549 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
550
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)551 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
552 const int nested, unsigned int trim_cap, bool refcounted)
553 {
554 int rc = NET_RX_SUCCESS;
555
556 if (sk_filter_trim_cap(sk, skb, trim_cap))
557 goto discard_and_relse;
558
559 skb->dev = NULL;
560
561 if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
562 atomic_inc(&sk->sk_drops);
563 goto discard_and_relse;
564 }
565 if (nested)
566 bh_lock_sock_nested(sk);
567 else
568 bh_lock_sock(sk);
569 if (!sock_owned_by_user(sk)) {
570 /*
571 * trylock + unlock semantics:
572 */
573 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
574
575 rc = sk_backlog_rcv(sk, skb);
576
577 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
578 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
579 bh_unlock_sock(sk);
580 atomic_inc(&sk->sk_drops);
581 goto discard_and_relse;
582 }
583
584 bh_unlock_sock(sk);
585 out:
586 if (refcounted)
587 sock_put(sk);
588 return rc;
589 discard_and_relse:
590 kfree_skb(skb);
591 goto out;
592 }
593 EXPORT_SYMBOL(__sk_receive_skb);
594
595 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
596 u32));
597 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
598 u32));
__sk_dst_check(struct sock * sk,u32 cookie)599 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
600 {
601 struct dst_entry *dst = __sk_dst_get(sk);
602
603 if (dst && dst->obsolete &&
604 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
605 dst, cookie) == NULL) {
606 sk_tx_queue_clear(sk);
607 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
608 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
609 dst_release(dst);
610 return NULL;
611 }
612
613 return dst;
614 }
615 EXPORT_SYMBOL(__sk_dst_check);
616
sk_dst_check(struct sock * sk,u32 cookie)617 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
618 {
619 struct dst_entry *dst = sk_dst_get(sk);
620
621 if (dst && dst->obsolete &&
622 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
623 dst, cookie) == NULL) {
624 sk_dst_reset(sk);
625 dst_release(dst);
626 return NULL;
627 }
628
629 return dst;
630 }
631 EXPORT_SYMBOL(sk_dst_check);
632
sock_bindtoindex_locked(struct sock * sk,int ifindex)633 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
634 {
635 int ret = -ENOPROTOOPT;
636 #ifdef CONFIG_NETDEVICES
637 struct net *net = sock_net(sk);
638
639 /* Sorry... */
640 ret = -EPERM;
641 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
642 goto out;
643
644 ret = -EINVAL;
645 if (ifindex < 0)
646 goto out;
647
648 /* Paired with all READ_ONCE() done locklessly. */
649 WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
650
651 if (sk->sk_prot->rehash)
652 sk->sk_prot->rehash(sk);
653 sk_dst_reset(sk);
654
655 ret = 0;
656
657 out:
658 #endif
659
660 return ret;
661 }
662
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)663 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
664 {
665 int ret;
666
667 if (lock_sk)
668 lock_sock(sk);
669 ret = sock_bindtoindex_locked(sk, ifindex);
670 if (lock_sk)
671 release_sock(sk);
672
673 return ret;
674 }
675 EXPORT_SYMBOL(sock_bindtoindex);
676
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)677 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
678 {
679 int ret = -ENOPROTOOPT;
680 #ifdef CONFIG_NETDEVICES
681 struct net *net = sock_net(sk);
682 char devname[IFNAMSIZ];
683 int index;
684
685 ret = -EINVAL;
686 if (optlen < 0)
687 goto out;
688
689 /* Bind this socket to a particular device like "eth0",
690 * as specified in the passed interface name. If the
691 * name is "" or the option length is zero the socket
692 * is not bound.
693 */
694 if (optlen > IFNAMSIZ - 1)
695 optlen = IFNAMSIZ - 1;
696 memset(devname, 0, sizeof(devname));
697
698 ret = -EFAULT;
699 if (copy_from_sockptr(devname, optval, optlen))
700 goto out;
701
702 index = 0;
703 if (devname[0] != '\0') {
704 struct net_device *dev;
705
706 rcu_read_lock();
707 dev = dev_get_by_name_rcu(net, devname);
708 if (dev)
709 index = dev->ifindex;
710 rcu_read_unlock();
711 ret = -ENODEV;
712 if (!dev)
713 goto out;
714 }
715
716 sockopt_lock_sock(sk);
717 ret = sock_bindtoindex_locked(sk, index);
718 sockopt_release_sock(sk);
719 out:
720 #endif
721
722 return ret;
723 }
724
sock_getbindtodevice(struct sock * sk,sockptr_t optval,sockptr_t optlen,int len)725 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
726 sockptr_t optlen, int len)
727 {
728 int ret = -ENOPROTOOPT;
729 #ifdef CONFIG_NETDEVICES
730 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
731 struct net *net = sock_net(sk);
732 char devname[IFNAMSIZ];
733
734 if (bound_dev_if == 0) {
735 len = 0;
736 goto zero;
737 }
738
739 ret = -EINVAL;
740 if (len < IFNAMSIZ)
741 goto out;
742
743 ret = netdev_get_name(net, devname, bound_dev_if);
744 if (ret)
745 goto out;
746
747 len = strlen(devname) + 1;
748
749 ret = -EFAULT;
750 if (copy_to_sockptr(optval, devname, len))
751 goto out;
752
753 zero:
754 ret = -EFAULT;
755 if (copy_to_sockptr(optlen, &len, sizeof(int)))
756 goto out;
757
758 ret = 0;
759
760 out:
761 #endif
762
763 return ret;
764 }
765
sk_mc_loop(struct sock * sk)766 bool sk_mc_loop(struct sock *sk)
767 {
768 if (dev_recursion_level())
769 return false;
770 if (!sk)
771 return true;
772 /* IPV6_ADDRFORM can change sk->sk_family under us. */
773 switch (READ_ONCE(sk->sk_family)) {
774 case AF_INET:
775 return inet_test_bit(MC_LOOP, sk);
776 #if IS_ENABLED(CONFIG_IPV6)
777 case AF_INET6:
778 return inet6_sk(sk)->mc_loop;
779 #endif
780 }
781 WARN_ON_ONCE(1);
782 return true;
783 }
784 EXPORT_SYMBOL(sk_mc_loop);
785
sock_set_reuseaddr(struct sock * sk)786 void sock_set_reuseaddr(struct sock *sk)
787 {
788 lock_sock(sk);
789 sk->sk_reuse = SK_CAN_REUSE;
790 release_sock(sk);
791 }
792 EXPORT_SYMBOL(sock_set_reuseaddr);
793
sock_set_reuseport(struct sock * sk)794 void sock_set_reuseport(struct sock *sk)
795 {
796 lock_sock(sk);
797 sk->sk_reuseport = true;
798 release_sock(sk);
799 }
800 EXPORT_SYMBOL(sock_set_reuseport);
801
sock_no_linger(struct sock * sk)802 void sock_no_linger(struct sock *sk)
803 {
804 lock_sock(sk);
805 WRITE_ONCE(sk->sk_lingertime, 0);
806 sock_set_flag(sk, SOCK_LINGER);
807 release_sock(sk);
808 }
809 EXPORT_SYMBOL(sock_no_linger);
810
sock_set_priority(struct sock * sk,u32 priority)811 void sock_set_priority(struct sock *sk, u32 priority)
812 {
813 lock_sock(sk);
814 WRITE_ONCE(sk->sk_priority, priority);
815 release_sock(sk);
816 }
817 EXPORT_SYMBOL(sock_set_priority);
818
sock_set_sndtimeo(struct sock * sk,s64 secs)819 void sock_set_sndtimeo(struct sock *sk, s64 secs)
820 {
821 lock_sock(sk);
822 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
823 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
824 else
825 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
826 release_sock(sk);
827 }
828 EXPORT_SYMBOL(sock_set_sndtimeo);
829
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)830 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
831 {
832 if (val) {
833 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
834 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
835 sock_set_flag(sk, SOCK_RCVTSTAMP);
836 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
837 } else {
838 sock_reset_flag(sk, SOCK_RCVTSTAMP);
839 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
840 }
841 }
842
sock_enable_timestamps(struct sock * sk)843 void sock_enable_timestamps(struct sock *sk)
844 {
845 lock_sock(sk);
846 __sock_set_timestamps(sk, true, false, true);
847 release_sock(sk);
848 }
849 EXPORT_SYMBOL(sock_enable_timestamps);
850
sock_set_timestamp(struct sock * sk,int optname,bool valbool)851 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
852 {
853 switch (optname) {
854 case SO_TIMESTAMP_OLD:
855 __sock_set_timestamps(sk, valbool, false, false);
856 break;
857 case SO_TIMESTAMP_NEW:
858 __sock_set_timestamps(sk, valbool, true, false);
859 break;
860 case SO_TIMESTAMPNS_OLD:
861 __sock_set_timestamps(sk, valbool, false, true);
862 break;
863 case SO_TIMESTAMPNS_NEW:
864 __sock_set_timestamps(sk, valbool, true, true);
865 break;
866 }
867 }
868
sock_timestamping_bind_phc(struct sock * sk,int phc_index)869 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
870 {
871 struct net *net = sock_net(sk);
872 struct net_device *dev = NULL;
873 bool match = false;
874 int *vclock_index;
875 int i, num;
876
877 if (sk->sk_bound_dev_if)
878 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
879
880 if (!dev) {
881 pr_err("%s: sock not bind to device\n", __func__);
882 return -EOPNOTSUPP;
883 }
884
885 num = ethtool_get_phc_vclocks(dev, &vclock_index);
886 dev_put(dev);
887
888 for (i = 0; i < num; i++) {
889 if (*(vclock_index + i) == phc_index) {
890 match = true;
891 break;
892 }
893 }
894
895 if (num > 0)
896 kfree(vclock_index);
897
898 if (!match)
899 return -EINVAL;
900
901 WRITE_ONCE(sk->sk_bind_phc, phc_index);
902
903 return 0;
904 }
905
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)906 int sock_set_timestamping(struct sock *sk, int optname,
907 struct so_timestamping timestamping)
908 {
909 int val = timestamping.flags;
910 int ret;
911
912 if (val & ~SOF_TIMESTAMPING_MASK)
913 return -EINVAL;
914
915 if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
916 !(val & SOF_TIMESTAMPING_OPT_ID))
917 return -EINVAL;
918
919 if (val & SOF_TIMESTAMPING_OPT_ID &&
920 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
921 if (sk_is_tcp(sk)) {
922 if ((1 << sk->sk_state) &
923 (TCPF_CLOSE | TCPF_LISTEN))
924 return -EINVAL;
925 if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
926 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
927 else
928 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
929 } else {
930 atomic_set(&sk->sk_tskey, 0);
931 }
932 }
933
934 if (val & SOF_TIMESTAMPING_OPT_STATS &&
935 !(val & SOF_TIMESTAMPING_OPT_TSONLY))
936 return -EINVAL;
937
938 if (val & SOF_TIMESTAMPING_BIND_PHC) {
939 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
940 if (ret)
941 return ret;
942 }
943
944 WRITE_ONCE(sk->sk_tsflags, val);
945 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
946
947 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
948 sock_enable_timestamp(sk,
949 SOCK_TIMESTAMPING_RX_SOFTWARE);
950 else
951 sock_disable_timestamp(sk,
952 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
953 return 0;
954 }
955
sock_set_keepalive(struct sock * sk)956 void sock_set_keepalive(struct sock *sk)
957 {
958 lock_sock(sk);
959 if (sk->sk_prot->keepalive)
960 sk->sk_prot->keepalive(sk, true);
961 sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
962 release_sock(sk);
963 }
964 EXPORT_SYMBOL(sock_set_keepalive);
965
__sock_set_rcvbuf(struct sock * sk,int val)966 static void __sock_set_rcvbuf(struct sock *sk, int val)
967 {
968 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
969 * as a negative value.
970 */
971 val = min_t(int, val, INT_MAX / 2);
972 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
973
974 /* We double it on the way in to account for "struct sk_buff" etc.
975 * overhead. Applications assume that the SO_RCVBUF setting they make
976 * will allow that much actual data to be received on that socket.
977 *
978 * Applications are unaware that "struct sk_buff" and other overheads
979 * allocate from the receive buffer during socket buffer allocation.
980 *
981 * And after considering the possible alternatives, returning the value
982 * we actually used in getsockopt is the most desirable behavior.
983 */
984 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
985 }
986
sock_set_rcvbuf(struct sock * sk,int val)987 void sock_set_rcvbuf(struct sock *sk, int val)
988 {
989 lock_sock(sk);
990 __sock_set_rcvbuf(sk, val);
991 release_sock(sk);
992 }
993 EXPORT_SYMBOL(sock_set_rcvbuf);
994
__sock_set_mark(struct sock * sk,u32 val)995 static void __sock_set_mark(struct sock *sk, u32 val)
996 {
997 if (val != sk->sk_mark) {
998 WRITE_ONCE(sk->sk_mark, val);
999 sk_dst_reset(sk);
1000 }
1001 }
1002
sock_set_mark(struct sock * sk,u32 val)1003 void sock_set_mark(struct sock *sk, u32 val)
1004 {
1005 lock_sock(sk);
1006 __sock_set_mark(sk, val);
1007 release_sock(sk);
1008 }
1009 EXPORT_SYMBOL(sock_set_mark);
1010
sock_release_reserved_memory(struct sock * sk,int bytes)1011 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1012 {
1013 /* Round down bytes to multiple of pages */
1014 bytes = round_down(bytes, PAGE_SIZE);
1015
1016 WARN_ON(bytes > sk->sk_reserved_mem);
1017 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1018 sk_mem_reclaim(sk);
1019 }
1020
sock_reserve_memory(struct sock * sk,int bytes)1021 static int sock_reserve_memory(struct sock *sk, int bytes)
1022 {
1023 long allocated;
1024 bool charged;
1025 int pages;
1026
1027 if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1028 return -EOPNOTSUPP;
1029
1030 if (!bytes)
1031 return 0;
1032
1033 pages = sk_mem_pages(bytes);
1034
1035 /* pre-charge to memcg */
1036 charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1037 GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1038 if (!charged)
1039 return -ENOMEM;
1040
1041 /* pre-charge to forward_alloc */
1042 sk_memory_allocated_add(sk, pages);
1043 allocated = sk_memory_allocated(sk);
1044 /* If the system goes into memory pressure with this
1045 * precharge, give up and return error.
1046 */
1047 if (allocated > sk_prot_mem_limits(sk, 1)) {
1048 sk_memory_allocated_sub(sk, pages);
1049 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1050 return -ENOMEM;
1051 }
1052 sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1053
1054 WRITE_ONCE(sk->sk_reserved_mem,
1055 sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1056
1057 return 0;
1058 }
1059
sockopt_lock_sock(struct sock * sk)1060 void sockopt_lock_sock(struct sock *sk)
1061 {
1062 /* When current->bpf_ctx is set, the setsockopt is called from
1063 * a bpf prog. bpf has ensured the sk lock has been
1064 * acquired before calling setsockopt().
1065 */
1066 if (has_current_bpf_ctx())
1067 return;
1068
1069 lock_sock(sk);
1070 }
1071 EXPORT_SYMBOL(sockopt_lock_sock);
1072
sockopt_release_sock(struct sock * sk)1073 void sockopt_release_sock(struct sock *sk)
1074 {
1075 if (has_current_bpf_ctx())
1076 return;
1077
1078 release_sock(sk);
1079 }
1080 EXPORT_SYMBOL(sockopt_release_sock);
1081
sockopt_ns_capable(struct user_namespace * ns,int cap)1082 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1083 {
1084 return has_current_bpf_ctx() || ns_capable(ns, cap);
1085 }
1086 EXPORT_SYMBOL(sockopt_ns_capable);
1087
sockopt_capable(int cap)1088 bool sockopt_capable(int cap)
1089 {
1090 return has_current_bpf_ctx() || capable(cap);
1091 }
1092 EXPORT_SYMBOL(sockopt_capable);
1093
1094 /*
1095 * This is meant for all protocols to use and covers goings on
1096 * at the socket level. Everything here is generic.
1097 */
1098
sk_setsockopt(struct sock * sk,int level,int optname,sockptr_t optval,unsigned int optlen)1099 int sk_setsockopt(struct sock *sk, int level, int optname,
1100 sockptr_t optval, unsigned int optlen)
1101 {
1102 struct so_timestamping timestamping;
1103 struct socket *sock = sk->sk_socket;
1104 struct sock_txtime sk_txtime;
1105 int val;
1106 int valbool;
1107 struct linger ling;
1108 int ret = 0;
1109
1110 /*
1111 * Options without arguments
1112 */
1113
1114 if (optname == SO_BINDTODEVICE)
1115 return sock_setbindtodevice(sk, optval, optlen);
1116
1117 if (optlen < sizeof(int))
1118 return -EINVAL;
1119
1120 if (copy_from_sockptr(&val, optval, sizeof(val)))
1121 return -EFAULT;
1122
1123 valbool = val ? 1 : 0;
1124
1125 sockopt_lock_sock(sk);
1126
1127 switch (optname) {
1128 case SO_DEBUG:
1129 if (val && !sockopt_capable(CAP_NET_ADMIN))
1130 ret = -EACCES;
1131 else
1132 sock_valbool_flag(sk, SOCK_DBG, valbool);
1133 break;
1134 case SO_REUSEADDR:
1135 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1136 break;
1137 case SO_REUSEPORT:
1138 sk->sk_reuseport = valbool;
1139 break;
1140 case SO_TYPE:
1141 case SO_PROTOCOL:
1142 case SO_DOMAIN:
1143 case SO_ERROR:
1144 ret = -ENOPROTOOPT;
1145 break;
1146 case SO_DONTROUTE:
1147 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1148 sk_dst_reset(sk);
1149 break;
1150 case SO_BROADCAST:
1151 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1152 break;
1153 case SO_SNDBUF:
1154 /* Don't error on this BSD doesn't and if you think
1155 * about it this is right. Otherwise apps have to
1156 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1157 * are treated in BSD as hints
1158 */
1159 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1160 set_sndbuf:
1161 /* Ensure val * 2 fits into an int, to prevent max_t()
1162 * from treating it as a negative value.
1163 */
1164 val = min_t(int, val, INT_MAX / 2);
1165 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1166 WRITE_ONCE(sk->sk_sndbuf,
1167 max_t(int, val * 2, SOCK_MIN_SNDBUF));
1168 /* Wake up sending tasks if we upped the value. */
1169 sk->sk_write_space(sk);
1170 break;
1171
1172 case SO_SNDBUFFORCE:
1173 if (!sockopt_capable(CAP_NET_ADMIN)) {
1174 ret = -EPERM;
1175 break;
1176 }
1177
1178 /* No negative values (to prevent underflow, as val will be
1179 * multiplied by 2).
1180 */
1181 if (val < 0)
1182 val = 0;
1183 goto set_sndbuf;
1184
1185 case SO_RCVBUF:
1186 /* Don't error on this BSD doesn't and if you think
1187 * about it this is right. Otherwise apps have to
1188 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1189 * are treated in BSD as hints
1190 */
1191 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1192 break;
1193
1194 case SO_RCVBUFFORCE:
1195 if (!sockopt_capable(CAP_NET_ADMIN)) {
1196 ret = -EPERM;
1197 break;
1198 }
1199
1200 /* No negative values (to prevent underflow, as val will be
1201 * multiplied by 2).
1202 */
1203 __sock_set_rcvbuf(sk, max(val, 0));
1204 break;
1205
1206 case SO_KEEPALIVE:
1207 if (sk->sk_prot->keepalive)
1208 sk->sk_prot->keepalive(sk, valbool);
1209 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1210 break;
1211
1212 case SO_OOBINLINE:
1213 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1214 break;
1215
1216 case SO_NO_CHECK:
1217 sk->sk_no_check_tx = valbool;
1218 break;
1219
1220 case SO_PRIORITY:
1221 if ((val >= 0 && val <= 6) ||
1222 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1223 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1224 WRITE_ONCE(sk->sk_priority, val);
1225 else
1226 ret = -EPERM;
1227 break;
1228
1229 case SO_LINGER:
1230 if (optlen < sizeof(ling)) {
1231 ret = -EINVAL; /* 1003.1g */
1232 break;
1233 }
1234 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1235 ret = -EFAULT;
1236 break;
1237 }
1238 if (!ling.l_onoff) {
1239 sock_reset_flag(sk, SOCK_LINGER);
1240 } else {
1241 unsigned long t_sec = ling.l_linger;
1242
1243 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1244 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1245 else
1246 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1247 sock_set_flag(sk, SOCK_LINGER);
1248 }
1249 break;
1250
1251 case SO_BSDCOMPAT:
1252 break;
1253
1254 case SO_PASSCRED:
1255 assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1256 break;
1257
1258 case SO_PASSPIDFD:
1259 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1260 break;
1261
1262 case SO_TIMESTAMP_OLD:
1263 case SO_TIMESTAMP_NEW:
1264 case SO_TIMESTAMPNS_OLD:
1265 case SO_TIMESTAMPNS_NEW:
1266 sock_set_timestamp(sk, optname, valbool);
1267 break;
1268
1269 case SO_TIMESTAMPING_NEW:
1270 case SO_TIMESTAMPING_OLD:
1271 if (optlen == sizeof(timestamping)) {
1272 if (copy_from_sockptr(×tamping, optval,
1273 sizeof(timestamping))) {
1274 ret = -EFAULT;
1275 break;
1276 }
1277 } else {
1278 memset(×tamping, 0, sizeof(timestamping));
1279 timestamping.flags = val;
1280 }
1281 ret = sock_set_timestamping(sk, optname, timestamping);
1282 break;
1283
1284 case SO_RCVLOWAT:
1285 {
1286 int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1287
1288 if (val < 0)
1289 val = INT_MAX;
1290 if (sock)
1291 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1292 if (set_rcvlowat)
1293 ret = set_rcvlowat(sk, val);
1294 else
1295 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1296 break;
1297 }
1298 case SO_RCVTIMEO_OLD:
1299 case SO_RCVTIMEO_NEW:
1300 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1301 optlen, optname == SO_RCVTIMEO_OLD);
1302 break;
1303
1304 case SO_SNDTIMEO_OLD:
1305 case SO_SNDTIMEO_NEW:
1306 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1307 optlen, optname == SO_SNDTIMEO_OLD);
1308 break;
1309
1310 case SO_ATTACH_FILTER: {
1311 struct sock_fprog fprog;
1312
1313 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1314 if (!ret)
1315 ret = sk_attach_filter(&fprog, sk);
1316 break;
1317 }
1318 case SO_ATTACH_BPF:
1319 ret = -EINVAL;
1320 if (optlen == sizeof(u32)) {
1321 u32 ufd;
1322
1323 ret = -EFAULT;
1324 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1325 break;
1326
1327 ret = sk_attach_bpf(ufd, sk);
1328 }
1329 break;
1330
1331 case SO_ATTACH_REUSEPORT_CBPF: {
1332 struct sock_fprog fprog;
1333
1334 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1335 if (!ret)
1336 ret = sk_reuseport_attach_filter(&fprog, sk);
1337 break;
1338 }
1339 case SO_ATTACH_REUSEPORT_EBPF:
1340 ret = -EINVAL;
1341 if (optlen == sizeof(u32)) {
1342 u32 ufd;
1343
1344 ret = -EFAULT;
1345 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1346 break;
1347
1348 ret = sk_reuseport_attach_bpf(ufd, sk);
1349 }
1350 break;
1351
1352 case SO_DETACH_REUSEPORT_BPF:
1353 ret = reuseport_detach_prog(sk);
1354 break;
1355
1356 case SO_DETACH_FILTER:
1357 ret = sk_detach_filter(sk);
1358 break;
1359
1360 case SO_LOCK_FILTER:
1361 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1362 ret = -EPERM;
1363 else
1364 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1365 break;
1366
1367 case SO_PASSSEC:
1368 assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1369 break;
1370 case SO_MARK:
1371 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1372 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1373 ret = -EPERM;
1374 break;
1375 }
1376
1377 __sock_set_mark(sk, val);
1378 break;
1379 case SO_RCVMARK:
1380 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1381 break;
1382
1383 case SO_RXQ_OVFL:
1384 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1385 break;
1386
1387 case SO_WIFI_STATUS:
1388 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1389 break;
1390
1391 case SO_PEEK_OFF:
1392 {
1393 int (*set_peek_off)(struct sock *sk, int val);
1394
1395 set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1396 if (set_peek_off)
1397 ret = set_peek_off(sk, val);
1398 else
1399 ret = -EOPNOTSUPP;
1400 break;
1401 }
1402
1403 case SO_NOFCS:
1404 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1405 break;
1406
1407 case SO_SELECT_ERR_QUEUE:
1408 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1409 break;
1410
1411 #ifdef CONFIG_NET_RX_BUSY_POLL
1412 case SO_BUSY_POLL:
1413 if (val < 0)
1414 ret = -EINVAL;
1415 else
1416 WRITE_ONCE(sk->sk_ll_usec, val);
1417 break;
1418 case SO_PREFER_BUSY_POLL:
1419 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1420 ret = -EPERM;
1421 else
1422 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1423 break;
1424 case SO_BUSY_POLL_BUDGET:
1425 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
1426 ret = -EPERM;
1427 } else {
1428 if (val < 0 || val > U16_MAX)
1429 ret = -EINVAL;
1430 else
1431 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1432 }
1433 break;
1434 #endif
1435
1436 case SO_MAX_PACING_RATE:
1437 {
1438 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1439
1440 if (sizeof(ulval) != sizeof(val) &&
1441 optlen >= sizeof(ulval) &&
1442 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1443 ret = -EFAULT;
1444 break;
1445 }
1446 if (ulval != ~0UL)
1447 cmpxchg(&sk->sk_pacing_status,
1448 SK_PACING_NONE,
1449 SK_PACING_NEEDED);
1450 /* Pairs with READ_ONCE() from sk_getsockopt() */
1451 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1452 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1453 break;
1454 }
1455 case SO_INCOMING_CPU:
1456 reuseport_update_incoming_cpu(sk, val);
1457 break;
1458
1459 case SO_CNX_ADVICE:
1460 if (val == 1)
1461 dst_negative_advice(sk);
1462 break;
1463
1464 case SO_ZEROCOPY:
1465 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1466 if (!(sk_is_tcp(sk) ||
1467 (sk->sk_type == SOCK_DGRAM &&
1468 sk->sk_protocol == IPPROTO_UDP)))
1469 ret = -EOPNOTSUPP;
1470 } else if (sk->sk_family != PF_RDS) {
1471 ret = -EOPNOTSUPP;
1472 }
1473 if (!ret) {
1474 if (val < 0 || val > 1)
1475 ret = -EINVAL;
1476 else
1477 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1478 }
1479 break;
1480
1481 case SO_TXTIME:
1482 if (optlen != sizeof(struct sock_txtime)) {
1483 ret = -EINVAL;
1484 break;
1485 } else if (copy_from_sockptr(&sk_txtime, optval,
1486 sizeof(struct sock_txtime))) {
1487 ret = -EFAULT;
1488 break;
1489 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1490 ret = -EINVAL;
1491 break;
1492 }
1493 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1494 * scheduler has enough safe guards.
1495 */
1496 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1497 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1498 ret = -EPERM;
1499 break;
1500 }
1501 sock_valbool_flag(sk, SOCK_TXTIME, true);
1502 sk->sk_clockid = sk_txtime.clockid;
1503 sk->sk_txtime_deadline_mode =
1504 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1505 sk->sk_txtime_report_errors =
1506 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1507 break;
1508
1509 case SO_BINDTOIFINDEX:
1510 ret = sock_bindtoindex_locked(sk, val);
1511 break;
1512
1513 case SO_BUF_LOCK:
1514 if (val & ~SOCK_BUF_LOCK_MASK) {
1515 ret = -EINVAL;
1516 break;
1517 }
1518 sk->sk_userlocks = val | (sk->sk_userlocks &
1519 ~SOCK_BUF_LOCK_MASK);
1520 break;
1521
1522 case SO_RESERVE_MEM:
1523 {
1524 int delta;
1525
1526 if (val < 0) {
1527 ret = -EINVAL;
1528 break;
1529 }
1530
1531 delta = val - sk->sk_reserved_mem;
1532 if (delta < 0)
1533 sock_release_reserved_memory(sk, -delta);
1534 else
1535 ret = sock_reserve_memory(sk, delta);
1536 break;
1537 }
1538
1539 case SO_TXREHASH:
1540 if (val < -1 || val > 1) {
1541 ret = -EINVAL;
1542 break;
1543 }
1544 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1545 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1546 /* Paired with READ_ONCE() in tcp_rtx_synack()
1547 * and sk_getsockopt().
1548 */
1549 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1550 break;
1551
1552 default:
1553 ret = -ENOPROTOOPT;
1554 break;
1555 }
1556 sockopt_release_sock(sk);
1557 return ret;
1558 }
1559
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)1560 int sock_setsockopt(struct socket *sock, int level, int optname,
1561 sockptr_t optval, unsigned int optlen)
1562 {
1563 return sk_setsockopt(sock->sk, level, optname,
1564 optval, optlen);
1565 }
1566 EXPORT_SYMBOL(sock_setsockopt);
1567
sk_get_peer_cred(struct sock * sk)1568 static const struct cred *sk_get_peer_cred(struct sock *sk)
1569 {
1570 const struct cred *cred;
1571
1572 spin_lock(&sk->sk_peer_lock);
1573 cred = get_cred(sk->sk_peer_cred);
1574 spin_unlock(&sk->sk_peer_lock);
1575
1576 return cred;
1577 }
1578
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1579 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1580 struct ucred *ucred)
1581 {
1582 ucred->pid = pid_vnr(pid);
1583 ucred->uid = ucred->gid = -1;
1584 if (cred) {
1585 struct user_namespace *current_ns = current_user_ns();
1586
1587 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1588 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1589 }
1590 }
1591
groups_to_user(sockptr_t dst,const struct group_info * src)1592 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1593 {
1594 struct user_namespace *user_ns = current_user_ns();
1595 int i;
1596
1597 for (i = 0; i < src->ngroups; i++) {
1598 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1599
1600 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1601 return -EFAULT;
1602 }
1603
1604 return 0;
1605 }
1606
sk_getsockopt(struct sock * sk,int level,int optname,sockptr_t optval,sockptr_t optlen)1607 int sk_getsockopt(struct sock *sk, int level, int optname,
1608 sockptr_t optval, sockptr_t optlen)
1609 {
1610 struct socket *sock = sk->sk_socket;
1611
1612 union {
1613 int val;
1614 u64 val64;
1615 unsigned long ulval;
1616 struct linger ling;
1617 struct old_timeval32 tm32;
1618 struct __kernel_old_timeval tm;
1619 struct __kernel_sock_timeval stm;
1620 struct sock_txtime txtime;
1621 struct so_timestamping timestamping;
1622 } v;
1623
1624 int lv = sizeof(int);
1625 int len;
1626
1627 if (copy_from_sockptr(&len, optlen, sizeof(int)))
1628 return -EFAULT;
1629 if (len < 0)
1630 return -EINVAL;
1631
1632 memset(&v, 0, sizeof(v));
1633
1634 switch (optname) {
1635 case SO_DEBUG:
1636 v.val = sock_flag(sk, SOCK_DBG);
1637 break;
1638
1639 case SO_DONTROUTE:
1640 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1641 break;
1642
1643 case SO_BROADCAST:
1644 v.val = sock_flag(sk, SOCK_BROADCAST);
1645 break;
1646
1647 case SO_SNDBUF:
1648 v.val = READ_ONCE(sk->sk_sndbuf);
1649 break;
1650
1651 case SO_RCVBUF:
1652 v.val = READ_ONCE(sk->sk_rcvbuf);
1653 break;
1654
1655 case SO_REUSEADDR:
1656 v.val = sk->sk_reuse;
1657 break;
1658
1659 case SO_REUSEPORT:
1660 v.val = sk->sk_reuseport;
1661 break;
1662
1663 case SO_KEEPALIVE:
1664 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1665 break;
1666
1667 case SO_TYPE:
1668 v.val = sk->sk_type;
1669 break;
1670
1671 case SO_PROTOCOL:
1672 v.val = sk->sk_protocol;
1673 break;
1674
1675 case SO_DOMAIN:
1676 v.val = sk->sk_family;
1677 break;
1678
1679 case SO_ERROR:
1680 v.val = -sock_error(sk);
1681 if (v.val == 0)
1682 v.val = xchg(&sk->sk_err_soft, 0);
1683 break;
1684
1685 case SO_OOBINLINE:
1686 v.val = sock_flag(sk, SOCK_URGINLINE);
1687 break;
1688
1689 case SO_NO_CHECK:
1690 v.val = sk->sk_no_check_tx;
1691 break;
1692
1693 case SO_PRIORITY:
1694 v.val = READ_ONCE(sk->sk_priority);
1695 break;
1696
1697 case SO_LINGER:
1698 lv = sizeof(v.ling);
1699 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1700 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1701 break;
1702
1703 case SO_BSDCOMPAT:
1704 break;
1705
1706 case SO_TIMESTAMP_OLD:
1707 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1708 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1709 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1710 break;
1711
1712 case SO_TIMESTAMPNS_OLD:
1713 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1714 break;
1715
1716 case SO_TIMESTAMP_NEW:
1717 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1718 break;
1719
1720 case SO_TIMESTAMPNS_NEW:
1721 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1722 break;
1723
1724 case SO_TIMESTAMPING_OLD:
1725 case SO_TIMESTAMPING_NEW:
1726 lv = sizeof(v.timestamping);
1727 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1728 * returning the flags when they were set through the same option.
1729 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1730 */
1731 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1732 v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1733 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1734 }
1735 break;
1736
1737 case SO_RCVTIMEO_OLD:
1738 case SO_RCVTIMEO_NEW:
1739 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1740 SO_RCVTIMEO_OLD == optname);
1741 break;
1742
1743 case SO_SNDTIMEO_OLD:
1744 case SO_SNDTIMEO_NEW:
1745 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1746 SO_SNDTIMEO_OLD == optname);
1747 break;
1748
1749 case SO_RCVLOWAT:
1750 v.val = READ_ONCE(sk->sk_rcvlowat);
1751 break;
1752
1753 case SO_SNDLOWAT:
1754 v.val = 1;
1755 break;
1756
1757 case SO_PASSCRED:
1758 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1759 break;
1760
1761 case SO_PASSPIDFD:
1762 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1763 break;
1764
1765 case SO_PEERCRED:
1766 {
1767 struct ucred peercred;
1768 if (len > sizeof(peercred))
1769 len = sizeof(peercred);
1770
1771 spin_lock(&sk->sk_peer_lock);
1772 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1773 spin_unlock(&sk->sk_peer_lock);
1774
1775 if (copy_to_sockptr(optval, &peercred, len))
1776 return -EFAULT;
1777 goto lenout;
1778 }
1779
1780 case SO_PEERPIDFD:
1781 {
1782 struct pid *peer_pid;
1783 struct file *pidfd_file = NULL;
1784 int pidfd;
1785
1786 if (len > sizeof(pidfd))
1787 len = sizeof(pidfd);
1788
1789 spin_lock(&sk->sk_peer_lock);
1790 peer_pid = get_pid(sk->sk_peer_pid);
1791 spin_unlock(&sk->sk_peer_lock);
1792
1793 if (!peer_pid)
1794 return -ENODATA;
1795
1796 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1797 put_pid(peer_pid);
1798 if (pidfd < 0)
1799 return pidfd;
1800
1801 if (copy_to_sockptr(optval, &pidfd, len) ||
1802 copy_to_sockptr(optlen, &len, sizeof(int))) {
1803 put_unused_fd(pidfd);
1804 fput(pidfd_file);
1805
1806 return -EFAULT;
1807 }
1808
1809 fd_install(pidfd, pidfd_file);
1810 return 0;
1811 }
1812
1813 case SO_PEERGROUPS:
1814 {
1815 const struct cred *cred;
1816 int ret, n;
1817
1818 cred = sk_get_peer_cred(sk);
1819 if (!cred)
1820 return -ENODATA;
1821
1822 n = cred->group_info->ngroups;
1823 if (len < n * sizeof(gid_t)) {
1824 len = n * sizeof(gid_t);
1825 put_cred(cred);
1826 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1827 }
1828 len = n * sizeof(gid_t);
1829
1830 ret = groups_to_user(optval, cred->group_info);
1831 put_cred(cred);
1832 if (ret)
1833 return ret;
1834 goto lenout;
1835 }
1836
1837 case SO_PEERNAME:
1838 {
1839 struct sockaddr_storage address;
1840
1841 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1842 if (lv < 0)
1843 return -ENOTCONN;
1844 if (lv < len)
1845 return -EINVAL;
1846 if (copy_to_sockptr(optval, &address, len))
1847 return -EFAULT;
1848 goto lenout;
1849 }
1850
1851 /* Dubious BSD thing... Probably nobody even uses it, but
1852 * the UNIX standard wants it for whatever reason... -DaveM
1853 */
1854 case SO_ACCEPTCONN:
1855 v.val = sk->sk_state == TCP_LISTEN;
1856 break;
1857
1858 case SO_PASSSEC:
1859 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1860 break;
1861
1862 case SO_PEERSEC:
1863 return security_socket_getpeersec_stream(sock,
1864 optval, optlen, len);
1865
1866 case SO_MARK:
1867 v.val = READ_ONCE(sk->sk_mark);
1868 break;
1869
1870 case SO_RCVMARK:
1871 v.val = sock_flag(sk, SOCK_RCVMARK);
1872 break;
1873
1874 case SO_RXQ_OVFL:
1875 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1876 break;
1877
1878 case SO_WIFI_STATUS:
1879 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1880 break;
1881
1882 case SO_PEEK_OFF:
1883 if (!READ_ONCE(sock->ops)->set_peek_off)
1884 return -EOPNOTSUPP;
1885
1886 v.val = READ_ONCE(sk->sk_peek_off);
1887 break;
1888 case SO_NOFCS:
1889 v.val = sock_flag(sk, SOCK_NOFCS);
1890 break;
1891
1892 case SO_BINDTODEVICE:
1893 return sock_getbindtodevice(sk, optval, optlen, len);
1894
1895 case SO_GET_FILTER:
1896 len = sk_get_filter(sk, optval, len);
1897 if (len < 0)
1898 return len;
1899
1900 goto lenout;
1901
1902 case SO_LOCK_FILTER:
1903 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1904 break;
1905
1906 case SO_BPF_EXTENSIONS:
1907 v.val = bpf_tell_extensions();
1908 break;
1909
1910 case SO_SELECT_ERR_QUEUE:
1911 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1912 break;
1913
1914 #ifdef CONFIG_NET_RX_BUSY_POLL
1915 case SO_BUSY_POLL:
1916 v.val = READ_ONCE(sk->sk_ll_usec);
1917 break;
1918 case SO_PREFER_BUSY_POLL:
1919 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1920 break;
1921 #endif
1922
1923 case SO_MAX_PACING_RATE:
1924 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1925 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1926 lv = sizeof(v.ulval);
1927 v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1928 } else {
1929 /* 32bit version */
1930 v.val = min_t(unsigned long, ~0U,
1931 READ_ONCE(sk->sk_max_pacing_rate));
1932 }
1933 break;
1934
1935 case SO_INCOMING_CPU:
1936 v.val = READ_ONCE(sk->sk_incoming_cpu);
1937 break;
1938
1939 case SO_MEMINFO:
1940 {
1941 u32 meminfo[SK_MEMINFO_VARS];
1942
1943 sk_get_meminfo(sk, meminfo);
1944
1945 len = min_t(unsigned int, len, sizeof(meminfo));
1946 if (copy_to_sockptr(optval, &meminfo, len))
1947 return -EFAULT;
1948
1949 goto lenout;
1950 }
1951
1952 #ifdef CONFIG_NET_RX_BUSY_POLL
1953 case SO_INCOMING_NAPI_ID:
1954 v.val = READ_ONCE(sk->sk_napi_id);
1955
1956 /* aggregate non-NAPI IDs down to 0 */
1957 if (v.val < MIN_NAPI_ID)
1958 v.val = 0;
1959
1960 break;
1961 #endif
1962
1963 case SO_COOKIE:
1964 lv = sizeof(u64);
1965 if (len < lv)
1966 return -EINVAL;
1967 v.val64 = sock_gen_cookie(sk);
1968 break;
1969
1970 case SO_ZEROCOPY:
1971 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1972 break;
1973
1974 case SO_TXTIME:
1975 lv = sizeof(v.txtime);
1976 v.txtime.clockid = sk->sk_clockid;
1977 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1978 SOF_TXTIME_DEADLINE_MODE : 0;
1979 v.txtime.flags |= sk->sk_txtime_report_errors ?
1980 SOF_TXTIME_REPORT_ERRORS : 0;
1981 break;
1982
1983 case SO_BINDTOIFINDEX:
1984 v.val = READ_ONCE(sk->sk_bound_dev_if);
1985 break;
1986
1987 case SO_NETNS_COOKIE:
1988 lv = sizeof(u64);
1989 if (len != lv)
1990 return -EINVAL;
1991 v.val64 = sock_net(sk)->net_cookie;
1992 break;
1993
1994 case SO_BUF_LOCK:
1995 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1996 break;
1997
1998 case SO_RESERVE_MEM:
1999 v.val = READ_ONCE(sk->sk_reserved_mem);
2000 break;
2001
2002 case SO_TXREHASH:
2003 /* Paired with WRITE_ONCE() in sk_setsockopt() */
2004 v.val = READ_ONCE(sk->sk_txrehash);
2005 break;
2006
2007 default:
2008 /* We implement the SO_SNDLOWAT etc to not be settable
2009 * (1003.1g 7).
2010 */
2011 return -ENOPROTOOPT;
2012 }
2013
2014 if (len > lv)
2015 len = lv;
2016 if (copy_to_sockptr(optval, &v, len))
2017 return -EFAULT;
2018 lenout:
2019 if (copy_to_sockptr(optlen, &len, sizeof(int)))
2020 return -EFAULT;
2021 return 0;
2022 }
2023
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2024 int sock_getsockopt(struct socket *sock, int level, int optname,
2025 char __user *optval, int __user *optlen)
2026 {
2027 return sk_getsockopt(sock->sk, level, optname,
2028 USER_SOCKPTR(optval),
2029 USER_SOCKPTR(optlen));
2030 }
2031
2032 /*
2033 * Initialize an sk_lock.
2034 *
2035 * (We also register the sk_lock with the lock validator.)
2036 */
sock_lock_init(struct sock * sk)2037 static inline void sock_lock_init(struct sock *sk)
2038 {
2039 if (sk->sk_kern_sock)
2040 sock_lock_init_class_and_name(
2041 sk,
2042 af_family_kern_slock_key_strings[sk->sk_family],
2043 af_family_kern_slock_keys + sk->sk_family,
2044 af_family_kern_key_strings[sk->sk_family],
2045 af_family_kern_keys + sk->sk_family);
2046 else
2047 sock_lock_init_class_and_name(
2048 sk,
2049 af_family_slock_key_strings[sk->sk_family],
2050 af_family_slock_keys + sk->sk_family,
2051 af_family_key_strings[sk->sk_family],
2052 af_family_keys + sk->sk_family);
2053 }
2054
2055 /*
2056 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2057 * even temporarly, because of RCU lookups. sk_node should also be left as is.
2058 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2059 */
sock_copy(struct sock * nsk,const struct sock * osk)2060 static void sock_copy(struct sock *nsk, const struct sock *osk)
2061 {
2062 const struct proto *prot = READ_ONCE(osk->sk_prot);
2063 #ifdef CONFIG_SECURITY_NETWORK
2064 void *sptr = nsk->sk_security;
2065 #endif
2066
2067 /* If we move sk_tx_queue_mapping out of the private section,
2068 * we must check if sk_tx_queue_clear() is called after
2069 * sock_copy() in sk_clone_lock().
2070 */
2071 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2072 offsetof(struct sock, sk_dontcopy_begin) ||
2073 offsetof(struct sock, sk_tx_queue_mapping) >=
2074 offsetof(struct sock, sk_dontcopy_end));
2075
2076 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2077
2078 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2079 prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2080
2081 #ifdef CONFIG_SECURITY_NETWORK
2082 nsk->sk_security = sptr;
2083 security_sk_clone(osk, nsk);
2084 #endif
2085 }
2086
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)2087 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2088 int family)
2089 {
2090 struct sock *sk;
2091 struct kmem_cache *slab;
2092
2093 slab = prot->slab;
2094 if (slab != NULL) {
2095 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2096 if (!sk)
2097 return sk;
2098 if (want_init_on_alloc(priority))
2099 sk_prot_clear_nulls(sk, prot->obj_size);
2100 } else
2101 sk = kmalloc(prot->obj_size, priority);
2102
2103 if (sk != NULL) {
2104 if (security_sk_alloc(sk, family, priority))
2105 goto out_free;
2106
2107 if (!try_module_get(prot->owner))
2108 goto out_free_sec;
2109 }
2110
2111 return sk;
2112
2113 out_free_sec:
2114 security_sk_free(sk);
2115 out_free:
2116 if (slab != NULL)
2117 kmem_cache_free(slab, sk);
2118 else
2119 kfree(sk);
2120 return NULL;
2121 }
2122
sk_prot_free(struct proto * prot,struct sock * sk)2123 static void sk_prot_free(struct proto *prot, struct sock *sk)
2124 {
2125 struct kmem_cache *slab;
2126 struct module *owner;
2127
2128 owner = prot->owner;
2129 slab = prot->slab;
2130
2131 cgroup_sk_free(&sk->sk_cgrp_data);
2132 mem_cgroup_sk_free(sk);
2133 trace_android_vh_sk_free(sk);
2134 security_sk_free(sk);
2135 if (slab != NULL)
2136 kmem_cache_free(slab, sk);
2137 else
2138 kfree(sk);
2139 module_put(owner);
2140 }
2141
2142 /**
2143 * sk_alloc - All socket objects are allocated here
2144 * @net: the applicable net namespace
2145 * @family: protocol family
2146 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2147 * @prot: struct proto associated with this new sock instance
2148 * @kern: is this to be a kernel socket?
2149 */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)2150 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2151 struct proto *prot, int kern)
2152 {
2153 struct sock *sk;
2154
2155 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2156 if (sk) {
2157 sk->sk_family = family;
2158 /*
2159 * See comment in struct sock definition to understand
2160 * why we need sk_prot_creator -acme
2161 */
2162 sk->sk_prot = sk->sk_prot_creator = prot;
2163 sk->sk_kern_sock = kern;
2164 sock_lock_init(sk);
2165 sk->sk_net_refcnt = kern ? 0 : 1;
2166 if (likely(sk->sk_net_refcnt)) {
2167 get_net_track(net, &sk->ns_tracker, priority);
2168 sock_inuse_add(net, 1);
2169 } else {
2170 __netns_tracker_alloc(net, &sk->ns_tracker,
2171 false, priority);
2172 }
2173
2174 sock_net_set(sk, net);
2175 refcount_set(&sk->sk_wmem_alloc, 1);
2176
2177 mem_cgroup_sk_alloc(sk);
2178 trace_android_vh_sk_alloc(sk);
2179 cgroup_sk_alloc(&sk->sk_cgrp_data);
2180 sock_update_classid(&sk->sk_cgrp_data);
2181 sock_update_netprioidx(&sk->sk_cgrp_data);
2182 sk_tx_queue_clear(sk);
2183 }
2184
2185 return sk;
2186 }
2187 EXPORT_SYMBOL(sk_alloc);
2188
2189 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2190 * grace period. This is the case for UDP sockets and TCP listeners.
2191 */
__sk_destruct(struct rcu_head * head)2192 static void __sk_destruct(struct rcu_head *head)
2193 {
2194 struct sock *sk = container_of(head, struct sock, sk_rcu);
2195 struct sk_filter *filter;
2196
2197 if (sk->sk_destruct)
2198 sk->sk_destruct(sk);
2199
2200 filter = rcu_dereference_check(sk->sk_filter,
2201 refcount_read(&sk->sk_wmem_alloc) == 0);
2202 if (filter) {
2203 sk_filter_uncharge(sk, filter);
2204 RCU_INIT_POINTER(sk->sk_filter, NULL);
2205 }
2206
2207 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2208
2209 #ifdef CONFIG_BPF_SYSCALL
2210 bpf_sk_storage_free(sk);
2211 #endif
2212
2213 if (atomic_read(&sk->sk_omem_alloc))
2214 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2215 __func__, atomic_read(&sk->sk_omem_alloc));
2216
2217 if (sk->sk_frag.page) {
2218 put_page(sk->sk_frag.page);
2219 sk->sk_frag.page = NULL;
2220 }
2221
2222 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2223 put_cred(sk->sk_peer_cred);
2224 put_pid(sk->sk_peer_pid);
2225
2226 if (likely(sk->sk_net_refcnt))
2227 put_net_track(sock_net(sk), &sk->ns_tracker);
2228 else
2229 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2230
2231 sk_prot_free(sk->sk_prot_creator, sk);
2232 }
2233
sk_destruct(struct sock * sk)2234 void sk_destruct(struct sock *sk)
2235 {
2236 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2237
2238 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2239 reuseport_detach_sock(sk);
2240 use_call_rcu = true;
2241 }
2242
2243 if (use_call_rcu)
2244 call_rcu(&sk->sk_rcu, __sk_destruct);
2245 else
2246 __sk_destruct(&sk->sk_rcu);
2247 }
2248
__sk_free(struct sock * sk)2249 static void __sk_free(struct sock *sk)
2250 {
2251 if (likely(sk->sk_net_refcnt))
2252 sock_inuse_add(sock_net(sk), -1);
2253
2254 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2255 sock_diag_broadcast_destroy(sk);
2256 else
2257 sk_destruct(sk);
2258 }
2259
sk_free(struct sock * sk)2260 void sk_free(struct sock *sk)
2261 {
2262 /*
2263 * We subtract one from sk_wmem_alloc and can know if
2264 * some packets are still in some tx queue.
2265 * If not null, sock_wfree() will call __sk_free(sk) later
2266 */
2267 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2268 __sk_free(sk);
2269 }
2270 EXPORT_SYMBOL(sk_free);
2271
sk_init_common(struct sock * sk)2272 static void sk_init_common(struct sock *sk)
2273 {
2274 skb_queue_head_init(&sk->sk_receive_queue);
2275 skb_queue_head_init(&sk->sk_write_queue);
2276 skb_queue_head_init(&sk->sk_error_queue);
2277
2278 rwlock_init(&sk->sk_callback_lock);
2279 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2280 af_rlock_keys + sk->sk_family,
2281 af_family_rlock_key_strings[sk->sk_family]);
2282 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2283 af_wlock_keys + sk->sk_family,
2284 af_family_wlock_key_strings[sk->sk_family]);
2285 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2286 af_elock_keys + sk->sk_family,
2287 af_family_elock_key_strings[sk->sk_family]);
2288 lockdep_set_class_and_name(&sk->sk_callback_lock,
2289 af_callback_keys + sk->sk_family,
2290 af_family_clock_key_strings[sk->sk_family]);
2291 }
2292
2293 /**
2294 * sk_clone_lock - clone a socket, and lock its clone
2295 * @sk: the socket to clone
2296 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2297 *
2298 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2299 */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2300 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2301 {
2302 struct proto *prot = READ_ONCE(sk->sk_prot);
2303 struct sk_filter *filter;
2304 bool is_charged = true;
2305 struct sock *newsk;
2306
2307 newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2308 if (!newsk)
2309 goto out;
2310
2311 sock_copy(newsk, sk);
2312 trace_android_vh_sk_clone_lock(newsk);
2313
2314 newsk->sk_prot_creator = prot;
2315
2316 /* SANITY */
2317 if (likely(newsk->sk_net_refcnt)) {
2318 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2319 sock_inuse_add(sock_net(newsk), 1);
2320 } else {
2321 /* Kernel sockets are not elevating the struct net refcount.
2322 * Instead, use a tracker to more easily detect if a layer
2323 * is not properly dismantling its kernel sockets at netns
2324 * destroy time.
2325 */
2326 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2327 false, priority);
2328 }
2329 sk_node_init(&newsk->sk_node);
2330 sock_lock_init(newsk);
2331 bh_lock_sock(newsk);
2332 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
2333 newsk->sk_backlog.len = 0;
2334
2335 atomic_set(&newsk->sk_rmem_alloc, 0);
2336
2337 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2338 refcount_set(&newsk->sk_wmem_alloc, 1);
2339
2340 atomic_set(&newsk->sk_omem_alloc, 0);
2341 sk_init_common(newsk);
2342
2343 newsk->sk_dst_cache = NULL;
2344 newsk->sk_dst_pending_confirm = 0;
2345 newsk->sk_wmem_queued = 0;
2346 newsk->sk_forward_alloc = 0;
2347 newsk->sk_reserved_mem = 0;
2348 atomic_set(&newsk->sk_drops, 0);
2349 newsk->sk_send_head = NULL;
2350 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2351 atomic_set(&newsk->sk_zckey, 0);
2352
2353 sock_reset_flag(newsk, SOCK_DONE);
2354
2355 /* sk->sk_memcg will be populated at accept() time */
2356 newsk->sk_memcg = NULL;
2357
2358 cgroup_sk_clone(&newsk->sk_cgrp_data);
2359
2360 rcu_read_lock();
2361 filter = rcu_dereference(sk->sk_filter);
2362 if (filter != NULL)
2363 /* though it's an empty new sock, the charging may fail
2364 * if sysctl_optmem_max was changed between creation of
2365 * original socket and cloning
2366 */
2367 is_charged = sk_filter_charge(newsk, filter);
2368 RCU_INIT_POINTER(newsk->sk_filter, filter);
2369 rcu_read_unlock();
2370
2371 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2372 /* We need to make sure that we don't uncharge the new
2373 * socket if we couldn't charge it in the first place
2374 * as otherwise we uncharge the parent's filter.
2375 */
2376 if (!is_charged)
2377 RCU_INIT_POINTER(newsk->sk_filter, NULL);
2378 sk_free_unlock_clone(newsk);
2379 newsk = NULL;
2380 goto out;
2381 }
2382 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2383
2384 if (bpf_sk_storage_clone(sk, newsk)) {
2385 sk_free_unlock_clone(newsk);
2386 newsk = NULL;
2387 goto out;
2388 }
2389
2390 /* Clear sk_user_data if parent had the pointer tagged
2391 * as not suitable for copying when cloning.
2392 */
2393 if (sk_user_data_is_nocopy(newsk))
2394 newsk->sk_user_data = NULL;
2395
2396 newsk->sk_err = 0;
2397 newsk->sk_err_soft = 0;
2398 newsk->sk_priority = 0;
2399 newsk->sk_incoming_cpu = raw_smp_processor_id();
2400
2401 /* Before updating sk_refcnt, we must commit prior changes to memory
2402 * (Documentation/RCU/rculist_nulls.rst for details)
2403 */
2404 smp_wmb();
2405 refcount_set(&newsk->sk_refcnt, 2);
2406
2407 sk_set_socket(newsk, NULL);
2408 sk_tx_queue_clear(newsk);
2409 RCU_INIT_POINTER(newsk->sk_wq, NULL);
2410
2411 if (newsk->sk_prot->sockets_allocated)
2412 sk_sockets_allocated_inc(newsk);
2413
2414 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2415 net_enable_timestamp();
2416 out:
2417 return newsk;
2418 }
2419 EXPORT_SYMBOL_GPL(sk_clone_lock);
2420
sk_free_unlock_clone(struct sock * sk)2421 void sk_free_unlock_clone(struct sock *sk)
2422 {
2423 /* It is still raw copy of parent, so invalidate
2424 * destructor and make plain sk_free() */
2425 sk->sk_destruct = NULL;
2426 bh_unlock_sock(sk);
2427 sk_free(sk);
2428 }
2429 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2430
sk_dst_gso_max_size(struct sock * sk,struct dst_entry * dst)2431 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2432 {
2433 bool is_ipv6 = false;
2434 u32 max_size;
2435
2436 #if IS_ENABLED(CONFIG_IPV6)
2437 is_ipv6 = (sk->sk_family == AF_INET6 &&
2438 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2439 #endif
2440 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2441 max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2442 READ_ONCE(dst->dev->gso_ipv4_max_size);
2443 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2444 max_size = GSO_LEGACY_MAX_SIZE;
2445
2446 return max_size - (MAX_TCP_HEADER + 1);
2447 }
2448
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2449 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2450 {
2451 u32 max_segs = 1;
2452
2453 sk->sk_route_caps = dst->dev->features;
2454 if (sk_is_tcp(sk))
2455 sk->sk_route_caps |= NETIF_F_GSO;
2456 if (sk->sk_route_caps & NETIF_F_GSO)
2457 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2458 if (unlikely(sk->sk_gso_disabled))
2459 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2460 if (sk_can_gso(sk)) {
2461 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2462 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2463 } else {
2464 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2465 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2466 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2467 max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2468 }
2469 }
2470 sk->sk_gso_max_segs = max_segs;
2471 sk_dst_set(sk, dst);
2472 }
2473 EXPORT_SYMBOL_GPL(sk_setup_caps);
2474
2475 /*
2476 * Simple resource managers for sockets.
2477 */
2478
2479
2480 /*
2481 * Write buffer destructor automatically called from kfree_skb.
2482 */
sock_wfree(struct sk_buff * skb)2483 void sock_wfree(struct sk_buff *skb)
2484 {
2485 struct sock *sk = skb->sk;
2486 unsigned int len = skb->truesize;
2487 bool free;
2488
2489 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2490 if (sock_flag(sk, SOCK_RCU_FREE) &&
2491 sk->sk_write_space == sock_def_write_space) {
2492 rcu_read_lock();
2493 free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2494 sock_def_write_space_wfree(sk);
2495 rcu_read_unlock();
2496 if (unlikely(free))
2497 __sk_free(sk);
2498 return;
2499 }
2500
2501 /*
2502 * Keep a reference on sk_wmem_alloc, this will be released
2503 * after sk_write_space() call
2504 */
2505 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2506 sk->sk_write_space(sk);
2507 len = 1;
2508 }
2509 /*
2510 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2511 * could not do because of in-flight packets
2512 */
2513 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2514 __sk_free(sk);
2515 }
2516 EXPORT_SYMBOL(sock_wfree);
2517
2518 /* This variant of sock_wfree() is used by TCP,
2519 * since it sets SOCK_USE_WRITE_QUEUE.
2520 */
__sock_wfree(struct sk_buff * skb)2521 void __sock_wfree(struct sk_buff *skb)
2522 {
2523 struct sock *sk = skb->sk;
2524
2525 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2526 __sk_free(sk);
2527 }
2528
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2529 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2530 {
2531 skb_orphan(skb);
2532 skb->sk = sk;
2533 #ifdef CONFIG_INET
2534 if (unlikely(!sk_fullsock(sk))) {
2535 skb->destructor = sock_edemux;
2536 sock_hold(sk);
2537 return;
2538 }
2539 #endif
2540 skb->destructor = sock_wfree;
2541 skb_set_hash_from_sk(skb, sk);
2542 /*
2543 * We used to take a refcount on sk, but following operation
2544 * is enough to guarantee sk_free() wont free this sock until
2545 * all in-flight packets are completed
2546 */
2547 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2548 }
2549 EXPORT_SYMBOL(skb_set_owner_w);
2550
can_skb_orphan_partial(const struct sk_buff * skb)2551 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2552 {
2553 #ifdef CONFIG_TLS_DEVICE
2554 /* Drivers depend on in-order delivery for crypto offload,
2555 * partial orphan breaks out-of-order-OK logic.
2556 */
2557 if (skb->decrypted)
2558 return false;
2559 #endif
2560 return (skb->destructor == sock_wfree ||
2561 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2562 }
2563
2564 /* This helper is used by netem, as it can hold packets in its
2565 * delay queue. We want to allow the owner socket to send more
2566 * packets, as if they were already TX completed by a typical driver.
2567 * But we also want to keep skb->sk set because some packet schedulers
2568 * rely on it (sch_fq for example).
2569 */
skb_orphan_partial(struct sk_buff * skb)2570 void skb_orphan_partial(struct sk_buff *skb)
2571 {
2572 if (skb_is_tcp_pure_ack(skb))
2573 return;
2574
2575 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2576 return;
2577
2578 skb_orphan(skb);
2579 }
2580 EXPORT_SYMBOL(skb_orphan_partial);
2581
2582 /*
2583 * Read buffer destructor automatically called from kfree_skb.
2584 */
sock_rfree(struct sk_buff * skb)2585 void sock_rfree(struct sk_buff *skb)
2586 {
2587 struct sock *sk = skb->sk;
2588 unsigned int len = skb->truesize;
2589
2590 atomic_sub(len, &sk->sk_rmem_alloc);
2591 sk_mem_uncharge(sk, len);
2592 }
2593 EXPORT_SYMBOL(sock_rfree);
2594
2595 /*
2596 * Buffer destructor for skbs that are not used directly in read or write
2597 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2598 */
sock_efree(struct sk_buff * skb)2599 void sock_efree(struct sk_buff *skb)
2600 {
2601 sock_put(skb->sk);
2602 }
2603 EXPORT_SYMBOL(sock_efree);
2604
2605 /* Buffer destructor for prefetch/receive path where reference count may
2606 * not be held, e.g. for listen sockets.
2607 */
2608 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2609 void sock_pfree(struct sk_buff *skb)
2610 {
2611 if (sk_is_refcounted(skb->sk))
2612 sock_gen_put(skb->sk);
2613 }
2614 EXPORT_SYMBOL(sock_pfree);
2615 #endif /* CONFIG_INET */
2616
sock_i_uid(struct sock * sk)2617 kuid_t sock_i_uid(struct sock *sk)
2618 {
2619 kuid_t uid;
2620
2621 read_lock_bh(&sk->sk_callback_lock);
2622 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2623 read_unlock_bh(&sk->sk_callback_lock);
2624 return uid;
2625 }
2626 EXPORT_SYMBOL(sock_i_uid);
2627
__sock_i_ino(struct sock * sk)2628 unsigned long __sock_i_ino(struct sock *sk)
2629 {
2630 unsigned long ino;
2631
2632 read_lock(&sk->sk_callback_lock);
2633 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2634 read_unlock(&sk->sk_callback_lock);
2635 return ino;
2636 }
2637 EXPORT_SYMBOL(__sock_i_ino);
2638
sock_i_ino(struct sock * sk)2639 unsigned long sock_i_ino(struct sock *sk)
2640 {
2641 unsigned long ino;
2642
2643 local_bh_disable();
2644 ino = __sock_i_ino(sk);
2645 local_bh_enable();
2646 return ino;
2647 }
2648 EXPORT_SYMBOL(sock_i_ino);
2649
2650 /*
2651 * Allocate a skb from the socket's send buffer.
2652 */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2653 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2654 gfp_t priority)
2655 {
2656 if (force ||
2657 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2658 struct sk_buff *skb = alloc_skb(size, priority);
2659
2660 if (skb) {
2661 skb_set_owner_w(skb, sk);
2662 return skb;
2663 }
2664 }
2665 return NULL;
2666 }
2667 EXPORT_SYMBOL(sock_wmalloc);
2668
sock_ofree(struct sk_buff * skb)2669 static void sock_ofree(struct sk_buff *skb)
2670 {
2671 struct sock *sk = skb->sk;
2672
2673 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2674 }
2675
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2676 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2677 gfp_t priority)
2678 {
2679 struct sk_buff *skb;
2680
2681 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2682 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2683 READ_ONCE(sysctl_optmem_max))
2684 return NULL;
2685
2686 skb = alloc_skb(size, priority);
2687 if (!skb)
2688 return NULL;
2689
2690 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2691 skb->sk = sk;
2692 skb->destructor = sock_ofree;
2693 return skb;
2694 }
2695
2696 /*
2697 * Allocate a memory block from the socket's option memory buffer.
2698 */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2699 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2700 {
2701 int optmem_max = READ_ONCE(sysctl_optmem_max);
2702
2703 if ((unsigned int)size <= optmem_max &&
2704 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2705 void *mem;
2706 /* First do the add, to avoid the race if kmalloc
2707 * might sleep.
2708 */
2709 atomic_add(size, &sk->sk_omem_alloc);
2710 mem = kmalloc(size, priority);
2711 if (mem)
2712 return mem;
2713 atomic_sub(size, &sk->sk_omem_alloc);
2714 }
2715 return NULL;
2716 }
2717 EXPORT_SYMBOL(sock_kmalloc);
2718
2719 /* Free an option memory block. Note, we actually want the inline
2720 * here as this allows gcc to detect the nullify and fold away the
2721 * condition entirely.
2722 */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2723 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2724 const bool nullify)
2725 {
2726 if (WARN_ON_ONCE(!mem))
2727 return;
2728 if (nullify)
2729 kfree_sensitive(mem);
2730 else
2731 kfree(mem);
2732 atomic_sub(size, &sk->sk_omem_alloc);
2733 }
2734
sock_kfree_s(struct sock * sk,void * mem,int size)2735 void sock_kfree_s(struct sock *sk, void *mem, int size)
2736 {
2737 __sock_kfree_s(sk, mem, size, false);
2738 }
2739 EXPORT_SYMBOL(sock_kfree_s);
2740
sock_kzfree_s(struct sock * sk,void * mem,int size)2741 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2742 {
2743 __sock_kfree_s(sk, mem, size, true);
2744 }
2745 EXPORT_SYMBOL(sock_kzfree_s);
2746
2747 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2748 I think, these locks should be removed for datagram sockets.
2749 */
sock_wait_for_wmem(struct sock * sk,long timeo)2750 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2751 {
2752 DEFINE_WAIT(wait);
2753
2754 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2755 for (;;) {
2756 if (!timeo)
2757 break;
2758 if (signal_pending(current))
2759 break;
2760 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2761 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2762 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2763 break;
2764 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2765 break;
2766 if (READ_ONCE(sk->sk_err))
2767 break;
2768 timeo = schedule_timeout(timeo);
2769 }
2770 finish_wait(sk_sleep(sk), &wait);
2771 return timeo;
2772 }
2773
2774
2775 /*
2776 * Generic send/receive buffer handlers
2777 */
2778
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2779 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2780 unsigned long data_len, int noblock,
2781 int *errcode, int max_page_order)
2782 {
2783 struct sk_buff *skb;
2784 long timeo;
2785 int err;
2786
2787 timeo = sock_sndtimeo(sk, noblock);
2788 for (;;) {
2789 err = sock_error(sk);
2790 if (err != 0)
2791 goto failure;
2792
2793 err = -EPIPE;
2794 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2795 goto failure;
2796
2797 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2798 break;
2799
2800 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2801 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2802 err = -EAGAIN;
2803 if (!timeo)
2804 goto failure;
2805 if (signal_pending(current))
2806 goto interrupted;
2807 timeo = sock_wait_for_wmem(sk, timeo);
2808 }
2809 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2810 errcode, sk->sk_allocation);
2811 if (skb)
2812 skb_set_owner_w(skb, sk);
2813 return skb;
2814
2815 interrupted:
2816 err = sock_intr_errno(timeo);
2817 failure:
2818 *errcode = err;
2819 return NULL;
2820 }
2821 EXPORT_SYMBOL(sock_alloc_send_pskb);
2822
__sock_cmsg_send(struct sock * sk,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2823 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2824 struct sockcm_cookie *sockc)
2825 {
2826 u32 tsflags;
2827
2828 switch (cmsg->cmsg_type) {
2829 case SO_MARK:
2830 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2831 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2832 return -EPERM;
2833 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2834 return -EINVAL;
2835 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2836 break;
2837 case SO_TIMESTAMPING_OLD:
2838 case SO_TIMESTAMPING_NEW:
2839 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2840 return -EINVAL;
2841
2842 tsflags = *(u32 *)CMSG_DATA(cmsg);
2843 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2844 return -EINVAL;
2845
2846 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2847 sockc->tsflags |= tsflags;
2848 break;
2849 case SCM_TXTIME:
2850 if (!sock_flag(sk, SOCK_TXTIME))
2851 return -EINVAL;
2852 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2853 return -EINVAL;
2854 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2855 break;
2856 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2857 case SCM_RIGHTS:
2858 case SCM_CREDENTIALS:
2859 break;
2860 default:
2861 return -EINVAL;
2862 }
2863 return 0;
2864 }
2865 EXPORT_SYMBOL(__sock_cmsg_send);
2866
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2867 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2868 struct sockcm_cookie *sockc)
2869 {
2870 struct cmsghdr *cmsg;
2871 int ret;
2872
2873 for_each_cmsghdr(cmsg, msg) {
2874 if (!CMSG_OK(msg, cmsg))
2875 return -EINVAL;
2876 if (cmsg->cmsg_level != SOL_SOCKET)
2877 continue;
2878 ret = __sock_cmsg_send(sk, cmsg, sockc);
2879 if (ret)
2880 return ret;
2881 }
2882 return 0;
2883 }
2884 EXPORT_SYMBOL(sock_cmsg_send);
2885
sk_enter_memory_pressure(struct sock * sk)2886 static void sk_enter_memory_pressure(struct sock *sk)
2887 {
2888 if (!sk->sk_prot->enter_memory_pressure)
2889 return;
2890
2891 sk->sk_prot->enter_memory_pressure(sk);
2892 }
2893
sk_leave_memory_pressure(struct sock * sk)2894 static void sk_leave_memory_pressure(struct sock *sk)
2895 {
2896 if (sk->sk_prot->leave_memory_pressure) {
2897 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2898 tcp_leave_memory_pressure, sk);
2899 } else {
2900 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2901
2902 if (memory_pressure && READ_ONCE(*memory_pressure))
2903 WRITE_ONCE(*memory_pressure, 0);
2904 }
2905 }
2906
2907 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2908
2909 /**
2910 * skb_page_frag_refill - check that a page_frag contains enough room
2911 * @sz: minimum size of the fragment we want to get
2912 * @pfrag: pointer to page_frag
2913 * @gfp: priority for memory allocation
2914 *
2915 * Note: While this allocator tries to use high order pages, there is
2916 * no guarantee that allocations succeed. Therefore, @sz MUST be
2917 * less or equal than PAGE_SIZE.
2918 */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2919 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2920 {
2921 if (pfrag->page) {
2922 if (page_ref_count(pfrag->page) == 1) {
2923 pfrag->offset = 0;
2924 return true;
2925 }
2926 if (pfrag->offset + sz <= pfrag->size)
2927 return true;
2928 put_page(pfrag->page);
2929 }
2930
2931 pfrag->offset = 0;
2932 if (SKB_FRAG_PAGE_ORDER &&
2933 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2934 /* Avoid direct reclaim but allow kswapd to wake */
2935 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2936 __GFP_COMP | __GFP_NOWARN |
2937 __GFP_NORETRY,
2938 SKB_FRAG_PAGE_ORDER);
2939 if (likely(pfrag->page)) {
2940 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2941 return true;
2942 }
2943 }
2944 pfrag->page = alloc_page(gfp);
2945 if (likely(pfrag->page)) {
2946 pfrag->size = PAGE_SIZE;
2947 return true;
2948 }
2949 return false;
2950 }
2951 EXPORT_SYMBOL(skb_page_frag_refill);
2952
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2953 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2954 {
2955 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2956 return true;
2957
2958 sk_enter_memory_pressure(sk);
2959 sk_stream_moderate_sndbuf(sk);
2960 return false;
2961 }
2962 EXPORT_SYMBOL(sk_page_frag_refill);
2963
__lock_sock(struct sock * sk)2964 void __lock_sock(struct sock *sk)
2965 __releases(&sk->sk_lock.slock)
2966 __acquires(&sk->sk_lock.slock)
2967 {
2968 DEFINE_WAIT(wait);
2969
2970 for (;;) {
2971 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2972 TASK_UNINTERRUPTIBLE);
2973 spin_unlock_bh(&sk->sk_lock.slock);
2974 schedule();
2975 spin_lock_bh(&sk->sk_lock.slock);
2976 if (!sock_owned_by_user(sk))
2977 break;
2978 }
2979 finish_wait(&sk->sk_lock.wq, &wait);
2980 }
2981
__release_sock(struct sock * sk)2982 void __release_sock(struct sock *sk)
2983 __releases(&sk->sk_lock.slock)
2984 __acquires(&sk->sk_lock.slock)
2985 {
2986 struct sk_buff *skb, *next;
2987
2988 while ((skb = sk->sk_backlog.head) != NULL) {
2989 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2990
2991 spin_unlock_bh(&sk->sk_lock.slock);
2992
2993 do {
2994 next = skb->next;
2995 prefetch(next);
2996 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2997 skb_mark_not_on_list(skb);
2998 sk_backlog_rcv(sk, skb);
2999
3000 cond_resched();
3001
3002 skb = next;
3003 } while (skb != NULL);
3004
3005 spin_lock_bh(&sk->sk_lock.slock);
3006 }
3007
3008 /*
3009 * Doing the zeroing here guarantee we can not loop forever
3010 * while a wild producer attempts to flood us.
3011 */
3012 sk->sk_backlog.len = 0;
3013 }
3014
__sk_flush_backlog(struct sock * sk)3015 void __sk_flush_backlog(struct sock *sk)
3016 {
3017 spin_lock_bh(&sk->sk_lock.slock);
3018 __release_sock(sk);
3019 spin_unlock_bh(&sk->sk_lock.slock);
3020 }
3021 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3022
3023 /**
3024 * sk_wait_data - wait for data to arrive at sk_receive_queue
3025 * @sk: sock to wait on
3026 * @timeo: for how long
3027 * @skb: last skb seen on sk_receive_queue
3028 *
3029 * Now socket state including sk->sk_err is changed only under lock,
3030 * hence we may omit checks after joining wait queue.
3031 * We check receive queue before schedule() only as optimization;
3032 * it is very likely that release_sock() added new data.
3033 */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)3034 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3035 {
3036 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3037 int rc;
3038
3039 add_wait_queue(sk_sleep(sk), &wait);
3040 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3041 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3042 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3043 remove_wait_queue(sk_sleep(sk), &wait);
3044 return rc;
3045 }
3046 EXPORT_SYMBOL(sk_wait_data);
3047
3048 /**
3049 * __sk_mem_raise_allocated - increase memory_allocated
3050 * @sk: socket
3051 * @size: memory size to allocate
3052 * @amt: pages to allocate
3053 * @kind: allocation type
3054 *
3055 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3056 */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)3057 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3058 {
3059 bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
3060 struct proto *prot = sk->sk_prot;
3061 bool charged = true;
3062 long allocated;
3063
3064 sk_memory_allocated_add(sk, amt);
3065 allocated = sk_memory_allocated(sk);
3066 if (memcg_charge &&
3067 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3068 gfp_memcg_charge())))
3069 goto suppress_allocation;
3070
3071 /* Under limit. */
3072 if (allocated <= sk_prot_mem_limits(sk, 0)) {
3073 sk_leave_memory_pressure(sk);
3074 return 1;
3075 }
3076
3077 /* Under pressure. */
3078 if (allocated > sk_prot_mem_limits(sk, 1))
3079 sk_enter_memory_pressure(sk);
3080
3081 /* Over hard limit. */
3082 if (allocated > sk_prot_mem_limits(sk, 2))
3083 goto suppress_allocation;
3084
3085 /* guarantee minimum buffer size under pressure */
3086 if (kind == SK_MEM_RECV) {
3087 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3088 return 1;
3089
3090 } else { /* SK_MEM_SEND */
3091 int wmem0 = sk_get_wmem0(sk, prot);
3092
3093 if (sk->sk_type == SOCK_STREAM) {
3094 if (sk->sk_wmem_queued < wmem0)
3095 return 1;
3096 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3097 return 1;
3098 }
3099 }
3100
3101 if (sk_has_memory_pressure(sk)) {
3102 u64 alloc;
3103
3104 if (!sk_under_memory_pressure(sk))
3105 return 1;
3106 alloc = sk_sockets_allocated_read_positive(sk);
3107 if (sk_prot_mem_limits(sk, 2) > alloc *
3108 sk_mem_pages(sk->sk_wmem_queued +
3109 atomic_read(&sk->sk_rmem_alloc) +
3110 sk->sk_forward_alloc))
3111 return 1;
3112 }
3113
3114 suppress_allocation:
3115
3116 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3117 sk_stream_moderate_sndbuf(sk);
3118
3119 /* Fail only if socket is _under_ its sndbuf.
3120 * In this case we cannot block, so that we have to fail.
3121 */
3122 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3123 /* Force charge with __GFP_NOFAIL */
3124 if (memcg_charge && !charged) {
3125 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3126 gfp_memcg_charge() | __GFP_NOFAIL);
3127 }
3128 return 1;
3129 }
3130 }
3131
3132 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3133 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3134
3135 sk_memory_allocated_sub(sk, amt);
3136
3137 if (memcg_charge && charged)
3138 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
3139
3140 return 0;
3141 }
3142
3143 /**
3144 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3145 * @sk: socket
3146 * @size: memory size to allocate
3147 * @kind: allocation type
3148 *
3149 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3150 * rmem allocation. This function assumes that protocols which have
3151 * memory_pressure use sk_wmem_queued as write buffer accounting.
3152 */
__sk_mem_schedule(struct sock * sk,int size,int kind)3153 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3154 {
3155 int ret, amt = sk_mem_pages(size);
3156
3157 sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3158 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3159 if (!ret)
3160 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3161 return ret;
3162 }
3163 EXPORT_SYMBOL(__sk_mem_schedule);
3164
3165 /**
3166 * __sk_mem_reduce_allocated - reclaim memory_allocated
3167 * @sk: socket
3168 * @amount: number of quanta
3169 *
3170 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3171 */
__sk_mem_reduce_allocated(struct sock * sk,int amount)3172 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3173 {
3174 sk_memory_allocated_sub(sk, amount);
3175
3176 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3177 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3178
3179 if (sk_under_global_memory_pressure(sk) &&
3180 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3181 sk_leave_memory_pressure(sk);
3182 }
3183
3184 /**
3185 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3186 * @sk: socket
3187 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3188 */
__sk_mem_reclaim(struct sock * sk,int amount)3189 void __sk_mem_reclaim(struct sock *sk, int amount)
3190 {
3191 amount >>= PAGE_SHIFT;
3192 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3193 __sk_mem_reduce_allocated(sk, amount);
3194 }
3195 EXPORT_SYMBOL(__sk_mem_reclaim);
3196
sk_set_peek_off(struct sock * sk,int val)3197 int sk_set_peek_off(struct sock *sk, int val)
3198 {
3199 WRITE_ONCE(sk->sk_peek_off, val);
3200 return 0;
3201 }
3202 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3203
3204 /*
3205 * Set of default routines for initialising struct proto_ops when
3206 * the protocol does not support a particular function. In certain
3207 * cases where it makes no sense for a protocol to have a "do nothing"
3208 * function, some default processing is provided.
3209 */
3210
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)3211 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3212 {
3213 return -EOPNOTSUPP;
3214 }
3215 EXPORT_SYMBOL(sock_no_bind);
3216
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)3217 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3218 int len, int flags)
3219 {
3220 return -EOPNOTSUPP;
3221 }
3222 EXPORT_SYMBOL(sock_no_connect);
3223
sock_no_socketpair(struct socket * sock1,struct socket * sock2)3224 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3225 {
3226 return -EOPNOTSUPP;
3227 }
3228 EXPORT_SYMBOL(sock_no_socketpair);
3229
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)3230 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3231 bool kern)
3232 {
3233 return -EOPNOTSUPP;
3234 }
3235 EXPORT_SYMBOL(sock_no_accept);
3236
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)3237 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3238 int peer)
3239 {
3240 return -EOPNOTSUPP;
3241 }
3242 EXPORT_SYMBOL(sock_no_getname);
3243
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3244 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3245 {
3246 return -EOPNOTSUPP;
3247 }
3248 EXPORT_SYMBOL(sock_no_ioctl);
3249
sock_no_listen(struct socket * sock,int backlog)3250 int sock_no_listen(struct socket *sock, int backlog)
3251 {
3252 return -EOPNOTSUPP;
3253 }
3254 EXPORT_SYMBOL(sock_no_listen);
3255
sock_no_shutdown(struct socket * sock,int how)3256 int sock_no_shutdown(struct socket *sock, int how)
3257 {
3258 return -EOPNOTSUPP;
3259 }
3260 EXPORT_SYMBOL(sock_no_shutdown);
3261
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)3262 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3263 {
3264 return -EOPNOTSUPP;
3265 }
3266 EXPORT_SYMBOL(sock_no_sendmsg);
3267
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)3268 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3269 {
3270 return -EOPNOTSUPP;
3271 }
3272 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3273
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)3274 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3275 int flags)
3276 {
3277 return -EOPNOTSUPP;
3278 }
3279 EXPORT_SYMBOL(sock_no_recvmsg);
3280
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)3281 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3282 {
3283 /* Mirror missing mmap method error code */
3284 return -ENODEV;
3285 }
3286 EXPORT_SYMBOL(sock_no_mmap);
3287
3288 /*
3289 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3290 * various sock-based usage counts.
3291 */
__receive_sock(struct file * file)3292 void __receive_sock(struct file *file)
3293 {
3294 struct socket *sock;
3295
3296 sock = sock_from_file(file);
3297 if (sock) {
3298 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3299 sock_update_classid(&sock->sk->sk_cgrp_data);
3300 }
3301 }
3302
3303 /*
3304 * Default Socket Callbacks
3305 */
3306
sock_def_wakeup(struct sock * sk)3307 static void sock_def_wakeup(struct sock *sk)
3308 {
3309 struct socket_wq *wq;
3310
3311 rcu_read_lock();
3312 wq = rcu_dereference(sk->sk_wq);
3313 if (skwq_has_sleeper(wq))
3314 wake_up_interruptible_all(&wq->wait);
3315 rcu_read_unlock();
3316 }
3317
sock_def_error_report(struct sock * sk)3318 static void sock_def_error_report(struct sock *sk)
3319 {
3320 struct socket_wq *wq;
3321
3322 rcu_read_lock();
3323 wq = rcu_dereference(sk->sk_wq);
3324 if (skwq_has_sleeper(wq))
3325 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3326 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3327 rcu_read_unlock();
3328 }
3329
sock_def_readable(struct sock * sk)3330 void sock_def_readable(struct sock *sk)
3331 {
3332 struct socket_wq *wq;
3333
3334 trace_sk_data_ready(sk);
3335
3336 rcu_read_lock();
3337 wq = rcu_dereference(sk->sk_wq);
3338
3339 if (skwq_has_sleeper(wq)) {
3340 int done = 0;
3341
3342 trace_android_vh_do_wake_up_sync(&wq->wait, &done, sk);
3343 if (done)
3344 goto out;
3345
3346 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3347 EPOLLRDNORM | EPOLLRDBAND);
3348 }
3349
3350 out:
3351 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3352 rcu_read_unlock();
3353 }
3354
sock_def_write_space(struct sock * sk)3355 static void sock_def_write_space(struct sock *sk)
3356 {
3357 struct socket_wq *wq;
3358
3359 rcu_read_lock();
3360
3361 /* Do not wake up a writer until he can make "significant"
3362 * progress. --DaveM
3363 */
3364 if (sock_writeable(sk)) {
3365 wq = rcu_dereference(sk->sk_wq);
3366 if (skwq_has_sleeper(wq))
3367 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3368 EPOLLWRNORM | EPOLLWRBAND);
3369
3370 /* Should agree with poll, otherwise some programs break */
3371 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3372 }
3373
3374 rcu_read_unlock();
3375 }
3376
3377 /* An optimised version of sock_def_write_space(), should only be called
3378 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3379 * ->sk_wmem_alloc.
3380 */
sock_def_write_space_wfree(struct sock * sk)3381 static void sock_def_write_space_wfree(struct sock *sk)
3382 {
3383 /* Do not wake up a writer until he can make "significant"
3384 * progress. --DaveM
3385 */
3386 if (sock_writeable(sk)) {
3387 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3388
3389 /* rely on refcount_sub from sock_wfree() */
3390 smp_mb__after_atomic();
3391 if (wq && waitqueue_active(&wq->wait))
3392 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3393 EPOLLWRNORM | EPOLLWRBAND);
3394
3395 /* Should agree with poll, otherwise some programs break */
3396 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3397 }
3398 }
3399
sock_def_destruct(struct sock * sk)3400 static void sock_def_destruct(struct sock *sk)
3401 {
3402 }
3403
sk_send_sigurg(struct sock * sk)3404 void sk_send_sigurg(struct sock *sk)
3405 {
3406 if (sk->sk_socket && sk->sk_socket->file)
3407 if (send_sigurg(&sk->sk_socket->file->f_owner))
3408 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3409 }
3410 EXPORT_SYMBOL(sk_send_sigurg);
3411
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3412 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3413 unsigned long expires)
3414 {
3415 if (!mod_timer(timer, expires))
3416 sock_hold(sk);
3417 }
3418 EXPORT_SYMBOL(sk_reset_timer);
3419
sk_stop_timer(struct sock * sk,struct timer_list * timer)3420 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3421 {
3422 if (del_timer(timer))
3423 __sock_put(sk);
3424 }
3425 EXPORT_SYMBOL(sk_stop_timer);
3426
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3427 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3428 {
3429 if (del_timer_sync(timer))
3430 __sock_put(sk);
3431 }
3432 EXPORT_SYMBOL(sk_stop_timer_sync);
3433
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3434 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3435 {
3436 sk_init_common(sk);
3437 sk->sk_send_head = NULL;
3438
3439 timer_setup(&sk->sk_timer, NULL, 0);
3440
3441 sk->sk_allocation = GFP_KERNEL;
3442 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
3443 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
3444 sk->sk_state = TCP_CLOSE;
3445 sk->sk_use_task_frag = true;
3446 sk_set_socket(sk, sock);
3447
3448 sock_set_flag(sk, SOCK_ZAPPED);
3449
3450 if (sock) {
3451 sk->sk_type = sock->type;
3452 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3453 sock->sk = sk;
3454 } else {
3455 RCU_INIT_POINTER(sk->sk_wq, NULL);
3456 }
3457 sk->sk_uid = uid;
3458
3459 rwlock_init(&sk->sk_callback_lock);
3460 if (sk->sk_kern_sock)
3461 lockdep_set_class_and_name(
3462 &sk->sk_callback_lock,
3463 af_kern_callback_keys + sk->sk_family,
3464 af_family_kern_clock_key_strings[sk->sk_family]);
3465 else
3466 lockdep_set_class_and_name(
3467 &sk->sk_callback_lock,
3468 af_callback_keys + sk->sk_family,
3469 af_family_clock_key_strings[sk->sk_family]);
3470
3471 sk->sk_state_change = sock_def_wakeup;
3472 sk->sk_data_ready = sock_def_readable;
3473 sk->sk_write_space = sock_def_write_space;
3474 sk->sk_error_report = sock_def_error_report;
3475 sk->sk_destruct = sock_def_destruct;
3476
3477 sk->sk_frag.page = NULL;
3478 sk->sk_frag.offset = 0;
3479 sk->sk_peek_off = -1;
3480
3481 sk->sk_peer_pid = NULL;
3482 sk->sk_peer_cred = NULL;
3483 spin_lock_init(&sk->sk_peer_lock);
3484
3485 sk->sk_write_pending = 0;
3486 sk->sk_rcvlowat = 1;
3487 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3488 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3489
3490 sk->sk_stamp = SK_DEFAULT_STAMP;
3491 #if BITS_PER_LONG==32
3492 seqlock_init(&sk->sk_stamp_seq);
3493 #endif
3494 atomic_set(&sk->sk_zckey, 0);
3495
3496 #ifdef CONFIG_NET_RX_BUSY_POLL
3497 sk->sk_napi_id = 0;
3498 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
3499 #endif
3500
3501 sk->sk_max_pacing_rate = ~0UL;
3502 sk->sk_pacing_rate = ~0UL;
3503 WRITE_ONCE(sk->sk_pacing_shift, 10);
3504 sk->sk_incoming_cpu = -1;
3505
3506 sk_rx_queue_clear(sk);
3507 /*
3508 * Before updating sk_refcnt, we must commit prior changes to memory
3509 * (Documentation/RCU/rculist_nulls.rst for details)
3510 */
3511 smp_wmb();
3512 refcount_set(&sk->sk_refcnt, 1);
3513 atomic_set(&sk->sk_drops, 0);
3514 }
3515 EXPORT_SYMBOL(sock_init_data_uid);
3516
sock_init_data(struct socket * sock,struct sock * sk)3517 void sock_init_data(struct socket *sock, struct sock *sk)
3518 {
3519 kuid_t uid = sock ?
3520 SOCK_INODE(sock)->i_uid :
3521 make_kuid(sock_net(sk)->user_ns, 0);
3522
3523 sock_init_data_uid(sock, sk, uid);
3524 }
3525 EXPORT_SYMBOL(sock_init_data);
3526
lock_sock_nested(struct sock * sk,int subclass)3527 void lock_sock_nested(struct sock *sk, int subclass)
3528 {
3529 /* The sk_lock has mutex_lock() semantics here. */
3530 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3531
3532 might_sleep();
3533 spin_lock_bh(&sk->sk_lock.slock);
3534 if (sock_owned_by_user_nocheck(sk))
3535 __lock_sock(sk);
3536 sk->sk_lock.owned = 1;
3537 spin_unlock_bh(&sk->sk_lock.slock);
3538 }
3539 EXPORT_SYMBOL(lock_sock_nested);
3540
release_sock(struct sock * sk)3541 void release_sock(struct sock *sk)
3542 {
3543 spin_lock_bh(&sk->sk_lock.slock);
3544 if (sk->sk_backlog.tail)
3545 __release_sock(sk);
3546
3547 /* Warning : release_cb() might need to release sk ownership,
3548 * ie call sock_release_ownership(sk) before us.
3549 */
3550 if (sk->sk_prot->release_cb)
3551 sk->sk_prot->release_cb(sk);
3552
3553 sock_release_ownership(sk);
3554 if (waitqueue_active(&sk->sk_lock.wq))
3555 wake_up(&sk->sk_lock.wq);
3556 spin_unlock_bh(&sk->sk_lock.slock);
3557 }
3558 EXPORT_SYMBOL(release_sock);
3559
__lock_sock_fast(struct sock * sk)3560 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3561 {
3562 might_sleep();
3563 spin_lock_bh(&sk->sk_lock.slock);
3564
3565 if (!sock_owned_by_user_nocheck(sk)) {
3566 /*
3567 * Fast path return with bottom halves disabled and
3568 * sock::sk_lock.slock held.
3569 *
3570 * The 'mutex' is not contended and holding
3571 * sock::sk_lock.slock prevents all other lockers to
3572 * proceed so the corresponding unlock_sock_fast() can
3573 * avoid the slow path of release_sock() completely and
3574 * just release slock.
3575 *
3576 * From a semantical POV this is equivalent to 'acquiring'
3577 * the 'mutex', hence the corresponding lockdep
3578 * mutex_release() has to happen in the fast path of
3579 * unlock_sock_fast().
3580 */
3581 return false;
3582 }
3583
3584 __lock_sock(sk);
3585 sk->sk_lock.owned = 1;
3586 __acquire(&sk->sk_lock.slock);
3587 spin_unlock_bh(&sk->sk_lock.slock);
3588 return true;
3589 }
3590 EXPORT_SYMBOL(__lock_sock_fast);
3591
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3592 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3593 bool timeval, bool time32)
3594 {
3595 struct sock *sk = sock->sk;
3596 struct timespec64 ts;
3597
3598 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3599 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3600 if (ts.tv_sec == -1)
3601 return -ENOENT;
3602 if (ts.tv_sec == 0) {
3603 ktime_t kt = ktime_get_real();
3604 sock_write_timestamp(sk, kt);
3605 ts = ktime_to_timespec64(kt);
3606 }
3607
3608 if (timeval)
3609 ts.tv_nsec /= 1000;
3610
3611 #ifdef CONFIG_COMPAT_32BIT_TIME
3612 if (time32)
3613 return put_old_timespec32(&ts, userstamp);
3614 #endif
3615 #ifdef CONFIG_SPARC64
3616 /* beware of padding in sparc64 timeval */
3617 if (timeval && !in_compat_syscall()) {
3618 struct __kernel_old_timeval __user tv = {
3619 .tv_sec = ts.tv_sec,
3620 .tv_usec = ts.tv_nsec,
3621 };
3622 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3623 return -EFAULT;
3624 return 0;
3625 }
3626 #endif
3627 return put_timespec64(&ts, userstamp);
3628 }
3629 EXPORT_SYMBOL(sock_gettstamp);
3630
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3631 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3632 {
3633 if (!sock_flag(sk, flag)) {
3634 unsigned long previous_flags = sk->sk_flags;
3635
3636 sock_set_flag(sk, flag);
3637 /*
3638 * we just set one of the two flags which require net
3639 * time stamping, but time stamping might have been on
3640 * already because of the other one
3641 */
3642 if (sock_needs_netstamp(sk) &&
3643 !(previous_flags & SK_FLAGS_TIMESTAMP))
3644 net_enable_timestamp();
3645 }
3646 }
3647
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3648 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3649 int level, int type)
3650 {
3651 struct sock_exterr_skb *serr;
3652 struct sk_buff *skb;
3653 int copied, err;
3654
3655 err = -EAGAIN;
3656 skb = sock_dequeue_err_skb(sk);
3657 if (skb == NULL)
3658 goto out;
3659
3660 copied = skb->len;
3661 if (copied > len) {
3662 msg->msg_flags |= MSG_TRUNC;
3663 copied = len;
3664 }
3665 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3666 if (err)
3667 goto out_free_skb;
3668
3669 sock_recv_timestamp(msg, sk, skb);
3670
3671 serr = SKB_EXT_ERR(skb);
3672 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3673
3674 msg->msg_flags |= MSG_ERRQUEUE;
3675 err = copied;
3676
3677 out_free_skb:
3678 kfree_skb(skb);
3679 out:
3680 return err;
3681 }
3682 EXPORT_SYMBOL(sock_recv_errqueue);
3683
3684 /*
3685 * Get a socket option on an socket.
3686 *
3687 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3688 * asynchronous errors should be reported by getsockopt. We assume
3689 * this means if you specify SO_ERROR (otherwise whats the point of it).
3690 */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3691 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3692 char __user *optval, int __user *optlen)
3693 {
3694 struct sock *sk = sock->sk;
3695
3696 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3697 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3698 }
3699 EXPORT_SYMBOL(sock_common_getsockopt);
3700
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3701 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3702 int flags)
3703 {
3704 struct sock *sk = sock->sk;
3705 int addr_len = 0;
3706 int err;
3707
3708 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3709 if (err >= 0)
3710 msg->msg_namelen = addr_len;
3711 return err;
3712 }
3713 EXPORT_SYMBOL(sock_common_recvmsg);
3714
3715 /*
3716 * Set socket options on an inet socket.
3717 */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3718 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3719 sockptr_t optval, unsigned int optlen)
3720 {
3721 struct sock *sk = sock->sk;
3722
3723 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3724 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3725 }
3726 EXPORT_SYMBOL(sock_common_setsockopt);
3727
sk_common_release(struct sock * sk)3728 void sk_common_release(struct sock *sk)
3729 {
3730 if (sk->sk_prot->destroy)
3731 sk->sk_prot->destroy(sk);
3732
3733 /*
3734 * Observation: when sk_common_release is called, processes have
3735 * no access to socket. But net still has.
3736 * Step one, detach it from networking:
3737 *
3738 * A. Remove from hash tables.
3739 */
3740
3741 sk->sk_prot->unhash(sk);
3742
3743 if (sk->sk_socket)
3744 sk->sk_socket->sk = NULL;
3745
3746 /*
3747 * In this point socket cannot receive new packets, but it is possible
3748 * that some packets are in flight because some CPU runs receiver and
3749 * did hash table lookup before we unhashed socket. They will achieve
3750 * receive queue and will be purged by socket destructor.
3751 *
3752 * Also we still have packets pending on receive queue and probably,
3753 * our own packets waiting in device queues. sock_destroy will drain
3754 * receive queue, but transmitted packets will delay socket destruction
3755 * until the last reference will be released.
3756 */
3757
3758 sock_orphan(sk);
3759
3760 xfrm_sk_free_policy(sk);
3761
3762 sock_put(sk);
3763 }
3764 EXPORT_SYMBOL(sk_common_release);
3765
sk_get_meminfo(const struct sock * sk,u32 * mem)3766 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3767 {
3768 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3769
3770 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3771 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3772 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3773 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3774 mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3775 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3776 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3777 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3778 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3779 }
3780
3781 #ifdef CONFIG_PROC_FS
3782 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3783
sock_prot_inuse_get(struct net * net,struct proto * prot)3784 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3785 {
3786 int cpu, idx = prot->inuse_idx;
3787 int res = 0;
3788
3789 for_each_possible_cpu(cpu)
3790 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3791
3792 return res >= 0 ? res : 0;
3793 }
3794 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3795
sock_inuse_get(struct net * net)3796 int sock_inuse_get(struct net *net)
3797 {
3798 int cpu, res = 0;
3799
3800 for_each_possible_cpu(cpu)
3801 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3802
3803 return res;
3804 }
3805
3806 EXPORT_SYMBOL_GPL(sock_inuse_get);
3807
sock_inuse_init_net(struct net * net)3808 static int __net_init sock_inuse_init_net(struct net *net)
3809 {
3810 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3811 if (net->core.prot_inuse == NULL)
3812 return -ENOMEM;
3813 return 0;
3814 }
3815
sock_inuse_exit_net(struct net * net)3816 static void __net_exit sock_inuse_exit_net(struct net *net)
3817 {
3818 free_percpu(net->core.prot_inuse);
3819 }
3820
3821 static struct pernet_operations net_inuse_ops = {
3822 .init = sock_inuse_init_net,
3823 .exit = sock_inuse_exit_net,
3824 };
3825
net_inuse_init(void)3826 static __init int net_inuse_init(void)
3827 {
3828 if (register_pernet_subsys(&net_inuse_ops))
3829 panic("Cannot initialize net inuse counters");
3830
3831 return 0;
3832 }
3833
3834 core_initcall(net_inuse_init);
3835
assign_proto_idx(struct proto * prot)3836 static int assign_proto_idx(struct proto *prot)
3837 {
3838 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3839
3840 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3841 pr_err("PROTO_INUSE_NR exhausted\n");
3842 return -ENOSPC;
3843 }
3844
3845 set_bit(prot->inuse_idx, proto_inuse_idx);
3846 return 0;
3847 }
3848
release_proto_idx(struct proto * prot)3849 static void release_proto_idx(struct proto *prot)
3850 {
3851 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3852 clear_bit(prot->inuse_idx, proto_inuse_idx);
3853 }
3854 #else
assign_proto_idx(struct proto * prot)3855 static inline int assign_proto_idx(struct proto *prot)
3856 {
3857 return 0;
3858 }
3859
release_proto_idx(struct proto * prot)3860 static inline void release_proto_idx(struct proto *prot)
3861 {
3862 }
3863
3864 #endif
3865
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3866 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3867 {
3868 if (!twsk_prot)
3869 return;
3870 kfree(twsk_prot->twsk_slab_name);
3871 twsk_prot->twsk_slab_name = NULL;
3872 kmem_cache_destroy(twsk_prot->twsk_slab);
3873 twsk_prot->twsk_slab = NULL;
3874 }
3875
tw_prot_init(const struct proto * prot)3876 static int tw_prot_init(const struct proto *prot)
3877 {
3878 struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3879
3880 if (!twsk_prot)
3881 return 0;
3882
3883 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3884 prot->name);
3885 if (!twsk_prot->twsk_slab_name)
3886 return -ENOMEM;
3887
3888 twsk_prot->twsk_slab =
3889 kmem_cache_create(twsk_prot->twsk_slab_name,
3890 twsk_prot->twsk_obj_size, 0,
3891 SLAB_ACCOUNT | prot->slab_flags,
3892 NULL);
3893 if (!twsk_prot->twsk_slab) {
3894 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3895 prot->name);
3896 return -ENOMEM;
3897 }
3898
3899 return 0;
3900 }
3901
req_prot_cleanup(struct request_sock_ops * rsk_prot)3902 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3903 {
3904 if (!rsk_prot)
3905 return;
3906 kfree(rsk_prot->slab_name);
3907 rsk_prot->slab_name = NULL;
3908 kmem_cache_destroy(rsk_prot->slab);
3909 rsk_prot->slab = NULL;
3910 }
3911
req_prot_init(const struct proto * prot)3912 static int req_prot_init(const struct proto *prot)
3913 {
3914 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3915
3916 if (!rsk_prot)
3917 return 0;
3918
3919 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3920 prot->name);
3921 if (!rsk_prot->slab_name)
3922 return -ENOMEM;
3923
3924 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3925 rsk_prot->obj_size, 0,
3926 SLAB_ACCOUNT | prot->slab_flags,
3927 NULL);
3928
3929 if (!rsk_prot->slab) {
3930 pr_crit("%s: Can't create request sock SLAB cache!\n",
3931 prot->name);
3932 return -ENOMEM;
3933 }
3934 return 0;
3935 }
3936
proto_register(struct proto * prot,int alloc_slab)3937 int proto_register(struct proto *prot, int alloc_slab)
3938 {
3939 int ret = -ENOBUFS;
3940
3941 if (prot->memory_allocated && !prot->sysctl_mem) {
3942 pr_err("%s: missing sysctl_mem\n", prot->name);
3943 return -EINVAL;
3944 }
3945 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3946 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3947 return -EINVAL;
3948 }
3949 if (alloc_slab) {
3950 prot->slab = kmem_cache_create_usercopy(prot->name,
3951 prot->obj_size, 0,
3952 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3953 prot->slab_flags,
3954 prot->useroffset, prot->usersize,
3955 NULL);
3956
3957 if (prot->slab == NULL) {
3958 pr_crit("%s: Can't create sock SLAB cache!\n",
3959 prot->name);
3960 goto out;
3961 }
3962
3963 if (req_prot_init(prot))
3964 goto out_free_request_sock_slab;
3965
3966 if (tw_prot_init(prot))
3967 goto out_free_timewait_sock_slab;
3968 }
3969
3970 mutex_lock(&proto_list_mutex);
3971 ret = assign_proto_idx(prot);
3972 if (ret) {
3973 mutex_unlock(&proto_list_mutex);
3974 goto out_free_timewait_sock_slab;
3975 }
3976 list_add(&prot->node, &proto_list);
3977 mutex_unlock(&proto_list_mutex);
3978 return ret;
3979
3980 out_free_timewait_sock_slab:
3981 if (alloc_slab)
3982 tw_prot_cleanup(prot->twsk_prot);
3983 out_free_request_sock_slab:
3984 if (alloc_slab) {
3985 req_prot_cleanup(prot->rsk_prot);
3986
3987 kmem_cache_destroy(prot->slab);
3988 prot->slab = NULL;
3989 }
3990 out:
3991 return ret;
3992 }
3993 EXPORT_SYMBOL(proto_register);
3994
proto_unregister(struct proto * prot)3995 void proto_unregister(struct proto *prot)
3996 {
3997 mutex_lock(&proto_list_mutex);
3998 release_proto_idx(prot);
3999 list_del(&prot->node);
4000 mutex_unlock(&proto_list_mutex);
4001
4002 kmem_cache_destroy(prot->slab);
4003 prot->slab = NULL;
4004
4005 req_prot_cleanup(prot->rsk_prot);
4006 tw_prot_cleanup(prot->twsk_prot);
4007 }
4008 EXPORT_SYMBOL(proto_unregister);
4009
sock_load_diag_module(int family,int protocol)4010 int sock_load_diag_module(int family, int protocol)
4011 {
4012 if (!protocol) {
4013 if (!sock_is_registered(family))
4014 return -ENOENT;
4015
4016 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4017 NETLINK_SOCK_DIAG, family);
4018 }
4019
4020 #ifdef CONFIG_INET
4021 if (family == AF_INET &&
4022 protocol != IPPROTO_RAW &&
4023 protocol < MAX_INET_PROTOS &&
4024 !rcu_access_pointer(inet_protos[protocol]))
4025 return -ENOENT;
4026 #endif
4027
4028 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4029 NETLINK_SOCK_DIAG, family, protocol);
4030 }
4031 EXPORT_SYMBOL(sock_load_diag_module);
4032
4033 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)4034 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4035 __acquires(proto_list_mutex)
4036 {
4037 mutex_lock(&proto_list_mutex);
4038 return seq_list_start_head(&proto_list, *pos);
4039 }
4040
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)4041 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4042 {
4043 return seq_list_next(v, &proto_list, pos);
4044 }
4045
proto_seq_stop(struct seq_file * seq,void * v)4046 static void proto_seq_stop(struct seq_file *seq, void *v)
4047 __releases(proto_list_mutex)
4048 {
4049 mutex_unlock(&proto_list_mutex);
4050 }
4051
proto_method_implemented(const void * method)4052 static char proto_method_implemented(const void *method)
4053 {
4054 return method == NULL ? 'n' : 'y';
4055 }
sock_prot_memory_allocated(struct proto * proto)4056 static long sock_prot_memory_allocated(struct proto *proto)
4057 {
4058 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4059 }
4060
sock_prot_memory_pressure(struct proto * proto)4061 static const char *sock_prot_memory_pressure(struct proto *proto)
4062 {
4063 return proto->memory_pressure != NULL ?
4064 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4065 }
4066
proto_seq_printf(struct seq_file * seq,struct proto * proto)4067 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4068 {
4069
4070 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
4071 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4072 proto->name,
4073 proto->obj_size,
4074 sock_prot_inuse_get(seq_file_net(seq), proto),
4075 sock_prot_memory_allocated(proto),
4076 sock_prot_memory_pressure(proto),
4077 proto->max_header,
4078 proto->slab == NULL ? "no" : "yes",
4079 module_name(proto->owner),
4080 proto_method_implemented(proto->close),
4081 proto_method_implemented(proto->connect),
4082 proto_method_implemented(proto->disconnect),
4083 proto_method_implemented(proto->accept),
4084 proto_method_implemented(proto->ioctl),
4085 proto_method_implemented(proto->init),
4086 proto_method_implemented(proto->destroy),
4087 proto_method_implemented(proto->shutdown),
4088 proto_method_implemented(proto->setsockopt),
4089 proto_method_implemented(proto->getsockopt),
4090 proto_method_implemented(proto->sendmsg),
4091 proto_method_implemented(proto->recvmsg),
4092 proto_method_implemented(proto->bind),
4093 proto_method_implemented(proto->backlog_rcv),
4094 proto_method_implemented(proto->hash),
4095 proto_method_implemented(proto->unhash),
4096 proto_method_implemented(proto->get_port),
4097 proto_method_implemented(proto->enter_memory_pressure));
4098 }
4099
proto_seq_show(struct seq_file * seq,void * v)4100 static int proto_seq_show(struct seq_file *seq, void *v)
4101 {
4102 if (v == &proto_list)
4103 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4104 "protocol",
4105 "size",
4106 "sockets",
4107 "memory",
4108 "press",
4109 "maxhdr",
4110 "slab",
4111 "module",
4112 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4113 else
4114 proto_seq_printf(seq, list_entry(v, struct proto, node));
4115 return 0;
4116 }
4117
4118 static const struct seq_operations proto_seq_ops = {
4119 .start = proto_seq_start,
4120 .next = proto_seq_next,
4121 .stop = proto_seq_stop,
4122 .show = proto_seq_show,
4123 };
4124
proto_init_net(struct net * net)4125 static __net_init int proto_init_net(struct net *net)
4126 {
4127 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4128 sizeof(struct seq_net_private)))
4129 return -ENOMEM;
4130
4131 return 0;
4132 }
4133
proto_exit_net(struct net * net)4134 static __net_exit void proto_exit_net(struct net *net)
4135 {
4136 remove_proc_entry("protocols", net->proc_net);
4137 }
4138
4139
4140 static __net_initdata struct pernet_operations proto_net_ops = {
4141 .init = proto_init_net,
4142 .exit = proto_exit_net,
4143 };
4144
proto_init(void)4145 static int __init proto_init(void)
4146 {
4147 return register_pernet_subsys(&proto_net_ops);
4148 }
4149
4150 subsys_initcall(proto_init);
4151
4152 #endif /* PROC_FS */
4153
4154 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)4155 bool sk_busy_loop_end(void *p, unsigned long start_time)
4156 {
4157 struct sock *sk = p;
4158
4159 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4160 return true;
4161
4162 if (sk_is_udp(sk) &&
4163 !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4164 return true;
4165
4166 return sk_busy_loop_timeout(sk, start_time);
4167 }
4168 EXPORT_SYMBOL(sk_busy_loop_end);
4169 #endif /* CONFIG_NET_RX_BUSY_POLL */
4170
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)4171 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4172 {
4173 if (!sk->sk_prot->bind_add)
4174 return -EOPNOTSUPP;
4175 return sk->sk_prot->bind_add(sk, addr, addr_len);
4176 }
4177 EXPORT_SYMBOL(sock_bind_add);
4178
4179 /* Copy 'size' bytes from userspace and return `size` back to userspace */
sock_ioctl_inout(struct sock * sk,unsigned int cmd,void __user * arg,void * karg,size_t size)4180 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4181 void __user *arg, void *karg, size_t size)
4182 {
4183 int ret;
4184
4185 if (copy_from_user(karg, arg, size))
4186 return -EFAULT;
4187
4188 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4189 if (ret)
4190 return ret;
4191
4192 if (copy_to_user(arg, karg, size))
4193 return -EFAULT;
4194
4195 return 0;
4196 }
4197 EXPORT_SYMBOL(sock_ioctl_inout);
4198
4199 /* This is the most common ioctl prep function, where the result (4 bytes) is
4200 * copied back to userspace if the ioctl() returns successfully. No input is
4201 * copied from userspace as input argument.
4202 */
sock_ioctl_out(struct sock * sk,unsigned int cmd,void __user * arg)4203 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4204 {
4205 int ret, karg = 0;
4206
4207 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4208 if (ret)
4209 return ret;
4210
4211 return put_user(karg, (int __user *)arg);
4212 }
4213
4214 /* A wrapper around sock ioctls, which copies the data from userspace
4215 * (depending on the protocol/ioctl), and copies back the result to userspace.
4216 * The main motivation for this function is to pass kernel memory to the
4217 * protocol ioctl callbacks, instead of userspace memory.
4218 */
sk_ioctl(struct sock * sk,unsigned int cmd,void __user * arg)4219 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4220 {
4221 int rc = 1;
4222
4223 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4224 rc = ipmr_sk_ioctl(sk, cmd, arg);
4225 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4226 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4227 else if (sk_is_phonet(sk))
4228 rc = phonet_sk_ioctl(sk, cmd, arg);
4229
4230 /* If ioctl was processed, returns its value */
4231 if (rc <= 0)
4232 return rc;
4233
4234 /* Otherwise call the default handler */
4235 return sock_ioctl_out(sk, cmd, arg);
4236 }
4237 EXPORT_SYMBOL(sk_ioctl);
4238