• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <linux/net_tstamp.h>
131 #include <net/xfrm.h>
132 #include <linux/ipsec.h>
133 #include <net/cls_cgroup.h>
134 #include <net/netprio_cgroup.h>
135 #include <linux/sock_diag.h>
136 
137 #include <linux/filter.h>
138 #include <net/sock_reuseport.h>
139 #include <net/bpf_sk_storage.h>
140 
141 #include <trace/events/sock.h>
142 #include <trace/hooks/sched.h>
143 #include <trace/hooks/net.h>
144 
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148 
149 #include <linux/ethtool.h>
150 
151 #include "dev.h"
152 
153 static DEFINE_MUTEX(proto_list_mutex);
154 static LIST_HEAD(proto_list);
155 
156 static void sock_def_write_space_wfree(struct sock *sk);
157 static void sock_def_write_space(struct sock *sk);
158 
159 /**
160  * sk_ns_capable - General socket capability test
161  * @sk: Socket to use a capability on or through
162  * @user_ns: The user namespace of the capability to use
163  * @cap: The capability to use
164  *
165  * Test to see if the opener of the socket had when the socket was
166  * created and the current process has the capability @cap in the user
167  * namespace @user_ns.
168  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)169 bool sk_ns_capable(const struct sock *sk,
170 		   struct user_namespace *user_ns, int cap)
171 {
172 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
173 		ns_capable(user_ns, cap);
174 }
175 EXPORT_SYMBOL(sk_ns_capable);
176 
177 /**
178  * sk_capable - Socket global capability test
179  * @sk: Socket to use a capability on or through
180  * @cap: The global capability to use
181  *
182  * Test to see if the opener of the socket had when the socket was
183  * created and the current process has the capability @cap in all user
184  * namespaces.
185  */
sk_capable(const struct sock * sk,int cap)186 bool sk_capable(const struct sock *sk, int cap)
187 {
188 	return sk_ns_capable(sk, &init_user_ns, cap);
189 }
190 EXPORT_SYMBOL(sk_capable);
191 
192 /**
193  * sk_net_capable - Network namespace socket capability test
194  * @sk: Socket to use a capability on or through
195  * @cap: The capability to use
196  *
197  * Test to see if the opener of the socket had when the socket was created
198  * and the current process has the capability @cap over the network namespace
199  * the socket is a member of.
200  */
sk_net_capable(const struct sock * sk,int cap)201 bool sk_net_capable(const struct sock *sk, int cap)
202 {
203 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
204 }
205 EXPORT_SYMBOL(sk_net_capable);
206 
207 /*
208  * Each address family might have different locking rules, so we have
209  * one slock key per address family and separate keys for internal and
210  * userspace sockets.
211  */
212 static struct lock_class_key af_family_keys[AF_MAX];
213 static struct lock_class_key af_family_kern_keys[AF_MAX];
214 static struct lock_class_key af_family_slock_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
216 
217 /*
218  * Make lock validator output more readable. (we pre-construct these
219  * strings build-time, so that runtime initialization of socket
220  * locks is fast):
221  */
222 
223 #define _sock_locks(x)						  \
224   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
225   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
226   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
227   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
228   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
229   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
230   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
231   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
232   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
233   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
234   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
235   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
236   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
237   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
238   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
239   x "AF_MCTP"  , \
240   x "AF_MAX"
241 
242 static const char *const af_family_key_strings[AF_MAX+1] = {
243 	_sock_locks("sk_lock-")
244 };
245 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
246 	_sock_locks("slock-")
247 };
248 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
249 	_sock_locks("clock-")
250 };
251 
252 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
253 	_sock_locks("k-sk_lock-")
254 };
255 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
256 	_sock_locks("k-slock-")
257 };
258 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
259 	_sock_locks("k-clock-")
260 };
261 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
262 	_sock_locks("rlock-")
263 };
264 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
265 	_sock_locks("wlock-")
266 };
267 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
268 	_sock_locks("elock-")
269 };
270 
271 /*
272  * sk_callback_lock and sk queues locking rules are per-address-family,
273  * so split the lock classes by using a per-AF key:
274  */
275 static struct lock_class_key af_callback_keys[AF_MAX];
276 static struct lock_class_key af_rlock_keys[AF_MAX];
277 static struct lock_class_key af_wlock_keys[AF_MAX];
278 static struct lock_class_key af_elock_keys[AF_MAX];
279 static struct lock_class_key af_kern_callback_keys[AF_MAX];
280 
281 /* Run time adjustable parameters. */
282 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
283 EXPORT_SYMBOL(sysctl_wmem_max);
284 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
285 EXPORT_SYMBOL(sysctl_rmem_max);
286 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
287 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
288 int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE;
289 
290 /* Maximal space eaten by iovec or ancillary data plus some space */
291 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
292 EXPORT_SYMBOL(sysctl_optmem_max);
293 
294 int sysctl_tstamp_allow_data __read_mostly = 1;
295 
296 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
297 EXPORT_SYMBOL_GPL(memalloc_socks_key);
298 
299 /**
300  * sk_set_memalloc - sets %SOCK_MEMALLOC
301  * @sk: socket to set it on
302  *
303  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
304  * It's the responsibility of the admin to adjust min_free_kbytes
305  * to meet the requirements
306  */
sk_set_memalloc(struct sock * sk)307 void sk_set_memalloc(struct sock *sk)
308 {
309 	sock_set_flag(sk, SOCK_MEMALLOC);
310 	sk->sk_allocation |= __GFP_MEMALLOC;
311 	static_branch_inc(&memalloc_socks_key);
312 }
313 EXPORT_SYMBOL_GPL(sk_set_memalloc);
314 
sk_clear_memalloc(struct sock * sk)315 void sk_clear_memalloc(struct sock *sk)
316 {
317 	sock_reset_flag(sk, SOCK_MEMALLOC);
318 	sk->sk_allocation &= ~__GFP_MEMALLOC;
319 	static_branch_dec(&memalloc_socks_key);
320 
321 	/*
322 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
323 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
324 	 * it has rmem allocations due to the last swapfile being deactivated
325 	 * but there is a risk that the socket is unusable due to exceeding
326 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
327 	 */
328 	sk_mem_reclaim(sk);
329 }
330 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
331 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)332 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
333 {
334 	int ret;
335 	unsigned int noreclaim_flag;
336 
337 	/* these should have been dropped before queueing */
338 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
339 
340 	noreclaim_flag = memalloc_noreclaim_save();
341 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
342 				 tcp_v6_do_rcv,
343 				 tcp_v4_do_rcv,
344 				 sk, skb);
345 	memalloc_noreclaim_restore(noreclaim_flag);
346 
347 	return ret;
348 }
349 EXPORT_SYMBOL(__sk_backlog_rcv);
350 
sk_error_report(struct sock * sk)351 void sk_error_report(struct sock *sk)
352 {
353 	sk->sk_error_report(sk);
354 
355 	switch (sk->sk_family) {
356 	case AF_INET:
357 		fallthrough;
358 	case AF_INET6:
359 		trace_inet_sk_error_report(sk);
360 		break;
361 	default:
362 		break;
363 	}
364 }
365 EXPORT_SYMBOL(sk_error_report);
366 
sock_get_timeout(long timeo,void * optval,bool old_timeval)367 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
368 {
369 	struct __kernel_sock_timeval tv;
370 
371 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
372 		tv.tv_sec = 0;
373 		tv.tv_usec = 0;
374 	} else {
375 		tv.tv_sec = timeo / HZ;
376 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
377 	}
378 
379 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
380 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
381 		*(struct old_timeval32 *)optval = tv32;
382 		return sizeof(tv32);
383 	}
384 
385 	if (old_timeval) {
386 		struct __kernel_old_timeval old_tv;
387 		old_tv.tv_sec = tv.tv_sec;
388 		old_tv.tv_usec = tv.tv_usec;
389 		*(struct __kernel_old_timeval *)optval = old_tv;
390 		return sizeof(old_tv);
391 	}
392 
393 	*(struct __kernel_sock_timeval *)optval = tv;
394 	return sizeof(tv);
395 }
396 EXPORT_SYMBOL(sock_get_timeout);
397 
sock_copy_user_timeval(struct __kernel_sock_timeval * tv,sockptr_t optval,int optlen,bool old_timeval)398 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
399 			   sockptr_t optval, int optlen, bool old_timeval)
400 {
401 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
402 		struct old_timeval32 tv32;
403 
404 		if (optlen < sizeof(tv32))
405 			return -EINVAL;
406 
407 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
408 			return -EFAULT;
409 		tv->tv_sec = tv32.tv_sec;
410 		tv->tv_usec = tv32.tv_usec;
411 	} else if (old_timeval) {
412 		struct __kernel_old_timeval old_tv;
413 
414 		if (optlen < sizeof(old_tv))
415 			return -EINVAL;
416 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
417 			return -EFAULT;
418 		tv->tv_sec = old_tv.tv_sec;
419 		tv->tv_usec = old_tv.tv_usec;
420 	} else {
421 		if (optlen < sizeof(*tv))
422 			return -EINVAL;
423 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
424 			return -EFAULT;
425 	}
426 
427 	return 0;
428 }
429 EXPORT_SYMBOL(sock_copy_user_timeval);
430 
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)431 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
432 			    bool old_timeval)
433 {
434 	struct __kernel_sock_timeval tv;
435 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
436 	long val;
437 
438 	if (err)
439 		return err;
440 
441 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
442 		return -EDOM;
443 
444 	if (tv.tv_sec < 0) {
445 		static int warned __read_mostly;
446 
447 		WRITE_ONCE(*timeo_p, 0);
448 		if (warned < 10 && net_ratelimit()) {
449 			warned++;
450 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
451 				__func__, current->comm, task_pid_nr(current));
452 		}
453 		return 0;
454 	}
455 	val = MAX_SCHEDULE_TIMEOUT;
456 	if ((tv.tv_sec || tv.tv_usec) &&
457 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
458 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
459 						    USEC_PER_SEC / HZ);
460 	WRITE_ONCE(*timeo_p, val);
461 	return 0;
462 }
463 
sock_needs_netstamp(const struct sock * sk)464 static bool sock_needs_netstamp(const struct sock *sk)
465 {
466 	switch (sk->sk_family) {
467 	case AF_UNSPEC:
468 	case AF_UNIX:
469 		return false;
470 	default:
471 		return true;
472 	}
473 }
474 
sock_disable_timestamp(struct sock * sk,unsigned long flags)475 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
476 {
477 	if (sk->sk_flags & flags) {
478 		sk->sk_flags &= ~flags;
479 		if (sock_needs_netstamp(sk) &&
480 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
481 			net_disable_timestamp();
482 	}
483 }
484 
485 
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)486 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
487 {
488 	unsigned long flags;
489 	struct sk_buff_head *list = &sk->sk_receive_queue;
490 
491 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
492 		atomic_inc(&sk->sk_drops);
493 		trace_sock_rcvqueue_full(sk, skb);
494 		return -ENOMEM;
495 	}
496 
497 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
498 		atomic_inc(&sk->sk_drops);
499 		return -ENOBUFS;
500 	}
501 
502 	skb->dev = NULL;
503 	skb_set_owner_r(skb, sk);
504 
505 	/* we escape from rcu protected region, make sure we dont leak
506 	 * a norefcounted dst
507 	 */
508 	skb_dst_force(skb);
509 
510 	spin_lock_irqsave(&list->lock, flags);
511 	sock_skb_set_dropcount(sk, skb);
512 	__skb_queue_tail(list, skb);
513 	spin_unlock_irqrestore(&list->lock, flags);
514 
515 	if (!sock_flag(sk, SOCK_DEAD))
516 		sk->sk_data_ready(sk);
517 	return 0;
518 }
519 EXPORT_SYMBOL(__sock_queue_rcv_skb);
520 
sock_queue_rcv_skb_reason(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)521 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
522 			      enum skb_drop_reason *reason)
523 {
524 	enum skb_drop_reason drop_reason;
525 	int err;
526 
527 	err = sk_filter(sk, skb);
528 	if (err) {
529 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
530 		goto out;
531 	}
532 	err = __sock_queue_rcv_skb(sk, skb);
533 	switch (err) {
534 	case -ENOMEM:
535 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
536 		break;
537 	case -ENOBUFS:
538 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
539 		break;
540 	default:
541 		drop_reason = SKB_NOT_DROPPED_YET;
542 		break;
543 	}
544 out:
545 	if (reason)
546 		*reason = drop_reason;
547 	return err;
548 }
549 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
550 
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)551 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
552 		     const int nested, unsigned int trim_cap, bool refcounted)
553 {
554 	int rc = NET_RX_SUCCESS;
555 
556 	if (sk_filter_trim_cap(sk, skb, trim_cap))
557 		goto discard_and_relse;
558 
559 	skb->dev = NULL;
560 
561 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
562 		atomic_inc(&sk->sk_drops);
563 		goto discard_and_relse;
564 	}
565 	if (nested)
566 		bh_lock_sock_nested(sk);
567 	else
568 		bh_lock_sock(sk);
569 	if (!sock_owned_by_user(sk)) {
570 		/*
571 		 * trylock + unlock semantics:
572 		 */
573 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
574 
575 		rc = sk_backlog_rcv(sk, skb);
576 
577 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
578 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
579 		bh_unlock_sock(sk);
580 		atomic_inc(&sk->sk_drops);
581 		goto discard_and_relse;
582 	}
583 
584 	bh_unlock_sock(sk);
585 out:
586 	if (refcounted)
587 		sock_put(sk);
588 	return rc;
589 discard_and_relse:
590 	kfree_skb(skb);
591 	goto out;
592 }
593 EXPORT_SYMBOL(__sk_receive_skb);
594 
595 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
596 							  u32));
597 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
598 							   u32));
__sk_dst_check(struct sock * sk,u32 cookie)599 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
600 {
601 	struct dst_entry *dst = __sk_dst_get(sk);
602 
603 	if (dst && dst->obsolete &&
604 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
605 			       dst, cookie) == NULL) {
606 		sk_tx_queue_clear(sk);
607 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
608 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
609 		dst_release(dst);
610 		return NULL;
611 	}
612 
613 	return dst;
614 }
615 EXPORT_SYMBOL(__sk_dst_check);
616 
sk_dst_check(struct sock * sk,u32 cookie)617 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
618 {
619 	struct dst_entry *dst = sk_dst_get(sk);
620 
621 	if (dst && dst->obsolete &&
622 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
623 			       dst, cookie) == NULL) {
624 		sk_dst_reset(sk);
625 		dst_release(dst);
626 		return NULL;
627 	}
628 
629 	return dst;
630 }
631 EXPORT_SYMBOL(sk_dst_check);
632 
sock_bindtoindex_locked(struct sock * sk,int ifindex)633 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
634 {
635 	int ret = -ENOPROTOOPT;
636 #ifdef CONFIG_NETDEVICES
637 	struct net *net = sock_net(sk);
638 
639 	/* Sorry... */
640 	ret = -EPERM;
641 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
642 		goto out;
643 
644 	ret = -EINVAL;
645 	if (ifindex < 0)
646 		goto out;
647 
648 	/* Paired with all READ_ONCE() done locklessly. */
649 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
650 
651 	if (sk->sk_prot->rehash)
652 		sk->sk_prot->rehash(sk);
653 	sk_dst_reset(sk);
654 
655 	ret = 0;
656 
657 out:
658 #endif
659 
660 	return ret;
661 }
662 
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)663 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
664 {
665 	int ret;
666 
667 	if (lock_sk)
668 		lock_sock(sk);
669 	ret = sock_bindtoindex_locked(sk, ifindex);
670 	if (lock_sk)
671 		release_sock(sk);
672 
673 	return ret;
674 }
675 EXPORT_SYMBOL(sock_bindtoindex);
676 
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)677 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
678 {
679 	int ret = -ENOPROTOOPT;
680 #ifdef CONFIG_NETDEVICES
681 	struct net *net = sock_net(sk);
682 	char devname[IFNAMSIZ];
683 	int index;
684 
685 	ret = -EINVAL;
686 	if (optlen < 0)
687 		goto out;
688 
689 	/* Bind this socket to a particular device like "eth0",
690 	 * as specified in the passed interface name. If the
691 	 * name is "" or the option length is zero the socket
692 	 * is not bound.
693 	 */
694 	if (optlen > IFNAMSIZ - 1)
695 		optlen = IFNAMSIZ - 1;
696 	memset(devname, 0, sizeof(devname));
697 
698 	ret = -EFAULT;
699 	if (copy_from_sockptr(devname, optval, optlen))
700 		goto out;
701 
702 	index = 0;
703 	if (devname[0] != '\0') {
704 		struct net_device *dev;
705 
706 		rcu_read_lock();
707 		dev = dev_get_by_name_rcu(net, devname);
708 		if (dev)
709 			index = dev->ifindex;
710 		rcu_read_unlock();
711 		ret = -ENODEV;
712 		if (!dev)
713 			goto out;
714 	}
715 
716 	sockopt_lock_sock(sk);
717 	ret = sock_bindtoindex_locked(sk, index);
718 	sockopt_release_sock(sk);
719 out:
720 #endif
721 
722 	return ret;
723 }
724 
sock_getbindtodevice(struct sock * sk,sockptr_t optval,sockptr_t optlen,int len)725 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
726 				sockptr_t optlen, int len)
727 {
728 	int ret = -ENOPROTOOPT;
729 #ifdef CONFIG_NETDEVICES
730 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
731 	struct net *net = sock_net(sk);
732 	char devname[IFNAMSIZ];
733 
734 	if (bound_dev_if == 0) {
735 		len = 0;
736 		goto zero;
737 	}
738 
739 	ret = -EINVAL;
740 	if (len < IFNAMSIZ)
741 		goto out;
742 
743 	ret = netdev_get_name(net, devname, bound_dev_if);
744 	if (ret)
745 		goto out;
746 
747 	len = strlen(devname) + 1;
748 
749 	ret = -EFAULT;
750 	if (copy_to_sockptr(optval, devname, len))
751 		goto out;
752 
753 zero:
754 	ret = -EFAULT;
755 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
756 		goto out;
757 
758 	ret = 0;
759 
760 out:
761 #endif
762 
763 	return ret;
764 }
765 
sk_mc_loop(struct sock * sk)766 bool sk_mc_loop(struct sock *sk)
767 {
768 	if (dev_recursion_level())
769 		return false;
770 	if (!sk)
771 		return true;
772 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
773 	switch (READ_ONCE(sk->sk_family)) {
774 	case AF_INET:
775 		return inet_test_bit(MC_LOOP, sk);
776 #if IS_ENABLED(CONFIG_IPV6)
777 	case AF_INET6:
778 		return inet6_sk(sk)->mc_loop;
779 #endif
780 	}
781 	WARN_ON_ONCE(1);
782 	return true;
783 }
784 EXPORT_SYMBOL(sk_mc_loop);
785 
sock_set_reuseaddr(struct sock * sk)786 void sock_set_reuseaddr(struct sock *sk)
787 {
788 	lock_sock(sk);
789 	sk->sk_reuse = SK_CAN_REUSE;
790 	release_sock(sk);
791 }
792 EXPORT_SYMBOL(sock_set_reuseaddr);
793 
sock_set_reuseport(struct sock * sk)794 void sock_set_reuseport(struct sock *sk)
795 {
796 	lock_sock(sk);
797 	sk->sk_reuseport = true;
798 	release_sock(sk);
799 }
800 EXPORT_SYMBOL(sock_set_reuseport);
801 
sock_no_linger(struct sock * sk)802 void sock_no_linger(struct sock *sk)
803 {
804 	lock_sock(sk);
805 	WRITE_ONCE(sk->sk_lingertime, 0);
806 	sock_set_flag(sk, SOCK_LINGER);
807 	release_sock(sk);
808 }
809 EXPORT_SYMBOL(sock_no_linger);
810 
sock_set_priority(struct sock * sk,u32 priority)811 void sock_set_priority(struct sock *sk, u32 priority)
812 {
813 	lock_sock(sk);
814 	WRITE_ONCE(sk->sk_priority, priority);
815 	release_sock(sk);
816 }
817 EXPORT_SYMBOL(sock_set_priority);
818 
sock_set_sndtimeo(struct sock * sk,s64 secs)819 void sock_set_sndtimeo(struct sock *sk, s64 secs)
820 {
821 	lock_sock(sk);
822 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
823 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
824 	else
825 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
826 	release_sock(sk);
827 }
828 EXPORT_SYMBOL(sock_set_sndtimeo);
829 
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)830 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
831 {
832 	if (val)  {
833 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
834 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
835 		sock_set_flag(sk, SOCK_RCVTSTAMP);
836 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
837 	} else {
838 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
839 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
840 	}
841 }
842 
sock_enable_timestamps(struct sock * sk)843 void sock_enable_timestamps(struct sock *sk)
844 {
845 	lock_sock(sk);
846 	__sock_set_timestamps(sk, true, false, true);
847 	release_sock(sk);
848 }
849 EXPORT_SYMBOL(sock_enable_timestamps);
850 
sock_set_timestamp(struct sock * sk,int optname,bool valbool)851 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
852 {
853 	switch (optname) {
854 	case SO_TIMESTAMP_OLD:
855 		__sock_set_timestamps(sk, valbool, false, false);
856 		break;
857 	case SO_TIMESTAMP_NEW:
858 		__sock_set_timestamps(sk, valbool, true, false);
859 		break;
860 	case SO_TIMESTAMPNS_OLD:
861 		__sock_set_timestamps(sk, valbool, false, true);
862 		break;
863 	case SO_TIMESTAMPNS_NEW:
864 		__sock_set_timestamps(sk, valbool, true, true);
865 		break;
866 	}
867 }
868 
sock_timestamping_bind_phc(struct sock * sk,int phc_index)869 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
870 {
871 	struct net *net = sock_net(sk);
872 	struct net_device *dev = NULL;
873 	bool match = false;
874 	int *vclock_index;
875 	int i, num;
876 
877 	if (sk->sk_bound_dev_if)
878 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
879 
880 	if (!dev) {
881 		pr_err("%s: sock not bind to device\n", __func__);
882 		return -EOPNOTSUPP;
883 	}
884 
885 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
886 	dev_put(dev);
887 
888 	for (i = 0; i < num; i++) {
889 		if (*(vclock_index + i) == phc_index) {
890 			match = true;
891 			break;
892 		}
893 	}
894 
895 	if (num > 0)
896 		kfree(vclock_index);
897 
898 	if (!match)
899 		return -EINVAL;
900 
901 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
902 
903 	return 0;
904 }
905 
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)906 int sock_set_timestamping(struct sock *sk, int optname,
907 			  struct so_timestamping timestamping)
908 {
909 	int val = timestamping.flags;
910 	int ret;
911 
912 	if (val & ~SOF_TIMESTAMPING_MASK)
913 		return -EINVAL;
914 
915 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
916 	    !(val & SOF_TIMESTAMPING_OPT_ID))
917 		return -EINVAL;
918 
919 	if (val & SOF_TIMESTAMPING_OPT_ID &&
920 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
921 		if (sk_is_tcp(sk)) {
922 			if ((1 << sk->sk_state) &
923 			    (TCPF_CLOSE | TCPF_LISTEN))
924 				return -EINVAL;
925 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
926 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
927 			else
928 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
929 		} else {
930 			atomic_set(&sk->sk_tskey, 0);
931 		}
932 	}
933 
934 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
935 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
936 		return -EINVAL;
937 
938 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
939 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
940 		if (ret)
941 			return ret;
942 	}
943 
944 	WRITE_ONCE(sk->sk_tsflags, val);
945 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
946 
947 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
948 		sock_enable_timestamp(sk,
949 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
950 	else
951 		sock_disable_timestamp(sk,
952 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
953 	return 0;
954 }
955 
sock_set_keepalive(struct sock * sk)956 void sock_set_keepalive(struct sock *sk)
957 {
958 	lock_sock(sk);
959 	if (sk->sk_prot->keepalive)
960 		sk->sk_prot->keepalive(sk, true);
961 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
962 	release_sock(sk);
963 }
964 EXPORT_SYMBOL(sock_set_keepalive);
965 
__sock_set_rcvbuf(struct sock * sk,int val)966 static void __sock_set_rcvbuf(struct sock *sk, int val)
967 {
968 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
969 	 * as a negative value.
970 	 */
971 	val = min_t(int, val, INT_MAX / 2);
972 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
973 
974 	/* We double it on the way in to account for "struct sk_buff" etc.
975 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
976 	 * will allow that much actual data to be received on that socket.
977 	 *
978 	 * Applications are unaware that "struct sk_buff" and other overheads
979 	 * allocate from the receive buffer during socket buffer allocation.
980 	 *
981 	 * And after considering the possible alternatives, returning the value
982 	 * we actually used in getsockopt is the most desirable behavior.
983 	 */
984 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
985 }
986 
sock_set_rcvbuf(struct sock * sk,int val)987 void sock_set_rcvbuf(struct sock *sk, int val)
988 {
989 	lock_sock(sk);
990 	__sock_set_rcvbuf(sk, val);
991 	release_sock(sk);
992 }
993 EXPORT_SYMBOL(sock_set_rcvbuf);
994 
__sock_set_mark(struct sock * sk,u32 val)995 static void __sock_set_mark(struct sock *sk, u32 val)
996 {
997 	if (val != sk->sk_mark) {
998 		WRITE_ONCE(sk->sk_mark, val);
999 		sk_dst_reset(sk);
1000 	}
1001 }
1002 
sock_set_mark(struct sock * sk,u32 val)1003 void sock_set_mark(struct sock *sk, u32 val)
1004 {
1005 	lock_sock(sk);
1006 	__sock_set_mark(sk, val);
1007 	release_sock(sk);
1008 }
1009 EXPORT_SYMBOL(sock_set_mark);
1010 
sock_release_reserved_memory(struct sock * sk,int bytes)1011 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1012 {
1013 	/* Round down bytes to multiple of pages */
1014 	bytes = round_down(bytes, PAGE_SIZE);
1015 
1016 	WARN_ON(bytes > sk->sk_reserved_mem);
1017 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1018 	sk_mem_reclaim(sk);
1019 }
1020 
sock_reserve_memory(struct sock * sk,int bytes)1021 static int sock_reserve_memory(struct sock *sk, int bytes)
1022 {
1023 	long allocated;
1024 	bool charged;
1025 	int pages;
1026 
1027 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1028 		return -EOPNOTSUPP;
1029 
1030 	if (!bytes)
1031 		return 0;
1032 
1033 	pages = sk_mem_pages(bytes);
1034 
1035 	/* pre-charge to memcg */
1036 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1037 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1038 	if (!charged)
1039 		return -ENOMEM;
1040 
1041 	/* pre-charge to forward_alloc */
1042 	sk_memory_allocated_add(sk, pages);
1043 	allocated = sk_memory_allocated(sk);
1044 	/* If the system goes into memory pressure with this
1045 	 * precharge, give up and return error.
1046 	 */
1047 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1048 		sk_memory_allocated_sub(sk, pages);
1049 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1050 		return -ENOMEM;
1051 	}
1052 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1053 
1054 	WRITE_ONCE(sk->sk_reserved_mem,
1055 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1056 
1057 	return 0;
1058 }
1059 
sockopt_lock_sock(struct sock * sk)1060 void sockopt_lock_sock(struct sock *sk)
1061 {
1062 	/* When current->bpf_ctx is set, the setsockopt is called from
1063 	 * a bpf prog.  bpf has ensured the sk lock has been
1064 	 * acquired before calling setsockopt().
1065 	 */
1066 	if (has_current_bpf_ctx())
1067 		return;
1068 
1069 	lock_sock(sk);
1070 }
1071 EXPORT_SYMBOL(sockopt_lock_sock);
1072 
sockopt_release_sock(struct sock * sk)1073 void sockopt_release_sock(struct sock *sk)
1074 {
1075 	if (has_current_bpf_ctx())
1076 		return;
1077 
1078 	release_sock(sk);
1079 }
1080 EXPORT_SYMBOL(sockopt_release_sock);
1081 
sockopt_ns_capable(struct user_namespace * ns,int cap)1082 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1083 {
1084 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1085 }
1086 EXPORT_SYMBOL(sockopt_ns_capable);
1087 
sockopt_capable(int cap)1088 bool sockopt_capable(int cap)
1089 {
1090 	return has_current_bpf_ctx() || capable(cap);
1091 }
1092 EXPORT_SYMBOL(sockopt_capable);
1093 
1094 /*
1095  *	This is meant for all protocols to use and covers goings on
1096  *	at the socket level. Everything here is generic.
1097  */
1098 
sk_setsockopt(struct sock * sk,int level,int optname,sockptr_t optval,unsigned int optlen)1099 int sk_setsockopt(struct sock *sk, int level, int optname,
1100 		  sockptr_t optval, unsigned int optlen)
1101 {
1102 	struct so_timestamping timestamping;
1103 	struct socket *sock = sk->sk_socket;
1104 	struct sock_txtime sk_txtime;
1105 	int val;
1106 	int valbool;
1107 	struct linger ling;
1108 	int ret = 0;
1109 
1110 	/*
1111 	 *	Options without arguments
1112 	 */
1113 
1114 	if (optname == SO_BINDTODEVICE)
1115 		return sock_setbindtodevice(sk, optval, optlen);
1116 
1117 	if (optlen < sizeof(int))
1118 		return -EINVAL;
1119 
1120 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1121 		return -EFAULT;
1122 
1123 	valbool = val ? 1 : 0;
1124 
1125 	sockopt_lock_sock(sk);
1126 
1127 	switch (optname) {
1128 	case SO_DEBUG:
1129 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1130 			ret = -EACCES;
1131 		else
1132 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1133 		break;
1134 	case SO_REUSEADDR:
1135 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1136 		break;
1137 	case SO_REUSEPORT:
1138 		sk->sk_reuseport = valbool;
1139 		break;
1140 	case SO_TYPE:
1141 	case SO_PROTOCOL:
1142 	case SO_DOMAIN:
1143 	case SO_ERROR:
1144 		ret = -ENOPROTOOPT;
1145 		break;
1146 	case SO_DONTROUTE:
1147 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1148 		sk_dst_reset(sk);
1149 		break;
1150 	case SO_BROADCAST:
1151 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1152 		break;
1153 	case SO_SNDBUF:
1154 		/* Don't error on this BSD doesn't and if you think
1155 		 * about it this is right. Otherwise apps have to
1156 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1157 		 * are treated in BSD as hints
1158 		 */
1159 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1160 set_sndbuf:
1161 		/* Ensure val * 2 fits into an int, to prevent max_t()
1162 		 * from treating it as a negative value.
1163 		 */
1164 		val = min_t(int, val, INT_MAX / 2);
1165 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1166 		WRITE_ONCE(sk->sk_sndbuf,
1167 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1168 		/* Wake up sending tasks if we upped the value. */
1169 		sk->sk_write_space(sk);
1170 		break;
1171 
1172 	case SO_SNDBUFFORCE:
1173 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1174 			ret = -EPERM;
1175 			break;
1176 		}
1177 
1178 		/* No negative values (to prevent underflow, as val will be
1179 		 * multiplied by 2).
1180 		 */
1181 		if (val < 0)
1182 			val = 0;
1183 		goto set_sndbuf;
1184 
1185 	case SO_RCVBUF:
1186 		/* Don't error on this BSD doesn't and if you think
1187 		 * about it this is right. Otherwise apps have to
1188 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1189 		 * are treated in BSD as hints
1190 		 */
1191 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1192 		break;
1193 
1194 	case SO_RCVBUFFORCE:
1195 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1196 			ret = -EPERM;
1197 			break;
1198 		}
1199 
1200 		/* No negative values (to prevent underflow, as val will be
1201 		 * multiplied by 2).
1202 		 */
1203 		__sock_set_rcvbuf(sk, max(val, 0));
1204 		break;
1205 
1206 	case SO_KEEPALIVE:
1207 		if (sk->sk_prot->keepalive)
1208 			sk->sk_prot->keepalive(sk, valbool);
1209 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1210 		break;
1211 
1212 	case SO_OOBINLINE:
1213 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1214 		break;
1215 
1216 	case SO_NO_CHECK:
1217 		sk->sk_no_check_tx = valbool;
1218 		break;
1219 
1220 	case SO_PRIORITY:
1221 		if ((val >= 0 && val <= 6) ||
1222 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1223 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1224 			WRITE_ONCE(sk->sk_priority, val);
1225 		else
1226 			ret = -EPERM;
1227 		break;
1228 
1229 	case SO_LINGER:
1230 		if (optlen < sizeof(ling)) {
1231 			ret = -EINVAL;	/* 1003.1g */
1232 			break;
1233 		}
1234 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1235 			ret = -EFAULT;
1236 			break;
1237 		}
1238 		if (!ling.l_onoff) {
1239 			sock_reset_flag(sk, SOCK_LINGER);
1240 		} else {
1241 			unsigned long t_sec = ling.l_linger;
1242 
1243 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1244 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1245 			else
1246 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1247 			sock_set_flag(sk, SOCK_LINGER);
1248 		}
1249 		break;
1250 
1251 	case SO_BSDCOMPAT:
1252 		break;
1253 
1254 	case SO_PASSCRED:
1255 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1256 		break;
1257 
1258 	case SO_PASSPIDFD:
1259 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1260 		break;
1261 
1262 	case SO_TIMESTAMP_OLD:
1263 	case SO_TIMESTAMP_NEW:
1264 	case SO_TIMESTAMPNS_OLD:
1265 	case SO_TIMESTAMPNS_NEW:
1266 		sock_set_timestamp(sk, optname, valbool);
1267 		break;
1268 
1269 	case SO_TIMESTAMPING_NEW:
1270 	case SO_TIMESTAMPING_OLD:
1271 		if (optlen == sizeof(timestamping)) {
1272 			if (copy_from_sockptr(&timestamping, optval,
1273 					      sizeof(timestamping))) {
1274 				ret = -EFAULT;
1275 				break;
1276 			}
1277 		} else {
1278 			memset(&timestamping, 0, sizeof(timestamping));
1279 			timestamping.flags = val;
1280 		}
1281 		ret = sock_set_timestamping(sk, optname, timestamping);
1282 		break;
1283 
1284 	case SO_RCVLOWAT:
1285 		{
1286 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1287 
1288 		if (val < 0)
1289 			val = INT_MAX;
1290 		if (sock)
1291 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1292 		if (set_rcvlowat)
1293 			ret = set_rcvlowat(sk, val);
1294 		else
1295 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1296 		break;
1297 		}
1298 	case SO_RCVTIMEO_OLD:
1299 	case SO_RCVTIMEO_NEW:
1300 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1301 				       optlen, optname == SO_RCVTIMEO_OLD);
1302 		break;
1303 
1304 	case SO_SNDTIMEO_OLD:
1305 	case SO_SNDTIMEO_NEW:
1306 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1307 				       optlen, optname == SO_SNDTIMEO_OLD);
1308 		break;
1309 
1310 	case SO_ATTACH_FILTER: {
1311 		struct sock_fprog fprog;
1312 
1313 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1314 		if (!ret)
1315 			ret = sk_attach_filter(&fprog, sk);
1316 		break;
1317 	}
1318 	case SO_ATTACH_BPF:
1319 		ret = -EINVAL;
1320 		if (optlen == sizeof(u32)) {
1321 			u32 ufd;
1322 
1323 			ret = -EFAULT;
1324 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1325 				break;
1326 
1327 			ret = sk_attach_bpf(ufd, sk);
1328 		}
1329 		break;
1330 
1331 	case SO_ATTACH_REUSEPORT_CBPF: {
1332 		struct sock_fprog fprog;
1333 
1334 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1335 		if (!ret)
1336 			ret = sk_reuseport_attach_filter(&fprog, sk);
1337 		break;
1338 	}
1339 	case SO_ATTACH_REUSEPORT_EBPF:
1340 		ret = -EINVAL;
1341 		if (optlen == sizeof(u32)) {
1342 			u32 ufd;
1343 
1344 			ret = -EFAULT;
1345 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1346 				break;
1347 
1348 			ret = sk_reuseport_attach_bpf(ufd, sk);
1349 		}
1350 		break;
1351 
1352 	case SO_DETACH_REUSEPORT_BPF:
1353 		ret = reuseport_detach_prog(sk);
1354 		break;
1355 
1356 	case SO_DETACH_FILTER:
1357 		ret = sk_detach_filter(sk);
1358 		break;
1359 
1360 	case SO_LOCK_FILTER:
1361 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1362 			ret = -EPERM;
1363 		else
1364 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1365 		break;
1366 
1367 	case SO_PASSSEC:
1368 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1369 		break;
1370 	case SO_MARK:
1371 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1372 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1373 			ret = -EPERM;
1374 			break;
1375 		}
1376 
1377 		__sock_set_mark(sk, val);
1378 		break;
1379 	case SO_RCVMARK:
1380 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1381 		break;
1382 
1383 	case SO_RXQ_OVFL:
1384 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1385 		break;
1386 
1387 	case SO_WIFI_STATUS:
1388 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1389 		break;
1390 
1391 	case SO_PEEK_OFF:
1392 		{
1393 		int (*set_peek_off)(struct sock *sk, int val);
1394 
1395 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1396 		if (set_peek_off)
1397 			ret = set_peek_off(sk, val);
1398 		else
1399 			ret = -EOPNOTSUPP;
1400 		break;
1401 		}
1402 
1403 	case SO_NOFCS:
1404 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1405 		break;
1406 
1407 	case SO_SELECT_ERR_QUEUE:
1408 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1409 		break;
1410 
1411 #ifdef CONFIG_NET_RX_BUSY_POLL
1412 	case SO_BUSY_POLL:
1413 		if (val < 0)
1414 			ret = -EINVAL;
1415 		else
1416 			WRITE_ONCE(sk->sk_ll_usec, val);
1417 		break;
1418 	case SO_PREFER_BUSY_POLL:
1419 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1420 			ret = -EPERM;
1421 		else
1422 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1423 		break;
1424 	case SO_BUSY_POLL_BUDGET:
1425 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
1426 			ret = -EPERM;
1427 		} else {
1428 			if (val < 0 || val > U16_MAX)
1429 				ret = -EINVAL;
1430 			else
1431 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1432 		}
1433 		break;
1434 #endif
1435 
1436 	case SO_MAX_PACING_RATE:
1437 		{
1438 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1439 
1440 		if (sizeof(ulval) != sizeof(val) &&
1441 		    optlen >= sizeof(ulval) &&
1442 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1443 			ret = -EFAULT;
1444 			break;
1445 		}
1446 		if (ulval != ~0UL)
1447 			cmpxchg(&sk->sk_pacing_status,
1448 				SK_PACING_NONE,
1449 				SK_PACING_NEEDED);
1450 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1451 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1452 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1453 		break;
1454 		}
1455 	case SO_INCOMING_CPU:
1456 		reuseport_update_incoming_cpu(sk, val);
1457 		break;
1458 
1459 	case SO_CNX_ADVICE:
1460 		if (val == 1)
1461 			dst_negative_advice(sk);
1462 		break;
1463 
1464 	case SO_ZEROCOPY:
1465 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1466 			if (!(sk_is_tcp(sk) ||
1467 			      (sk->sk_type == SOCK_DGRAM &&
1468 			       sk->sk_protocol == IPPROTO_UDP)))
1469 				ret = -EOPNOTSUPP;
1470 		} else if (sk->sk_family != PF_RDS) {
1471 			ret = -EOPNOTSUPP;
1472 		}
1473 		if (!ret) {
1474 			if (val < 0 || val > 1)
1475 				ret = -EINVAL;
1476 			else
1477 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1478 		}
1479 		break;
1480 
1481 	case SO_TXTIME:
1482 		if (optlen != sizeof(struct sock_txtime)) {
1483 			ret = -EINVAL;
1484 			break;
1485 		} else if (copy_from_sockptr(&sk_txtime, optval,
1486 			   sizeof(struct sock_txtime))) {
1487 			ret = -EFAULT;
1488 			break;
1489 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1490 			ret = -EINVAL;
1491 			break;
1492 		}
1493 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1494 		 * scheduler has enough safe guards.
1495 		 */
1496 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1497 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1498 			ret = -EPERM;
1499 			break;
1500 		}
1501 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1502 		sk->sk_clockid = sk_txtime.clockid;
1503 		sk->sk_txtime_deadline_mode =
1504 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1505 		sk->sk_txtime_report_errors =
1506 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1507 		break;
1508 
1509 	case SO_BINDTOIFINDEX:
1510 		ret = sock_bindtoindex_locked(sk, val);
1511 		break;
1512 
1513 	case SO_BUF_LOCK:
1514 		if (val & ~SOCK_BUF_LOCK_MASK) {
1515 			ret = -EINVAL;
1516 			break;
1517 		}
1518 		sk->sk_userlocks = val | (sk->sk_userlocks &
1519 					  ~SOCK_BUF_LOCK_MASK);
1520 		break;
1521 
1522 	case SO_RESERVE_MEM:
1523 	{
1524 		int delta;
1525 
1526 		if (val < 0) {
1527 			ret = -EINVAL;
1528 			break;
1529 		}
1530 
1531 		delta = val - sk->sk_reserved_mem;
1532 		if (delta < 0)
1533 			sock_release_reserved_memory(sk, -delta);
1534 		else
1535 			ret = sock_reserve_memory(sk, delta);
1536 		break;
1537 	}
1538 
1539 	case SO_TXREHASH:
1540 		if (val < -1 || val > 1) {
1541 			ret = -EINVAL;
1542 			break;
1543 		}
1544 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1545 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1546 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1547 		 * and sk_getsockopt().
1548 		 */
1549 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1550 		break;
1551 
1552 	default:
1553 		ret = -ENOPROTOOPT;
1554 		break;
1555 	}
1556 	sockopt_release_sock(sk);
1557 	return ret;
1558 }
1559 
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)1560 int sock_setsockopt(struct socket *sock, int level, int optname,
1561 		    sockptr_t optval, unsigned int optlen)
1562 {
1563 	return sk_setsockopt(sock->sk, level, optname,
1564 			     optval, optlen);
1565 }
1566 EXPORT_SYMBOL(sock_setsockopt);
1567 
sk_get_peer_cred(struct sock * sk)1568 static const struct cred *sk_get_peer_cred(struct sock *sk)
1569 {
1570 	const struct cred *cred;
1571 
1572 	spin_lock(&sk->sk_peer_lock);
1573 	cred = get_cred(sk->sk_peer_cred);
1574 	spin_unlock(&sk->sk_peer_lock);
1575 
1576 	return cred;
1577 }
1578 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1579 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1580 			  struct ucred *ucred)
1581 {
1582 	ucred->pid = pid_vnr(pid);
1583 	ucred->uid = ucred->gid = -1;
1584 	if (cred) {
1585 		struct user_namespace *current_ns = current_user_ns();
1586 
1587 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1588 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1589 	}
1590 }
1591 
groups_to_user(sockptr_t dst,const struct group_info * src)1592 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1593 {
1594 	struct user_namespace *user_ns = current_user_ns();
1595 	int i;
1596 
1597 	for (i = 0; i < src->ngroups; i++) {
1598 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1599 
1600 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1601 			return -EFAULT;
1602 	}
1603 
1604 	return 0;
1605 }
1606 
sk_getsockopt(struct sock * sk,int level,int optname,sockptr_t optval,sockptr_t optlen)1607 int sk_getsockopt(struct sock *sk, int level, int optname,
1608 		  sockptr_t optval, sockptr_t optlen)
1609 {
1610 	struct socket *sock = sk->sk_socket;
1611 
1612 	union {
1613 		int val;
1614 		u64 val64;
1615 		unsigned long ulval;
1616 		struct linger ling;
1617 		struct old_timeval32 tm32;
1618 		struct __kernel_old_timeval tm;
1619 		struct  __kernel_sock_timeval stm;
1620 		struct sock_txtime txtime;
1621 		struct so_timestamping timestamping;
1622 	} v;
1623 
1624 	int lv = sizeof(int);
1625 	int len;
1626 
1627 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1628 		return -EFAULT;
1629 	if (len < 0)
1630 		return -EINVAL;
1631 
1632 	memset(&v, 0, sizeof(v));
1633 
1634 	switch (optname) {
1635 	case SO_DEBUG:
1636 		v.val = sock_flag(sk, SOCK_DBG);
1637 		break;
1638 
1639 	case SO_DONTROUTE:
1640 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1641 		break;
1642 
1643 	case SO_BROADCAST:
1644 		v.val = sock_flag(sk, SOCK_BROADCAST);
1645 		break;
1646 
1647 	case SO_SNDBUF:
1648 		v.val = READ_ONCE(sk->sk_sndbuf);
1649 		break;
1650 
1651 	case SO_RCVBUF:
1652 		v.val = READ_ONCE(sk->sk_rcvbuf);
1653 		break;
1654 
1655 	case SO_REUSEADDR:
1656 		v.val = sk->sk_reuse;
1657 		break;
1658 
1659 	case SO_REUSEPORT:
1660 		v.val = sk->sk_reuseport;
1661 		break;
1662 
1663 	case SO_KEEPALIVE:
1664 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1665 		break;
1666 
1667 	case SO_TYPE:
1668 		v.val = sk->sk_type;
1669 		break;
1670 
1671 	case SO_PROTOCOL:
1672 		v.val = sk->sk_protocol;
1673 		break;
1674 
1675 	case SO_DOMAIN:
1676 		v.val = sk->sk_family;
1677 		break;
1678 
1679 	case SO_ERROR:
1680 		v.val = -sock_error(sk);
1681 		if (v.val == 0)
1682 			v.val = xchg(&sk->sk_err_soft, 0);
1683 		break;
1684 
1685 	case SO_OOBINLINE:
1686 		v.val = sock_flag(sk, SOCK_URGINLINE);
1687 		break;
1688 
1689 	case SO_NO_CHECK:
1690 		v.val = sk->sk_no_check_tx;
1691 		break;
1692 
1693 	case SO_PRIORITY:
1694 		v.val = READ_ONCE(sk->sk_priority);
1695 		break;
1696 
1697 	case SO_LINGER:
1698 		lv		= sizeof(v.ling);
1699 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1700 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1701 		break;
1702 
1703 	case SO_BSDCOMPAT:
1704 		break;
1705 
1706 	case SO_TIMESTAMP_OLD:
1707 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1708 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1709 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1710 		break;
1711 
1712 	case SO_TIMESTAMPNS_OLD:
1713 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1714 		break;
1715 
1716 	case SO_TIMESTAMP_NEW:
1717 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1718 		break;
1719 
1720 	case SO_TIMESTAMPNS_NEW:
1721 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1722 		break;
1723 
1724 	case SO_TIMESTAMPING_OLD:
1725 	case SO_TIMESTAMPING_NEW:
1726 		lv = sizeof(v.timestamping);
1727 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1728 		 * returning the flags when they were set through the same option.
1729 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1730 		 */
1731 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1732 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1733 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1734 		}
1735 		break;
1736 
1737 	case SO_RCVTIMEO_OLD:
1738 	case SO_RCVTIMEO_NEW:
1739 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1740 				      SO_RCVTIMEO_OLD == optname);
1741 		break;
1742 
1743 	case SO_SNDTIMEO_OLD:
1744 	case SO_SNDTIMEO_NEW:
1745 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1746 				      SO_SNDTIMEO_OLD == optname);
1747 		break;
1748 
1749 	case SO_RCVLOWAT:
1750 		v.val = READ_ONCE(sk->sk_rcvlowat);
1751 		break;
1752 
1753 	case SO_SNDLOWAT:
1754 		v.val = 1;
1755 		break;
1756 
1757 	case SO_PASSCRED:
1758 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1759 		break;
1760 
1761 	case SO_PASSPIDFD:
1762 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1763 		break;
1764 
1765 	case SO_PEERCRED:
1766 	{
1767 		struct ucred peercred;
1768 		if (len > sizeof(peercred))
1769 			len = sizeof(peercred);
1770 
1771 		spin_lock(&sk->sk_peer_lock);
1772 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1773 		spin_unlock(&sk->sk_peer_lock);
1774 
1775 		if (copy_to_sockptr(optval, &peercred, len))
1776 			return -EFAULT;
1777 		goto lenout;
1778 	}
1779 
1780 	case SO_PEERPIDFD:
1781 	{
1782 		struct pid *peer_pid;
1783 		struct file *pidfd_file = NULL;
1784 		int pidfd;
1785 
1786 		if (len > sizeof(pidfd))
1787 			len = sizeof(pidfd);
1788 
1789 		spin_lock(&sk->sk_peer_lock);
1790 		peer_pid = get_pid(sk->sk_peer_pid);
1791 		spin_unlock(&sk->sk_peer_lock);
1792 
1793 		if (!peer_pid)
1794 			return -ENODATA;
1795 
1796 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1797 		put_pid(peer_pid);
1798 		if (pidfd < 0)
1799 			return pidfd;
1800 
1801 		if (copy_to_sockptr(optval, &pidfd, len) ||
1802 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1803 			put_unused_fd(pidfd);
1804 			fput(pidfd_file);
1805 
1806 			return -EFAULT;
1807 		}
1808 
1809 		fd_install(pidfd, pidfd_file);
1810 		return 0;
1811 	}
1812 
1813 	case SO_PEERGROUPS:
1814 	{
1815 		const struct cred *cred;
1816 		int ret, n;
1817 
1818 		cred = sk_get_peer_cred(sk);
1819 		if (!cred)
1820 			return -ENODATA;
1821 
1822 		n = cred->group_info->ngroups;
1823 		if (len < n * sizeof(gid_t)) {
1824 			len = n * sizeof(gid_t);
1825 			put_cred(cred);
1826 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1827 		}
1828 		len = n * sizeof(gid_t);
1829 
1830 		ret = groups_to_user(optval, cred->group_info);
1831 		put_cred(cred);
1832 		if (ret)
1833 			return ret;
1834 		goto lenout;
1835 	}
1836 
1837 	case SO_PEERNAME:
1838 	{
1839 		struct sockaddr_storage address;
1840 
1841 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1842 		if (lv < 0)
1843 			return -ENOTCONN;
1844 		if (lv < len)
1845 			return -EINVAL;
1846 		if (copy_to_sockptr(optval, &address, len))
1847 			return -EFAULT;
1848 		goto lenout;
1849 	}
1850 
1851 	/* Dubious BSD thing... Probably nobody even uses it, but
1852 	 * the UNIX standard wants it for whatever reason... -DaveM
1853 	 */
1854 	case SO_ACCEPTCONN:
1855 		v.val = sk->sk_state == TCP_LISTEN;
1856 		break;
1857 
1858 	case SO_PASSSEC:
1859 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1860 		break;
1861 
1862 	case SO_PEERSEC:
1863 		return security_socket_getpeersec_stream(sock,
1864 							 optval, optlen, len);
1865 
1866 	case SO_MARK:
1867 		v.val = READ_ONCE(sk->sk_mark);
1868 		break;
1869 
1870 	case SO_RCVMARK:
1871 		v.val = sock_flag(sk, SOCK_RCVMARK);
1872 		break;
1873 
1874 	case SO_RXQ_OVFL:
1875 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1876 		break;
1877 
1878 	case SO_WIFI_STATUS:
1879 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1880 		break;
1881 
1882 	case SO_PEEK_OFF:
1883 		if (!READ_ONCE(sock->ops)->set_peek_off)
1884 			return -EOPNOTSUPP;
1885 
1886 		v.val = READ_ONCE(sk->sk_peek_off);
1887 		break;
1888 	case SO_NOFCS:
1889 		v.val = sock_flag(sk, SOCK_NOFCS);
1890 		break;
1891 
1892 	case SO_BINDTODEVICE:
1893 		return sock_getbindtodevice(sk, optval, optlen, len);
1894 
1895 	case SO_GET_FILTER:
1896 		len = sk_get_filter(sk, optval, len);
1897 		if (len < 0)
1898 			return len;
1899 
1900 		goto lenout;
1901 
1902 	case SO_LOCK_FILTER:
1903 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1904 		break;
1905 
1906 	case SO_BPF_EXTENSIONS:
1907 		v.val = bpf_tell_extensions();
1908 		break;
1909 
1910 	case SO_SELECT_ERR_QUEUE:
1911 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1912 		break;
1913 
1914 #ifdef CONFIG_NET_RX_BUSY_POLL
1915 	case SO_BUSY_POLL:
1916 		v.val = READ_ONCE(sk->sk_ll_usec);
1917 		break;
1918 	case SO_PREFER_BUSY_POLL:
1919 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1920 		break;
1921 #endif
1922 
1923 	case SO_MAX_PACING_RATE:
1924 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1925 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1926 			lv = sizeof(v.ulval);
1927 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1928 		} else {
1929 			/* 32bit version */
1930 			v.val = min_t(unsigned long, ~0U,
1931 				      READ_ONCE(sk->sk_max_pacing_rate));
1932 		}
1933 		break;
1934 
1935 	case SO_INCOMING_CPU:
1936 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1937 		break;
1938 
1939 	case SO_MEMINFO:
1940 	{
1941 		u32 meminfo[SK_MEMINFO_VARS];
1942 
1943 		sk_get_meminfo(sk, meminfo);
1944 
1945 		len = min_t(unsigned int, len, sizeof(meminfo));
1946 		if (copy_to_sockptr(optval, &meminfo, len))
1947 			return -EFAULT;
1948 
1949 		goto lenout;
1950 	}
1951 
1952 #ifdef CONFIG_NET_RX_BUSY_POLL
1953 	case SO_INCOMING_NAPI_ID:
1954 		v.val = READ_ONCE(sk->sk_napi_id);
1955 
1956 		/* aggregate non-NAPI IDs down to 0 */
1957 		if (v.val < MIN_NAPI_ID)
1958 			v.val = 0;
1959 
1960 		break;
1961 #endif
1962 
1963 	case SO_COOKIE:
1964 		lv = sizeof(u64);
1965 		if (len < lv)
1966 			return -EINVAL;
1967 		v.val64 = sock_gen_cookie(sk);
1968 		break;
1969 
1970 	case SO_ZEROCOPY:
1971 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1972 		break;
1973 
1974 	case SO_TXTIME:
1975 		lv = sizeof(v.txtime);
1976 		v.txtime.clockid = sk->sk_clockid;
1977 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1978 				  SOF_TXTIME_DEADLINE_MODE : 0;
1979 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1980 				  SOF_TXTIME_REPORT_ERRORS : 0;
1981 		break;
1982 
1983 	case SO_BINDTOIFINDEX:
1984 		v.val = READ_ONCE(sk->sk_bound_dev_if);
1985 		break;
1986 
1987 	case SO_NETNS_COOKIE:
1988 		lv = sizeof(u64);
1989 		if (len != lv)
1990 			return -EINVAL;
1991 		v.val64 = sock_net(sk)->net_cookie;
1992 		break;
1993 
1994 	case SO_BUF_LOCK:
1995 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1996 		break;
1997 
1998 	case SO_RESERVE_MEM:
1999 		v.val = READ_ONCE(sk->sk_reserved_mem);
2000 		break;
2001 
2002 	case SO_TXREHASH:
2003 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2004 		v.val = READ_ONCE(sk->sk_txrehash);
2005 		break;
2006 
2007 	default:
2008 		/* We implement the SO_SNDLOWAT etc to not be settable
2009 		 * (1003.1g 7).
2010 		 */
2011 		return -ENOPROTOOPT;
2012 	}
2013 
2014 	if (len > lv)
2015 		len = lv;
2016 	if (copy_to_sockptr(optval, &v, len))
2017 		return -EFAULT;
2018 lenout:
2019 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2020 		return -EFAULT;
2021 	return 0;
2022 }
2023 
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)2024 int sock_getsockopt(struct socket *sock, int level, int optname,
2025 		    char __user *optval, int __user *optlen)
2026 {
2027 	return sk_getsockopt(sock->sk, level, optname,
2028 			     USER_SOCKPTR(optval),
2029 			     USER_SOCKPTR(optlen));
2030 }
2031 
2032 /*
2033  * Initialize an sk_lock.
2034  *
2035  * (We also register the sk_lock with the lock validator.)
2036  */
sock_lock_init(struct sock * sk)2037 static inline void sock_lock_init(struct sock *sk)
2038 {
2039 	if (sk->sk_kern_sock)
2040 		sock_lock_init_class_and_name(
2041 			sk,
2042 			af_family_kern_slock_key_strings[sk->sk_family],
2043 			af_family_kern_slock_keys + sk->sk_family,
2044 			af_family_kern_key_strings[sk->sk_family],
2045 			af_family_kern_keys + sk->sk_family);
2046 	else
2047 		sock_lock_init_class_and_name(
2048 			sk,
2049 			af_family_slock_key_strings[sk->sk_family],
2050 			af_family_slock_keys + sk->sk_family,
2051 			af_family_key_strings[sk->sk_family],
2052 			af_family_keys + sk->sk_family);
2053 }
2054 
2055 /*
2056  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2057  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2058  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2059  */
sock_copy(struct sock * nsk,const struct sock * osk)2060 static void sock_copy(struct sock *nsk, const struct sock *osk)
2061 {
2062 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2063 #ifdef CONFIG_SECURITY_NETWORK
2064 	void *sptr = nsk->sk_security;
2065 #endif
2066 
2067 	/* If we move sk_tx_queue_mapping out of the private section,
2068 	 * we must check if sk_tx_queue_clear() is called after
2069 	 * sock_copy() in sk_clone_lock().
2070 	 */
2071 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2072 		     offsetof(struct sock, sk_dontcopy_begin) ||
2073 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2074 		     offsetof(struct sock, sk_dontcopy_end));
2075 
2076 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2077 
2078 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2079 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2080 
2081 #ifdef CONFIG_SECURITY_NETWORK
2082 	nsk->sk_security = sptr;
2083 	security_sk_clone(osk, nsk);
2084 #endif
2085 }
2086 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)2087 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2088 		int family)
2089 {
2090 	struct sock *sk;
2091 	struct kmem_cache *slab;
2092 
2093 	slab = prot->slab;
2094 	if (slab != NULL) {
2095 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2096 		if (!sk)
2097 			return sk;
2098 		if (want_init_on_alloc(priority))
2099 			sk_prot_clear_nulls(sk, prot->obj_size);
2100 	} else
2101 		sk = kmalloc(prot->obj_size, priority);
2102 
2103 	if (sk != NULL) {
2104 		if (security_sk_alloc(sk, family, priority))
2105 			goto out_free;
2106 
2107 		if (!try_module_get(prot->owner))
2108 			goto out_free_sec;
2109 	}
2110 
2111 	return sk;
2112 
2113 out_free_sec:
2114 	security_sk_free(sk);
2115 out_free:
2116 	if (slab != NULL)
2117 		kmem_cache_free(slab, sk);
2118 	else
2119 		kfree(sk);
2120 	return NULL;
2121 }
2122 
sk_prot_free(struct proto * prot,struct sock * sk)2123 static void sk_prot_free(struct proto *prot, struct sock *sk)
2124 {
2125 	struct kmem_cache *slab;
2126 	struct module *owner;
2127 
2128 	owner = prot->owner;
2129 	slab = prot->slab;
2130 
2131 	cgroup_sk_free(&sk->sk_cgrp_data);
2132 	mem_cgroup_sk_free(sk);
2133 	trace_android_vh_sk_free(sk);
2134 	security_sk_free(sk);
2135 	if (slab != NULL)
2136 		kmem_cache_free(slab, sk);
2137 	else
2138 		kfree(sk);
2139 	module_put(owner);
2140 }
2141 
2142 /**
2143  *	sk_alloc - All socket objects are allocated here
2144  *	@net: the applicable net namespace
2145  *	@family: protocol family
2146  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2147  *	@prot: struct proto associated with this new sock instance
2148  *	@kern: is this to be a kernel socket?
2149  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)2150 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2151 		      struct proto *prot, int kern)
2152 {
2153 	struct sock *sk;
2154 
2155 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2156 	if (sk) {
2157 		sk->sk_family = family;
2158 		/*
2159 		 * See comment in struct sock definition to understand
2160 		 * why we need sk_prot_creator -acme
2161 		 */
2162 		sk->sk_prot = sk->sk_prot_creator = prot;
2163 		sk->sk_kern_sock = kern;
2164 		sock_lock_init(sk);
2165 		sk->sk_net_refcnt = kern ? 0 : 1;
2166 		if (likely(sk->sk_net_refcnt)) {
2167 			get_net_track(net, &sk->ns_tracker, priority);
2168 			sock_inuse_add(net, 1);
2169 		} else {
2170 			__netns_tracker_alloc(net, &sk->ns_tracker,
2171 					      false, priority);
2172 		}
2173 
2174 		sock_net_set(sk, net);
2175 		refcount_set(&sk->sk_wmem_alloc, 1);
2176 
2177 		mem_cgroup_sk_alloc(sk);
2178 		trace_android_vh_sk_alloc(sk);
2179 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2180 		sock_update_classid(&sk->sk_cgrp_data);
2181 		sock_update_netprioidx(&sk->sk_cgrp_data);
2182 		sk_tx_queue_clear(sk);
2183 	}
2184 
2185 	return sk;
2186 }
2187 EXPORT_SYMBOL(sk_alloc);
2188 
2189 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2190  * grace period. This is the case for UDP sockets and TCP listeners.
2191  */
__sk_destruct(struct rcu_head * head)2192 static void __sk_destruct(struct rcu_head *head)
2193 {
2194 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2195 	struct sk_filter *filter;
2196 
2197 	if (sk->sk_destruct)
2198 		sk->sk_destruct(sk);
2199 
2200 	filter = rcu_dereference_check(sk->sk_filter,
2201 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2202 	if (filter) {
2203 		sk_filter_uncharge(sk, filter);
2204 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2205 	}
2206 
2207 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2208 
2209 #ifdef CONFIG_BPF_SYSCALL
2210 	bpf_sk_storage_free(sk);
2211 #endif
2212 
2213 	if (atomic_read(&sk->sk_omem_alloc))
2214 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2215 			 __func__, atomic_read(&sk->sk_omem_alloc));
2216 
2217 	if (sk->sk_frag.page) {
2218 		put_page(sk->sk_frag.page);
2219 		sk->sk_frag.page = NULL;
2220 	}
2221 
2222 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2223 	put_cred(sk->sk_peer_cred);
2224 	put_pid(sk->sk_peer_pid);
2225 
2226 	if (likely(sk->sk_net_refcnt))
2227 		put_net_track(sock_net(sk), &sk->ns_tracker);
2228 	else
2229 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2230 
2231 	sk_prot_free(sk->sk_prot_creator, sk);
2232 }
2233 
sk_destruct(struct sock * sk)2234 void sk_destruct(struct sock *sk)
2235 {
2236 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2237 
2238 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2239 		reuseport_detach_sock(sk);
2240 		use_call_rcu = true;
2241 	}
2242 
2243 	if (use_call_rcu)
2244 		call_rcu(&sk->sk_rcu, __sk_destruct);
2245 	else
2246 		__sk_destruct(&sk->sk_rcu);
2247 }
2248 
__sk_free(struct sock * sk)2249 static void __sk_free(struct sock *sk)
2250 {
2251 	if (likely(sk->sk_net_refcnt))
2252 		sock_inuse_add(sock_net(sk), -1);
2253 
2254 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2255 		sock_diag_broadcast_destroy(sk);
2256 	else
2257 		sk_destruct(sk);
2258 }
2259 
sk_free(struct sock * sk)2260 void sk_free(struct sock *sk)
2261 {
2262 	/*
2263 	 * We subtract one from sk_wmem_alloc and can know if
2264 	 * some packets are still in some tx queue.
2265 	 * If not null, sock_wfree() will call __sk_free(sk) later
2266 	 */
2267 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2268 		__sk_free(sk);
2269 }
2270 EXPORT_SYMBOL(sk_free);
2271 
sk_init_common(struct sock * sk)2272 static void sk_init_common(struct sock *sk)
2273 {
2274 	skb_queue_head_init(&sk->sk_receive_queue);
2275 	skb_queue_head_init(&sk->sk_write_queue);
2276 	skb_queue_head_init(&sk->sk_error_queue);
2277 
2278 	rwlock_init(&sk->sk_callback_lock);
2279 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2280 			af_rlock_keys + sk->sk_family,
2281 			af_family_rlock_key_strings[sk->sk_family]);
2282 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2283 			af_wlock_keys + sk->sk_family,
2284 			af_family_wlock_key_strings[sk->sk_family]);
2285 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2286 			af_elock_keys + sk->sk_family,
2287 			af_family_elock_key_strings[sk->sk_family]);
2288 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2289 			af_callback_keys + sk->sk_family,
2290 			af_family_clock_key_strings[sk->sk_family]);
2291 }
2292 
2293 /**
2294  *	sk_clone_lock - clone a socket, and lock its clone
2295  *	@sk: the socket to clone
2296  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2297  *
2298  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2299  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2300 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2301 {
2302 	struct proto *prot = READ_ONCE(sk->sk_prot);
2303 	struct sk_filter *filter;
2304 	bool is_charged = true;
2305 	struct sock *newsk;
2306 
2307 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2308 	if (!newsk)
2309 		goto out;
2310 
2311 	sock_copy(newsk, sk);
2312 	trace_android_vh_sk_clone_lock(newsk);
2313 
2314 	newsk->sk_prot_creator = prot;
2315 
2316 	/* SANITY */
2317 	if (likely(newsk->sk_net_refcnt)) {
2318 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2319 		sock_inuse_add(sock_net(newsk), 1);
2320 	} else {
2321 		/* Kernel sockets are not elevating the struct net refcount.
2322 		 * Instead, use a tracker to more easily detect if a layer
2323 		 * is not properly dismantling its kernel sockets at netns
2324 		 * destroy time.
2325 		 */
2326 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2327 				      false, priority);
2328 	}
2329 	sk_node_init(&newsk->sk_node);
2330 	sock_lock_init(newsk);
2331 	bh_lock_sock(newsk);
2332 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2333 	newsk->sk_backlog.len = 0;
2334 
2335 	atomic_set(&newsk->sk_rmem_alloc, 0);
2336 
2337 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2338 	refcount_set(&newsk->sk_wmem_alloc, 1);
2339 
2340 	atomic_set(&newsk->sk_omem_alloc, 0);
2341 	sk_init_common(newsk);
2342 
2343 	newsk->sk_dst_cache	= NULL;
2344 	newsk->sk_dst_pending_confirm = 0;
2345 	newsk->sk_wmem_queued	= 0;
2346 	newsk->sk_forward_alloc = 0;
2347 	newsk->sk_reserved_mem  = 0;
2348 	atomic_set(&newsk->sk_drops, 0);
2349 	newsk->sk_send_head	= NULL;
2350 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2351 	atomic_set(&newsk->sk_zckey, 0);
2352 
2353 	sock_reset_flag(newsk, SOCK_DONE);
2354 
2355 	/* sk->sk_memcg will be populated at accept() time */
2356 	newsk->sk_memcg = NULL;
2357 
2358 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2359 
2360 	rcu_read_lock();
2361 	filter = rcu_dereference(sk->sk_filter);
2362 	if (filter != NULL)
2363 		/* though it's an empty new sock, the charging may fail
2364 		 * if sysctl_optmem_max was changed between creation of
2365 		 * original socket and cloning
2366 		 */
2367 		is_charged = sk_filter_charge(newsk, filter);
2368 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2369 	rcu_read_unlock();
2370 
2371 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2372 		/* We need to make sure that we don't uncharge the new
2373 		 * socket if we couldn't charge it in the first place
2374 		 * as otherwise we uncharge the parent's filter.
2375 		 */
2376 		if (!is_charged)
2377 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2378 		sk_free_unlock_clone(newsk);
2379 		newsk = NULL;
2380 		goto out;
2381 	}
2382 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2383 
2384 	if (bpf_sk_storage_clone(sk, newsk)) {
2385 		sk_free_unlock_clone(newsk);
2386 		newsk = NULL;
2387 		goto out;
2388 	}
2389 
2390 	/* Clear sk_user_data if parent had the pointer tagged
2391 	 * as not suitable for copying when cloning.
2392 	 */
2393 	if (sk_user_data_is_nocopy(newsk))
2394 		newsk->sk_user_data = NULL;
2395 
2396 	newsk->sk_err	   = 0;
2397 	newsk->sk_err_soft = 0;
2398 	newsk->sk_priority = 0;
2399 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2400 
2401 	/* Before updating sk_refcnt, we must commit prior changes to memory
2402 	 * (Documentation/RCU/rculist_nulls.rst for details)
2403 	 */
2404 	smp_wmb();
2405 	refcount_set(&newsk->sk_refcnt, 2);
2406 
2407 	sk_set_socket(newsk, NULL);
2408 	sk_tx_queue_clear(newsk);
2409 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2410 
2411 	if (newsk->sk_prot->sockets_allocated)
2412 		sk_sockets_allocated_inc(newsk);
2413 
2414 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2415 		net_enable_timestamp();
2416 out:
2417 	return newsk;
2418 }
2419 EXPORT_SYMBOL_GPL(sk_clone_lock);
2420 
sk_free_unlock_clone(struct sock * sk)2421 void sk_free_unlock_clone(struct sock *sk)
2422 {
2423 	/* It is still raw copy of parent, so invalidate
2424 	 * destructor and make plain sk_free() */
2425 	sk->sk_destruct = NULL;
2426 	bh_unlock_sock(sk);
2427 	sk_free(sk);
2428 }
2429 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2430 
sk_dst_gso_max_size(struct sock * sk,struct dst_entry * dst)2431 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2432 {
2433 	bool is_ipv6 = false;
2434 	u32 max_size;
2435 
2436 #if IS_ENABLED(CONFIG_IPV6)
2437 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2438 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2439 #endif
2440 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2441 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2442 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2443 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2444 		max_size = GSO_LEGACY_MAX_SIZE;
2445 
2446 	return max_size - (MAX_TCP_HEADER + 1);
2447 }
2448 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2449 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2450 {
2451 	u32 max_segs = 1;
2452 
2453 	sk->sk_route_caps = dst->dev->features;
2454 	if (sk_is_tcp(sk))
2455 		sk->sk_route_caps |= NETIF_F_GSO;
2456 	if (sk->sk_route_caps & NETIF_F_GSO)
2457 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2458 	if (unlikely(sk->sk_gso_disabled))
2459 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2460 	if (sk_can_gso(sk)) {
2461 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2462 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2463 		} else {
2464 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2465 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2466 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2467 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2468 		}
2469 	}
2470 	sk->sk_gso_max_segs = max_segs;
2471 	sk_dst_set(sk, dst);
2472 }
2473 EXPORT_SYMBOL_GPL(sk_setup_caps);
2474 
2475 /*
2476  *	Simple resource managers for sockets.
2477  */
2478 
2479 
2480 /*
2481  * Write buffer destructor automatically called from kfree_skb.
2482  */
sock_wfree(struct sk_buff * skb)2483 void sock_wfree(struct sk_buff *skb)
2484 {
2485 	struct sock *sk = skb->sk;
2486 	unsigned int len = skb->truesize;
2487 	bool free;
2488 
2489 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2490 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2491 		    sk->sk_write_space == sock_def_write_space) {
2492 			rcu_read_lock();
2493 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2494 			sock_def_write_space_wfree(sk);
2495 			rcu_read_unlock();
2496 			if (unlikely(free))
2497 				__sk_free(sk);
2498 			return;
2499 		}
2500 
2501 		/*
2502 		 * Keep a reference on sk_wmem_alloc, this will be released
2503 		 * after sk_write_space() call
2504 		 */
2505 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2506 		sk->sk_write_space(sk);
2507 		len = 1;
2508 	}
2509 	/*
2510 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2511 	 * could not do because of in-flight packets
2512 	 */
2513 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2514 		__sk_free(sk);
2515 }
2516 EXPORT_SYMBOL(sock_wfree);
2517 
2518 /* This variant of sock_wfree() is used by TCP,
2519  * since it sets SOCK_USE_WRITE_QUEUE.
2520  */
__sock_wfree(struct sk_buff * skb)2521 void __sock_wfree(struct sk_buff *skb)
2522 {
2523 	struct sock *sk = skb->sk;
2524 
2525 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2526 		__sk_free(sk);
2527 }
2528 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2529 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2530 {
2531 	skb_orphan(skb);
2532 	skb->sk = sk;
2533 #ifdef CONFIG_INET
2534 	if (unlikely(!sk_fullsock(sk))) {
2535 		skb->destructor = sock_edemux;
2536 		sock_hold(sk);
2537 		return;
2538 	}
2539 #endif
2540 	skb->destructor = sock_wfree;
2541 	skb_set_hash_from_sk(skb, sk);
2542 	/*
2543 	 * We used to take a refcount on sk, but following operation
2544 	 * is enough to guarantee sk_free() wont free this sock until
2545 	 * all in-flight packets are completed
2546 	 */
2547 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2548 }
2549 EXPORT_SYMBOL(skb_set_owner_w);
2550 
can_skb_orphan_partial(const struct sk_buff * skb)2551 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2552 {
2553 #ifdef CONFIG_TLS_DEVICE
2554 	/* Drivers depend on in-order delivery for crypto offload,
2555 	 * partial orphan breaks out-of-order-OK logic.
2556 	 */
2557 	if (skb->decrypted)
2558 		return false;
2559 #endif
2560 	return (skb->destructor == sock_wfree ||
2561 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2562 }
2563 
2564 /* This helper is used by netem, as it can hold packets in its
2565  * delay queue. We want to allow the owner socket to send more
2566  * packets, as if they were already TX completed by a typical driver.
2567  * But we also want to keep skb->sk set because some packet schedulers
2568  * rely on it (sch_fq for example).
2569  */
skb_orphan_partial(struct sk_buff * skb)2570 void skb_orphan_partial(struct sk_buff *skb)
2571 {
2572 	if (skb_is_tcp_pure_ack(skb))
2573 		return;
2574 
2575 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2576 		return;
2577 
2578 	skb_orphan(skb);
2579 }
2580 EXPORT_SYMBOL(skb_orphan_partial);
2581 
2582 /*
2583  * Read buffer destructor automatically called from kfree_skb.
2584  */
sock_rfree(struct sk_buff * skb)2585 void sock_rfree(struct sk_buff *skb)
2586 {
2587 	struct sock *sk = skb->sk;
2588 	unsigned int len = skb->truesize;
2589 
2590 	atomic_sub(len, &sk->sk_rmem_alloc);
2591 	sk_mem_uncharge(sk, len);
2592 }
2593 EXPORT_SYMBOL(sock_rfree);
2594 
2595 /*
2596  * Buffer destructor for skbs that are not used directly in read or write
2597  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2598  */
sock_efree(struct sk_buff * skb)2599 void sock_efree(struct sk_buff *skb)
2600 {
2601 	sock_put(skb->sk);
2602 }
2603 EXPORT_SYMBOL(sock_efree);
2604 
2605 /* Buffer destructor for prefetch/receive path where reference count may
2606  * not be held, e.g. for listen sockets.
2607  */
2608 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2609 void sock_pfree(struct sk_buff *skb)
2610 {
2611 	if (sk_is_refcounted(skb->sk))
2612 		sock_gen_put(skb->sk);
2613 }
2614 EXPORT_SYMBOL(sock_pfree);
2615 #endif /* CONFIG_INET */
2616 
sock_i_uid(struct sock * sk)2617 kuid_t sock_i_uid(struct sock *sk)
2618 {
2619 	kuid_t uid;
2620 
2621 	read_lock_bh(&sk->sk_callback_lock);
2622 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2623 	read_unlock_bh(&sk->sk_callback_lock);
2624 	return uid;
2625 }
2626 EXPORT_SYMBOL(sock_i_uid);
2627 
__sock_i_ino(struct sock * sk)2628 unsigned long __sock_i_ino(struct sock *sk)
2629 {
2630 	unsigned long ino;
2631 
2632 	read_lock(&sk->sk_callback_lock);
2633 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2634 	read_unlock(&sk->sk_callback_lock);
2635 	return ino;
2636 }
2637 EXPORT_SYMBOL(__sock_i_ino);
2638 
sock_i_ino(struct sock * sk)2639 unsigned long sock_i_ino(struct sock *sk)
2640 {
2641 	unsigned long ino;
2642 
2643 	local_bh_disable();
2644 	ino = __sock_i_ino(sk);
2645 	local_bh_enable();
2646 	return ino;
2647 }
2648 EXPORT_SYMBOL(sock_i_ino);
2649 
2650 /*
2651  * Allocate a skb from the socket's send buffer.
2652  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2653 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2654 			     gfp_t priority)
2655 {
2656 	if (force ||
2657 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2658 		struct sk_buff *skb = alloc_skb(size, priority);
2659 
2660 		if (skb) {
2661 			skb_set_owner_w(skb, sk);
2662 			return skb;
2663 		}
2664 	}
2665 	return NULL;
2666 }
2667 EXPORT_SYMBOL(sock_wmalloc);
2668 
sock_ofree(struct sk_buff * skb)2669 static void sock_ofree(struct sk_buff *skb)
2670 {
2671 	struct sock *sk = skb->sk;
2672 
2673 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2674 }
2675 
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2676 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2677 			     gfp_t priority)
2678 {
2679 	struct sk_buff *skb;
2680 
2681 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2682 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2683 	    READ_ONCE(sysctl_optmem_max))
2684 		return NULL;
2685 
2686 	skb = alloc_skb(size, priority);
2687 	if (!skb)
2688 		return NULL;
2689 
2690 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2691 	skb->sk = sk;
2692 	skb->destructor = sock_ofree;
2693 	return skb;
2694 }
2695 
2696 /*
2697  * Allocate a memory block from the socket's option memory buffer.
2698  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2699 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2700 {
2701 	int optmem_max = READ_ONCE(sysctl_optmem_max);
2702 
2703 	if ((unsigned int)size <= optmem_max &&
2704 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2705 		void *mem;
2706 		/* First do the add, to avoid the race if kmalloc
2707 		 * might sleep.
2708 		 */
2709 		atomic_add(size, &sk->sk_omem_alloc);
2710 		mem = kmalloc(size, priority);
2711 		if (mem)
2712 			return mem;
2713 		atomic_sub(size, &sk->sk_omem_alloc);
2714 	}
2715 	return NULL;
2716 }
2717 EXPORT_SYMBOL(sock_kmalloc);
2718 
2719 /* Free an option memory block. Note, we actually want the inline
2720  * here as this allows gcc to detect the nullify and fold away the
2721  * condition entirely.
2722  */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2723 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2724 				  const bool nullify)
2725 {
2726 	if (WARN_ON_ONCE(!mem))
2727 		return;
2728 	if (nullify)
2729 		kfree_sensitive(mem);
2730 	else
2731 		kfree(mem);
2732 	atomic_sub(size, &sk->sk_omem_alloc);
2733 }
2734 
sock_kfree_s(struct sock * sk,void * mem,int size)2735 void sock_kfree_s(struct sock *sk, void *mem, int size)
2736 {
2737 	__sock_kfree_s(sk, mem, size, false);
2738 }
2739 EXPORT_SYMBOL(sock_kfree_s);
2740 
sock_kzfree_s(struct sock * sk,void * mem,int size)2741 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2742 {
2743 	__sock_kfree_s(sk, mem, size, true);
2744 }
2745 EXPORT_SYMBOL(sock_kzfree_s);
2746 
2747 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2748    I think, these locks should be removed for datagram sockets.
2749  */
sock_wait_for_wmem(struct sock * sk,long timeo)2750 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2751 {
2752 	DEFINE_WAIT(wait);
2753 
2754 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2755 	for (;;) {
2756 		if (!timeo)
2757 			break;
2758 		if (signal_pending(current))
2759 			break;
2760 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2761 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2762 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2763 			break;
2764 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2765 			break;
2766 		if (READ_ONCE(sk->sk_err))
2767 			break;
2768 		timeo = schedule_timeout(timeo);
2769 	}
2770 	finish_wait(sk_sleep(sk), &wait);
2771 	return timeo;
2772 }
2773 
2774 
2775 /*
2776  *	Generic send/receive buffer handlers
2777  */
2778 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2779 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2780 				     unsigned long data_len, int noblock,
2781 				     int *errcode, int max_page_order)
2782 {
2783 	struct sk_buff *skb;
2784 	long timeo;
2785 	int err;
2786 
2787 	timeo = sock_sndtimeo(sk, noblock);
2788 	for (;;) {
2789 		err = sock_error(sk);
2790 		if (err != 0)
2791 			goto failure;
2792 
2793 		err = -EPIPE;
2794 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2795 			goto failure;
2796 
2797 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2798 			break;
2799 
2800 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2801 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2802 		err = -EAGAIN;
2803 		if (!timeo)
2804 			goto failure;
2805 		if (signal_pending(current))
2806 			goto interrupted;
2807 		timeo = sock_wait_for_wmem(sk, timeo);
2808 	}
2809 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2810 				   errcode, sk->sk_allocation);
2811 	if (skb)
2812 		skb_set_owner_w(skb, sk);
2813 	return skb;
2814 
2815 interrupted:
2816 	err = sock_intr_errno(timeo);
2817 failure:
2818 	*errcode = err;
2819 	return NULL;
2820 }
2821 EXPORT_SYMBOL(sock_alloc_send_pskb);
2822 
__sock_cmsg_send(struct sock * sk,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2823 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2824 		     struct sockcm_cookie *sockc)
2825 {
2826 	u32 tsflags;
2827 
2828 	switch (cmsg->cmsg_type) {
2829 	case SO_MARK:
2830 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2831 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2832 			return -EPERM;
2833 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2834 			return -EINVAL;
2835 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2836 		break;
2837 	case SO_TIMESTAMPING_OLD:
2838 	case SO_TIMESTAMPING_NEW:
2839 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2840 			return -EINVAL;
2841 
2842 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2843 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2844 			return -EINVAL;
2845 
2846 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2847 		sockc->tsflags |= tsflags;
2848 		break;
2849 	case SCM_TXTIME:
2850 		if (!sock_flag(sk, SOCK_TXTIME))
2851 			return -EINVAL;
2852 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2853 			return -EINVAL;
2854 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2855 		break;
2856 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2857 	case SCM_RIGHTS:
2858 	case SCM_CREDENTIALS:
2859 		break;
2860 	default:
2861 		return -EINVAL;
2862 	}
2863 	return 0;
2864 }
2865 EXPORT_SYMBOL(__sock_cmsg_send);
2866 
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2867 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2868 		   struct sockcm_cookie *sockc)
2869 {
2870 	struct cmsghdr *cmsg;
2871 	int ret;
2872 
2873 	for_each_cmsghdr(cmsg, msg) {
2874 		if (!CMSG_OK(msg, cmsg))
2875 			return -EINVAL;
2876 		if (cmsg->cmsg_level != SOL_SOCKET)
2877 			continue;
2878 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2879 		if (ret)
2880 			return ret;
2881 	}
2882 	return 0;
2883 }
2884 EXPORT_SYMBOL(sock_cmsg_send);
2885 
sk_enter_memory_pressure(struct sock * sk)2886 static void sk_enter_memory_pressure(struct sock *sk)
2887 {
2888 	if (!sk->sk_prot->enter_memory_pressure)
2889 		return;
2890 
2891 	sk->sk_prot->enter_memory_pressure(sk);
2892 }
2893 
sk_leave_memory_pressure(struct sock * sk)2894 static void sk_leave_memory_pressure(struct sock *sk)
2895 {
2896 	if (sk->sk_prot->leave_memory_pressure) {
2897 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2898 				     tcp_leave_memory_pressure, sk);
2899 	} else {
2900 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2901 
2902 		if (memory_pressure && READ_ONCE(*memory_pressure))
2903 			WRITE_ONCE(*memory_pressure, 0);
2904 	}
2905 }
2906 
2907 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2908 
2909 /**
2910  * skb_page_frag_refill - check that a page_frag contains enough room
2911  * @sz: minimum size of the fragment we want to get
2912  * @pfrag: pointer to page_frag
2913  * @gfp: priority for memory allocation
2914  *
2915  * Note: While this allocator tries to use high order pages, there is
2916  * no guarantee that allocations succeed. Therefore, @sz MUST be
2917  * less or equal than PAGE_SIZE.
2918  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2919 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2920 {
2921 	if (pfrag->page) {
2922 		if (page_ref_count(pfrag->page) == 1) {
2923 			pfrag->offset = 0;
2924 			return true;
2925 		}
2926 		if (pfrag->offset + sz <= pfrag->size)
2927 			return true;
2928 		put_page(pfrag->page);
2929 	}
2930 
2931 	pfrag->offset = 0;
2932 	if (SKB_FRAG_PAGE_ORDER &&
2933 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2934 		/* Avoid direct reclaim but allow kswapd to wake */
2935 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2936 					  __GFP_COMP | __GFP_NOWARN |
2937 					  __GFP_NORETRY,
2938 					  SKB_FRAG_PAGE_ORDER);
2939 		if (likely(pfrag->page)) {
2940 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2941 			return true;
2942 		}
2943 	}
2944 	pfrag->page = alloc_page(gfp);
2945 	if (likely(pfrag->page)) {
2946 		pfrag->size = PAGE_SIZE;
2947 		return true;
2948 	}
2949 	return false;
2950 }
2951 EXPORT_SYMBOL(skb_page_frag_refill);
2952 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2953 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2954 {
2955 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2956 		return true;
2957 
2958 	sk_enter_memory_pressure(sk);
2959 	sk_stream_moderate_sndbuf(sk);
2960 	return false;
2961 }
2962 EXPORT_SYMBOL(sk_page_frag_refill);
2963 
__lock_sock(struct sock * sk)2964 void __lock_sock(struct sock *sk)
2965 	__releases(&sk->sk_lock.slock)
2966 	__acquires(&sk->sk_lock.slock)
2967 {
2968 	DEFINE_WAIT(wait);
2969 
2970 	for (;;) {
2971 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2972 					TASK_UNINTERRUPTIBLE);
2973 		spin_unlock_bh(&sk->sk_lock.slock);
2974 		schedule();
2975 		spin_lock_bh(&sk->sk_lock.slock);
2976 		if (!sock_owned_by_user(sk))
2977 			break;
2978 	}
2979 	finish_wait(&sk->sk_lock.wq, &wait);
2980 }
2981 
__release_sock(struct sock * sk)2982 void __release_sock(struct sock *sk)
2983 	__releases(&sk->sk_lock.slock)
2984 	__acquires(&sk->sk_lock.slock)
2985 {
2986 	struct sk_buff *skb, *next;
2987 
2988 	while ((skb = sk->sk_backlog.head) != NULL) {
2989 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2990 
2991 		spin_unlock_bh(&sk->sk_lock.slock);
2992 
2993 		do {
2994 			next = skb->next;
2995 			prefetch(next);
2996 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2997 			skb_mark_not_on_list(skb);
2998 			sk_backlog_rcv(sk, skb);
2999 
3000 			cond_resched();
3001 
3002 			skb = next;
3003 		} while (skb != NULL);
3004 
3005 		spin_lock_bh(&sk->sk_lock.slock);
3006 	}
3007 
3008 	/*
3009 	 * Doing the zeroing here guarantee we can not loop forever
3010 	 * while a wild producer attempts to flood us.
3011 	 */
3012 	sk->sk_backlog.len = 0;
3013 }
3014 
__sk_flush_backlog(struct sock * sk)3015 void __sk_flush_backlog(struct sock *sk)
3016 {
3017 	spin_lock_bh(&sk->sk_lock.slock);
3018 	__release_sock(sk);
3019 	spin_unlock_bh(&sk->sk_lock.slock);
3020 }
3021 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3022 
3023 /**
3024  * sk_wait_data - wait for data to arrive at sk_receive_queue
3025  * @sk:    sock to wait on
3026  * @timeo: for how long
3027  * @skb:   last skb seen on sk_receive_queue
3028  *
3029  * Now socket state including sk->sk_err is changed only under lock,
3030  * hence we may omit checks after joining wait queue.
3031  * We check receive queue before schedule() only as optimization;
3032  * it is very likely that release_sock() added new data.
3033  */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)3034 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3035 {
3036 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3037 	int rc;
3038 
3039 	add_wait_queue(sk_sleep(sk), &wait);
3040 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3041 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3042 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3043 	remove_wait_queue(sk_sleep(sk), &wait);
3044 	return rc;
3045 }
3046 EXPORT_SYMBOL(sk_wait_data);
3047 
3048 /**
3049  *	__sk_mem_raise_allocated - increase memory_allocated
3050  *	@sk: socket
3051  *	@size: memory size to allocate
3052  *	@amt: pages to allocate
3053  *	@kind: allocation type
3054  *
3055  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3056  */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)3057 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3058 {
3059 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
3060 	struct proto *prot = sk->sk_prot;
3061 	bool charged = true;
3062 	long allocated;
3063 
3064 	sk_memory_allocated_add(sk, amt);
3065 	allocated = sk_memory_allocated(sk);
3066 	if (memcg_charge &&
3067 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3068 						gfp_memcg_charge())))
3069 		goto suppress_allocation;
3070 
3071 	/* Under limit. */
3072 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3073 		sk_leave_memory_pressure(sk);
3074 		return 1;
3075 	}
3076 
3077 	/* Under pressure. */
3078 	if (allocated > sk_prot_mem_limits(sk, 1))
3079 		sk_enter_memory_pressure(sk);
3080 
3081 	/* Over hard limit. */
3082 	if (allocated > sk_prot_mem_limits(sk, 2))
3083 		goto suppress_allocation;
3084 
3085 	/* guarantee minimum buffer size under pressure */
3086 	if (kind == SK_MEM_RECV) {
3087 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3088 			return 1;
3089 
3090 	} else { /* SK_MEM_SEND */
3091 		int wmem0 = sk_get_wmem0(sk, prot);
3092 
3093 		if (sk->sk_type == SOCK_STREAM) {
3094 			if (sk->sk_wmem_queued < wmem0)
3095 				return 1;
3096 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3097 				return 1;
3098 		}
3099 	}
3100 
3101 	if (sk_has_memory_pressure(sk)) {
3102 		u64 alloc;
3103 
3104 		if (!sk_under_memory_pressure(sk))
3105 			return 1;
3106 		alloc = sk_sockets_allocated_read_positive(sk);
3107 		if (sk_prot_mem_limits(sk, 2) > alloc *
3108 		    sk_mem_pages(sk->sk_wmem_queued +
3109 				 atomic_read(&sk->sk_rmem_alloc) +
3110 				 sk->sk_forward_alloc))
3111 			return 1;
3112 	}
3113 
3114 suppress_allocation:
3115 
3116 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3117 		sk_stream_moderate_sndbuf(sk);
3118 
3119 		/* Fail only if socket is _under_ its sndbuf.
3120 		 * In this case we cannot block, so that we have to fail.
3121 		 */
3122 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3123 			/* Force charge with __GFP_NOFAIL */
3124 			if (memcg_charge && !charged) {
3125 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3126 					gfp_memcg_charge() | __GFP_NOFAIL);
3127 			}
3128 			return 1;
3129 		}
3130 	}
3131 
3132 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3133 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3134 
3135 	sk_memory_allocated_sub(sk, amt);
3136 
3137 	if (memcg_charge && charged)
3138 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
3139 
3140 	return 0;
3141 }
3142 
3143 /**
3144  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3145  *	@sk: socket
3146  *	@size: memory size to allocate
3147  *	@kind: allocation type
3148  *
3149  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3150  *	rmem allocation. This function assumes that protocols which have
3151  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3152  */
__sk_mem_schedule(struct sock * sk,int size,int kind)3153 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3154 {
3155 	int ret, amt = sk_mem_pages(size);
3156 
3157 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3158 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3159 	if (!ret)
3160 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3161 	return ret;
3162 }
3163 EXPORT_SYMBOL(__sk_mem_schedule);
3164 
3165 /**
3166  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3167  *	@sk: socket
3168  *	@amount: number of quanta
3169  *
3170  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3171  */
__sk_mem_reduce_allocated(struct sock * sk,int amount)3172 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3173 {
3174 	sk_memory_allocated_sub(sk, amount);
3175 
3176 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3177 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3178 
3179 	if (sk_under_global_memory_pressure(sk) &&
3180 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3181 		sk_leave_memory_pressure(sk);
3182 }
3183 
3184 /**
3185  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3186  *	@sk: socket
3187  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3188  */
__sk_mem_reclaim(struct sock * sk,int amount)3189 void __sk_mem_reclaim(struct sock *sk, int amount)
3190 {
3191 	amount >>= PAGE_SHIFT;
3192 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3193 	__sk_mem_reduce_allocated(sk, amount);
3194 }
3195 EXPORT_SYMBOL(__sk_mem_reclaim);
3196 
sk_set_peek_off(struct sock * sk,int val)3197 int sk_set_peek_off(struct sock *sk, int val)
3198 {
3199 	WRITE_ONCE(sk->sk_peek_off, val);
3200 	return 0;
3201 }
3202 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3203 
3204 /*
3205  * Set of default routines for initialising struct proto_ops when
3206  * the protocol does not support a particular function. In certain
3207  * cases where it makes no sense for a protocol to have a "do nothing"
3208  * function, some default processing is provided.
3209  */
3210 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)3211 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3212 {
3213 	return -EOPNOTSUPP;
3214 }
3215 EXPORT_SYMBOL(sock_no_bind);
3216 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)3217 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3218 		    int len, int flags)
3219 {
3220 	return -EOPNOTSUPP;
3221 }
3222 EXPORT_SYMBOL(sock_no_connect);
3223 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)3224 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3225 {
3226 	return -EOPNOTSUPP;
3227 }
3228 EXPORT_SYMBOL(sock_no_socketpair);
3229 
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)3230 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3231 		   bool kern)
3232 {
3233 	return -EOPNOTSUPP;
3234 }
3235 EXPORT_SYMBOL(sock_no_accept);
3236 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)3237 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3238 		    int peer)
3239 {
3240 	return -EOPNOTSUPP;
3241 }
3242 EXPORT_SYMBOL(sock_no_getname);
3243 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3244 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3245 {
3246 	return -EOPNOTSUPP;
3247 }
3248 EXPORT_SYMBOL(sock_no_ioctl);
3249 
sock_no_listen(struct socket * sock,int backlog)3250 int sock_no_listen(struct socket *sock, int backlog)
3251 {
3252 	return -EOPNOTSUPP;
3253 }
3254 EXPORT_SYMBOL(sock_no_listen);
3255 
sock_no_shutdown(struct socket * sock,int how)3256 int sock_no_shutdown(struct socket *sock, int how)
3257 {
3258 	return -EOPNOTSUPP;
3259 }
3260 EXPORT_SYMBOL(sock_no_shutdown);
3261 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)3262 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3263 {
3264 	return -EOPNOTSUPP;
3265 }
3266 EXPORT_SYMBOL(sock_no_sendmsg);
3267 
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)3268 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3269 {
3270 	return -EOPNOTSUPP;
3271 }
3272 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3273 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)3274 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3275 		    int flags)
3276 {
3277 	return -EOPNOTSUPP;
3278 }
3279 EXPORT_SYMBOL(sock_no_recvmsg);
3280 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)3281 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3282 {
3283 	/* Mirror missing mmap method error code */
3284 	return -ENODEV;
3285 }
3286 EXPORT_SYMBOL(sock_no_mmap);
3287 
3288 /*
3289  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3290  * various sock-based usage counts.
3291  */
__receive_sock(struct file * file)3292 void __receive_sock(struct file *file)
3293 {
3294 	struct socket *sock;
3295 
3296 	sock = sock_from_file(file);
3297 	if (sock) {
3298 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3299 		sock_update_classid(&sock->sk->sk_cgrp_data);
3300 	}
3301 }
3302 
3303 /*
3304  *	Default Socket Callbacks
3305  */
3306 
sock_def_wakeup(struct sock * sk)3307 static void sock_def_wakeup(struct sock *sk)
3308 {
3309 	struct socket_wq *wq;
3310 
3311 	rcu_read_lock();
3312 	wq = rcu_dereference(sk->sk_wq);
3313 	if (skwq_has_sleeper(wq))
3314 		wake_up_interruptible_all(&wq->wait);
3315 	rcu_read_unlock();
3316 }
3317 
sock_def_error_report(struct sock * sk)3318 static void sock_def_error_report(struct sock *sk)
3319 {
3320 	struct socket_wq *wq;
3321 
3322 	rcu_read_lock();
3323 	wq = rcu_dereference(sk->sk_wq);
3324 	if (skwq_has_sleeper(wq))
3325 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3326 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3327 	rcu_read_unlock();
3328 }
3329 
sock_def_readable(struct sock * sk)3330 void sock_def_readable(struct sock *sk)
3331 {
3332 	struct socket_wq *wq;
3333 
3334 	trace_sk_data_ready(sk);
3335 
3336 	rcu_read_lock();
3337 	wq = rcu_dereference(sk->sk_wq);
3338 
3339 	if (skwq_has_sleeper(wq)) {
3340 		int done = 0;
3341 
3342 		trace_android_vh_do_wake_up_sync(&wq->wait, &done, sk);
3343 		if (done)
3344 			goto out;
3345 
3346 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3347 						EPOLLRDNORM | EPOLLRDBAND);
3348 	}
3349 
3350 out:
3351 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3352 	rcu_read_unlock();
3353 }
3354 
sock_def_write_space(struct sock * sk)3355 static void sock_def_write_space(struct sock *sk)
3356 {
3357 	struct socket_wq *wq;
3358 
3359 	rcu_read_lock();
3360 
3361 	/* Do not wake up a writer until he can make "significant"
3362 	 * progress.  --DaveM
3363 	 */
3364 	if (sock_writeable(sk)) {
3365 		wq = rcu_dereference(sk->sk_wq);
3366 		if (skwq_has_sleeper(wq))
3367 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3368 						EPOLLWRNORM | EPOLLWRBAND);
3369 
3370 		/* Should agree with poll, otherwise some programs break */
3371 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3372 	}
3373 
3374 	rcu_read_unlock();
3375 }
3376 
3377 /* An optimised version of sock_def_write_space(), should only be called
3378  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3379  * ->sk_wmem_alloc.
3380  */
sock_def_write_space_wfree(struct sock * sk)3381 static void sock_def_write_space_wfree(struct sock *sk)
3382 {
3383 	/* Do not wake up a writer until he can make "significant"
3384 	 * progress.  --DaveM
3385 	 */
3386 	if (sock_writeable(sk)) {
3387 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3388 
3389 		/* rely on refcount_sub from sock_wfree() */
3390 		smp_mb__after_atomic();
3391 		if (wq && waitqueue_active(&wq->wait))
3392 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3393 						EPOLLWRNORM | EPOLLWRBAND);
3394 
3395 		/* Should agree with poll, otherwise some programs break */
3396 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3397 	}
3398 }
3399 
sock_def_destruct(struct sock * sk)3400 static void sock_def_destruct(struct sock *sk)
3401 {
3402 }
3403 
sk_send_sigurg(struct sock * sk)3404 void sk_send_sigurg(struct sock *sk)
3405 {
3406 	if (sk->sk_socket && sk->sk_socket->file)
3407 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3408 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3409 }
3410 EXPORT_SYMBOL(sk_send_sigurg);
3411 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3412 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3413 		    unsigned long expires)
3414 {
3415 	if (!mod_timer(timer, expires))
3416 		sock_hold(sk);
3417 }
3418 EXPORT_SYMBOL(sk_reset_timer);
3419 
sk_stop_timer(struct sock * sk,struct timer_list * timer)3420 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3421 {
3422 	if (del_timer(timer))
3423 		__sock_put(sk);
3424 }
3425 EXPORT_SYMBOL(sk_stop_timer);
3426 
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3427 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3428 {
3429 	if (del_timer_sync(timer))
3430 		__sock_put(sk);
3431 }
3432 EXPORT_SYMBOL(sk_stop_timer_sync);
3433 
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3434 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3435 {
3436 	sk_init_common(sk);
3437 	sk->sk_send_head	=	NULL;
3438 
3439 	timer_setup(&sk->sk_timer, NULL, 0);
3440 
3441 	sk->sk_allocation	=	GFP_KERNEL;
3442 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3443 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3444 	sk->sk_state		=	TCP_CLOSE;
3445 	sk->sk_use_task_frag	=	true;
3446 	sk_set_socket(sk, sock);
3447 
3448 	sock_set_flag(sk, SOCK_ZAPPED);
3449 
3450 	if (sock) {
3451 		sk->sk_type	=	sock->type;
3452 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3453 		sock->sk	=	sk;
3454 	} else {
3455 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3456 	}
3457 	sk->sk_uid	=	uid;
3458 
3459 	rwlock_init(&sk->sk_callback_lock);
3460 	if (sk->sk_kern_sock)
3461 		lockdep_set_class_and_name(
3462 			&sk->sk_callback_lock,
3463 			af_kern_callback_keys + sk->sk_family,
3464 			af_family_kern_clock_key_strings[sk->sk_family]);
3465 	else
3466 		lockdep_set_class_and_name(
3467 			&sk->sk_callback_lock,
3468 			af_callback_keys + sk->sk_family,
3469 			af_family_clock_key_strings[sk->sk_family]);
3470 
3471 	sk->sk_state_change	=	sock_def_wakeup;
3472 	sk->sk_data_ready	=	sock_def_readable;
3473 	sk->sk_write_space	=	sock_def_write_space;
3474 	sk->sk_error_report	=	sock_def_error_report;
3475 	sk->sk_destruct		=	sock_def_destruct;
3476 
3477 	sk->sk_frag.page	=	NULL;
3478 	sk->sk_frag.offset	=	0;
3479 	sk->sk_peek_off		=	-1;
3480 
3481 	sk->sk_peer_pid 	=	NULL;
3482 	sk->sk_peer_cred	=	NULL;
3483 	spin_lock_init(&sk->sk_peer_lock);
3484 
3485 	sk->sk_write_pending	=	0;
3486 	sk->sk_rcvlowat		=	1;
3487 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3488 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3489 
3490 	sk->sk_stamp = SK_DEFAULT_STAMP;
3491 #if BITS_PER_LONG==32
3492 	seqlock_init(&sk->sk_stamp_seq);
3493 #endif
3494 	atomic_set(&sk->sk_zckey, 0);
3495 
3496 #ifdef CONFIG_NET_RX_BUSY_POLL
3497 	sk->sk_napi_id		=	0;
3498 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3499 #endif
3500 
3501 	sk->sk_max_pacing_rate = ~0UL;
3502 	sk->sk_pacing_rate = ~0UL;
3503 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3504 	sk->sk_incoming_cpu = -1;
3505 
3506 	sk_rx_queue_clear(sk);
3507 	/*
3508 	 * Before updating sk_refcnt, we must commit prior changes to memory
3509 	 * (Documentation/RCU/rculist_nulls.rst for details)
3510 	 */
3511 	smp_wmb();
3512 	refcount_set(&sk->sk_refcnt, 1);
3513 	atomic_set(&sk->sk_drops, 0);
3514 }
3515 EXPORT_SYMBOL(sock_init_data_uid);
3516 
sock_init_data(struct socket * sock,struct sock * sk)3517 void sock_init_data(struct socket *sock, struct sock *sk)
3518 {
3519 	kuid_t uid = sock ?
3520 		SOCK_INODE(sock)->i_uid :
3521 		make_kuid(sock_net(sk)->user_ns, 0);
3522 
3523 	sock_init_data_uid(sock, sk, uid);
3524 }
3525 EXPORT_SYMBOL(sock_init_data);
3526 
lock_sock_nested(struct sock * sk,int subclass)3527 void lock_sock_nested(struct sock *sk, int subclass)
3528 {
3529 	/* The sk_lock has mutex_lock() semantics here. */
3530 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3531 
3532 	might_sleep();
3533 	spin_lock_bh(&sk->sk_lock.slock);
3534 	if (sock_owned_by_user_nocheck(sk))
3535 		__lock_sock(sk);
3536 	sk->sk_lock.owned = 1;
3537 	spin_unlock_bh(&sk->sk_lock.slock);
3538 }
3539 EXPORT_SYMBOL(lock_sock_nested);
3540 
release_sock(struct sock * sk)3541 void release_sock(struct sock *sk)
3542 {
3543 	spin_lock_bh(&sk->sk_lock.slock);
3544 	if (sk->sk_backlog.tail)
3545 		__release_sock(sk);
3546 
3547 	/* Warning : release_cb() might need to release sk ownership,
3548 	 * ie call sock_release_ownership(sk) before us.
3549 	 */
3550 	if (sk->sk_prot->release_cb)
3551 		sk->sk_prot->release_cb(sk);
3552 
3553 	sock_release_ownership(sk);
3554 	if (waitqueue_active(&sk->sk_lock.wq))
3555 		wake_up(&sk->sk_lock.wq);
3556 	spin_unlock_bh(&sk->sk_lock.slock);
3557 }
3558 EXPORT_SYMBOL(release_sock);
3559 
__lock_sock_fast(struct sock * sk)3560 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3561 {
3562 	might_sleep();
3563 	spin_lock_bh(&sk->sk_lock.slock);
3564 
3565 	if (!sock_owned_by_user_nocheck(sk)) {
3566 		/*
3567 		 * Fast path return with bottom halves disabled and
3568 		 * sock::sk_lock.slock held.
3569 		 *
3570 		 * The 'mutex' is not contended and holding
3571 		 * sock::sk_lock.slock prevents all other lockers to
3572 		 * proceed so the corresponding unlock_sock_fast() can
3573 		 * avoid the slow path of release_sock() completely and
3574 		 * just release slock.
3575 		 *
3576 		 * From a semantical POV this is equivalent to 'acquiring'
3577 		 * the 'mutex', hence the corresponding lockdep
3578 		 * mutex_release() has to happen in the fast path of
3579 		 * unlock_sock_fast().
3580 		 */
3581 		return false;
3582 	}
3583 
3584 	__lock_sock(sk);
3585 	sk->sk_lock.owned = 1;
3586 	__acquire(&sk->sk_lock.slock);
3587 	spin_unlock_bh(&sk->sk_lock.slock);
3588 	return true;
3589 }
3590 EXPORT_SYMBOL(__lock_sock_fast);
3591 
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3592 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3593 		   bool timeval, bool time32)
3594 {
3595 	struct sock *sk = sock->sk;
3596 	struct timespec64 ts;
3597 
3598 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3599 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3600 	if (ts.tv_sec == -1)
3601 		return -ENOENT;
3602 	if (ts.tv_sec == 0) {
3603 		ktime_t kt = ktime_get_real();
3604 		sock_write_timestamp(sk, kt);
3605 		ts = ktime_to_timespec64(kt);
3606 	}
3607 
3608 	if (timeval)
3609 		ts.tv_nsec /= 1000;
3610 
3611 #ifdef CONFIG_COMPAT_32BIT_TIME
3612 	if (time32)
3613 		return put_old_timespec32(&ts, userstamp);
3614 #endif
3615 #ifdef CONFIG_SPARC64
3616 	/* beware of padding in sparc64 timeval */
3617 	if (timeval && !in_compat_syscall()) {
3618 		struct __kernel_old_timeval __user tv = {
3619 			.tv_sec = ts.tv_sec,
3620 			.tv_usec = ts.tv_nsec,
3621 		};
3622 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3623 			return -EFAULT;
3624 		return 0;
3625 	}
3626 #endif
3627 	return put_timespec64(&ts, userstamp);
3628 }
3629 EXPORT_SYMBOL(sock_gettstamp);
3630 
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3631 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3632 {
3633 	if (!sock_flag(sk, flag)) {
3634 		unsigned long previous_flags = sk->sk_flags;
3635 
3636 		sock_set_flag(sk, flag);
3637 		/*
3638 		 * we just set one of the two flags which require net
3639 		 * time stamping, but time stamping might have been on
3640 		 * already because of the other one
3641 		 */
3642 		if (sock_needs_netstamp(sk) &&
3643 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3644 			net_enable_timestamp();
3645 	}
3646 }
3647 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3648 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3649 		       int level, int type)
3650 {
3651 	struct sock_exterr_skb *serr;
3652 	struct sk_buff *skb;
3653 	int copied, err;
3654 
3655 	err = -EAGAIN;
3656 	skb = sock_dequeue_err_skb(sk);
3657 	if (skb == NULL)
3658 		goto out;
3659 
3660 	copied = skb->len;
3661 	if (copied > len) {
3662 		msg->msg_flags |= MSG_TRUNC;
3663 		copied = len;
3664 	}
3665 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3666 	if (err)
3667 		goto out_free_skb;
3668 
3669 	sock_recv_timestamp(msg, sk, skb);
3670 
3671 	serr = SKB_EXT_ERR(skb);
3672 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3673 
3674 	msg->msg_flags |= MSG_ERRQUEUE;
3675 	err = copied;
3676 
3677 out_free_skb:
3678 	kfree_skb(skb);
3679 out:
3680 	return err;
3681 }
3682 EXPORT_SYMBOL(sock_recv_errqueue);
3683 
3684 /*
3685  *	Get a socket option on an socket.
3686  *
3687  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3688  *	asynchronous errors should be reported by getsockopt. We assume
3689  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3690  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3691 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3692 			   char __user *optval, int __user *optlen)
3693 {
3694 	struct sock *sk = sock->sk;
3695 
3696 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3697 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3698 }
3699 EXPORT_SYMBOL(sock_common_getsockopt);
3700 
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3701 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3702 			int flags)
3703 {
3704 	struct sock *sk = sock->sk;
3705 	int addr_len = 0;
3706 	int err;
3707 
3708 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3709 	if (err >= 0)
3710 		msg->msg_namelen = addr_len;
3711 	return err;
3712 }
3713 EXPORT_SYMBOL(sock_common_recvmsg);
3714 
3715 /*
3716  *	Set socket options on an inet socket.
3717  */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3718 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3719 			   sockptr_t optval, unsigned int optlen)
3720 {
3721 	struct sock *sk = sock->sk;
3722 
3723 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3724 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3725 }
3726 EXPORT_SYMBOL(sock_common_setsockopt);
3727 
sk_common_release(struct sock * sk)3728 void sk_common_release(struct sock *sk)
3729 {
3730 	if (sk->sk_prot->destroy)
3731 		sk->sk_prot->destroy(sk);
3732 
3733 	/*
3734 	 * Observation: when sk_common_release is called, processes have
3735 	 * no access to socket. But net still has.
3736 	 * Step one, detach it from networking:
3737 	 *
3738 	 * A. Remove from hash tables.
3739 	 */
3740 
3741 	sk->sk_prot->unhash(sk);
3742 
3743 	if (sk->sk_socket)
3744 		sk->sk_socket->sk = NULL;
3745 
3746 	/*
3747 	 * In this point socket cannot receive new packets, but it is possible
3748 	 * that some packets are in flight because some CPU runs receiver and
3749 	 * did hash table lookup before we unhashed socket. They will achieve
3750 	 * receive queue and will be purged by socket destructor.
3751 	 *
3752 	 * Also we still have packets pending on receive queue and probably,
3753 	 * our own packets waiting in device queues. sock_destroy will drain
3754 	 * receive queue, but transmitted packets will delay socket destruction
3755 	 * until the last reference will be released.
3756 	 */
3757 
3758 	sock_orphan(sk);
3759 
3760 	xfrm_sk_free_policy(sk);
3761 
3762 	sock_put(sk);
3763 }
3764 EXPORT_SYMBOL(sk_common_release);
3765 
sk_get_meminfo(const struct sock * sk,u32 * mem)3766 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3767 {
3768 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3769 
3770 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3771 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3772 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3773 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3774 	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3775 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3776 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3777 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3778 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3779 }
3780 
3781 #ifdef CONFIG_PROC_FS
3782 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3783 
sock_prot_inuse_get(struct net * net,struct proto * prot)3784 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3785 {
3786 	int cpu, idx = prot->inuse_idx;
3787 	int res = 0;
3788 
3789 	for_each_possible_cpu(cpu)
3790 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3791 
3792 	return res >= 0 ? res : 0;
3793 }
3794 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3795 
sock_inuse_get(struct net * net)3796 int sock_inuse_get(struct net *net)
3797 {
3798 	int cpu, res = 0;
3799 
3800 	for_each_possible_cpu(cpu)
3801 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3802 
3803 	return res;
3804 }
3805 
3806 EXPORT_SYMBOL_GPL(sock_inuse_get);
3807 
sock_inuse_init_net(struct net * net)3808 static int __net_init sock_inuse_init_net(struct net *net)
3809 {
3810 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3811 	if (net->core.prot_inuse == NULL)
3812 		return -ENOMEM;
3813 	return 0;
3814 }
3815 
sock_inuse_exit_net(struct net * net)3816 static void __net_exit sock_inuse_exit_net(struct net *net)
3817 {
3818 	free_percpu(net->core.prot_inuse);
3819 }
3820 
3821 static struct pernet_operations net_inuse_ops = {
3822 	.init = sock_inuse_init_net,
3823 	.exit = sock_inuse_exit_net,
3824 };
3825 
net_inuse_init(void)3826 static __init int net_inuse_init(void)
3827 {
3828 	if (register_pernet_subsys(&net_inuse_ops))
3829 		panic("Cannot initialize net inuse counters");
3830 
3831 	return 0;
3832 }
3833 
3834 core_initcall(net_inuse_init);
3835 
assign_proto_idx(struct proto * prot)3836 static int assign_proto_idx(struct proto *prot)
3837 {
3838 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3839 
3840 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3841 		pr_err("PROTO_INUSE_NR exhausted\n");
3842 		return -ENOSPC;
3843 	}
3844 
3845 	set_bit(prot->inuse_idx, proto_inuse_idx);
3846 	return 0;
3847 }
3848 
release_proto_idx(struct proto * prot)3849 static void release_proto_idx(struct proto *prot)
3850 {
3851 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3852 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3853 }
3854 #else
assign_proto_idx(struct proto * prot)3855 static inline int assign_proto_idx(struct proto *prot)
3856 {
3857 	return 0;
3858 }
3859 
release_proto_idx(struct proto * prot)3860 static inline void release_proto_idx(struct proto *prot)
3861 {
3862 }
3863 
3864 #endif
3865 
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3866 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3867 {
3868 	if (!twsk_prot)
3869 		return;
3870 	kfree(twsk_prot->twsk_slab_name);
3871 	twsk_prot->twsk_slab_name = NULL;
3872 	kmem_cache_destroy(twsk_prot->twsk_slab);
3873 	twsk_prot->twsk_slab = NULL;
3874 }
3875 
tw_prot_init(const struct proto * prot)3876 static int tw_prot_init(const struct proto *prot)
3877 {
3878 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3879 
3880 	if (!twsk_prot)
3881 		return 0;
3882 
3883 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3884 					      prot->name);
3885 	if (!twsk_prot->twsk_slab_name)
3886 		return -ENOMEM;
3887 
3888 	twsk_prot->twsk_slab =
3889 		kmem_cache_create(twsk_prot->twsk_slab_name,
3890 				  twsk_prot->twsk_obj_size, 0,
3891 				  SLAB_ACCOUNT | prot->slab_flags,
3892 				  NULL);
3893 	if (!twsk_prot->twsk_slab) {
3894 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3895 			prot->name);
3896 		return -ENOMEM;
3897 	}
3898 
3899 	return 0;
3900 }
3901 
req_prot_cleanup(struct request_sock_ops * rsk_prot)3902 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3903 {
3904 	if (!rsk_prot)
3905 		return;
3906 	kfree(rsk_prot->slab_name);
3907 	rsk_prot->slab_name = NULL;
3908 	kmem_cache_destroy(rsk_prot->slab);
3909 	rsk_prot->slab = NULL;
3910 }
3911 
req_prot_init(const struct proto * prot)3912 static int req_prot_init(const struct proto *prot)
3913 {
3914 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3915 
3916 	if (!rsk_prot)
3917 		return 0;
3918 
3919 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3920 					prot->name);
3921 	if (!rsk_prot->slab_name)
3922 		return -ENOMEM;
3923 
3924 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3925 					   rsk_prot->obj_size, 0,
3926 					   SLAB_ACCOUNT | prot->slab_flags,
3927 					   NULL);
3928 
3929 	if (!rsk_prot->slab) {
3930 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3931 			prot->name);
3932 		return -ENOMEM;
3933 	}
3934 	return 0;
3935 }
3936 
proto_register(struct proto * prot,int alloc_slab)3937 int proto_register(struct proto *prot, int alloc_slab)
3938 {
3939 	int ret = -ENOBUFS;
3940 
3941 	if (prot->memory_allocated && !prot->sysctl_mem) {
3942 		pr_err("%s: missing sysctl_mem\n", prot->name);
3943 		return -EINVAL;
3944 	}
3945 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3946 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3947 		return -EINVAL;
3948 	}
3949 	if (alloc_slab) {
3950 		prot->slab = kmem_cache_create_usercopy(prot->name,
3951 					prot->obj_size, 0,
3952 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3953 					prot->slab_flags,
3954 					prot->useroffset, prot->usersize,
3955 					NULL);
3956 
3957 		if (prot->slab == NULL) {
3958 			pr_crit("%s: Can't create sock SLAB cache!\n",
3959 				prot->name);
3960 			goto out;
3961 		}
3962 
3963 		if (req_prot_init(prot))
3964 			goto out_free_request_sock_slab;
3965 
3966 		if (tw_prot_init(prot))
3967 			goto out_free_timewait_sock_slab;
3968 	}
3969 
3970 	mutex_lock(&proto_list_mutex);
3971 	ret = assign_proto_idx(prot);
3972 	if (ret) {
3973 		mutex_unlock(&proto_list_mutex);
3974 		goto out_free_timewait_sock_slab;
3975 	}
3976 	list_add(&prot->node, &proto_list);
3977 	mutex_unlock(&proto_list_mutex);
3978 	return ret;
3979 
3980 out_free_timewait_sock_slab:
3981 	if (alloc_slab)
3982 		tw_prot_cleanup(prot->twsk_prot);
3983 out_free_request_sock_slab:
3984 	if (alloc_slab) {
3985 		req_prot_cleanup(prot->rsk_prot);
3986 
3987 		kmem_cache_destroy(prot->slab);
3988 		prot->slab = NULL;
3989 	}
3990 out:
3991 	return ret;
3992 }
3993 EXPORT_SYMBOL(proto_register);
3994 
proto_unregister(struct proto * prot)3995 void proto_unregister(struct proto *prot)
3996 {
3997 	mutex_lock(&proto_list_mutex);
3998 	release_proto_idx(prot);
3999 	list_del(&prot->node);
4000 	mutex_unlock(&proto_list_mutex);
4001 
4002 	kmem_cache_destroy(prot->slab);
4003 	prot->slab = NULL;
4004 
4005 	req_prot_cleanup(prot->rsk_prot);
4006 	tw_prot_cleanup(prot->twsk_prot);
4007 }
4008 EXPORT_SYMBOL(proto_unregister);
4009 
sock_load_diag_module(int family,int protocol)4010 int sock_load_diag_module(int family, int protocol)
4011 {
4012 	if (!protocol) {
4013 		if (!sock_is_registered(family))
4014 			return -ENOENT;
4015 
4016 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4017 				      NETLINK_SOCK_DIAG, family);
4018 	}
4019 
4020 #ifdef CONFIG_INET
4021 	if (family == AF_INET &&
4022 	    protocol != IPPROTO_RAW &&
4023 	    protocol < MAX_INET_PROTOS &&
4024 	    !rcu_access_pointer(inet_protos[protocol]))
4025 		return -ENOENT;
4026 #endif
4027 
4028 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4029 			      NETLINK_SOCK_DIAG, family, protocol);
4030 }
4031 EXPORT_SYMBOL(sock_load_diag_module);
4032 
4033 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)4034 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4035 	__acquires(proto_list_mutex)
4036 {
4037 	mutex_lock(&proto_list_mutex);
4038 	return seq_list_start_head(&proto_list, *pos);
4039 }
4040 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)4041 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4042 {
4043 	return seq_list_next(v, &proto_list, pos);
4044 }
4045 
proto_seq_stop(struct seq_file * seq,void * v)4046 static void proto_seq_stop(struct seq_file *seq, void *v)
4047 	__releases(proto_list_mutex)
4048 {
4049 	mutex_unlock(&proto_list_mutex);
4050 }
4051 
proto_method_implemented(const void * method)4052 static char proto_method_implemented(const void *method)
4053 {
4054 	return method == NULL ? 'n' : 'y';
4055 }
sock_prot_memory_allocated(struct proto * proto)4056 static long sock_prot_memory_allocated(struct proto *proto)
4057 {
4058 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4059 }
4060 
sock_prot_memory_pressure(struct proto * proto)4061 static const char *sock_prot_memory_pressure(struct proto *proto)
4062 {
4063 	return proto->memory_pressure != NULL ?
4064 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4065 }
4066 
proto_seq_printf(struct seq_file * seq,struct proto * proto)4067 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4068 {
4069 
4070 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4071 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4072 		   proto->name,
4073 		   proto->obj_size,
4074 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4075 		   sock_prot_memory_allocated(proto),
4076 		   sock_prot_memory_pressure(proto),
4077 		   proto->max_header,
4078 		   proto->slab == NULL ? "no" : "yes",
4079 		   module_name(proto->owner),
4080 		   proto_method_implemented(proto->close),
4081 		   proto_method_implemented(proto->connect),
4082 		   proto_method_implemented(proto->disconnect),
4083 		   proto_method_implemented(proto->accept),
4084 		   proto_method_implemented(proto->ioctl),
4085 		   proto_method_implemented(proto->init),
4086 		   proto_method_implemented(proto->destroy),
4087 		   proto_method_implemented(proto->shutdown),
4088 		   proto_method_implemented(proto->setsockopt),
4089 		   proto_method_implemented(proto->getsockopt),
4090 		   proto_method_implemented(proto->sendmsg),
4091 		   proto_method_implemented(proto->recvmsg),
4092 		   proto_method_implemented(proto->bind),
4093 		   proto_method_implemented(proto->backlog_rcv),
4094 		   proto_method_implemented(proto->hash),
4095 		   proto_method_implemented(proto->unhash),
4096 		   proto_method_implemented(proto->get_port),
4097 		   proto_method_implemented(proto->enter_memory_pressure));
4098 }
4099 
proto_seq_show(struct seq_file * seq,void * v)4100 static int proto_seq_show(struct seq_file *seq, void *v)
4101 {
4102 	if (v == &proto_list)
4103 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4104 			   "protocol",
4105 			   "size",
4106 			   "sockets",
4107 			   "memory",
4108 			   "press",
4109 			   "maxhdr",
4110 			   "slab",
4111 			   "module",
4112 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4113 	else
4114 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4115 	return 0;
4116 }
4117 
4118 static const struct seq_operations proto_seq_ops = {
4119 	.start  = proto_seq_start,
4120 	.next   = proto_seq_next,
4121 	.stop   = proto_seq_stop,
4122 	.show   = proto_seq_show,
4123 };
4124 
proto_init_net(struct net * net)4125 static __net_init int proto_init_net(struct net *net)
4126 {
4127 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4128 			sizeof(struct seq_net_private)))
4129 		return -ENOMEM;
4130 
4131 	return 0;
4132 }
4133 
proto_exit_net(struct net * net)4134 static __net_exit void proto_exit_net(struct net *net)
4135 {
4136 	remove_proc_entry("protocols", net->proc_net);
4137 }
4138 
4139 
4140 static __net_initdata struct pernet_operations proto_net_ops = {
4141 	.init = proto_init_net,
4142 	.exit = proto_exit_net,
4143 };
4144 
proto_init(void)4145 static int __init proto_init(void)
4146 {
4147 	return register_pernet_subsys(&proto_net_ops);
4148 }
4149 
4150 subsys_initcall(proto_init);
4151 
4152 #endif /* PROC_FS */
4153 
4154 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)4155 bool sk_busy_loop_end(void *p, unsigned long start_time)
4156 {
4157 	struct sock *sk = p;
4158 
4159 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4160 		return true;
4161 
4162 	if (sk_is_udp(sk) &&
4163 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4164 		return true;
4165 
4166 	return sk_busy_loop_timeout(sk, start_time);
4167 }
4168 EXPORT_SYMBOL(sk_busy_loop_end);
4169 #endif /* CONFIG_NET_RX_BUSY_POLL */
4170 
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)4171 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4172 {
4173 	if (!sk->sk_prot->bind_add)
4174 		return -EOPNOTSUPP;
4175 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4176 }
4177 EXPORT_SYMBOL(sock_bind_add);
4178 
4179 /* Copy 'size' bytes from userspace and return `size` back to userspace */
sock_ioctl_inout(struct sock * sk,unsigned int cmd,void __user * arg,void * karg,size_t size)4180 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4181 		     void __user *arg, void *karg, size_t size)
4182 {
4183 	int ret;
4184 
4185 	if (copy_from_user(karg, arg, size))
4186 		return -EFAULT;
4187 
4188 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4189 	if (ret)
4190 		return ret;
4191 
4192 	if (copy_to_user(arg, karg, size))
4193 		return -EFAULT;
4194 
4195 	return 0;
4196 }
4197 EXPORT_SYMBOL(sock_ioctl_inout);
4198 
4199 /* This is the most common ioctl prep function, where the result (4 bytes) is
4200  * copied back to userspace if the ioctl() returns successfully. No input is
4201  * copied from userspace as input argument.
4202  */
sock_ioctl_out(struct sock * sk,unsigned int cmd,void __user * arg)4203 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4204 {
4205 	int ret, karg = 0;
4206 
4207 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4208 	if (ret)
4209 		return ret;
4210 
4211 	return put_user(karg, (int __user *)arg);
4212 }
4213 
4214 /* A wrapper around sock ioctls, which copies the data from userspace
4215  * (depending on the protocol/ioctl), and copies back the result to userspace.
4216  * The main motivation for this function is to pass kernel memory to the
4217  * protocol ioctl callbacks, instead of userspace memory.
4218  */
sk_ioctl(struct sock * sk,unsigned int cmd,void __user * arg)4219 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4220 {
4221 	int rc = 1;
4222 
4223 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4224 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4225 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4226 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4227 	else if (sk_is_phonet(sk))
4228 		rc = phonet_sk_ioctl(sk, cmd, arg);
4229 
4230 	/* If ioctl was processed, returns its value */
4231 	if (rc <= 0)
4232 		return rc;
4233 
4234 	/* Otherwise call the default handler */
4235 	return sock_ioctl_out(sk, cmd, arg);
4236 }
4237 EXPORT_SYMBOL(sk_ioctl);
4238