1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138 
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142 
143 #include <trace/events/sock.h>
144 #include <trace/hooks/sched.h>
145 #include <trace/hooks/net.h>
146 
147 #include <net/tcp.h>
148 #include <net/busy_poll.h>
149 #include <net/phonet/phonet.h>
150 
151 #include <linux/ethtool.h>
152 
153 #include "dev.h"
154 
155 static DEFINE_MUTEX(proto_list_mutex);
156 static LIST_HEAD(proto_list);
157 
158 static void sock_def_write_space_wfree(struct sock *sk);
159 static void sock_def_write_space(struct sock *sk);
160 
161 /**
162  * sk_ns_capable - General socket capability test
163  * @sk: Socket to use a capability on or through
164  * @user_ns: The user namespace of the capability to use
165  * @cap: The capability to use
166  *
167  * Test to see if the opener of the socket had when the socket was
168  * created and the current process has the capability @cap in the user
169  * namespace @user_ns.
170  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)171 bool sk_ns_capable(const struct sock *sk,
172 		   struct user_namespace *user_ns, int cap)
173 {
174 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
175 		ns_capable(user_ns, cap);
176 }
177 EXPORT_SYMBOL(sk_ns_capable);
178 
179 /**
180  * sk_capable - Socket global capability test
181  * @sk: Socket to use a capability on or through
182  * @cap: The global capability to use
183  *
184  * Test to see if the opener of the socket had when the socket was
185  * created and the current process has the capability @cap in all user
186  * namespaces.
187  */
sk_capable(const struct sock * sk,int cap)188 bool sk_capable(const struct sock *sk, int cap)
189 {
190 	return sk_ns_capable(sk, &init_user_ns, cap);
191 }
192 EXPORT_SYMBOL(sk_capable);
193 
194 /**
195  * sk_net_capable - Network namespace socket capability test
196  * @sk: Socket to use a capability on or through
197  * @cap: The capability to use
198  *
199  * Test to see if the opener of the socket had when the socket was created
200  * and the current process has the capability @cap over the network namespace
201  * the socket is a member of.
202  */
sk_net_capable(const struct sock * sk,int cap)203 bool sk_net_capable(const struct sock *sk, int cap)
204 {
205 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
206 }
207 EXPORT_SYMBOL(sk_net_capable);
208 
209 /*
210  * Each address family might have different locking rules, so we have
211  * one slock key per address family and separate keys for internal and
212  * userspace sockets.
213  */
214 static struct lock_class_key af_family_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_keys[AF_MAX];
216 static struct lock_class_key af_family_slock_keys[AF_MAX];
217 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
218 
219 /*
220  * Make lock validator output more readable. (we pre-construct these
221  * strings build-time, so that runtime initialization of socket
222  * locks is fast):
223  */
224 
225 #define _sock_locks(x)						  \
226   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
227   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
228   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
229   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
230   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
231   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
232   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
233   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
234   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
235   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
236   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
237   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
238   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
239   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
240   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
241   x "AF_MCTP"  , \
242   x "AF_MAX"
243 
244 static const char *const af_family_key_strings[AF_MAX+1] = {
245 	_sock_locks("sk_lock-")
246 };
247 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
248 	_sock_locks("slock-")
249 };
250 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
251 	_sock_locks("clock-")
252 };
253 
254 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
255 	_sock_locks("k-sk_lock-")
256 };
257 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
258 	_sock_locks("k-slock-")
259 };
260 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
261 	_sock_locks("k-clock-")
262 };
263 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
264 	_sock_locks("rlock-")
265 };
266 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
267 	_sock_locks("wlock-")
268 };
269 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
270 	_sock_locks("elock-")
271 };
272 
273 /*
274  * sk_callback_lock and sk queues locking rules are per-address-family,
275  * so split the lock classes by using a per-AF key:
276  */
277 static struct lock_class_key af_callback_keys[AF_MAX];
278 static struct lock_class_key af_rlock_keys[AF_MAX];
279 static struct lock_class_key af_wlock_keys[AF_MAX];
280 static struct lock_class_key af_elock_keys[AF_MAX];
281 static struct lock_class_key af_kern_callback_keys[AF_MAX];
282 
283 /* Run time adjustable parameters. */
284 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
285 EXPORT_SYMBOL(sysctl_wmem_max);
286 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
287 EXPORT_SYMBOL(sysctl_rmem_max);
288 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
289 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
290 
291 int sysctl_tstamp_allow_data __read_mostly = 1;
292 
293 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
294 EXPORT_SYMBOL_GPL(memalloc_socks_key);
295 
296 /**
297  * sk_set_memalloc - sets %SOCK_MEMALLOC
298  * @sk: socket to set it on
299  *
300  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
301  * It's the responsibility of the admin to adjust min_free_kbytes
302  * to meet the requirements
303  */
sk_set_memalloc(struct sock * sk)304 void sk_set_memalloc(struct sock *sk)
305 {
306 	sock_set_flag(sk, SOCK_MEMALLOC);
307 	sk->sk_allocation |= __GFP_MEMALLOC;
308 	static_branch_inc(&memalloc_socks_key);
309 }
310 EXPORT_SYMBOL_GPL(sk_set_memalloc);
311 
sk_clear_memalloc(struct sock * sk)312 void sk_clear_memalloc(struct sock *sk)
313 {
314 	sock_reset_flag(sk, SOCK_MEMALLOC);
315 	sk->sk_allocation &= ~__GFP_MEMALLOC;
316 	static_branch_dec(&memalloc_socks_key);
317 
318 	/*
319 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
320 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
321 	 * it has rmem allocations due to the last swapfile being deactivated
322 	 * but there is a risk that the socket is unusable due to exceeding
323 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
324 	 */
325 	sk_mem_reclaim(sk);
326 }
327 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
328 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)329 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
330 {
331 	int ret;
332 	unsigned int noreclaim_flag;
333 
334 	/* these should have been dropped before queueing */
335 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
336 
337 	noreclaim_flag = memalloc_noreclaim_save();
338 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
339 				 tcp_v6_do_rcv,
340 				 tcp_v4_do_rcv,
341 				 sk, skb);
342 	memalloc_noreclaim_restore(noreclaim_flag);
343 
344 	return ret;
345 }
346 EXPORT_SYMBOL(__sk_backlog_rcv);
347 
sk_error_report(struct sock * sk)348 void sk_error_report(struct sock *sk)
349 {
350 	sk->sk_error_report(sk);
351 
352 	switch (sk->sk_family) {
353 	case AF_INET:
354 		fallthrough;
355 	case AF_INET6:
356 		trace_inet_sk_error_report(sk);
357 		break;
358 	default:
359 		break;
360 	}
361 }
362 EXPORT_SYMBOL(sk_error_report);
363 
sock_get_timeout(long timeo,void * optval,bool old_timeval)364 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
365 {
366 	struct __kernel_sock_timeval tv;
367 
368 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
369 		tv.tv_sec = 0;
370 		tv.tv_usec = 0;
371 	} else {
372 		tv.tv_sec = timeo / HZ;
373 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
374 	}
375 
376 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
377 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
378 		*(struct old_timeval32 *)optval = tv32;
379 		return sizeof(tv32);
380 	}
381 
382 	if (old_timeval) {
383 		struct __kernel_old_timeval old_tv;
384 		old_tv.tv_sec = tv.tv_sec;
385 		old_tv.tv_usec = tv.tv_usec;
386 		*(struct __kernel_old_timeval *)optval = old_tv;
387 		return sizeof(old_tv);
388 	}
389 
390 	*(struct __kernel_sock_timeval *)optval = tv;
391 	return sizeof(tv);
392 }
393 EXPORT_SYMBOL(sock_get_timeout);
394 
sock_copy_user_timeval(struct __kernel_sock_timeval * tv,sockptr_t optval,int optlen,bool old_timeval)395 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
396 			   sockptr_t optval, int optlen, bool old_timeval)
397 {
398 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
399 		struct old_timeval32 tv32;
400 
401 		if (optlen < sizeof(tv32))
402 			return -EINVAL;
403 
404 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
405 			return -EFAULT;
406 		tv->tv_sec = tv32.tv_sec;
407 		tv->tv_usec = tv32.tv_usec;
408 	} else if (old_timeval) {
409 		struct __kernel_old_timeval old_tv;
410 
411 		if (optlen < sizeof(old_tv))
412 			return -EINVAL;
413 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
414 			return -EFAULT;
415 		tv->tv_sec = old_tv.tv_sec;
416 		tv->tv_usec = old_tv.tv_usec;
417 	} else {
418 		if (optlen < sizeof(*tv))
419 			return -EINVAL;
420 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
421 			return -EFAULT;
422 	}
423 
424 	return 0;
425 }
426 EXPORT_SYMBOL(sock_copy_user_timeval);
427 
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)428 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
429 			    bool old_timeval)
430 {
431 	struct __kernel_sock_timeval tv;
432 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
433 	long val;
434 
435 	if (err)
436 		return err;
437 
438 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
439 		return -EDOM;
440 
441 	if (tv.tv_sec < 0) {
442 		static int warned __read_mostly;
443 
444 		WRITE_ONCE(*timeo_p, 0);
445 		if (warned < 10 && net_ratelimit()) {
446 			warned++;
447 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
448 				__func__, current->comm, task_pid_nr(current));
449 		}
450 		return 0;
451 	}
452 	val = MAX_SCHEDULE_TIMEOUT;
453 	if ((tv.tv_sec || tv.tv_usec) &&
454 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
455 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
456 						    USEC_PER_SEC / HZ);
457 	WRITE_ONCE(*timeo_p, val);
458 	return 0;
459 }
460 
sock_needs_netstamp(const struct sock * sk)461 static bool sock_needs_netstamp(const struct sock *sk)
462 {
463 	switch (sk->sk_family) {
464 	case AF_UNSPEC:
465 	case AF_UNIX:
466 		return false;
467 	default:
468 		return true;
469 	}
470 }
471 
sock_disable_timestamp(struct sock * sk,unsigned long flags)472 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
473 {
474 	if (sk->sk_flags & flags) {
475 		sk->sk_flags &= ~flags;
476 		if (sock_needs_netstamp(sk) &&
477 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
478 			net_disable_timestamp();
479 	}
480 }
481 
482 
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)483 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
484 {
485 	unsigned long flags;
486 	struct sk_buff_head *list = &sk->sk_receive_queue;
487 
488 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
489 		atomic_inc(&sk->sk_drops);
490 		trace_sock_rcvqueue_full(sk, skb);
491 		return -ENOMEM;
492 	}
493 
494 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
495 		atomic_inc(&sk->sk_drops);
496 		return -ENOBUFS;
497 	}
498 
499 	skb->dev = NULL;
500 	skb_set_owner_r(skb, sk);
501 
502 	/* we escape from rcu protected region, make sure we dont leak
503 	 * a norefcounted dst
504 	 */
505 	skb_dst_force(skb);
506 
507 	spin_lock_irqsave(&list->lock, flags);
508 	sock_skb_set_dropcount(sk, skb);
509 	__skb_queue_tail(list, skb);
510 	spin_unlock_irqrestore(&list->lock, flags);
511 
512 	if (!sock_flag(sk, SOCK_DEAD))
513 		sk->sk_data_ready(sk);
514 	return 0;
515 }
516 EXPORT_SYMBOL(__sock_queue_rcv_skb);
517 
sock_queue_rcv_skb_reason(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)518 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
519 			      enum skb_drop_reason *reason)
520 {
521 	enum skb_drop_reason drop_reason;
522 	int err;
523 
524 	err = sk_filter(sk, skb);
525 	if (err) {
526 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
527 		goto out;
528 	}
529 	err = __sock_queue_rcv_skb(sk, skb);
530 	switch (err) {
531 	case -ENOMEM:
532 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
533 		break;
534 	case -ENOBUFS:
535 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
536 		break;
537 	default:
538 		drop_reason = SKB_NOT_DROPPED_YET;
539 		break;
540 	}
541 out:
542 	if (reason)
543 		*reason = drop_reason;
544 	return err;
545 }
546 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
547 
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)548 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
549 		     const int nested, unsigned int trim_cap, bool refcounted)
550 {
551 	int rc = NET_RX_SUCCESS;
552 
553 	if (sk_filter_trim_cap(sk, skb, trim_cap))
554 		goto discard_and_relse;
555 
556 	skb->dev = NULL;
557 
558 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
559 		atomic_inc(&sk->sk_drops);
560 		goto discard_and_relse;
561 	}
562 	if (nested)
563 		bh_lock_sock_nested(sk);
564 	else
565 		bh_lock_sock(sk);
566 	if (!sock_owned_by_user(sk)) {
567 		/*
568 		 * trylock + unlock semantics:
569 		 */
570 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
571 
572 		rc = sk_backlog_rcv(sk, skb);
573 
574 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
575 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
576 		bh_unlock_sock(sk);
577 		atomic_inc(&sk->sk_drops);
578 		goto discard_and_relse;
579 	}
580 
581 	bh_unlock_sock(sk);
582 out:
583 	if (refcounted)
584 		sock_put(sk);
585 	return rc;
586 discard_and_relse:
587 	kfree_skb(skb);
588 	goto out;
589 }
590 EXPORT_SYMBOL(__sk_receive_skb);
591 
592 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
593 							  u32));
594 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
595 							   u32));
__sk_dst_check(struct sock * sk,u32 cookie)596 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
597 {
598 	struct dst_entry *dst = __sk_dst_get(sk);
599 
600 	if (dst && dst->obsolete &&
601 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
602 			       dst, cookie) == NULL) {
603 		sk_tx_queue_clear(sk);
604 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
605 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
606 		dst_release(dst);
607 		return NULL;
608 	}
609 
610 	return dst;
611 }
612 EXPORT_SYMBOL(__sk_dst_check);
613 
sk_dst_check(struct sock * sk,u32 cookie)614 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
615 {
616 	struct dst_entry *dst = sk_dst_get(sk);
617 
618 	if (dst && dst->obsolete &&
619 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
620 			       dst, cookie) == NULL) {
621 		sk_dst_reset(sk);
622 		dst_release(dst);
623 		return NULL;
624 	}
625 
626 	return dst;
627 }
628 EXPORT_SYMBOL(sk_dst_check);
629 
sock_bindtoindex_locked(struct sock * sk,int ifindex)630 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
631 {
632 	int ret = -ENOPROTOOPT;
633 #ifdef CONFIG_NETDEVICES
634 	struct net *net = sock_net(sk);
635 
636 	/* Sorry... */
637 	ret = -EPERM;
638 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
639 		goto out;
640 
641 	ret = -EINVAL;
642 	if (ifindex < 0)
643 		goto out;
644 
645 	/* Paired with all READ_ONCE() done locklessly. */
646 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
647 
648 	if (sk->sk_prot->rehash)
649 		sk->sk_prot->rehash(sk);
650 	sk_dst_reset(sk);
651 
652 	ret = 0;
653 
654 out:
655 #endif
656 
657 	return ret;
658 }
659 
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)660 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
661 {
662 	int ret;
663 
664 	if (lock_sk)
665 		lock_sock(sk);
666 	ret = sock_bindtoindex_locked(sk, ifindex);
667 	if (lock_sk)
668 		release_sock(sk);
669 
670 	return ret;
671 }
672 EXPORT_SYMBOL(sock_bindtoindex);
673 
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)674 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
675 {
676 	int ret = -ENOPROTOOPT;
677 #ifdef CONFIG_NETDEVICES
678 	struct net *net = sock_net(sk);
679 	char devname[IFNAMSIZ];
680 	int index;
681 
682 	ret = -EINVAL;
683 	if (optlen < 0)
684 		goto out;
685 
686 	/* Bind this socket to a particular device like "eth0",
687 	 * as specified in the passed interface name. If the
688 	 * name is "" or the option length is zero the socket
689 	 * is not bound.
690 	 */
691 	if (optlen > IFNAMSIZ - 1)
692 		optlen = IFNAMSIZ - 1;
693 	memset(devname, 0, sizeof(devname));
694 
695 	ret = -EFAULT;
696 	if (copy_from_sockptr(devname, optval, optlen))
697 		goto out;
698 
699 	index = 0;
700 	if (devname[0] != '\0') {
701 		struct net_device *dev;
702 
703 		rcu_read_lock();
704 		dev = dev_get_by_name_rcu(net, devname);
705 		if (dev)
706 			index = dev->ifindex;
707 		rcu_read_unlock();
708 		ret = -ENODEV;
709 		if (!dev)
710 			goto out;
711 	}
712 
713 	sockopt_lock_sock(sk);
714 	ret = sock_bindtoindex_locked(sk, index);
715 	sockopt_release_sock(sk);
716 out:
717 #endif
718 
719 	return ret;
720 }
721 
sock_getbindtodevice(struct sock * sk,sockptr_t optval,sockptr_t optlen,int len)722 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
723 				sockptr_t optlen, int len)
724 {
725 	int ret = -ENOPROTOOPT;
726 #ifdef CONFIG_NETDEVICES
727 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
728 	struct net *net = sock_net(sk);
729 	char devname[IFNAMSIZ];
730 
731 	if (bound_dev_if == 0) {
732 		len = 0;
733 		goto zero;
734 	}
735 
736 	ret = -EINVAL;
737 	if (len < IFNAMSIZ)
738 		goto out;
739 
740 	ret = netdev_get_name(net, devname, bound_dev_if);
741 	if (ret)
742 		goto out;
743 
744 	len = strlen(devname) + 1;
745 
746 	ret = -EFAULT;
747 	if (copy_to_sockptr(optval, devname, len))
748 		goto out;
749 
750 zero:
751 	ret = -EFAULT;
752 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
753 		goto out;
754 
755 	ret = 0;
756 
757 out:
758 #endif
759 
760 	return ret;
761 }
762 
sk_mc_loop(const struct sock * sk)763 bool sk_mc_loop(const struct sock *sk)
764 {
765 	if (dev_recursion_level())
766 		return false;
767 	if (!sk)
768 		return true;
769 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
770 	switch (READ_ONCE(sk->sk_family)) {
771 	case AF_INET:
772 		return inet_test_bit(MC_LOOP, sk);
773 #if IS_ENABLED(CONFIG_IPV6)
774 	case AF_INET6:
775 		return inet6_test_bit(MC6_LOOP, sk);
776 #endif
777 	}
778 	WARN_ON_ONCE(1);
779 	return true;
780 }
781 EXPORT_SYMBOL(sk_mc_loop);
782 
sock_set_reuseaddr(struct sock * sk)783 void sock_set_reuseaddr(struct sock *sk)
784 {
785 	lock_sock(sk);
786 	sk->sk_reuse = SK_CAN_REUSE;
787 	release_sock(sk);
788 }
789 EXPORT_SYMBOL(sock_set_reuseaddr);
790 
sock_set_reuseport(struct sock * sk)791 void sock_set_reuseport(struct sock *sk)
792 {
793 	lock_sock(sk);
794 	sk->sk_reuseport = true;
795 	release_sock(sk);
796 }
797 EXPORT_SYMBOL(sock_set_reuseport);
798 
sock_no_linger(struct sock * sk)799 void sock_no_linger(struct sock *sk)
800 {
801 	lock_sock(sk);
802 	WRITE_ONCE(sk->sk_lingertime, 0);
803 	sock_set_flag(sk, SOCK_LINGER);
804 	release_sock(sk);
805 }
806 EXPORT_SYMBOL(sock_no_linger);
807 
sock_set_priority(struct sock * sk,u32 priority)808 void sock_set_priority(struct sock *sk, u32 priority)
809 {
810 	WRITE_ONCE(sk->sk_priority, priority);
811 }
812 EXPORT_SYMBOL(sock_set_priority);
813 
sock_set_sndtimeo(struct sock * sk,s64 secs)814 void sock_set_sndtimeo(struct sock *sk, s64 secs)
815 {
816 	lock_sock(sk);
817 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
818 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
819 	else
820 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
821 	release_sock(sk);
822 }
823 EXPORT_SYMBOL(sock_set_sndtimeo);
824 
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)825 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
826 {
827 	if (val)  {
828 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
829 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
830 		sock_set_flag(sk, SOCK_RCVTSTAMP);
831 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
832 	} else {
833 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
834 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
835 	}
836 }
837 
sock_enable_timestamps(struct sock * sk)838 void sock_enable_timestamps(struct sock *sk)
839 {
840 	lock_sock(sk);
841 	__sock_set_timestamps(sk, true, false, true);
842 	release_sock(sk);
843 }
844 EXPORT_SYMBOL(sock_enable_timestamps);
845 
sock_set_timestamp(struct sock * sk,int optname,bool valbool)846 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
847 {
848 	switch (optname) {
849 	case SO_TIMESTAMP_OLD:
850 		__sock_set_timestamps(sk, valbool, false, false);
851 		break;
852 	case SO_TIMESTAMP_NEW:
853 		__sock_set_timestamps(sk, valbool, true, false);
854 		break;
855 	case SO_TIMESTAMPNS_OLD:
856 		__sock_set_timestamps(sk, valbool, false, true);
857 		break;
858 	case SO_TIMESTAMPNS_NEW:
859 		__sock_set_timestamps(sk, valbool, true, true);
860 		break;
861 	}
862 }
863 
sock_timestamping_bind_phc(struct sock * sk,int phc_index)864 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
865 {
866 	struct net *net = sock_net(sk);
867 	struct net_device *dev = NULL;
868 	bool match = false;
869 	int *vclock_index;
870 	int i, num;
871 
872 	if (sk->sk_bound_dev_if)
873 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
874 
875 	if (!dev) {
876 		pr_err("%s: sock not bind to device\n", __func__);
877 		return -EOPNOTSUPP;
878 	}
879 
880 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
881 	dev_put(dev);
882 
883 	for (i = 0; i < num; i++) {
884 		if (*(vclock_index + i) == phc_index) {
885 			match = true;
886 			break;
887 		}
888 	}
889 
890 	if (num > 0)
891 		kfree(vclock_index);
892 
893 	if (!match)
894 		return -EINVAL;
895 
896 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
897 
898 	return 0;
899 }
900 
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)901 int sock_set_timestamping(struct sock *sk, int optname,
902 			  struct so_timestamping timestamping)
903 {
904 	int val = timestamping.flags;
905 	int ret;
906 
907 	if (val & ~SOF_TIMESTAMPING_MASK)
908 		return -EINVAL;
909 
910 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
911 	    !(val & SOF_TIMESTAMPING_OPT_ID))
912 		return -EINVAL;
913 
914 	if (val & SOF_TIMESTAMPING_OPT_ID &&
915 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
916 		if (sk_is_tcp(sk)) {
917 			if ((1 << sk->sk_state) &
918 			    (TCPF_CLOSE | TCPF_LISTEN))
919 				return -EINVAL;
920 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
921 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
922 			else
923 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
924 		} else {
925 			atomic_set(&sk->sk_tskey, 0);
926 		}
927 	}
928 
929 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
930 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
931 		return -EINVAL;
932 
933 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
934 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
935 		if (ret)
936 			return ret;
937 	}
938 
939 	WRITE_ONCE(sk->sk_tsflags, val);
940 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
941 
942 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
943 		sock_enable_timestamp(sk,
944 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
945 	else
946 		sock_disable_timestamp(sk,
947 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
948 	return 0;
949 }
950 
sock_set_keepalive(struct sock * sk)951 void sock_set_keepalive(struct sock *sk)
952 {
953 	lock_sock(sk);
954 	if (sk->sk_prot->keepalive)
955 		sk->sk_prot->keepalive(sk, true);
956 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
957 	release_sock(sk);
958 }
959 EXPORT_SYMBOL(sock_set_keepalive);
960 
__sock_set_rcvbuf(struct sock * sk,int val)961 static void __sock_set_rcvbuf(struct sock *sk, int val)
962 {
963 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
964 	 * as a negative value.
965 	 */
966 	val = min_t(int, val, INT_MAX / 2);
967 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
968 
969 	/* We double it on the way in to account for "struct sk_buff" etc.
970 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
971 	 * will allow that much actual data to be received on that socket.
972 	 *
973 	 * Applications are unaware that "struct sk_buff" and other overheads
974 	 * allocate from the receive buffer during socket buffer allocation.
975 	 *
976 	 * And after considering the possible alternatives, returning the value
977 	 * we actually used in getsockopt is the most desirable behavior.
978 	 */
979 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
980 }
981 
sock_set_rcvbuf(struct sock * sk,int val)982 void sock_set_rcvbuf(struct sock *sk, int val)
983 {
984 	lock_sock(sk);
985 	__sock_set_rcvbuf(sk, val);
986 	release_sock(sk);
987 }
988 EXPORT_SYMBOL(sock_set_rcvbuf);
989 
__sock_set_mark(struct sock * sk,u32 val)990 static void __sock_set_mark(struct sock *sk, u32 val)
991 {
992 	if (val != sk->sk_mark) {
993 		WRITE_ONCE(sk->sk_mark, val);
994 		sk_dst_reset(sk);
995 	}
996 }
997 
sock_set_mark(struct sock * sk,u32 val)998 void sock_set_mark(struct sock *sk, u32 val)
999 {
1000 	lock_sock(sk);
1001 	__sock_set_mark(sk, val);
1002 	release_sock(sk);
1003 }
1004 EXPORT_SYMBOL(sock_set_mark);
1005 
sock_release_reserved_memory(struct sock * sk,int bytes)1006 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1007 {
1008 	/* Round down bytes to multiple of pages */
1009 	bytes = round_down(bytes, PAGE_SIZE);
1010 
1011 	WARN_ON(bytes > sk->sk_reserved_mem);
1012 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1013 	sk_mem_reclaim(sk);
1014 }
1015 
sock_reserve_memory(struct sock * sk,int bytes)1016 static int sock_reserve_memory(struct sock *sk, int bytes)
1017 {
1018 	long allocated;
1019 	bool charged;
1020 	int pages;
1021 
1022 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1023 		return -EOPNOTSUPP;
1024 
1025 	if (!bytes)
1026 		return 0;
1027 
1028 	pages = sk_mem_pages(bytes);
1029 
1030 	/* pre-charge to memcg */
1031 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1032 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1033 	if (!charged)
1034 		return -ENOMEM;
1035 
1036 	/* pre-charge to forward_alloc */
1037 	sk_memory_allocated_add(sk, pages);
1038 	allocated = sk_memory_allocated(sk);
1039 	/* If the system goes into memory pressure with this
1040 	 * precharge, give up and return error.
1041 	 */
1042 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1043 		sk_memory_allocated_sub(sk, pages);
1044 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1045 		return -ENOMEM;
1046 	}
1047 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1048 
1049 	WRITE_ONCE(sk->sk_reserved_mem,
1050 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1051 
1052 	return 0;
1053 }
1054 
1055 #ifdef CONFIG_PAGE_POOL
1056 
1057 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1058  * in 1 syscall. The limit exists to limit the amount of memory the kernel
1059  * allocates to copy these tokens, and to prevent looping over the frags for
1060  * too long.
1061  */
1062 #define MAX_DONTNEED_TOKENS 128
1063 #define MAX_DONTNEED_FRAGS 1024
1064 
1065 static noinline_for_stack int
sock_devmem_dontneed(struct sock * sk,sockptr_t optval,unsigned int optlen)1066 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1067 {
1068 	unsigned int num_tokens, i, j, k, netmem_num = 0;
1069 	struct dmabuf_token *tokens;
1070 	int ret = 0, num_frags = 0;
1071 	netmem_ref netmems[16];
1072 
1073 	if (!sk_is_tcp(sk))
1074 		return -EBADF;
1075 
1076 	if (optlen % sizeof(*tokens) ||
1077 	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1078 		return -EINVAL;
1079 
1080 	num_tokens = optlen / sizeof(*tokens);
1081 	tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1082 	if (!tokens)
1083 		return -ENOMEM;
1084 
1085 	if (copy_from_sockptr(tokens, optval, optlen)) {
1086 		kvfree(tokens);
1087 		return -EFAULT;
1088 	}
1089 
1090 	xa_lock_bh(&sk->sk_user_frags);
1091 	for (i = 0; i < num_tokens; i++) {
1092 		for (j = 0; j < tokens[i].token_count; j++) {
1093 			if (++num_frags > MAX_DONTNEED_FRAGS)
1094 				goto frag_limit_reached;
1095 
1096 			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1097 				&sk->sk_user_frags, tokens[i].token_start + j);
1098 
1099 			if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1100 				continue;
1101 
1102 			netmems[netmem_num++] = netmem;
1103 			if (netmem_num == ARRAY_SIZE(netmems)) {
1104 				xa_unlock_bh(&sk->sk_user_frags);
1105 				for (k = 0; k < netmem_num; k++)
1106 					WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1107 				netmem_num = 0;
1108 				xa_lock_bh(&sk->sk_user_frags);
1109 			}
1110 			ret++;
1111 		}
1112 	}
1113 
1114 frag_limit_reached:
1115 	xa_unlock_bh(&sk->sk_user_frags);
1116 	for (k = 0; k < netmem_num; k++)
1117 		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1118 
1119 	kvfree(tokens);
1120 	return ret;
1121 }
1122 #endif
1123 
sockopt_lock_sock(struct sock * sk)1124 void sockopt_lock_sock(struct sock *sk)
1125 {
1126 	/* When current->bpf_ctx is set, the setsockopt is called from
1127 	 * a bpf prog.  bpf has ensured the sk lock has been
1128 	 * acquired before calling setsockopt().
1129 	 */
1130 	if (has_current_bpf_ctx())
1131 		return;
1132 
1133 	lock_sock(sk);
1134 }
1135 EXPORT_SYMBOL(sockopt_lock_sock);
1136 
sockopt_release_sock(struct sock * sk)1137 void sockopt_release_sock(struct sock *sk)
1138 {
1139 	if (has_current_bpf_ctx())
1140 		return;
1141 
1142 	release_sock(sk);
1143 }
1144 EXPORT_SYMBOL(sockopt_release_sock);
1145 
sockopt_ns_capable(struct user_namespace * ns,int cap)1146 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1147 {
1148 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1149 }
1150 EXPORT_SYMBOL(sockopt_ns_capable);
1151 
sockopt_capable(int cap)1152 bool sockopt_capable(int cap)
1153 {
1154 	return has_current_bpf_ctx() || capable(cap);
1155 }
1156 EXPORT_SYMBOL(sockopt_capable);
1157 
sockopt_validate_clockid(__kernel_clockid_t value)1158 static int sockopt_validate_clockid(__kernel_clockid_t value)
1159 {
1160 	switch (value) {
1161 	case CLOCK_REALTIME:
1162 	case CLOCK_MONOTONIC:
1163 	case CLOCK_TAI:
1164 		return 0;
1165 	}
1166 	return -EINVAL;
1167 }
1168 
1169 /*
1170  *	This is meant for all protocols to use and covers goings on
1171  *	at the socket level. Everything here is generic.
1172  */
1173 
sk_setsockopt(struct sock * sk,int level,int optname,sockptr_t optval,unsigned int optlen)1174 int sk_setsockopt(struct sock *sk, int level, int optname,
1175 		  sockptr_t optval, unsigned int optlen)
1176 {
1177 	struct so_timestamping timestamping;
1178 	struct socket *sock = sk->sk_socket;
1179 	struct sock_txtime sk_txtime;
1180 	int val;
1181 	int valbool;
1182 	struct linger ling;
1183 	int ret = 0;
1184 
1185 	/*
1186 	 *	Options without arguments
1187 	 */
1188 
1189 	if (optname == SO_BINDTODEVICE)
1190 		return sock_setbindtodevice(sk, optval, optlen);
1191 
1192 	if (optlen < sizeof(int))
1193 		return -EINVAL;
1194 
1195 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1196 		return -EFAULT;
1197 
1198 	valbool = val ? 1 : 0;
1199 
1200 	/* handle options which do not require locking the socket. */
1201 	switch (optname) {
1202 	case SO_PRIORITY:
1203 		if ((val >= 0 && val <= 6) ||
1204 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1205 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1206 			sock_set_priority(sk, val);
1207 			return 0;
1208 		}
1209 		return -EPERM;
1210 	case SO_PASSSEC:
1211 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1212 		return 0;
1213 	case SO_PASSCRED:
1214 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1215 		return 0;
1216 	case SO_PASSPIDFD:
1217 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1218 		return 0;
1219 	case SO_TYPE:
1220 	case SO_PROTOCOL:
1221 	case SO_DOMAIN:
1222 	case SO_ERROR:
1223 		return -ENOPROTOOPT;
1224 #ifdef CONFIG_NET_RX_BUSY_POLL
1225 	case SO_BUSY_POLL:
1226 		if (val < 0)
1227 			return -EINVAL;
1228 		WRITE_ONCE(sk->sk_ll_usec, val);
1229 		return 0;
1230 	case SO_PREFER_BUSY_POLL:
1231 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1232 			return -EPERM;
1233 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1234 		return 0;
1235 	case SO_BUSY_POLL_BUDGET:
1236 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1237 		    !sockopt_capable(CAP_NET_ADMIN))
1238 			return -EPERM;
1239 		if (val < 0 || val > U16_MAX)
1240 			return -EINVAL;
1241 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1242 		return 0;
1243 #endif
1244 	case SO_MAX_PACING_RATE:
1245 		{
1246 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1247 		unsigned long pacing_rate;
1248 
1249 		if (sizeof(ulval) != sizeof(val) &&
1250 		    optlen >= sizeof(ulval) &&
1251 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1252 			return -EFAULT;
1253 		}
1254 		if (ulval != ~0UL)
1255 			cmpxchg(&sk->sk_pacing_status,
1256 				SK_PACING_NONE,
1257 				SK_PACING_NEEDED);
1258 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1259 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1260 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1261 		if (ulval < pacing_rate)
1262 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1263 		return 0;
1264 		}
1265 	case SO_TXREHASH:
1266 		if (val < -1 || val > 1)
1267 			return -EINVAL;
1268 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1269 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1270 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1271 		 * and sk_getsockopt().
1272 		 */
1273 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1274 		return 0;
1275 	case SO_PEEK_OFF:
1276 		{
1277 		int (*set_peek_off)(struct sock *sk, int val);
1278 
1279 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1280 		if (set_peek_off)
1281 			ret = set_peek_off(sk, val);
1282 		else
1283 			ret = -EOPNOTSUPP;
1284 		return ret;
1285 		}
1286 #ifdef CONFIG_PAGE_POOL
1287 	case SO_DEVMEM_DONTNEED:
1288 		return sock_devmem_dontneed(sk, optval, optlen);
1289 #endif
1290 	}
1291 
1292 	sockopt_lock_sock(sk);
1293 
1294 	switch (optname) {
1295 	case SO_DEBUG:
1296 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1297 			ret = -EACCES;
1298 		else
1299 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1300 		break;
1301 	case SO_REUSEADDR:
1302 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1303 		break;
1304 	case SO_REUSEPORT:
1305 		if (valbool && !sk_is_inet(sk))
1306 			ret = -EOPNOTSUPP;
1307 		else
1308 			sk->sk_reuseport = valbool;
1309 		break;
1310 	case SO_DONTROUTE:
1311 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1312 		sk_dst_reset(sk);
1313 		break;
1314 	case SO_BROADCAST:
1315 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1316 		break;
1317 	case SO_SNDBUF:
1318 		/* Don't error on this BSD doesn't and if you think
1319 		 * about it this is right. Otherwise apps have to
1320 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1321 		 * are treated in BSD as hints
1322 		 */
1323 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1324 set_sndbuf:
1325 		/* Ensure val * 2 fits into an int, to prevent max_t()
1326 		 * from treating it as a negative value.
1327 		 */
1328 		val = min_t(int, val, INT_MAX / 2);
1329 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1330 		WRITE_ONCE(sk->sk_sndbuf,
1331 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1332 		/* Wake up sending tasks if we upped the value. */
1333 		sk->sk_write_space(sk);
1334 		break;
1335 
1336 	case SO_SNDBUFFORCE:
1337 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1338 			ret = -EPERM;
1339 			break;
1340 		}
1341 
1342 		/* No negative values (to prevent underflow, as val will be
1343 		 * multiplied by 2).
1344 		 */
1345 		if (val < 0)
1346 			val = 0;
1347 		goto set_sndbuf;
1348 
1349 	case SO_RCVBUF:
1350 		/* Don't error on this BSD doesn't and if you think
1351 		 * about it this is right. Otherwise apps have to
1352 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1353 		 * are treated in BSD as hints
1354 		 */
1355 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1356 		break;
1357 
1358 	case SO_RCVBUFFORCE:
1359 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1360 			ret = -EPERM;
1361 			break;
1362 		}
1363 
1364 		/* No negative values (to prevent underflow, as val will be
1365 		 * multiplied by 2).
1366 		 */
1367 		__sock_set_rcvbuf(sk, max(val, 0));
1368 		break;
1369 
1370 	case SO_KEEPALIVE:
1371 		if (sk->sk_prot->keepalive)
1372 			sk->sk_prot->keepalive(sk, valbool);
1373 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1374 		break;
1375 
1376 	case SO_OOBINLINE:
1377 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1378 		break;
1379 
1380 	case SO_NO_CHECK:
1381 		sk->sk_no_check_tx = valbool;
1382 		break;
1383 
1384 	case SO_LINGER:
1385 		if (optlen < sizeof(ling)) {
1386 			ret = -EINVAL;	/* 1003.1g */
1387 			break;
1388 		}
1389 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1390 			ret = -EFAULT;
1391 			break;
1392 		}
1393 		if (!ling.l_onoff) {
1394 			sock_reset_flag(sk, SOCK_LINGER);
1395 		} else {
1396 			unsigned long t_sec = ling.l_linger;
1397 
1398 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1399 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1400 			else
1401 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1402 			sock_set_flag(sk, SOCK_LINGER);
1403 		}
1404 		break;
1405 
1406 	case SO_BSDCOMPAT:
1407 		break;
1408 
1409 	case SO_TIMESTAMP_OLD:
1410 	case SO_TIMESTAMP_NEW:
1411 	case SO_TIMESTAMPNS_OLD:
1412 	case SO_TIMESTAMPNS_NEW:
1413 		sock_set_timestamp(sk, optname, valbool);
1414 		break;
1415 
1416 	case SO_TIMESTAMPING_NEW:
1417 	case SO_TIMESTAMPING_OLD:
1418 		if (optlen == sizeof(timestamping)) {
1419 			if (copy_from_sockptr(×tamping, optval,
1420 					      sizeof(timestamping))) {
1421 				ret = -EFAULT;
1422 				break;
1423 			}
1424 		} else {
1425 			memset(×tamping, 0, sizeof(timestamping));
1426 			timestamping.flags = val;
1427 		}
1428 		ret = sock_set_timestamping(sk, optname, timestamping);
1429 		break;
1430 
1431 	case SO_RCVLOWAT:
1432 		{
1433 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1434 
1435 		if (val < 0)
1436 			val = INT_MAX;
1437 		if (sock)
1438 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1439 		if (set_rcvlowat)
1440 			ret = set_rcvlowat(sk, val);
1441 		else
1442 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1443 		break;
1444 		}
1445 	case SO_RCVTIMEO_OLD:
1446 	case SO_RCVTIMEO_NEW:
1447 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1448 				       optlen, optname == SO_RCVTIMEO_OLD);
1449 		break;
1450 
1451 	case SO_SNDTIMEO_OLD:
1452 	case SO_SNDTIMEO_NEW:
1453 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1454 				       optlen, optname == SO_SNDTIMEO_OLD);
1455 		break;
1456 
1457 	case SO_ATTACH_FILTER: {
1458 		struct sock_fprog fprog;
1459 
1460 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1461 		if (!ret)
1462 			ret = sk_attach_filter(&fprog, sk);
1463 		break;
1464 	}
1465 	case SO_ATTACH_BPF:
1466 		ret = -EINVAL;
1467 		if (optlen == sizeof(u32)) {
1468 			u32 ufd;
1469 
1470 			ret = -EFAULT;
1471 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1472 				break;
1473 
1474 			ret = sk_attach_bpf(ufd, sk);
1475 		}
1476 		break;
1477 
1478 	case SO_ATTACH_REUSEPORT_CBPF: {
1479 		struct sock_fprog fprog;
1480 
1481 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1482 		if (!ret)
1483 			ret = sk_reuseport_attach_filter(&fprog, sk);
1484 		break;
1485 	}
1486 	case SO_ATTACH_REUSEPORT_EBPF:
1487 		ret = -EINVAL;
1488 		if (optlen == sizeof(u32)) {
1489 			u32 ufd;
1490 
1491 			ret = -EFAULT;
1492 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1493 				break;
1494 
1495 			ret = sk_reuseport_attach_bpf(ufd, sk);
1496 		}
1497 		break;
1498 
1499 	case SO_DETACH_REUSEPORT_BPF:
1500 		ret = reuseport_detach_prog(sk);
1501 		break;
1502 
1503 	case SO_DETACH_FILTER:
1504 		ret = sk_detach_filter(sk);
1505 		break;
1506 
1507 	case SO_LOCK_FILTER:
1508 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1509 			ret = -EPERM;
1510 		else
1511 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1512 		break;
1513 
1514 	case SO_MARK:
1515 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1516 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1517 			ret = -EPERM;
1518 			break;
1519 		}
1520 
1521 		__sock_set_mark(sk, val);
1522 		break;
1523 	case SO_RCVMARK:
1524 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1525 		break;
1526 
1527 	case SO_RXQ_OVFL:
1528 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1529 		break;
1530 
1531 	case SO_WIFI_STATUS:
1532 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1533 		break;
1534 
1535 	case SO_NOFCS:
1536 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1537 		break;
1538 
1539 	case SO_SELECT_ERR_QUEUE:
1540 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1541 		break;
1542 
1543 
1544 	case SO_INCOMING_CPU:
1545 		reuseport_update_incoming_cpu(sk, val);
1546 		break;
1547 
1548 	case SO_CNX_ADVICE:
1549 		if (val == 1)
1550 			dst_negative_advice(sk);
1551 		break;
1552 
1553 	case SO_ZEROCOPY:
1554 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1555 			if (!(sk_is_tcp(sk) ||
1556 			      (sk->sk_type == SOCK_DGRAM &&
1557 			       sk->sk_protocol == IPPROTO_UDP)))
1558 				ret = -EOPNOTSUPP;
1559 		} else if (sk->sk_family != PF_RDS) {
1560 			ret = -EOPNOTSUPP;
1561 		}
1562 		if (!ret) {
1563 			if (val < 0 || val > 1)
1564 				ret = -EINVAL;
1565 			else
1566 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1567 		}
1568 		break;
1569 
1570 	case SO_TXTIME:
1571 		if (optlen != sizeof(struct sock_txtime)) {
1572 			ret = -EINVAL;
1573 			break;
1574 		} else if (copy_from_sockptr(&sk_txtime, optval,
1575 			   sizeof(struct sock_txtime))) {
1576 			ret = -EFAULT;
1577 			break;
1578 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1579 			ret = -EINVAL;
1580 			break;
1581 		}
1582 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1583 		 * scheduler has enough safe guards.
1584 		 */
1585 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1586 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1587 			ret = -EPERM;
1588 			break;
1589 		}
1590 
1591 		ret = sockopt_validate_clockid(sk_txtime.clockid);
1592 		if (ret)
1593 			break;
1594 
1595 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1596 		sk->sk_clockid = sk_txtime.clockid;
1597 		sk->sk_txtime_deadline_mode =
1598 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1599 		sk->sk_txtime_report_errors =
1600 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1601 		break;
1602 
1603 	case SO_BINDTOIFINDEX:
1604 		ret = sock_bindtoindex_locked(sk, val);
1605 		break;
1606 
1607 	case SO_BUF_LOCK:
1608 		if (val & ~SOCK_BUF_LOCK_MASK) {
1609 			ret = -EINVAL;
1610 			break;
1611 		}
1612 		sk->sk_userlocks = val | (sk->sk_userlocks &
1613 					  ~SOCK_BUF_LOCK_MASK);
1614 		break;
1615 
1616 	case SO_RESERVE_MEM:
1617 	{
1618 		int delta;
1619 
1620 		if (val < 0) {
1621 			ret = -EINVAL;
1622 			break;
1623 		}
1624 
1625 		delta = val - sk->sk_reserved_mem;
1626 		if (delta < 0)
1627 			sock_release_reserved_memory(sk, -delta);
1628 		else
1629 			ret = sock_reserve_memory(sk, delta);
1630 		break;
1631 	}
1632 
1633 	default:
1634 		ret = -ENOPROTOOPT;
1635 		break;
1636 	}
1637 	sockopt_release_sock(sk);
1638 	return ret;
1639 }
1640 
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)1641 int sock_setsockopt(struct socket *sock, int level, int optname,
1642 		    sockptr_t optval, unsigned int optlen)
1643 {
1644 	return sk_setsockopt(sock->sk, level, optname,
1645 			     optval, optlen);
1646 }
1647 EXPORT_SYMBOL(sock_setsockopt);
1648 
sk_get_peer_cred(struct sock * sk)1649 static const struct cred *sk_get_peer_cred(struct sock *sk)
1650 {
1651 	const struct cred *cred;
1652 
1653 	spin_lock(&sk->sk_peer_lock);
1654 	cred = get_cred(sk->sk_peer_cred);
1655 	spin_unlock(&sk->sk_peer_lock);
1656 
1657 	return cred;
1658 }
1659 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1660 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1661 			  struct ucred *ucred)
1662 {
1663 	ucred->pid = pid_vnr(pid);
1664 	ucred->uid = ucred->gid = -1;
1665 	if (cred) {
1666 		struct user_namespace *current_ns = current_user_ns();
1667 
1668 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1669 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1670 	}
1671 }
1672 
groups_to_user(sockptr_t dst,const struct group_info * src)1673 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1674 {
1675 	struct user_namespace *user_ns = current_user_ns();
1676 	int i;
1677 
1678 	for (i = 0; i < src->ngroups; i++) {
1679 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1680 
1681 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1682 			return -EFAULT;
1683 	}
1684 
1685 	return 0;
1686 }
1687 
sk_getsockopt(struct sock * sk,int level,int optname,sockptr_t optval,sockptr_t optlen)1688 int sk_getsockopt(struct sock *sk, int level, int optname,
1689 		  sockptr_t optval, sockptr_t optlen)
1690 {
1691 	struct socket *sock = sk->sk_socket;
1692 
1693 	union {
1694 		int val;
1695 		u64 val64;
1696 		unsigned long ulval;
1697 		struct linger ling;
1698 		struct old_timeval32 tm32;
1699 		struct __kernel_old_timeval tm;
1700 		struct  __kernel_sock_timeval stm;
1701 		struct sock_txtime txtime;
1702 		struct so_timestamping timestamping;
1703 	} v;
1704 
1705 	int lv = sizeof(int);
1706 	int len;
1707 
1708 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1709 		return -EFAULT;
1710 	if (len < 0)
1711 		return -EINVAL;
1712 
1713 	memset(&v, 0, sizeof(v));
1714 
1715 	switch (optname) {
1716 	case SO_DEBUG:
1717 		v.val = sock_flag(sk, SOCK_DBG);
1718 		break;
1719 
1720 	case SO_DONTROUTE:
1721 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1722 		break;
1723 
1724 	case SO_BROADCAST:
1725 		v.val = sock_flag(sk, SOCK_BROADCAST);
1726 		break;
1727 
1728 	case SO_SNDBUF:
1729 		v.val = READ_ONCE(sk->sk_sndbuf);
1730 		break;
1731 
1732 	case SO_RCVBUF:
1733 		v.val = READ_ONCE(sk->sk_rcvbuf);
1734 		break;
1735 
1736 	case SO_REUSEADDR:
1737 		v.val = sk->sk_reuse;
1738 		break;
1739 
1740 	case SO_REUSEPORT:
1741 		v.val = sk->sk_reuseport;
1742 		break;
1743 
1744 	case SO_KEEPALIVE:
1745 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1746 		break;
1747 
1748 	case SO_TYPE:
1749 		v.val = sk->sk_type;
1750 		break;
1751 
1752 	case SO_PROTOCOL:
1753 		v.val = sk->sk_protocol;
1754 		break;
1755 
1756 	case SO_DOMAIN:
1757 		v.val = sk->sk_family;
1758 		break;
1759 
1760 	case SO_ERROR:
1761 		v.val = -sock_error(sk);
1762 		if (v.val == 0)
1763 			v.val = xchg(&sk->sk_err_soft, 0);
1764 		break;
1765 
1766 	case SO_OOBINLINE:
1767 		v.val = sock_flag(sk, SOCK_URGINLINE);
1768 		break;
1769 
1770 	case SO_NO_CHECK:
1771 		v.val = sk->sk_no_check_tx;
1772 		break;
1773 
1774 	case SO_PRIORITY:
1775 		v.val = READ_ONCE(sk->sk_priority);
1776 		break;
1777 
1778 	case SO_LINGER:
1779 		lv		= sizeof(v.ling);
1780 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1781 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1782 		break;
1783 
1784 	case SO_BSDCOMPAT:
1785 		break;
1786 
1787 	case SO_TIMESTAMP_OLD:
1788 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1789 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1790 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1791 		break;
1792 
1793 	case SO_TIMESTAMPNS_OLD:
1794 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1795 		break;
1796 
1797 	case SO_TIMESTAMP_NEW:
1798 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1799 		break;
1800 
1801 	case SO_TIMESTAMPNS_NEW:
1802 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1803 		break;
1804 
1805 	case SO_TIMESTAMPING_OLD:
1806 	case SO_TIMESTAMPING_NEW:
1807 		lv = sizeof(v.timestamping);
1808 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1809 		 * returning the flags when they were set through the same option.
1810 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1811 		 */
1812 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1813 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1814 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1815 		}
1816 		break;
1817 
1818 	case SO_RCVTIMEO_OLD:
1819 	case SO_RCVTIMEO_NEW:
1820 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1821 				      SO_RCVTIMEO_OLD == optname);
1822 		break;
1823 
1824 	case SO_SNDTIMEO_OLD:
1825 	case SO_SNDTIMEO_NEW:
1826 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1827 				      SO_SNDTIMEO_OLD == optname);
1828 		break;
1829 
1830 	case SO_RCVLOWAT:
1831 		v.val = READ_ONCE(sk->sk_rcvlowat);
1832 		break;
1833 
1834 	case SO_SNDLOWAT:
1835 		v.val = 1;
1836 		break;
1837 
1838 	case SO_PASSCRED:
1839 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1840 		break;
1841 
1842 	case SO_PASSPIDFD:
1843 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1844 		break;
1845 
1846 	case SO_PEERCRED:
1847 	{
1848 		struct ucred peercred;
1849 		if (len > sizeof(peercred))
1850 			len = sizeof(peercred);
1851 
1852 		spin_lock(&sk->sk_peer_lock);
1853 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1854 		spin_unlock(&sk->sk_peer_lock);
1855 
1856 		if (copy_to_sockptr(optval, &peercred, len))
1857 			return -EFAULT;
1858 		goto lenout;
1859 	}
1860 
1861 	case SO_PEERPIDFD:
1862 	{
1863 		struct pid *peer_pid;
1864 		struct file *pidfd_file = NULL;
1865 		int pidfd;
1866 
1867 		if (len > sizeof(pidfd))
1868 			len = sizeof(pidfd);
1869 
1870 		spin_lock(&sk->sk_peer_lock);
1871 		peer_pid = get_pid(sk->sk_peer_pid);
1872 		spin_unlock(&sk->sk_peer_lock);
1873 
1874 		if (!peer_pid)
1875 			return -ENODATA;
1876 
1877 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1878 		put_pid(peer_pid);
1879 		if (pidfd < 0)
1880 			return pidfd;
1881 
1882 		if (copy_to_sockptr(optval, &pidfd, len) ||
1883 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1884 			put_unused_fd(pidfd);
1885 			fput(pidfd_file);
1886 
1887 			return -EFAULT;
1888 		}
1889 
1890 		fd_install(pidfd, pidfd_file);
1891 		return 0;
1892 	}
1893 
1894 	case SO_PEERGROUPS:
1895 	{
1896 		const struct cred *cred;
1897 		int ret, n;
1898 
1899 		cred = sk_get_peer_cred(sk);
1900 		if (!cred)
1901 			return -ENODATA;
1902 
1903 		n = cred->group_info->ngroups;
1904 		if (len < n * sizeof(gid_t)) {
1905 			len = n * sizeof(gid_t);
1906 			put_cred(cred);
1907 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1908 		}
1909 		len = n * sizeof(gid_t);
1910 
1911 		ret = groups_to_user(optval, cred->group_info);
1912 		put_cred(cred);
1913 		if (ret)
1914 			return ret;
1915 		goto lenout;
1916 	}
1917 
1918 	case SO_PEERNAME:
1919 	{
1920 		struct sockaddr_storage address;
1921 
1922 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1923 		if (lv < 0)
1924 			return -ENOTCONN;
1925 		if (lv < len)
1926 			return -EINVAL;
1927 		if (copy_to_sockptr(optval, &address, len))
1928 			return -EFAULT;
1929 		goto lenout;
1930 	}
1931 
1932 	/* Dubious BSD thing... Probably nobody even uses it, but
1933 	 * the UNIX standard wants it for whatever reason... -DaveM
1934 	 */
1935 	case SO_ACCEPTCONN:
1936 		v.val = sk->sk_state == TCP_LISTEN;
1937 		break;
1938 
1939 	case SO_PASSSEC:
1940 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1941 		break;
1942 
1943 	case SO_PEERSEC:
1944 		return security_socket_getpeersec_stream(sock,
1945 							 optval, optlen, len);
1946 
1947 	case SO_MARK:
1948 		v.val = READ_ONCE(sk->sk_mark);
1949 		break;
1950 
1951 	case SO_RCVMARK:
1952 		v.val = sock_flag(sk, SOCK_RCVMARK);
1953 		break;
1954 
1955 	case SO_RXQ_OVFL:
1956 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1957 		break;
1958 
1959 	case SO_WIFI_STATUS:
1960 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1961 		break;
1962 
1963 	case SO_PEEK_OFF:
1964 		if (!READ_ONCE(sock->ops)->set_peek_off)
1965 			return -EOPNOTSUPP;
1966 
1967 		v.val = READ_ONCE(sk->sk_peek_off);
1968 		break;
1969 	case SO_NOFCS:
1970 		v.val = sock_flag(sk, SOCK_NOFCS);
1971 		break;
1972 
1973 	case SO_BINDTODEVICE:
1974 		return sock_getbindtodevice(sk, optval, optlen, len);
1975 
1976 	case SO_GET_FILTER:
1977 		len = sk_get_filter(sk, optval, len);
1978 		if (len < 0)
1979 			return len;
1980 
1981 		goto lenout;
1982 
1983 	case SO_LOCK_FILTER:
1984 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1985 		break;
1986 
1987 	case SO_BPF_EXTENSIONS:
1988 		v.val = bpf_tell_extensions();
1989 		break;
1990 
1991 	case SO_SELECT_ERR_QUEUE:
1992 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1993 		break;
1994 
1995 #ifdef CONFIG_NET_RX_BUSY_POLL
1996 	case SO_BUSY_POLL:
1997 		v.val = READ_ONCE(sk->sk_ll_usec);
1998 		break;
1999 	case SO_PREFER_BUSY_POLL:
2000 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2001 		break;
2002 #endif
2003 
2004 	case SO_MAX_PACING_RATE:
2005 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2006 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2007 			lv = sizeof(v.ulval);
2008 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2009 		} else {
2010 			/* 32bit version */
2011 			v.val = min_t(unsigned long, ~0U,
2012 				      READ_ONCE(sk->sk_max_pacing_rate));
2013 		}
2014 		break;
2015 
2016 	case SO_INCOMING_CPU:
2017 		v.val = READ_ONCE(sk->sk_incoming_cpu);
2018 		break;
2019 
2020 	case SO_MEMINFO:
2021 	{
2022 		u32 meminfo[SK_MEMINFO_VARS];
2023 
2024 		sk_get_meminfo(sk, meminfo);
2025 
2026 		len = min_t(unsigned int, len, sizeof(meminfo));
2027 		if (copy_to_sockptr(optval, &meminfo, len))
2028 			return -EFAULT;
2029 
2030 		goto lenout;
2031 	}
2032 
2033 #ifdef CONFIG_NET_RX_BUSY_POLL
2034 	case SO_INCOMING_NAPI_ID:
2035 		v.val = READ_ONCE(sk->sk_napi_id);
2036 
2037 		/* aggregate non-NAPI IDs down to 0 */
2038 		if (v.val < MIN_NAPI_ID)
2039 			v.val = 0;
2040 
2041 		break;
2042 #endif
2043 
2044 	case SO_COOKIE:
2045 		lv = sizeof(u64);
2046 		if (len < lv)
2047 			return -EINVAL;
2048 		v.val64 = sock_gen_cookie(sk);
2049 		break;
2050 
2051 	case SO_ZEROCOPY:
2052 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2053 		break;
2054 
2055 	case SO_TXTIME:
2056 		lv = sizeof(v.txtime);
2057 		v.txtime.clockid = sk->sk_clockid;
2058 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2059 				  SOF_TXTIME_DEADLINE_MODE : 0;
2060 		v.txtime.flags |= sk->sk_txtime_report_errors ?
2061 				  SOF_TXTIME_REPORT_ERRORS : 0;
2062 		break;
2063 
2064 	case SO_BINDTOIFINDEX:
2065 		v.val = READ_ONCE(sk->sk_bound_dev_if);
2066 		break;
2067 
2068 	case SO_NETNS_COOKIE:
2069 		lv = sizeof(u64);
2070 		if (len != lv)
2071 			return -EINVAL;
2072 		v.val64 = sock_net(sk)->net_cookie;
2073 		break;
2074 
2075 	case SO_BUF_LOCK:
2076 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2077 		break;
2078 
2079 	case SO_RESERVE_MEM:
2080 		v.val = READ_ONCE(sk->sk_reserved_mem);
2081 		break;
2082 
2083 	case SO_TXREHASH:
2084 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2085 		v.val = READ_ONCE(sk->sk_txrehash);
2086 		break;
2087 
2088 	default:
2089 		/* We implement the SO_SNDLOWAT etc to not be settable
2090 		 * (1003.1g 7).
2091 		 */
2092 		return -ENOPROTOOPT;
2093 	}
2094 
2095 	if (len > lv)
2096 		len = lv;
2097 	if (copy_to_sockptr(optval, &v, len))
2098 		return -EFAULT;
2099 lenout:
2100 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2101 		return -EFAULT;
2102 	return 0;
2103 }
2104 
2105 /*
2106  * Initialize an sk_lock.
2107  *
2108  * (We also register the sk_lock with the lock validator.)
2109  */
sock_lock_init(struct sock * sk)2110 static inline void sock_lock_init(struct sock *sk)
2111 {
2112 	sk_owner_clear(sk);
2113 
2114 	if (sk->sk_kern_sock)
2115 		sock_lock_init_class_and_name(
2116 			sk,
2117 			af_family_kern_slock_key_strings[sk->sk_family],
2118 			af_family_kern_slock_keys + sk->sk_family,
2119 			af_family_kern_key_strings[sk->sk_family],
2120 			af_family_kern_keys + sk->sk_family);
2121 	else
2122 		sock_lock_init_class_and_name(
2123 			sk,
2124 			af_family_slock_key_strings[sk->sk_family],
2125 			af_family_slock_keys + sk->sk_family,
2126 			af_family_key_strings[sk->sk_family],
2127 			af_family_keys + sk->sk_family);
2128 }
2129 
2130 /*
2131  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2132  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2133  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2134  */
sock_copy(struct sock * nsk,const struct sock * osk)2135 static void sock_copy(struct sock *nsk, const struct sock *osk)
2136 {
2137 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2138 #ifdef CONFIG_SECURITY_NETWORK
2139 	void *sptr = nsk->sk_security;
2140 #endif
2141 
2142 	/* If we move sk_tx_queue_mapping out of the private section,
2143 	 * we must check if sk_tx_queue_clear() is called after
2144 	 * sock_copy() in sk_clone_lock().
2145 	 */
2146 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2147 		     offsetof(struct sock, sk_dontcopy_begin) ||
2148 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2149 		     offsetof(struct sock, sk_dontcopy_end));
2150 
2151 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2152 
2153 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2154 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2155 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2156 
2157 #ifdef CONFIG_SECURITY_NETWORK
2158 	nsk->sk_security = sptr;
2159 	security_sk_clone(osk, nsk);
2160 #endif
2161 }
2162 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)2163 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2164 		int family)
2165 {
2166 	struct sock *sk;
2167 	struct kmem_cache *slab;
2168 
2169 	slab = prot->slab;
2170 	if (slab != NULL) {
2171 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2172 		if (!sk)
2173 			return sk;
2174 		if (want_init_on_alloc(priority))
2175 			sk_prot_clear_nulls(sk, prot->obj_size);
2176 	} else
2177 		sk = kmalloc(prot->obj_size, priority);
2178 
2179 	if (sk != NULL) {
2180 		if (security_sk_alloc(sk, family, priority))
2181 			goto out_free;
2182 
2183 		if (!try_module_get(prot->owner))
2184 			goto out_free_sec;
2185 	}
2186 
2187 	return sk;
2188 
2189 out_free_sec:
2190 	security_sk_free(sk);
2191 out_free:
2192 	if (slab != NULL)
2193 		kmem_cache_free(slab, sk);
2194 	else
2195 		kfree(sk);
2196 	return NULL;
2197 }
2198 
sk_prot_free(struct proto * prot,struct sock * sk)2199 static void sk_prot_free(struct proto *prot, struct sock *sk)
2200 {
2201 	struct kmem_cache *slab;
2202 	struct module *owner;
2203 
2204 	owner = prot->owner;
2205 	slab = prot->slab;
2206 
2207 	cgroup_sk_free(&sk->sk_cgrp_data);
2208 	mem_cgroup_sk_free(sk);
2209 	trace_android_vh_sk_free(sk);
2210 	security_sk_free(sk);
2211 
2212 	sk_owner_put(sk);
2213 
2214 	if (slab != NULL)
2215 		kmem_cache_free(slab, sk);
2216 	else
2217 		kfree(sk);
2218 	module_put(owner);
2219 }
2220 
2221 /**
2222  *	sk_alloc - All socket objects are allocated here
2223  *	@net: the applicable net namespace
2224  *	@family: protocol family
2225  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2226  *	@prot: struct proto associated with this new sock instance
2227  *	@kern: is this to be a kernel socket?
2228  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)2229 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2230 		      struct proto *prot, int kern)
2231 {
2232 	struct sock *sk;
2233 
2234 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2235 	if (sk) {
2236 		sk->sk_family = family;
2237 		/*
2238 		 * See comment in struct sock definition to understand
2239 		 * why we need sk_prot_creator -acme
2240 		 */
2241 		sk->sk_prot = sk->sk_prot_creator = prot;
2242 		sk->sk_kern_sock = kern;
2243 		sock_lock_init(sk);
2244 		sk->sk_net_refcnt = kern ? 0 : 1;
2245 		if (likely(sk->sk_net_refcnt)) {
2246 			get_net_track(net, &sk->ns_tracker, priority);
2247 			sock_inuse_add(net, 1);
2248 		} else {
2249 			net_passive_inc(net);
2250 			__netns_tracker_alloc(net, &sk->ns_tracker,
2251 					      false, priority);
2252 		}
2253 
2254 		sock_net_set(sk, net);
2255 		refcount_set(&sk->sk_wmem_alloc, 1);
2256 
2257 		mem_cgroup_sk_alloc(sk);
2258 		trace_android_vh_sk_alloc(sk);
2259 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2260 		sock_update_classid(&sk->sk_cgrp_data);
2261 		sock_update_netprioidx(&sk->sk_cgrp_data);
2262 		sk_tx_queue_clear(sk);
2263 	}
2264 
2265 	return sk;
2266 }
2267 EXPORT_SYMBOL(sk_alloc);
2268 
2269 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2270  * grace period. This is the case for UDP sockets and TCP listeners.
2271  */
__sk_destruct(struct rcu_head * head)2272 static void __sk_destruct(struct rcu_head *head)
2273 {
2274 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2275 	struct net *net = sock_net(sk);
2276 	struct sk_filter *filter;
2277 
2278 	if (sk->sk_destruct)
2279 		sk->sk_destruct(sk);
2280 
2281 	filter = rcu_dereference_check(sk->sk_filter,
2282 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2283 	if (filter) {
2284 		sk_filter_uncharge(sk, filter);
2285 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2286 	}
2287 
2288 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2289 
2290 #ifdef CONFIG_BPF_SYSCALL
2291 	bpf_sk_storage_free(sk);
2292 #endif
2293 
2294 	if (atomic_read(&sk->sk_omem_alloc))
2295 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2296 			 __func__, atomic_read(&sk->sk_omem_alloc));
2297 
2298 	if (sk->sk_frag.page) {
2299 		put_page(sk->sk_frag.page);
2300 		sk->sk_frag.page = NULL;
2301 	}
2302 
2303 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2304 	put_cred(sk->sk_peer_cred);
2305 	put_pid(sk->sk_peer_pid);
2306 
2307 	if (likely(sk->sk_net_refcnt)) {
2308 		put_net_track(net, &sk->ns_tracker);
2309 	} else {
2310 		__netns_tracker_free(net, &sk->ns_tracker, false);
2311 		net_passive_dec(net);
2312 	}
2313 	sk_prot_free(sk->sk_prot_creator, sk);
2314 }
2315 
sk_net_refcnt_upgrade(struct sock * sk)2316 void sk_net_refcnt_upgrade(struct sock *sk)
2317 {
2318 	struct net *net = sock_net(sk);
2319 
2320 	WARN_ON_ONCE(sk->sk_net_refcnt);
2321 	__netns_tracker_free(net, &sk->ns_tracker, false);
2322 	net_passive_dec(net);
2323 	sk->sk_net_refcnt = 1;
2324 	get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
2325 	sock_inuse_add(net, 1);
2326 }
2327 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
2328 
sk_destruct(struct sock * sk)2329 void sk_destruct(struct sock *sk)
2330 {
2331 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2332 
2333 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2334 		reuseport_detach_sock(sk);
2335 		use_call_rcu = true;
2336 	}
2337 
2338 	if (use_call_rcu)
2339 		call_rcu(&sk->sk_rcu, __sk_destruct);
2340 	else
2341 		__sk_destruct(&sk->sk_rcu);
2342 }
2343 
__sk_free(struct sock * sk)2344 static void __sk_free(struct sock *sk)
2345 {
2346 	if (likely(sk->sk_net_refcnt))
2347 		sock_inuse_add(sock_net(sk), -1);
2348 
2349 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2350 		sock_diag_broadcast_destroy(sk);
2351 	else
2352 		sk_destruct(sk);
2353 }
2354 
sk_free(struct sock * sk)2355 void sk_free(struct sock *sk)
2356 {
2357 	/*
2358 	 * We subtract one from sk_wmem_alloc and can know if
2359 	 * some packets are still in some tx queue.
2360 	 * If not null, sock_wfree() will call __sk_free(sk) later
2361 	 */
2362 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2363 		__sk_free(sk);
2364 }
2365 EXPORT_SYMBOL(sk_free);
2366 
sk_init_common(struct sock * sk)2367 static void sk_init_common(struct sock *sk)
2368 {
2369 	skb_queue_head_init(&sk->sk_receive_queue);
2370 	skb_queue_head_init(&sk->sk_write_queue);
2371 	skb_queue_head_init(&sk->sk_error_queue);
2372 
2373 	rwlock_init(&sk->sk_callback_lock);
2374 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2375 			af_rlock_keys + sk->sk_family,
2376 			af_family_rlock_key_strings[sk->sk_family]);
2377 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2378 			af_wlock_keys + sk->sk_family,
2379 			af_family_wlock_key_strings[sk->sk_family]);
2380 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2381 			af_elock_keys + sk->sk_family,
2382 			af_family_elock_key_strings[sk->sk_family]);
2383 	if (sk->sk_kern_sock)
2384 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2385 			af_kern_callback_keys + sk->sk_family,
2386 			af_family_kern_clock_key_strings[sk->sk_family]);
2387 	else
2388 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2389 			af_callback_keys + sk->sk_family,
2390 			af_family_clock_key_strings[sk->sk_family]);
2391 }
2392 
2393 /**
2394  *	sk_clone_lock - clone a socket, and lock its clone
2395  *	@sk: the socket to clone
2396  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2397  *
2398  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2399  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2400 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2401 {
2402 	struct proto *prot = READ_ONCE(sk->sk_prot);
2403 	struct sk_filter *filter;
2404 	bool is_charged = true;
2405 	struct sock *newsk;
2406 
2407 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2408 	if (!newsk)
2409 		goto out;
2410 
2411 	sock_copy(newsk, sk);
2412 	trace_android_vh_sk_clone_lock(newsk);
2413 
2414 	newsk->sk_prot_creator = prot;
2415 
2416 	/* SANITY */
2417 	if (likely(newsk->sk_net_refcnt)) {
2418 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2419 		sock_inuse_add(sock_net(newsk), 1);
2420 	} else {
2421 		/* Kernel sockets are not elevating the struct net refcount.
2422 		 * Instead, use a tracker to more easily detect if a layer
2423 		 * is not properly dismantling its kernel sockets at netns
2424 		 * destroy time.
2425 		 */
2426 		net_passive_inc(sock_net(newsk));
2427 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2428 				      false, priority);
2429 	}
2430 	sk_node_init(&newsk->sk_node);
2431 	sock_lock_init(newsk);
2432 	bh_lock_sock(newsk);
2433 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2434 	newsk->sk_backlog.len = 0;
2435 
2436 	atomic_set(&newsk->sk_rmem_alloc, 0);
2437 
2438 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2439 	refcount_set(&newsk->sk_wmem_alloc, 1);
2440 
2441 	atomic_set(&newsk->sk_omem_alloc, 0);
2442 	sk_init_common(newsk);
2443 
2444 	newsk->sk_dst_cache	= NULL;
2445 	newsk->sk_dst_pending_confirm = 0;
2446 	newsk->sk_wmem_queued	= 0;
2447 	newsk->sk_forward_alloc = 0;
2448 	newsk->sk_reserved_mem  = 0;
2449 	atomic_set(&newsk->sk_drops, 0);
2450 	newsk->sk_send_head	= NULL;
2451 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2452 	atomic_set(&newsk->sk_zckey, 0);
2453 
2454 	sock_reset_flag(newsk, SOCK_DONE);
2455 
2456 	/* sk->sk_memcg will be populated at accept() time */
2457 	newsk->sk_memcg = NULL;
2458 
2459 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2460 
2461 	rcu_read_lock();
2462 	filter = rcu_dereference(sk->sk_filter);
2463 	if (filter != NULL)
2464 		/* though it's an empty new sock, the charging may fail
2465 		 * if sysctl_optmem_max was changed between creation of
2466 		 * original socket and cloning
2467 		 */
2468 		is_charged = sk_filter_charge(newsk, filter);
2469 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2470 	rcu_read_unlock();
2471 
2472 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2473 		/* We need to make sure that we don't uncharge the new
2474 		 * socket if we couldn't charge it in the first place
2475 		 * as otherwise we uncharge the parent's filter.
2476 		 */
2477 		if (!is_charged)
2478 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2479 		sk_free_unlock_clone(newsk);
2480 		newsk = NULL;
2481 		goto out;
2482 	}
2483 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2484 
2485 	if (bpf_sk_storage_clone(sk, newsk)) {
2486 		sk_free_unlock_clone(newsk);
2487 		newsk = NULL;
2488 		goto out;
2489 	}
2490 
2491 	/* Clear sk_user_data if parent had the pointer tagged
2492 	 * as not suitable for copying when cloning.
2493 	 */
2494 	if (sk_user_data_is_nocopy(newsk))
2495 		newsk->sk_user_data = NULL;
2496 
2497 	newsk->sk_err	   = 0;
2498 	newsk->sk_err_soft = 0;
2499 	newsk->sk_priority = 0;
2500 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2501 
2502 	/* Before updating sk_refcnt, we must commit prior changes to memory
2503 	 * (Documentation/RCU/rculist_nulls.rst for details)
2504 	 */
2505 	smp_wmb();
2506 	refcount_set(&newsk->sk_refcnt, 2);
2507 
2508 	sk_set_socket(newsk, NULL);
2509 	sk_tx_queue_clear(newsk);
2510 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2511 
2512 	if (newsk->sk_prot->sockets_allocated)
2513 		sk_sockets_allocated_inc(newsk);
2514 
2515 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2516 		net_enable_timestamp();
2517 out:
2518 	return newsk;
2519 }
2520 EXPORT_SYMBOL_GPL(sk_clone_lock);
2521 
sk_free_unlock_clone(struct sock * sk)2522 void sk_free_unlock_clone(struct sock *sk)
2523 {
2524 	/* It is still raw copy of parent, so invalidate
2525 	 * destructor and make plain sk_free() */
2526 	sk->sk_destruct = NULL;
2527 	bh_unlock_sock(sk);
2528 	sk_free(sk);
2529 }
2530 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2531 
sk_dst_gso_max_size(struct sock * sk,struct dst_entry * dst)2532 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2533 {
2534 	bool is_ipv6 = false;
2535 	u32 max_size;
2536 
2537 #if IS_ENABLED(CONFIG_IPV6)
2538 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2539 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2540 #endif
2541 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2542 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2543 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2544 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2545 		max_size = GSO_LEGACY_MAX_SIZE;
2546 
2547 	return max_size - (MAX_TCP_HEADER + 1);
2548 }
2549 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2550 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2551 {
2552 	u32 max_segs = 1;
2553 
2554 	sk->sk_route_caps = dst->dev->features;
2555 	if (sk_is_tcp(sk))
2556 		sk->sk_route_caps |= NETIF_F_GSO;
2557 	if (sk->sk_route_caps & NETIF_F_GSO)
2558 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2559 	if (unlikely(sk->sk_gso_disabled))
2560 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2561 	if (sk_can_gso(sk)) {
2562 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2563 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2564 		} else {
2565 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2566 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2567 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2568 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2569 		}
2570 	}
2571 	sk->sk_gso_max_segs = max_segs;
2572 	sk_dst_set(sk, dst);
2573 }
2574 EXPORT_SYMBOL_GPL(sk_setup_caps);
2575 
2576 /*
2577  *	Simple resource managers for sockets.
2578  */
2579 
2580 
2581 /*
2582  * Write buffer destructor automatically called from kfree_skb.
2583  */
sock_wfree(struct sk_buff * skb)2584 void sock_wfree(struct sk_buff *skb)
2585 {
2586 	struct sock *sk = skb->sk;
2587 	unsigned int len = skb->truesize;
2588 	bool free;
2589 
2590 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2591 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2592 		    sk->sk_write_space == sock_def_write_space) {
2593 			rcu_read_lock();
2594 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2595 			sock_def_write_space_wfree(sk);
2596 			rcu_read_unlock();
2597 			if (unlikely(free))
2598 				__sk_free(sk);
2599 			return;
2600 		}
2601 
2602 		/*
2603 		 * Keep a reference on sk_wmem_alloc, this will be released
2604 		 * after sk_write_space() call
2605 		 */
2606 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2607 		sk->sk_write_space(sk);
2608 		len = 1;
2609 	}
2610 	/*
2611 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2612 	 * could not do because of in-flight packets
2613 	 */
2614 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2615 		__sk_free(sk);
2616 }
2617 EXPORT_SYMBOL(sock_wfree);
2618 
2619 /* This variant of sock_wfree() is used by TCP,
2620  * since it sets SOCK_USE_WRITE_QUEUE.
2621  */
__sock_wfree(struct sk_buff * skb)2622 void __sock_wfree(struct sk_buff *skb)
2623 {
2624 	struct sock *sk = skb->sk;
2625 
2626 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2627 		__sk_free(sk);
2628 }
2629 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2630 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2631 {
2632 	skb_orphan(skb);
2633 	skb->sk = sk;
2634 #ifdef CONFIG_INET
2635 	if (unlikely(!sk_fullsock(sk))) {
2636 		skb->destructor = sock_edemux;
2637 		sock_hold(sk);
2638 		return;
2639 	}
2640 #endif
2641 	skb->destructor = sock_wfree;
2642 	skb_set_hash_from_sk(skb, sk);
2643 	/*
2644 	 * We used to take a refcount on sk, but following operation
2645 	 * is enough to guarantee sk_free() won't free this sock until
2646 	 * all in-flight packets are completed
2647 	 */
2648 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2649 }
2650 EXPORT_SYMBOL(skb_set_owner_w);
2651 
can_skb_orphan_partial(const struct sk_buff * skb)2652 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2653 {
2654 	/* Drivers depend on in-order delivery for crypto offload,
2655 	 * partial orphan breaks out-of-order-OK logic.
2656 	 */
2657 	if (skb_is_decrypted(skb))
2658 		return false;
2659 
2660 	return (skb->destructor == sock_wfree ||
2661 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2662 }
2663 
2664 /* This helper is used by netem, as it can hold packets in its
2665  * delay queue. We want to allow the owner socket to send more
2666  * packets, as if they were already TX completed by a typical driver.
2667  * But we also want to keep skb->sk set because some packet schedulers
2668  * rely on it (sch_fq for example).
2669  */
skb_orphan_partial(struct sk_buff * skb)2670 void skb_orphan_partial(struct sk_buff *skb)
2671 {
2672 	if (skb_is_tcp_pure_ack(skb))
2673 		return;
2674 
2675 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2676 		return;
2677 
2678 	skb_orphan(skb);
2679 }
2680 EXPORT_SYMBOL(skb_orphan_partial);
2681 
2682 /*
2683  * Read buffer destructor automatically called from kfree_skb.
2684  */
sock_rfree(struct sk_buff * skb)2685 void sock_rfree(struct sk_buff *skb)
2686 {
2687 	struct sock *sk = skb->sk;
2688 	unsigned int len = skb->truesize;
2689 
2690 	atomic_sub(len, &sk->sk_rmem_alloc);
2691 	sk_mem_uncharge(sk, len);
2692 }
2693 EXPORT_SYMBOL(sock_rfree);
2694 
2695 /*
2696  * Buffer destructor for skbs that are not used directly in read or write
2697  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2698  */
sock_efree(struct sk_buff * skb)2699 void sock_efree(struct sk_buff *skb)
2700 {
2701 	sock_put(skb->sk);
2702 }
2703 EXPORT_SYMBOL(sock_efree);
2704 
2705 /* Buffer destructor for prefetch/receive path where reference count may
2706  * not be held, e.g. for listen sockets.
2707  */
2708 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2709 void sock_pfree(struct sk_buff *skb)
2710 {
2711 	struct sock *sk = skb->sk;
2712 
2713 	if (!sk_is_refcounted(sk))
2714 		return;
2715 
2716 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2717 		inet_reqsk(sk)->rsk_listener = NULL;
2718 		reqsk_free(inet_reqsk(sk));
2719 		return;
2720 	}
2721 
2722 	sock_gen_put(sk);
2723 }
2724 EXPORT_SYMBOL(sock_pfree);
2725 #endif /* CONFIG_INET */
2726 
sock_i_uid(struct sock * sk)2727 kuid_t sock_i_uid(struct sock *sk)
2728 {
2729 	kuid_t uid;
2730 
2731 	read_lock_bh(&sk->sk_callback_lock);
2732 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2733 	read_unlock_bh(&sk->sk_callback_lock);
2734 	return uid;
2735 }
2736 EXPORT_SYMBOL(sock_i_uid);
2737 
__sock_i_ino(struct sock * sk)2738 unsigned long __sock_i_ino(struct sock *sk)
2739 {
2740 	unsigned long ino;
2741 
2742 	read_lock(&sk->sk_callback_lock);
2743 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2744 	read_unlock(&sk->sk_callback_lock);
2745 	return ino;
2746 }
2747 EXPORT_SYMBOL(__sock_i_ino);
2748 
sock_i_ino(struct sock * sk)2749 unsigned long sock_i_ino(struct sock *sk)
2750 {
2751 	unsigned long ino;
2752 
2753 	local_bh_disable();
2754 	ino = __sock_i_ino(sk);
2755 	local_bh_enable();
2756 	return ino;
2757 }
2758 EXPORT_SYMBOL(sock_i_ino);
2759 
2760 /*
2761  * Allocate a skb from the socket's send buffer.
2762  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2763 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2764 			     gfp_t priority)
2765 {
2766 	if (force ||
2767 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2768 		struct sk_buff *skb = alloc_skb(size, priority);
2769 
2770 		if (skb) {
2771 			skb_set_owner_w(skb, sk);
2772 			return skb;
2773 		}
2774 	}
2775 	return NULL;
2776 }
2777 EXPORT_SYMBOL(sock_wmalloc);
2778 
sock_ofree(struct sk_buff * skb)2779 static void sock_ofree(struct sk_buff *skb)
2780 {
2781 	struct sock *sk = skb->sk;
2782 
2783 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2784 }
2785 
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2786 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2787 			     gfp_t priority)
2788 {
2789 	struct sk_buff *skb;
2790 
2791 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2792 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2793 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2794 		return NULL;
2795 
2796 	skb = alloc_skb(size, priority);
2797 	if (!skb)
2798 		return NULL;
2799 
2800 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2801 	skb->sk = sk;
2802 	skb->destructor = sock_ofree;
2803 	return skb;
2804 }
2805 
2806 /*
2807  * Allocate a memory block from the socket's option memory buffer.
2808  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2809 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2810 {
2811 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2812 
2813 	if ((unsigned int)size <= optmem_max &&
2814 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2815 		void *mem;
2816 		/* First do the add, to avoid the race if kmalloc
2817 		 * might sleep.
2818 		 */
2819 		atomic_add(size, &sk->sk_omem_alloc);
2820 		mem = kmalloc(size, priority);
2821 		if (mem)
2822 			return mem;
2823 		atomic_sub(size, &sk->sk_omem_alloc);
2824 	}
2825 	return NULL;
2826 }
2827 EXPORT_SYMBOL(sock_kmalloc);
2828 
2829 /* Free an option memory block. Note, we actually want the inline
2830  * here as this allows gcc to detect the nullify and fold away the
2831  * condition entirely.
2832  */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2833 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2834 				  const bool nullify)
2835 {
2836 	if (WARN_ON_ONCE(!mem))
2837 		return;
2838 	if (nullify)
2839 		kfree_sensitive(mem);
2840 	else
2841 		kfree(mem);
2842 	atomic_sub(size, &sk->sk_omem_alloc);
2843 }
2844 
sock_kfree_s(struct sock * sk,void * mem,int size)2845 void sock_kfree_s(struct sock *sk, void *mem, int size)
2846 {
2847 	__sock_kfree_s(sk, mem, size, false);
2848 }
2849 EXPORT_SYMBOL(sock_kfree_s);
2850 
sock_kzfree_s(struct sock * sk,void * mem,int size)2851 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2852 {
2853 	__sock_kfree_s(sk, mem, size, true);
2854 }
2855 EXPORT_SYMBOL(sock_kzfree_s);
2856 
2857 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2858    I think, these locks should be removed for datagram sockets.
2859  */
sock_wait_for_wmem(struct sock * sk,long timeo)2860 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2861 {
2862 	DEFINE_WAIT(wait);
2863 
2864 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2865 	for (;;) {
2866 		if (!timeo)
2867 			break;
2868 		if (signal_pending(current))
2869 			break;
2870 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2871 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2872 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2873 			break;
2874 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2875 			break;
2876 		if (READ_ONCE(sk->sk_err))
2877 			break;
2878 		timeo = schedule_timeout(timeo);
2879 	}
2880 	finish_wait(sk_sleep(sk), &wait);
2881 	return timeo;
2882 }
2883 
2884 
2885 /*
2886  *	Generic send/receive buffer handlers
2887  */
2888 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2889 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2890 				     unsigned long data_len, int noblock,
2891 				     int *errcode, int max_page_order)
2892 {
2893 	struct sk_buff *skb;
2894 	long timeo;
2895 	int err;
2896 
2897 	timeo = sock_sndtimeo(sk, noblock);
2898 	for (;;) {
2899 		err = sock_error(sk);
2900 		if (err != 0)
2901 			goto failure;
2902 
2903 		err = -EPIPE;
2904 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2905 			goto failure;
2906 
2907 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2908 			break;
2909 
2910 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2911 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2912 		err = -EAGAIN;
2913 		if (!timeo)
2914 			goto failure;
2915 		if (signal_pending(current))
2916 			goto interrupted;
2917 		timeo = sock_wait_for_wmem(sk, timeo);
2918 	}
2919 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2920 				   errcode, sk->sk_allocation);
2921 	if (skb)
2922 		skb_set_owner_w(skb, sk);
2923 	return skb;
2924 
2925 interrupted:
2926 	err = sock_intr_errno(timeo);
2927 failure:
2928 	*errcode = err;
2929 	return NULL;
2930 }
2931 EXPORT_SYMBOL(sock_alloc_send_pskb);
2932 
__sock_cmsg_send(struct sock * sk,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2933 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2934 		     struct sockcm_cookie *sockc)
2935 {
2936 	u32 tsflags;
2937 
2938 	switch (cmsg->cmsg_type) {
2939 	case SO_MARK:
2940 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2941 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2942 			return -EPERM;
2943 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2944 			return -EINVAL;
2945 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2946 		break;
2947 	case SO_TIMESTAMPING_OLD:
2948 	case SO_TIMESTAMPING_NEW:
2949 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2950 			return -EINVAL;
2951 
2952 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2953 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2954 			return -EINVAL;
2955 
2956 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2957 		sockc->tsflags |= tsflags;
2958 		break;
2959 	case SCM_TXTIME:
2960 		if (!sock_flag(sk, SOCK_TXTIME))
2961 			return -EINVAL;
2962 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2963 			return -EINVAL;
2964 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2965 		break;
2966 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2967 	case SCM_RIGHTS:
2968 	case SCM_CREDENTIALS:
2969 		break;
2970 	default:
2971 		return -EINVAL;
2972 	}
2973 	return 0;
2974 }
2975 EXPORT_SYMBOL(__sock_cmsg_send);
2976 
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2977 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2978 		   struct sockcm_cookie *sockc)
2979 {
2980 	struct cmsghdr *cmsg;
2981 	int ret;
2982 
2983 	for_each_cmsghdr(cmsg, msg) {
2984 		if (!CMSG_OK(msg, cmsg))
2985 			return -EINVAL;
2986 		if (cmsg->cmsg_level != SOL_SOCKET)
2987 			continue;
2988 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2989 		if (ret)
2990 			return ret;
2991 	}
2992 	return 0;
2993 }
2994 EXPORT_SYMBOL(sock_cmsg_send);
2995 
sk_enter_memory_pressure(struct sock * sk)2996 static void sk_enter_memory_pressure(struct sock *sk)
2997 {
2998 	if (!sk->sk_prot->enter_memory_pressure)
2999 		return;
3000 
3001 	sk->sk_prot->enter_memory_pressure(sk);
3002 }
3003 
sk_leave_memory_pressure(struct sock * sk)3004 static void sk_leave_memory_pressure(struct sock *sk)
3005 {
3006 	if (sk->sk_prot->leave_memory_pressure) {
3007 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3008 				     tcp_leave_memory_pressure, sk);
3009 	} else {
3010 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3011 
3012 		if (memory_pressure && READ_ONCE(*memory_pressure))
3013 			WRITE_ONCE(*memory_pressure, 0);
3014 	}
3015 }
3016 
3017 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3018 
3019 /**
3020  * skb_page_frag_refill - check that a page_frag contains enough room
3021  * @sz: minimum size of the fragment we want to get
3022  * @pfrag: pointer to page_frag
3023  * @gfp: priority for memory allocation
3024  *
3025  * Note: While this allocator tries to use high order pages, there is
3026  * no guarantee that allocations succeed. Therefore, @sz MUST be
3027  * less or equal than PAGE_SIZE.
3028  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)3029 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3030 {
3031 	if (pfrag->page) {
3032 		if (page_ref_count(pfrag->page) == 1) {
3033 			pfrag->offset = 0;
3034 			return true;
3035 		}
3036 		if (pfrag->offset + sz <= pfrag->size)
3037 			return true;
3038 		put_page(pfrag->page);
3039 	}
3040 
3041 	pfrag->offset = 0;
3042 	if (SKB_FRAG_PAGE_ORDER &&
3043 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3044 		/* Avoid direct reclaim but allow kswapd to wake */
3045 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3046 					  __GFP_COMP | __GFP_NOWARN |
3047 					  __GFP_NORETRY,
3048 					  SKB_FRAG_PAGE_ORDER);
3049 		if (likely(pfrag->page)) {
3050 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3051 			return true;
3052 		}
3053 	}
3054 	pfrag->page = alloc_page(gfp);
3055 	if (likely(pfrag->page)) {
3056 		pfrag->size = PAGE_SIZE;
3057 		return true;
3058 	}
3059 	return false;
3060 }
3061 EXPORT_SYMBOL(skb_page_frag_refill);
3062 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)3063 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3064 {
3065 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3066 		return true;
3067 
3068 	sk_enter_memory_pressure(sk);
3069 	sk_stream_moderate_sndbuf(sk);
3070 	return false;
3071 }
3072 EXPORT_SYMBOL(sk_page_frag_refill);
3073 
__lock_sock(struct sock * sk)3074 void __lock_sock(struct sock *sk)
3075 	__releases(&sk->sk_lock.slock)
3076 	__acquires(&sk->sk_lock.slock)
3077 {
3078 	DEFINE_WAIT(wait);
3079 
3080 	for (;;) {
3081 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3082 					TASK_UNINTERRUPTIBLE);
3083 		spin_unlock_bh(&sk->sk_lock.slock);
3084 		schedule();
3085 		spin_lock_bh(&sk->sk_lock.slock);
3086 		if (!sock_owned_by_user(sk))
3087 			break;
3088 	}
3089 	finish_wait(&sk->sk_lock.wq, &wait);
3090 }
3091 
__release_sock(struct sock * sk)3092 void __release_sock(struct sock *sk)
3093 	__releases(&sk->sk_lock.slock)
3094 	__acquires(&sk->sk_lock.slock)
3095 {
3096 	struct sk_buff *skb, *next;
3097 
3098 	while ((skb = sk->sk_backlog.head) != NULL) {
3099 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3100 
3101 		spin_unlock_bh(&sk->sk_lock.slock);
3102 
3103 		do {
3104 			next = skb->next;
3105 			prefetch(next);
3106 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3107 			skb_mark_not_on_list(skb);
3108 			sk_backlog_rcv(sk, skb);
3109 
3110 			cond_resched();
3111 
3112 			skb = next;
3113 		} while (skb != NULL);
3114 
3115 		spin_lock_bh(&sk->sk_lock.slock);
3116 	}
3117 
3118 	/*
3119 	 * Doing the zeroing here guarantee we can not loop forever
3120 	 * while a wild producer attempts to flood us.
3121 	 */
3122 	sk->sk_backlog.len = 0;
3123 }
3124 
__sk_flush_backlog(struct sock * sk)3125 void __sk_flush_backlog(struct sock *sk)
3126 {
3127 	spin_lock_bh(&sk->sk_lock.slock);
3128 	__release_sock(sk);
3129 
3130 	if (sk->sk_prot->release_cb)
3131 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3132 				     tcp_release_cb, sk);
3133 
3134 	spin_unlock_bh(&sk->sk_lock.slock);
3135 }
3136 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3137 
3138 /**
3139  * sk_wait_data - wait for data to arrive at sk_receive_queue
3140  * @sk:    sock to wait on
3141  * @timeo: for how long
3142  * @skb:   last skb seen on sk_receive_queue
3143  *
3144  * Now socket state including sk->sk_err is changed only under lock,
3145  * hence we may omit checks after joining wait queue.
3146  * We check receive queue before schedule() only as optimization;
3147  * it is very likely that release_sock() added new data.
3148  */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)3149 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3150 {
3151 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3152 	int rc;
3153 
3154 	add_wait_queue(sk_sleep(sk), &wait);
3155 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3156 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3157 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3158 	remove_wait_queue(sk_sleep(sk), &wait);
3159 	return rc;
3160 }
3161 EXPORT_SYMBOL(sk_wait_data);
3162 
3163 /**
3164  *	__sk_mem_raise_allocated - increase memory_allocated
3165  *	@sk: socket
3166  *	@size: memory size to allocate
3167  *	@amt: pages to allocate
3168  *	@kind: allocation type
3169  *
3170  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3171  *
3172  *	Unlike the globally shared limits among the sockets under same protocol,
3173  *	consuming the budget of a memcg won't have direct effect on other ones.
3174  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3175  *	whether or not to raise allocated through sk_under_memory_pressure() or
3176  *	its variants.
3177  */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)3178 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3179 {
3180 	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3181 	struct proto *prot = sk->sk_prot;
3182 	bool charged = true;
3183 	long allocated;
3184 
3185 	sk_memory_allocated_add(sk, amt);
3186 	allocated = sk_memory_allocated(sk);
3187 
3188 	if (memcg) {
3189 		charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge());
3190 		if (!charged)
3191 			goto suppress_allocation;
3192 	}
3193 
3194 	/* Under limit. */
3195 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3196 		sk_leave_memory_pressure(sk);
3197 		return 1;
3198 	}
3199 
3200 	/* Under pressure. */
3201 	if (allocated > sk_prot_mem_limits(sk, 1))
3202 		sk_enter_memory_pressure(sk);
3203 
3204 	/* Over hard limit. */
3205 	if (allocated > sk_prot_mem_limits(sk, 2))
3206 		goto suppress_allocation;
3207 
3208 	/* Guarantee minimum buffer size under pressure (either global
3209 	 * or memcg) to make sure features described in RFC 7323 (TCP
3210 	 * Extensions for High Performance) work properly.
3211 	 *
3212 	 * This rule does NOT stand when exceeds global or memcg's hard
3213 	 * limit, or else a DoS attack can be taken place by spawning
3214 	 * lots of sockets whose usage are under minimum buffer size.
3215 	 */
3216 	if (kind == SK_MEM_RECV) {
3217 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3218 			return 1;
3219 
3220 	} else { /* SK_MEM_SEND */
3221 		int wmem0 = sk_get_wmem0(sk, prot);
3222 
3223 		if (sk->sk_type == SOCK_STREAM) {
3224 			if (sk->sk_wmem_queued < wmem0)
3225 				return 1;
3226 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3227 				return 1;
3228 		}
3229 	}
3230 
3231 	if (sk_has_memory_pressure(sk)) {
3232 		u64 alloc;
3233 
3234 		/* The following 'average' heuristic is within the
3235 		 * scope of global accounting, so it only makes
3236 		 * sense for global memory pressure.
3237 		 */
3238 		if (!sk_under_global_memory_pressure(sk))
3239 			return 1;
3240 
3241 		/* Try to be fair among all the sockets under global
3242 		 * pressure by allowing the ones that below average
3243 		 * usage to raise.
3244 		 */
3245 		alloc = sk_sockets_allocated_read_positive(sk);
3246 		if (sk_prot_mem_limits(sk, 2) > alloc *
3247 		    sk_mem_pages(sk->sk_wmem_queued +
3248 				 atomic_read(&sk->sk_rmem_alloc) +
3249 				 sk->sk_forward_alloc))
3250 			return 1;
3251 	}
3252 
3253 suppress_allocation:
3254 
3255 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3256 		sk_stream_moderate_sndbuf(sk);
3257 
3258 		/* Fail only if socket is _under_ its sndbuf.
3259 		 * In this case we cannot block, so that we have to fail.
3260 		 */
3261 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3262 			/* Force charge with __GFP_NOFAIL */
3263 			if (memcg && !charged) {
3264 				mem_cgroup_charge_skmem(memcg, amt,
3265 					gfp_memcg_charge() | __GFP_NOFAIL);
3266 			}
3267 			return 1;
3268 		}
3269 	}
3270 
3271 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3272 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3273 
3274 	sk_memory_allocated_sub(sk, amt);
3275 
3276 	if (memcg && charged)
3277 		mem_cgroup_uncharge_skmem(memcg, amt);
3278 
3279 	return 0;
3280 }
3281 
3282 /**
3283  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3284  *	@sk: socket
3285  *	@size: memory size to allocate
3286  *	@kind: allocation type
3287  *
3288  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3289  *	rmem allocation. This function assumes that protocols which have
3290  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3291  */
__sk_mem_schedule(struct sock * sk,int size,int kind)3292 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3293 {
3294 	int ret, amt = sk_mem_pages(size);
3295 
3296 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3297 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3298 	if (!ret)
3299 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3300 	return ret;
3301 }
3302 EXPORT_SYMBOL(__sk_mem_schedule);
3303 
3304 /**
3305  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3306  *	@sk: socket
3307  *	@amount: number of quanta
3308  *
3309  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3310  */
__sk_mem_reduce_allocated(struct sock * sk,int amount)3311 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3312 {
3313 	sk_memory_allocated_sub(sk, amount);
3314 
3315 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3316 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3317 
3318 	if (sk_under_global_memory_pressure(sk) &&
3319 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3320 		sk_leave_memory_pressure(sk);
3321 }
3322 
3323 /**
3324  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3325  *	@sk: socket
3326  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3327  */
__sk_mem_reclaim(struct sock * sk,int amount)3328 void __sk_mem_reclaim(struct sock *sk, int amount)
3329 {
3330 	amount >>= PAGE_SHIFT;
3331 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3332 	__sk_mem_reduce_allocated(sk, amount);
3333 }
3334 EXPORT_SYMBOL(__sk_mem_reclaim);
3335 
sk_set_peek_off(struct sock * sk,int val)3336 int sk_set_peek_off(struct sock *sk, int val)
3337 {
3338 	WRITE_ONCE(sk->sk_peek_off, val);
3339 	return 0;
3340 }
3341 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3342 
3343 /*
3344  * Set of default routines for initialising struct proto_ops when
3345  * the protocol does not support a particular function. In certain
3346  * cases where it makes no sense for a protocol to have a "do nothing"
3347  * function, some default processing is provided.
3348  */
3349 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)3350 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3351 {
3352 	return -EOPNOTSUPP;
3353 }
3354 EXPORT_SYMBOL(sock_no_bind);
3355 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)3356 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3357 		    int len, int flags)
3358 {
3359 	return -EOPNOTSUPP;
3360 }
3361 EXPORT_SYMBOL(sock_no_connect);
3362 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)3363 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3364 {
3365 	return -EOPNOTSUPP;
3366 }
3367 EXPORT_SYMBOL(sock_no_socketpair);
3368 
sock_no_accept(struct socket * sock,struct socket * newsock,struct proto_accept_arg * arg)3369 int sock_no_accept(struct socket *sock, struct socket *newsock,
3370 		   struct proto_accept_arg *arg)
3371 {
3372 	return -EOPNOTSUPP;
3373 }
3374 EXPORT_SYMBOL(sock_no_accept);
3375 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)3376 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3377 		    int peer)
3378 {
3379 	return -EOPNOTSUPP;
3380 }
3381 EXPORT_SYMBOL(sock_no_getname);
3382 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3383 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3384 {
3385 	return -EOPNOTSUPP;
3386 }
3387 EXPORT_SYMBOL(sock_no_ioctl);
3388 
sock_no_listen(struct socket * sock,int backlog)3389 int sock_no_listen(struct socket *sock, int backlog)
3390 {
3391 	return -EOPNOTSUPP;
3392 }
3393 EXPORT_SYMBOL(sock_no_listen);
3394 
sock_no_shutdown(struct socket * sock,int how)3395 int sock_no_shutdown(struct socket *sock, int how)
3396 {
3397 	return -EOPNOTSUPP;
3398 }
3399 EXPORT_SYMBOL(sock_no_shutdown);
3400 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)3401 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3402 {
3403 	return -EOPNOTSUPP;
3404 }
3405 EXPORT_SYMBOL(sock_no_sendmsg);
3406 
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)3407 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3408 {
3409 	return -EOPNOTSUPP;
3410 }
3411 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3412 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)3413 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3414 		    int flags)
3415 {
3416 	return -EOPNOTSUPP;
3417 }
3418 EXPORT_SYMBOL(sock_no_recvmsg);
3419 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)3420 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3421 {
3422 	/* Mirror missing mmap method error code */
3423 	return -ENODEV;
3424 }
3425 EXPORT_SYMBOL(sock_no_mmap);
3426 
3427 /*
3428  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3429  * various sock-based usage counts.
3430  */
__receive_sock(struct file * file)3431 void __receive_sock(struct file *file)
3432 {
3433 	struct socket *sock;
3434 
3435 	sock = sock_from_file(file);
3436 	if (sock) {
3437 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3438 		sock_update_classid(&sock->sk->sk_cgrp_data);
3439 		trace_android_vh_receive_sock(sock->sk);
3440 	}
3441 }
3442 
3443 /*
3444  *	Default Socket Callbacks
3445  */
3446 
sock_def_wakeup(struct sock * sk)3447 static void sock_def_wakeup(struct sock *sk)
3448 {
3449 	struct socket_wq *wq;
3450 
3451 	rcu_read_lock();
3452 	wq = rcu_dereference(sk->sk_wq);
3453 	if (skwq_has_sleeper(wq))
3454 		wake_up_interruptible_all(&wq->wait);
3455 	rcu_read_unlock();
3456 }
3457 
sock_def_error_report(struct sock * sk)3458 static void sock_def_error_report(struct sock *sk)
3459 {
3460 	struct socket_wq *wq;
3461 
3462 	rcu_read_lock();
3463 	wq = rcu_dereference(sk->sk_wq);
3464 	if (skwq_has_sleeper(wq))
3465 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3466 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3467 	rcu_read_unlock();
3468 }
3469 
sock_def_readable(struct sock * sk)3470 void sock_def_readable(struct sock *sk)
3471 {
3472 	struct socket_wq *wq;
3473 
3474 	trace_sk_data_ready(sk);
3475 
3476 	rcu_read_lock();
3477 	wq = rcu_dereference(sk->sk_wq);
3478 
3479 	if (skwq_has_sleeper(wq)) {
3480 		int done = 0;
3481 
3482 		trace_android_vh_do_wake_up_sync(&wq->wait, &done, sk);
3483 		if (done)
3484 			goto out;
3485 
3486 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3487 						EPOLLRDNORM | EPOLLRDBAND);
3488 	}
3489 
3490 out:
3491 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3492 	rcu_read_unlock();
3493 }
3494 
sock_def_write_space(struct sock * sk)3495 static void sock_def_write_space(struct sock *sk)
3496 {
3497 	struct socket_wq *wq;
3498 
3499 	rcu_read_lock();
3500 
3501 	/* Do not wake up a writer until he can make "significant"
3502 	 * progress.  --DaveM
3503 	 */
3504 	if (sock_writeable(sk)) {
3505 		wq = rcu_dereference(sk->sk_wq);
3506 		if (skwq_has_sleeper(wq))
3507 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3508 						EPOLLWRNORM | EPOLLWRBAND);
3509 
3510 		/* Should agree with poll, otherwise some programs break */
3511 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3512 	}
3513 
3514 	rcu_read_unlock();
3515 }
3516 
3517 /* An optimised version of sock_def_write_space(), should only be called
3518  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3519  * ->sk_wmem_alloc.
3520  */
sock_def_write_space_wfree(struct sock * sk)3521 static void sock_def_write_space_wfree(struct sock *sk)
3522 {
3523 	/* Do not wake up a writer until he can make "significant"
3524 	 * progress.  --DaveM
3525 	 */
3526 	if (sock_writeable(sk)) {
3527 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3528 
3529 		/* rely on refcount_sub from sock_wfree() */
3530 		smp_mb__after_atomic();
3531 		if (wq && waitqueue_active(&wq->wait))
3532 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3533 						EPOLLWRNORM | EPOLLWRBAND);
3534 
3535 		/* Should agree with poll, otherwise some programs break */
3536 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3537 	}
3538 }
3539 
sock_def_destruct(struct sock * sk)3540 static void sock_def_destruct(struct sock *sk)
3541 {
3542 }
3543 
sk_send_sigurg(struct sock * sk)3544 void sk_send_sigurg(struct sock *sk)
3545 {
3546 	if (sk->sk_socket && sk->sk_socket->file)
3547 		if (send_sigurg(sk->sk_socket->file))
3548 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3549 }
3550 EXPORT_SYMBOL(sk_send_sigurg);
3551 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3552 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3553 		    unsigned long expires)
3554 {
3555 	if (!mod_timer(timer, expires))
3556 		sock_hold(sk);
3557 }
3558 EXPORT_SYMBOL(sk_reset_timer);
3559 
sk_stop_timer(struct sock * sk,struct timer_list * timer)3560 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3561 {
3562 	if (del_timer(timer))
3563 		__sock_put(sk);
3564 }
3565 EXPORT_SYMBOL(sk_stop_timer);
3566 
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3567 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3568 {
3569 	if (del_timer_sync(timer))
3570 		__sock_put(sk);
3571 }
3572 EXPORT_SYMBOL(sk_stop_timer_sync);
3573 
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3574 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3575 {
3576 	sk_init_common(sk);
3577 	sk->sk_send_head	=	NULL;
3578 
3579 	timer_setup(&sk->sk_timer, NULL, 0);
3580 
3581 	sk->sk_allocation	=	GFP_KERNEL;
3582 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3583 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3584 	sk->sk_state		=	TCP_CLOSE;
3585 	sk->sk_use_task_frag	=	true;
3586 	sk_set_socket(sk, sock);
3587 
3588 	sock_set_flag(sk, SOCK_ZAPPED);
3589 
3590 	if (sock) {
3591 		sk->sk_type	=	sock->type;
3592 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3593 		sock->sk	=	sk;
3594 	} else {
3595 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3596 	}
3597 	sk->sk_uid	=	uid;
3598 
3599 	sk->sk_state_change	=	sock_def_wakeup;
3600 	sk->sk_data_ready	=	sock_def_readable;
3601 	sk->sk_write_space	=	sock_def_write_space;
3602 	sk->sk_error_report	=	sock_def_error_report;
3603 	sk->sk_destruct		=	sock_def_destruct;
3604 
3605 	sk->sk_frag.page	=	NULL;
3606 	sk->sk_frag.offset	=	0;
3607 	sk->sk_peek_off		=	-1;
3608 
3609 	sk->sk_peer_pid 	=	NULL;
3610 	sk->sk_peer_cred	=	NULL;
3611 	spin_lock_init(&sk->sk_peer_lock);
3612 
3613 	sk->sk_write_pending	=	0;
3614 	sk->sk_rcvlowat		=	1;
3615 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3616 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3617 
3618 	sk->sk_stamp = SK_DEFAULT_STAMP;
3619 #if BITS_PER_LONG==32
3620 	seqlock_init(&sk->sk_stamp_seq);
3621 #endif
3622 	atomic_set(&sk->sk_zckey, 0);
3623 
3624 #ifdef CONFIG_NET_RX_BUSY_POLL
3625 	sk->sk_napi_id		=	0;
3626 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3627 #endif
3628 
3629 	sk->sk_max_pacing_rate = ~0UL;
3630 	sk->sk_pacing_rate = ~0UL;
3631 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3632 	sk->sk_incoming_cpu = -1;
3633 
3634 	sk_rx_queue_clear(sk);
3635 	/*
3636 	 * Before updating sk_refcnt, we must commit prior changes to memory
3637 	 * (Documentation/RCU/rculist_nulls.rst for details)
3638 	 */
3639 	smp_wmb();
3640 	refcount_set(&sk->sk_refcnt, 1);
3641 	atomic_set(&sk->sk_drops, 0);
3642 }
3643 EXPORT_SYMBOL(sock_init_data_uid);
3644 
sock_init_data(struct socket * sock,struct sock * sk)3645 void sock_init_data(struct socket *sock, struct sock *sk)
3646 {
3647 	kuid_t uid = sock ?
3648 		SOCK_INODE(sock)->i_uid :
3649 		make_kuid(sock_net(sk)->user_ns, 0);
3650 
3651 	sock_init_data_uid(sock, sk, uid);
3652 }
3653 EXPORT_SYMBOL(sock_init_data);
3654 
lock_sock_nested(struct sock * sk,int subclass)3655 void lock_sock_nested(struct sock *sk, int subclass)
3656 {
3657 	/* The sk_lock has mutex_lock() semantics here. */
3658 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3659 
3660 	might_sleep();
3661 	spin_lock_bh(&sk->sk_lock.slock);
3662 	if (sock_owned_by_user_nocheck(sk))
3663 		__lock_sock(sk);
3664 	sk->sk_lock.owned = 1;
3665 	spin_unlock_bh(&sk->sk_lock.slock);
3666 }
3667 EXPORT_SYMBOL(lock_sock_nested);
3668 
release_sock(struct sock * sk)3669 void release_sock(struct sock *sk)
3670 {
3671 	spin_lock_bh(&sk->sk_lock.slock);
3672 	if (sk->sk_backlog.tail)
3673 		__release_sock(sk);
3674 
3675 	if (sk->sk_prot->release_cb)
3676 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3677 				     tcp_release_cb, sk);
3678 
3679 	sock_release_ownership(sk);
3680 	if (waitqueue_active(&sk->sk_lock.wq))
3681 		wake_up(&sk->sk_lock.wq);
3682 	spin_unlock_bh(&sk->sk_lock.slock);
3683 }
3684 EXPORT_SYMBOL(release_sock);
3685 
__lock_sock_fast(struct sock * sk)3686 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3687 {
3688 	might_sleep();
3689 	spin_lock_bh(&sk->sk_lock.slock);
3690 
3691 	if (!sock_owned_by_user_nocheck(sk)) {
3692 		/*
3693 		 * Fast path return with bottom halves disabled and
3694 		 * sock::sk_lock.slock held.
3695 		 *
3696 		 * The 'mutex' is not contended and holding
3697 		 * sock::sk_lock.slock prevents all other lockers to
3698 		 * proceed so the corresponding unlock_sock_fast() can
3699 		 * avoid the slow path of release_sock() completely and
3700 		 * just release slock.
3701 		 *
3702 		 * From a semantical POV this is equivalent to 'acquiring'
3703 		 * the 'mutex', hence the corresponding lockdep
3704 		 * mutex_release() has to happen in the fast path of
3705 		 * unlock_sock_fast().
3706 		 */
3707 		return false;
3708 	}
3709 
3710 	__lock_sock(sk);
3711 	sk->sk_lock.owned = 1;
3712 	__acquire(&sk->sk_lock.slock);
3713 	spin_unlock_bh(&sk->sk_lock.slock);
3714 	return true;
3715 }
3716 EXPORT_SYMBOL(__lock_sock_fast);
3717 
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3718 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3719 		   bool timeval, bool time32)
3720 {
3721 	struct sock *sk = sock->sk;
3722 	struct timespec64 ts;
3723 
3724 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3725 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3726 	if (ts.tv_sec == -1)
3727 		return -ENOENT;
3728 	if (ts.tv_sec == 0) {
3729 		ktime_t kt = ktime_get_real();
3730 		sock_write_timestamp(sk, kt);
3731 		ts = ktime_to_timespec64(kt);
3732 	}
3733 
3734 	if (timeval)
3735 		ts.tv_nsec /= 1000;
3736 
3737 #ifdef CONFIG_COMPAT_32BIT_TIME
3738 	if (time32)
3739 		return put_old_timespec32(&ts, userstamp);
3740 #endif
3741 #ifdef CONFIG_SPARC64
3742 	/* beware of padding in sparc64 timeval */
3743 	if (timeval && !in_compat_syscall()) {
3744 		struct __kernel_old_timeval __user tv = {
3745 			.tv_sec = ts.tv_sec,
3746 			.tv_usec = ts.tv_nsec,
3747 		};
3748 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3749 			return -EFAULT;
3750 		return 0;
3751 	}
3752 #endif
3753 	return put_timespec64(&ts, userstamp);
3754 }
3755 EXPORT_SYMBOL(sock_gettstamp);
3756 
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3757 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3758 {
3759 	if (!sock_flag(sk, flag)) {
3760 		unsigned long previous_flags = sk->sk_flags;
3761 
3762 		sock_set_flag(sk, flag);
3763 		/*
3764 		 * we just set one of the two flags which require net
3765 		 * time stamping, but time stamping might have been on
3766 		 * already because of the other one
3767 		 */
3768 		if (sock_needs_netstamp(sk) &&
3769 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3770 			net_enable_timestamp();
3771 	}
3772 }
3773 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3774 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3775 		       int level, int type)
3776 {
3777 	struct sock_exterr_skb *serr;
3778 	struct sk_buff *skb;
3779 	int copied, err;
3780 
3781 	err = -EAGAIN;
3782 	skb = sock_dequeue_err_skb(sk);
3783 	if (skb == NULL)
3784 		goto out;
3785 
3786 	copied = skb->len;
3787 	if (copied > len) {
3788 		msg->msg_flags |= MSG_TRUNC;
3789 		copied = len;
3790 	}
3791 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3792 	if (err)
3793 		goto out_free_skb;
3794 
3795 	sock_recv_timestamp(msg, sk, skb);
3796 
3797 	serr = SKB_EXT_ERR(skb);
3798 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3799 
3800 	msg->msg_flags |= MSG_ERRQUEUE;
3801 	err = copied;
3802 
3803 out_free_skb:
3804 	kfree_skb(skb);
3805 out:
3806 	return err;
3807 }
3808 EXPORT_SYMBOL(sock_recv_errqueue);
3809 
3810 /*
3811  *	Get a socket option on an socket.
3812  *
3813  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3814  *	asynchronous errors should be reported by getsockopt. We assume
3815  *	this means if you specify SO_ERROR (otherwise what is the point of it).
3816  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3817 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3818 			   char __user *optval, int __user *optlen)
3819 {
3820 	struct sock *sk = sock->sk;
3821 
3822 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3823 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3824 }
3825 EXPORT_SYMBOL(sock_common_getsockopt);
3826 
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3827 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3828 			int flags)
3829 {
3830 	struct sock *sk = sock->sk;
3831 	int addr_len = 0;
3832 	int err;
3833 
3834 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3835 	if (err >= 0)
3836 		msg->msg_namelen = addr_len;
3837 	return err;
3838 }
3839 EXPORT_SYMBOL(sock_common_recvmsg);
3840 
3841 /*
3842  *	Set socket options on an inet socket.
3843  */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3844 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3845 			   sockptr_t optval, unsigned int optlen)
3846 {
3847 	struct sock *sk = sock->sk;
3848 
3849 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3850 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3851 }
3852 EXPORT_SYMBOL(sock_common_setsockopt);
3853 
sk_common_release(struct sock * sk)3854 void sk_common_release(struct sock *sk)
3855 {
3856 	if (sk->sk_prot->destroy)
3857 		sk->sk_prot->destroy(sk);
3858 
3859 	/*
3860 	 * Observation: when sk_common_release is called, processes have
3861 	 * no access to socket. But net still has.
3862 	 * Step one, detach it from networking:
3863 	 *
3864 	 * A. Remove from hash tables.
3865 	 */
3866 
3867 	sk->sk_prot->unhash(sk);
3868 
3869 	if (sk->sk_socket)
3870 		sk->sk_socket->sk = NULL;
3871 
3872 	/*
3873 	 * In this point socket cannot receive new packets, but it is possible
3874 	 * that some packets are in flight because some CPU runs receiver and
3875 	 * did hash table lookup before we unhashed socket. They will achieve
3876 	 * receive queue and will be purged by socket destructor.
3877 	 *
3878 	 * Also we still have packets pending on receive queue and probably,
3879 	 * our own packets waiting in device queues. sock_destroy will drain
3880 	 * receive queue, but transmitted packets will delay socket destruction
3881 	 * until the last reference will be released.
3882 	 */
3883 
3884 	sock_orphan(sk);
3885 
3886 	xfrm_sk_free_policy(sk);
3887 
3888 	sock_put(sk);
3889 }
3890 EXPORT_SYMBOL(sk_common_release);
3891 
sk_get_meminfo(const struct sock * sk,u32 * mem)3892 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3893 {
3894 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3895 
3896 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3897 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3898 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3899 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3900 	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3901 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3902 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3903 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3904 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3905 }
3906 
3907 #ifdef CONFIG_PROC_FS
3908 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3909 
sock_prot_inuse_get(struct net * net,struct proto * prot)3910 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3911 {
3912 	int cpu, idx = prot->inuse_idx;
3913 	int res = 0;
3914 
3915 	for_each_possible_cpu(cpu)
3916 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3917 
3918 	return res >= 0 ? res : 0;
3919 }
3920 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3921 
sock_inuse_get(struct net * net)3922 int sock_inuse_get(struct net *net)
3923 {
3924 	int cpu, res = 0;
3925 
3926 	for_each_possible_cpu(cpu)
3927 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3928 
3929 	return res;
3930 }
3931 
3932 EXPORT_SYMBOL_GPL(sock_inuse_get);
3933 
sock_inuse_init_net(struct net * net)3934 static int __net_init sock_inuse_init_net(struct net *net)
3935 {
3936 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3937 	if (net->core.prot_inuse == NULL)
3938 		return -ENOMEM;
3939 	return 0;
3940 }
3941 
sock_inuse_exit_net(struct net * net)3942 static void __net_exit sock_inuse_exit_net(struct net *net)
3943 {
3944 	free_percpu(net->core.prot_inuse);
3945 }
3946 
3947 static struct pernet_operations net_inuse_ops = {
3948 	.init = sock_inuse_init_net,
3949 	.exit = sock_inuse_exit_net,
3950 };
3951 
net_inuse_init(void)3952 static __init int net_inuse_init(void)
3953 {
3954 	if (register_pernet_subsys(&net_inuse_ops))
3955 		panic("Cannot initialize net inuse counters");
3956 
3957 	return 0;
3958 }
3959 
3960 core_initcall(net_inuse_init);
3961 
assign_proto_idx(struct proto * prot)3962 static int assign_proto_idx(struct proto *prot)
3963 {
3964 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3965 
3966 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
3967 		pr_err("PROTO_INUSE_NR exhausted\n");
3968 		return -ENOSPC;
3969 	}
3970 
3971 	set_bit(prot->inuse_idx, proto_inuse_idx);
3972 	return 0;
3973 }
3974 
release_proto_idx(struct proto * prot)3975 static void release_proto_idx(struct proto *prot)
3976 {
3977 	if (prot->inuse_idx != PROTO_INUSE_NR)
3978 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3979 }
3980 #else
assign_proto_idx(struct proto * prot)3981 static inline int assign_proto_idx(struct proto *prot)
3982 {
3983 	return 0;
3984 }
3985 
release_proto_idx(struct proto * prot)3986 static inline void release_proto_idx(struct proto *prot)
3987 {
3988 }
3989 
3990 #endif
3991 
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3992 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3993 {
3994 	if (!twsk_prot)
3995 		return;
3996 	kfree(twsk_prot->twsk_slab_name);
3997 	twsk_prot->twsk_slab_name = NULL;
3998 	kmem_cache_destroy(twsk_prot->twsk_slab);
3999 	twsk_prot->twsk_slab = NULL;
4000 }
4001 
tw_prot_init(const struct proto * prot)4002 static int tw_prot_init(const struct proto *prot)
4003 {
4004 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
4005 
4006 	if (!twsk_prot)
4007 		return 0;
4008 
4009 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
4010 					      prot->name);
4011 	if (!twsk_prot->twsk_slab_name)
4012 		return -ENOMEM;
4013 
4014 	twsk_prot->twsk_slab =
4015 		kmem_cache_create(twsk_prot->twsk_slab_name,
4016 				  twsk_prot->twsk_obj_size, 0,
4017 				  SLAB_ACCOUNT | prot->slab_flags,
4018 				  NULL);
4019 	if (!twsk_prot->twsk_slab) {
4020 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4021 			prot->name);
4022 		return -ENOMEM;
4023 	}
4024 
4025 	return 0;
4026 }
4027 
req_prot_cleanup(struct request_sock_ops * rsk_prot)4028 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4029 {
4030 	if (!rsk_prot)
4031 		return;
4032 	kfree(rsk_prot->slab_name);
4033 	rsk_prot->slab_name = NULL;
4034 	kmem_cache_destroy(rsk_prot->slab);
4035 	rsk_prot->slab = NULL;
4036 }
4037 
req_prot_init(const struct proto * prot)4038 static int req_prot_init(const struct proto *prot)
4039 {
4040 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4041 
4042 	if (!rsk_prot)
4043 		return 0;
4044 
4045 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4046 					prot->name);
4047 	if (!rsk_prot->slab_name)
4048 		return -ENOMEM;
4049 
4050 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4051 					   rsk_prot->obj_size, 0,
4052 					   SLAB_ACCOUNT | prot->slab_flags,
4053 					   NULL);
4054 
4055 	if (!rsk_prot->slab) {
4056 		pr_crit("%s: Can't create request sock SLAB cache!\n",
4057 			prot->name);
4058 		return -ENOMEM;
4059 	}
4060 	return 0;
4061 }
4062 
proto_register(struct proto * prot,int alloc_slab)4063 int proto_register(struct proto *prot, int alloc_slab)
4064 {
4065 	int ret = -ENOBUFS;
4066 
4067 	if (prot->memory_allocated && !prot->sysctl_mem) {
4068 		pr_err("%s: missing sysctl_mem\n", prot->name);
4069 		return -EINVAL;
4070 	}
4071 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4072 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4073 		return -EINVAL;
4074 	}
4075 	if (alloc_slab) {
4076 		prot->slab = kmem_cache_create_usercopy(prot->name,
4077 					prot->obj_size, 0,
4078 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4079 					prot->slab_flags,
4080 					prot->useroffset, prot->usersize,
4081 					NULL);
4082 
4083 		if (prot->slab == NULL) {
4084 			pr_crit("%s: Can't create sock SLAB cache!\n",
4085 				prot->name);
4086 			goto out;
4087 		}
4088 
4089 		if (req_prot_init(prot))
4090 			goto out_free_request_sock_slab;
4091 
4092 		if (tw_prot_init(prot))
4093 			goto out_free_timewait_sock_slab;
4094 	}
4095 
4096 	mutex_lock(&proto_list_mutex);
4097 	ret = assign_proto_idx(prot);
4098 	if (ret) {
4099 		mutex_unlock(&proto_list_mutex);
4100 		goto out_free_timewait_sock_slab;
4101 	}
4102 	list_add(&prot->node, &proto_list);
4103 	mutex_unlock(&proto_list_mutex);
4104 	return ret;
4105 
4106 out_free_timewait_sock_slab:
4107 	if (alloc_slab)
4108 		tw_prot_cleanup(prot->twsk_prot);
4109 out_free_request_sock_slab:
4110 	if (alloc_slab) {
4111 		req_prot_cleanup(prot->rsk_prot);
4112 
4113 		kmem_cache_destroy(prot->slab);
4114 		prot->slab = NULL;
4115 	}
4116 out:
4117 	return ret;
4118 }
4119 EXPORT_SYMBOL(proto_register);
4120 
proto_unregister(struct proto * prot)4121 void proto_unregister(struct proto *prot)
4122 {
4123 	mutex_lock(&proto_list_mutex);
4124 	release_proto_idx(prot);
4125 	list_del(&prot->node);
4126 	mutex_unlock(&proto_list_mutex);
4127 
4128 	kmem_cache_destroy(prot->slab);
4129 	prot->slab = NULL;
4130 
4131 	req_prot_cleanup(prot->rsk_prot);
4132 	tw_prot_cleanup(prot->twsk_prot);
4133 }
4134 EXPORT_SYMBOL(proto_unregister);
4135 
sock_load_diag_module(int family,int protocol)4136 int sock_load_diag_module(int family, int protocol)
4137 {
4138 	if (!protocol) {
4139 		if (!sock_is_registered(family))
4140 			return -ENOENT;
4141 
4142 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4143 				      NETLINK_SOCK_DIAG, family);
4144 	}
4145 
4146 #ifdef CONFIG_INET
4147 	if (family == AF_INET &&
4148 	    protocol != IPPROTO_RAW &&
4149 	    protocol < MAX_INET_PROTOS &&
4150 	    !rcu_access_pointer(inet_protos[protocol]))
4151 		return -ENOENT;
4152 #endif
4153 
4154 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4155 			      NETLINK_SOCK_DIAG, family, protocol);
4156 }
4157 EXPORT_SYMBOL(sock_load_diag_module);
4158 
4159 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)4160 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4161 	__acquires(proto_list_mutex)
4162 {
4163 	mutex_lock(&proto_list_mutex);
4164 	return seq_list_start_head(&proto_list, *pos);
4165 }
4166 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)4167 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4168 {
4169 	return seq_list_next(v, &proto_list, pos);
4170 }
4171 
proto_seq_stop(struct seq_file * seq,void * v)4172 static void proto_seq_stop(struct seq_file *seq, void *v)
4173 	__releases(proto_list_mutex)
4174 {
4175 	mutex_unlock(&proto_list_mutex);
4176 }
4177 
proto_method_implemented(const void * method)4178 static char proto_method_implemented(const void *method)
4179 {
4180 	return method == NULL ? 'n' : 'y';
4181 }
sock_prot_memory_allocated(struct proto * proto)4182 static long sock_prot_memory_allocated(struct proto *proto)
4183 {
4184 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4185 }
4186 
sock_prot_memory_pressure(struct proto * proto)4187 static const char *sock_prot_memory_pressure(struct proto *proto)
4188 {
4189 	return proto->memory_pressure != NULL ?
4190 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4191 }
4192 
proto_seq_printf(struct seq_file * seq,struct proto * proto)4193 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4194 {
4195 
4196 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4197 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4198 		   proto->name,
4199 		   proto->obj_size,
4200 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4201 		   sock_prot_memory_allocated(proto),
4202 		   sock_prot_memory_pressure(proto),
4203 		   proto->max_header,
4204 		   proto->slab == NULL ? "no" : "yes",
4205 		   module_name(proto->owner),
4206 		   proto_method_implemented(proto->close),
4207 		   proto_method_implemented(proto->connect),
4208 		   proto_method_implemented(proto->disconnect),
4209 		   proto_method_implemented(proto->accept),
4210 		   proto_method_implemented(proto->ioctl),
4211 		   proto_method_implemented(proto->init),
4212 		   proto_method_implemented(proto->destroy),
4213 		   proto_method_implemented(proto->shutdown),
4214 		   proto_method_implemented(proto->setsockopt),
4215 		   proto_method_implemented(proto->getsockopt),
4216 		   proto_method_implemented(proto->sendmsg),
4217 		   proto_method_implemented(proto->recvmsg),
4218 		   proto_method_implemented(proto->bind),
4219 		   proto_method_implemented(proto->backlog_rcv),
4220 		   proto_method_implemented(proto->hash),
4221 		   proto_method_implemented(proto->unhash),
4222 		   proto_method_implemented(proto->get_port),
4223 		   proto_method_implemented(proto->enter_memory_pressure));
4224 }
4225 
proto_seq_show(struct seq_file * seq,void * v)4226 static int proto_seq_show(struct seq_file *seq, void *v)
4227 {
4228 	if (v == &proto_list)
4229 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4230 			   "protocol",
4231 			   "size",
4232 			   "sockets",
4233 			   "memory",
4234 			   "press",
4235 			   "maxhdr",
4236 			   "slab",
4237 			   "module",
4238 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4239 	else
4240 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4241 	return 0;
4242 }
4243 
4244 static const struct seq_operations proto_seq_ops = {
4245 	.start  = proto_seq_start,
4246 	.next   = proto_seq_next,
4247 	.stop   = proto_seq_stop,
4248 	.show   = proto_seq_show,
4249 };
4250 
proto_init_net(struct net * net)4251 static __net_init int proto_init_net(struct net *net)
4252 {
4253 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4254 			sizeof(struct seq_net_private)))
4255 		return -ENOMEM;
4256 
4257 	return 0;
4258 }
4259 
proto_exit_net(struct net * net)4260 static __net_exit void proto_exit_net(struct net *net)
4261 {
4262 	remove_proc_entry("protocols", net->proc_net);
4263 }
4264 
4265 
4266 static __net_initdata struct pernet_operations proto_net_ops = {
4267 	.init = proto_init_net,
4268 	.exit = proto_exit_net,
4269 };
4270 
proto_init(void)4271 static int __init proto_init(void)
4272 {
4273 	return register_pernet_subsys(&proto_net_ops);
4274 }
4275 
4276 subsys_initcall(proto_init);
4277 
4278 #endif /* PROC_FS */
4279 
4280 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)4281 bool sk_busy_loop_end(void *p, unsigned long start_time)
4282 {
4283 	struct sock *sk = p;
4284 
4285 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4286 		return true;
4287 
4288 	if (sk_is_udp(sk) &&
4289 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4290 		return true;
4291 
4292 	return sk_busy_loop_timeout(sk, start_time);
4293 }
4294 EXPORT_SYMBOL(sk_busy_loop_end);
4295 #endif /* CONFIG_NET_RX_BUSY_POLL */
4296 
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)4297 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4298 {
4299 	if (!sk->sk_prot->bind_add)
4300 		return -EOPNOTSUPP;
4301 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4302 }
4303 EXPORT_SYMBOL(sock_bind_add);
4304 
4305 /* Copy 'size' bytes from userspace and return `size` back to userspace */
sock_ioctl_inout(struct sock * sk,unsigned int cmd,void __user * arg,void * karg,size_t size)4306 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4307 		     void __user *arg, void *karg, size_t size)
4308 {
4309 	int ret;
4310 
4311 	if (copy_from_user(karg, arg, size))
4312 		return -EFAULT;
4313 
4314 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4315 	if (ret)
4316 		return ret;
4317 
4318 	if (copy_to_user(arg, karg, size))
4319 		return -EFAULT;
4320 
4321 	return 0;
4322 }
4323 EXPORT_SYMBOL(sock_ioctl_inout);
4324 
4325 /* This is the most common ioctl prep function, where the result (4 bytes) is
4326  * copied back to userspace if the ioctl() returns successfully. No input is
4327  * copied from userspace as input argument.
4328  */
sock_ioctl_out(struct sock * sk,unsigned int cmd,void __user * arg)4329 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4330 {
4331 	int ret, karg = 0;
4332 
4333 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4334 	if (ret)
4335 		return ret;
4336 
4337 	return put_user(karg, (int __user *)arg);
4338 }
4339 
4340 /* A wrapper around sock ioctls, which copies the data from userspace
4341  * (depending on the protocol/ioctl), and copies back the result to userspace.
4342  * The main motivation for this function is to pass kernel memory to the
4343  * protocol ioctl callbacks, instead of userspace memory.
4344  */
sk_ioctl(struct sock * sk,unsigned int cmd,void __user * arg)4345 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4346 {
4347 	int rc = 1;
4348 
4349 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4350 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4351 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4352 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4353 	else if (sk_is_phonet(sk))
4354 		rc = phonet_sk_ioctl(sk, cmd, arg);
4355 
4356 	/* If ioctl was processed, returns its value */
4357 	if (rc <= 0)
4358 		return rc;
4359 
4360 	/* Otherwise call the default handler */
4361 	return sock_ioctl_out(sk, cmd, arg);
4362 }
4363 EXPORT_SYMBOL(sk_ioctl);
4364 
sock_struct_check(void)4365 static int __init sock_struct_check(void)
4366 {
4367 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4368 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4369 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4370 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4371 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4372 
4373 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4374 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4375 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4376 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4377 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4378 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4379 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4380 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4381 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4382 
4383 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4384 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4385 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4386 
4387 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4388 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4389 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4390 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4391 
4392 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4393 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4394 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4395 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4396 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4397 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4398 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4399 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4400 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4401 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4402 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4403 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4404 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4405 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4406 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4407 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4408 
4409 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4410 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4411 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4412 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4413 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4414 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4415 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4416 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4417 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4418 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4419 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4420 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4421 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4422 	return 0;
4423 }
4424 
4425 core_initcall(sock_struct_check);
4426