• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 #include <trace/hooks/sched.h>
139 #include <trace/hooks/net.h>
140 
141 #include <net/tcp.h>
142 #include <net/busy_poll.h>
143 
144 #include <linux/ethtool.h>
145 
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
148 
149 static void sock_inuse_add(struct net *net, int val);
150 
151 /**
152  * sk_ns_capable - General socket capability test
153  * @sk: Socket to use a capability on or through
154  * @user_ns: The user namespace of the capability to use
155  * @cap: The capability to use
156  *
157  * Test to see if the opener of the socket had when the socket was
158  * created and the current process has the capability @cap in the user
159  * namespace @user_ns.
160  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)161 bool sk_ns_capable(const struct sock *sk,
162 		   struct user_namespace *user_ns, int cap)
163 {
164 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 		ns_capable(user_ns, cap);
166 }
167 EXPORT_SYMBOL(sk_ns_capable);
168 
169 /**
170  * sk_capable - Socket global capability test
171  * @sk: Socket to use a capability on or through
172  * @cap: The global capability to use
173  *
174  * Test to see if the opener of the socket had when the socket was
175  * created and the current process has the capability @cap in all user
176  * namespaces.
177  */
sk_capable(const struct sock * sk,int cap)178 bool sk_capable(const struct sock *sk, int cap)
179 {
180 	return sk_ns_capable(sk, &init_user_ns, cap);
181 }
182 EXPORT_SYMBOL(sk_capable);
183 
184 /**
185  * sk_net_capable - Network namespace socket capability test
186  * @sk: Socket to use a capability on or through
187  * @cap: The capability to use
188  *
189  * Test to see if the opener of the socket had when the socket was created
190  * and the current process has the capability @cap over the network namespace
191  * the socket is a member of.
192  */
sk_net_capable(const struct sock * sk,int cap)193 bool sk_net_capable(const struct sock *sk, int cap)
194 {
195 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
196 }
197 EXPORT_SYMBOL(sk_net_capable);
198 
199 /*
200  * Each address family might have different locking rules, so we have
201  * one slock key per address family and separate keys for internal and
202  * userspace sockets.
203  */
204 static struct lock_class_key af_family_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_keys[AF_MAX];
206 static struct lock_class_key af_family_slock_keys[AF_MAX];
207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
208 
209 /*
210  * Make lock validator output more readable. (we pre-construct these
211  * strings build-time, so that runtime initialization of socket
212  * locks is fast):
213  */
214 
215 #define _sock_locks(x)						  \
216   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
217   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
218   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
219   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
220   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
221   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
222   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
223   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
224   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
225   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
226   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
227   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
228   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
229   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
230   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
231   x "AF_MCTP"  , \
232   x "AF_MAX"
233 
234 static const char *const af_family_key_strings[AF_MAX+1] = {
235 	_sock_locks("sk_lock-")
236 };
237 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
238 	_sock_locks("slock-")
239 };
240 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
241 	_sock_locks("clock-")
242 };
243 
244 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
245 	_sock_locks("k-sk_lock-")
246 };
247 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
248 	_sock_locks("k-slock-")
249 };
250 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
251 	_sock_locks("k-clock-")
252 };
253 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
254 	_sock_locks("rlock-")
255 };
256 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
257 	_sock_locks("wlock-")
258 };
259 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
260 	_sock_locks("elock-")
261 };
262 
263 /*
264  * sk_callback_lock and sk queues locking rules are per-address-family,
265  * so split the lock classes by using a per-AF key:
266  */
267 static struct lock_class_key af_callback_keys[AF_MAX];
268 static struct lock_class_key af_rlock_keys[AF_MAX];
269 static struct lock_class_key af_wlock_keys[AF_MAX];
270 static struct lock_class_key af_elock_keys[AF_MAX];
271 static struct lock_class_key af_kern_callback_keys[AF_MAX];
272 
273 /* Run time adjustable parameters. */
274 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
275 EXPORT_SYMBOL(sysctl_wmem_max);
276 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
277 EXPORT_SYMBOL(sysctl_rmem_max);
278 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
279 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
280 
281 /* Maximal space eaten by iovec or ancillary data plus some space */
282 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
283 EXPORT_SYMBOL(sysctl_optmem_max);
284 
285 int sysctl_tstamp_allow_data __read_mostly = 1;
286 
287 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
288 EXPORT_SYMBOL_GPL(memalloc_socks_key);
289 
290 /**
291  * sk_set_memalloc - sets %SOCK_MEMALLOC
292  * @sk: socket to set it on
293  *
294  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
295  * It's the responsibility of the admin to adjust min_free_kbytes
296  * to meet the requirements
297  */
sk_set_memalloc(struct sock * sk)298 void sk_set_memalloc(struct sock *sk)
299 {
300 	sock_set_flag(sk, SOCK_MEMALLOC);
301 	sk->sk_allocation |= __GFP_MEMALLOC;
302 	static_branch_inc(&memalloc_socks_key);
303 }
304 EXPORT_SYMBOL_GPL(sk_set_memalloc);
305 
sk_clear_memalloc(struct sock * sk)306 void sk_clear_memalloc(struct sock *sk)
307 {
308 	sock_reset_flag(sk, SOCK_MEMALLOC);
309 	sk->sk_allocation &= ~__GFP_MEMALLOC;
310 	static_branch_dec(&memalloc_socks_key);
311 
312 	/*
313 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
314 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
315 	 * it has rmem allocations due to the last swapfile being deactivated
316 	 * but there is a risk that the socket is unusable due to exceeding
317 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
318 	 */
319 	sk_mem_reclaim(sk);
320 }
321 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
322 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)323 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
324 {
325 	int ret;
326 	unsigned int noreclaim_flag;
327 
328 	/* these should have been dropped before queueing */
329 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
330 
331 	noreclaim_flag = memalloc_noreclaim_save();
332 	ret = sk->sk_backlog_rcv(sk, skb);
333 	memalloc_noreclaim_restore(noreclaim_flag);
334 
335 	return ret;
336 }
337 EXPORT_SYMBOL(__sk_backlog_rcv);
338 
sk_error_report(struct sock * sk)339 void sk_error_report(struct sock *sk)
340 {
341 	sk->sk_error_report(sk);
342 
343 	switch (sk->sk_family) {
344 	case AF_INET:
345 		fallthrough;
346 	case AF_INET6:
347 		trace_inet_sk_error_report(sk);
348 		break;
349 	default:
350 		break;
351 	}
352 }
353 EXPORT_SYMBOL(sk_error_report);
354 
sock_get_timeout(long timeo,void * optval,bool old_timeval)355 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
356 {
357 	struct __kernel_sock_timeval tv;
358 
359 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
360 		tv.tv_sec = 0;
361 		tv.tv_usec = 0;
362 	} else {
363 		tv.tv_sec = timeo / HZ;
364 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
365 	}
366 
367 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
368 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
369 		*(struct old_timeval32 *)optval = tv32;
370 		return sizeof(tv32);
371 	}
372 
373 	if (old_timeval) {
374 		struct __kernel_old_timeval old_tv;
375 		old_tv.tv_sec = tv.tv_sec;
376 		old_tv.tv_usec = tv.tv_usec;
377 		*(struct __kernel_old_timeval *)optval = old_tv;
378 		return sizeof(old_tv);
379 	}
380 
381 	*(struct __kernel_sock_timeval *)optval = tv;
382 	return sizeof(tv);
383 }
384 
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)385 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
386 			    bool old_timeval)
387 {
388 	struct __kernel_sock_timeval tv;
389 
390 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
391 		struct old_timeval32 tv32;
392 
393 		if (optlen < sizeof(tv32))
394 			return -EINVAL;
395 
396 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
397 			return -EFAULT;
398 		tv.tv_sec = tv32.tv_sec;
399 		tv.tv_usec = tv32.tv_usec;
400 	} else if (old_timeval) {
401 		struct __kernel_old_timeval old_tv;
402 
403 		if (optlen < sizeof(old_tv))
404 			return -EINVAL;
405 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
406 			return -EFAULT;
407 		tv.tv_sec = old_tv.tv_sec;
408 		tv.tv_usec = old_tv.tv_usec;
409 	} else {
410 		if (optlen < sizeof(tv))
411 			return -EINVAL;
412 		if (copy_from_sockptr(&tv, optval, sizeof(tv)))
413 			return -EFAULT;
414 	}
415 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
416 		return -EDOM;
417 
418 	if (tv.tv_sec < 0) {
419 		static int warned __read_mostly;
420 
421 		*timeo_p = 0;
422 		if (warned < 10 && net_ratelimit()) {
423 			warned++;
424 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
425 				__func__, current->comm, task_pid_nr(current));
426 		}
427 		return 0;
428 	}
429 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
430 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
431 		return 0;
432 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
433 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
434 	return 0;
435 }
436 
sock_needs_netstamp(const struct sock * sk)437 static bool sock_needs_netstamp(const struct sock *sk)
438 {
439 	switch (sk->sk_family) {
440 	case AF_UNSPEC:
441 	case AF_UNIX:
442 		return false;
443 	default:
444 		return true;
445 	}
446 }
447 
sock_disable_timestamp(struct sock * sk,unsigned long flags)448 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
449 {
450 	if (sk->sk_flags & flags) {
451 		sk->sk_flags &= ~flags;
452 		if (sock_needs_netstamp(sk) &&
453 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
454 			net_disable_timestamp();
455 	}
456 }
457 
458 
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)459 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
460 {
461 	unsigned long flags;
462 	struct sk_buff_head *list = &sk->sk_receive_queue;
463 
464 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
465 		atomic_inc(&sk->sk_drops);
466 		trace_sock_rcvqueue_full(sk, skb);
467 		return -ENOMEM;
468 	}
469 
470 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
471 		atomic_inc(&sk->sk_drops);
472 		return -ENOBUFS;
473 	}
474 
475 	skb->dev = NULL;
476 	skb_set_owner_r(skb, sk);
477 
478 	/* we escape from rcu protected region, make sure we dont leak
479 	 * a norefcounted dst
480 	 */
481 	skb_dst_force(skb);
482 
483 	spin_lock_irqsave(&list->lock, flags);
484 	sock_skb_set_dropcount(sk, skb);
485 	__skb_queue_tail(list, skb);
486 	spin_unlock_irqrestore(&list->lock, flags);
487 
488 	if (!sock_flag(sk, SOCK_DEAD))
489 		sk->sk_data_ready(sk);
490 	return 0;
491 }
492 EXPORT_SYMBOL(__sock_queue_rcv_skb);
493 
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)494 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
495 {
496 	int err;
497 
498 	err = sk_filter(sk, skb);
499 	if (err)
500 		return err;
501 
502 	return __sock_queue_rcv_skb(sk, skb);
503 }
504 EXPORT_SYMBOL(sock_queue_rcv_skb);
505 
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)506 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
507 		     const int nested, unsigned int trim_cap, bool refcounted)
508 {
509 	int rc = NET_RX_SUCCESS;
510 
511 	if (sk_filter_trim_cap(sk, skb, trim_cap))
512 		goto discard_and_relse;
513 
514 	skb->dev = NULL;
515 
516 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
517 		atomic_inc(&sk->sk_drops);
518 		goto discard_and_relse;
519 	}
520 	if (nested)
521 		bh_lock_sock_nested(sk);
522 	else
523 		bh_lock_sock(sk);
524 	if (!sock_owned_by_user(sk)) {
525 		/*
526 		 * trylock + unlock semantics:
527 		 */
528 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
529 
530 		rc = sk_backlog_rcv(sk, skb);
531 
532 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
533 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
534 		bh_unlock_sock(sk);
535 		atomic_inc(&sk->sk_drops);
536 		goto discard_and_relse;
537 	}
538 
539 	bh_unlock_sock(sk);
540 out:
541 	if (refcounted)
542 		sock_put(sk);
543 	return rc;
544 discard_and_relse:
545 	kfree_skb(skb);
546 	goto out;
547 }
548 EXPORT_SYMBOL(__sk_receive_skb);
549 
550 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
551 							  u32));
552 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
553 							   u32));
__sk_dst_check(struct sock * sk,u32 cookie)554 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
555 {
556 	struct dst_entry *dst = __sk_dst_get(sk);
557 
558 	if (dst && dst->obsolete &&
559 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
560 			       dst, cookie) == NULL) {
561 		sk_tx_queue_clear(sk);
562 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
563 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
564 		dst_release(dst);
565 		return NULL;
566 	}
567 
568 	return dst;
569 }
570 EXPORT_SYMBOL(__sk_dst_check);
571 
sk_dst_check(struct sock * sk,u32 cookie)572 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
573 {
574 	struct dst_entry *dst = sk_dst_get(sk);
575 
576 	if (dst && dst->obsolete &&
577 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
578 			       dst, cookie) == NULL) {
579 		sk_dst_reset(sk);
580 		dst_release(dst);
581 		return NULL;
582 	}
583 
584 	return dst;
585 }
586 EXPORT_SYMBOL(sk_dst_check);
587 
sock_bindtoindex_locked(struct sock * sk,int ifindex)588 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
589 {
590 	int ret = -ENOPROTOOPT;
591 #ifdef CONFIG_NETDEVICES
592 	struct net *net = sock_net(sk);
593 
594 	/* Sorry... */
595 	ret = -EPERM;
596 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
597 		goto out;
598 
599 	ret = -EINVAL;
600 	if (ifindex < 0)
601 		goto out;
602 
603 	sk->sk_bound_dev_if = ifindex;
604 	if (sk->sk_prot->rehash)
605 		sk->sk_prot->rehash(sk);
606 	sk_dst_reset(sk);
607 
608 	ret = 0;
609 
610 out:
611 #endif
612 
613 	return ret;
614 }
615 
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)616 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
617 {
618 	int ret;
619 
620 	if (lock_sk)
621 		lock_sock(sk);
622 	ret = sock_bindtoindex_locked(sk, ifindex);
623 	if (lock_sk)
624 		release_sock(sk);
625 
626 	return ret;
627 }
628 EXPORT_SYMBOL(sock_bindtoindex);
629 
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)630 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
631 {
632 	int ret = -ENOPROTOOPT;
633 #ifdef CONFIG_NETDEVICES
634 	struct net *net = sock_net(sk);
635 	char devname[IFNAMSIZ];
636 	int index;
637 
638 	ret = -EINVAL;
639 	if (optlen < 0)
640 		goto out;
641 
642 	/* Bind this socket to a particular device like "eth0",
643 	 * as specified in the passed interface name. If the
644 	 * name is "" or the option length is zero the socket
645 	 * is not bound.
646 	 */
647 	if (optlen > IFNAMSIZ - 1)
648 		optlen = IFNAMSIZ - 1;
649 	memset(devname, 0, sizeof(devname));
650 
651 	ret = -EFAULT;
652 	if (copy_from_sockptr(devname, optval, optlen))
653 		goto out;
654 
655 	index = 0;
656 	if (devname[0] != '\0') {
657 		struct net_device *dev;
658 
659 		rcu_read_lock();
660 		dev = dev_get_by_name_rcu(net, devname);
661 		if (dev)
662 			index = dev->ifindex;
663 		rcu_read_unlock();
664 		ret = -ENODEV;
665 		if (!dev)
666 			goto out;
667 	}
668 
669 	return sock_bindtoindex(sk, index, true);
670 out:
671 #endif
672 
673 	return ret;
674 }
675 
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)676 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
677 				int __user *optlen, int len)
678 {
679 	int ret = -ENOPROTOOPT;
680 #ifdef CONFIG_NETDEVICES
681 	struct net *net = sock_net(sk);
682 	char devname[IFNAMSIZ];
683 
684 	if (sk->sk_bound_dev_if == 0) {
685 		len = 0;
686 		goto zero;
687 	}
688 
689 	ret = -EINVAL;
690 	if (len < IFNAMSIZ)
691 		goto out;
692 
693 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
694 	if (ret)
695 		goto out;
696 
697 	len = strlen(devname) + 1;
698 
699 	ret = -EFAULT;
700 	if (copy_to_user(optval, devname, len))
701 		goto out;
702 
703 zero:
704 	ret = -EFAULT;
705 	if (put_user(len, optlen))
706 		goto out;
707 
708 	ret = 0;
709 
710 out:
711 #endif
712 
713 	return ret;
714 }
715 
sk_mc_loop(struct sock * sk)716 bool sk_mc_loop(struct sock *sk)
717 {
718 	if (dev_recursion_level())
719 		return false;
720 	if (!sk)
721 		return true;
722 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
723 	switch (READ_ONCE(sk->sk_family)) {
724 	case AF_INET:
725 		return inet_sk(sk)->mc_loop;
726 #if IS_ENABLED(CONFIG_IPV6)
727 	case AF_INET6:
728 		return inet6_sk(sk)->mc_loop;
729 #endif
730 	}
731 	WARN_ON_ONCE(1);
732 	return true;
733 }
734 EXPORT_SYMBOL(sk_mc_loop);
735 
sock_set_reuseaddr(struct sock * sk)736 void sock_set_reuseaddr(struct sock *sk)
737 {
738 	lock_sock(sk);
739 	sk->sk_reuse = SK_CAN_REUSE;
740 	release_sock(sk);
741 }
742 EXPORT_SYMBOL(sock_set_reuseaddr);
743 
sock_set_reuseport(struct sock * sk)744 void sock_set_reuseport(struct sock *sk)
745 {
746 	lock_sock(sk);
747 	sk->sk_reuseport = true;
748 	release_sock(sk);
749 }
750 EXPORT_SYMBOL(sock_set_reuseport);
751 
sock_no_linger(struct sock * sk)752 void sock_no_linger(struct sock *sk)
753 {
754 	lock_sock(sk);
755 	sk->sk_lingertime = 0;
756 	sock_set_flag(sk, SOCK_LINGER);
757 	release_sock(sk);
758 }
759 EXPORT_SYMBOL(sock_no_linger);
760 
sock_set_priority(struct sock * sk,u32 priority)761 void sock_set_priority(struct sock *sk, u32 priority)
762 {
763 	lock_sock(sk);
764 	sk->sk_priority = priority;
765 	release_sock(sk);
766 }
767 EXPORT_SYMBOL(sock_set_priority);
768 
sock_set_sndtimeo(struct sock * sk,s64 secs)769 void sock_set_sndtimeo(struct sock *sk, s64 secs)
770 {
771 	lock_sock(sk);
772 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
773 		sk->sk_sndtimeo = secs * HZ;
774 	else
775 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
776 	release_sock(sk);
777 }
778 EXPORT_SYMBOL(sock_set_sndtimeo);
779 
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)780 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
781 {
782 	if (val)  {
783 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
784 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
785 		sock_set_flag(sk, SOCK_RCVTSTAMP);
786 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
787 	} else {
788 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
789 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
790 	}
791 }
792 
sock_enable_timestamps(struct sock * sk)793 void sock_enable_timestamps(struct sock *sk)
794 {
795 	lock_sock(sk);
796 	__sock_set_timestamps(sk, true, false, true);
797 	release_sock(sk);
798 }
799 EXPORT_SYMBOL(sock_enable_timestamps);
800 
sock_set_timestamp(struct sock * sk,int optname,bool valbool)801 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
802 {
803 	switch (optname) {
804 	case SO_TIMESTAMP_OLD:
805 		__sock_set_timestamps(sk, valbool, false, false);
806 		break;
807 	case SO_TIMESTAMP_NEW:
808 		__sock_set_timestamps(sk, valbool, true, false);
809 		break;
810 	case SO_TIMESTAMPNS_OLD:
811 		__sock_set_timestamps(sk, valbool, false, true);
812 		break;
813 	case SO_TIMESTAMPNS_NEW:
814 		__sock_set_timestamps(sk, valbool, true, true);
815 		break;
816 	}
817 }
818 
sock_timestamping_bind_phc(struct sock * sk,int phc_index)819 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
820 {
821 	struct net *net = sock_net(sk);
822 	struct net_device *dev = NULL;
823 	bool match = false;
824 	int *vclock_index;
825 	int i, num;
826 
827 	if (sk->sk_bound_dev_if)
828 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
829 
830 	if (!dev) {
831 		pr_err("%s: sock not bind to device\n", __func__);
832 		return -EOPNOTSUPP;
833 	}
834 
835 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
836 	dev_put(dev);
837 
838 	for (i = 0; i < num; i++) {
839 		if (*(vclock_index + i) == phc_index) {
840 			match = true;
841 			break;
842 		}
843 	}
844 
845 	if (num > 0)
846 		kfree(vclock_index);
847 
848 	if (!match)
849 		return -EINVAL;
850 
851 	sk->sk_bind_phc = phc_index;
852 
853 	return 0;
854 }
855 
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)856 int sock_set_timestamping(struct sock *sk, int optname,
857 			  struct so_timestamping timestamping)
858 {
859 	int val = timestamping.flags;
860 	int ret;
861 
862 	if (val & ~SOF_TIMESTAMPING_MASK)
863 		return -EINVAL;
864 
865 	if (val & SOF_TIMESTAMPING_OPT_ID &&
866 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
867 		if (sk->sk_protocol == IPPROTO_TCP &&
868 		    sk->sk_type == SOCK_STREAM) {
869 			if ((1 << sk->sk_state) &
870 			    (TCPF_CLOSE | TCPF_LISTEN))
871 				return -EINVAL;
872 			atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
873 		} else {
874 			atomic_set(&sk->sk_tskey, 0);
875 		}
876 	}
877 
878 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
879 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
880 		return -EINVAL;
881 
882 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
883 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
884 		if (ret)
885 			return ret;
886 	}
887 
888 	sk->sk_tsflags = val;
889 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
890 
891 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
892 		sock_enable_timestamp(sk,
893 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
894 	else
895 		sock_disable_timestamp(sk,
896 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
897 	return 0;
898 }
899 
sock_set_keepalive(struct sock * sk)900 void sock_set_keepalive(struct sock *sk)
901 {
902 	lock_sock(sk);
903 	if (sk->sk_prot->keepalive)
904 		sk->sk_prot->keepalive(sk, true);
905 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
906 	release_sock(sk);
907 }
908 EXPORT_SYMBOL(sock_set_keepalive);
909 
__sock_set_rcvbuf(struct sock * sk,int val)910 static void __sock_set_rcvbuf(struct sock *sk, int val)
911 {
912 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
913 	 * as a negative value.
914 	 */
915 	val = min_t(int, val, INT_MAX / 2);
916 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
917 
918 	/* We double it on the way in to account for "struct sk_buff" etc.
919 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
920 	 * will allow that much actual data to be received on that socket.
921 	 *
922 	 * Applications are unaware that "struct sk_buff" and other overheads
923 	 * allocate from the receive buffer during socket buffer allocation.
924 	 *
925 	 * And after considering the possible alternatives, returning the value
926 	 * we actually used in getsockopt is the most desirable behavior.
927 	 */
928 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
929 }
930 
sock_set_rcvbuf(struct sock * sk,int val)931 void sock_set_rcvbuf(struct sock *sk, int val)
932 {
933 	lock_sock(sk);
934 	__sock_set_rcvbuf(sk, val);
935 	release_sock(sk);
936 }
937 EXPORT_SYMBOL(sock_set_rcvbuf);
938 
__sock_set_mark(struct sock * sk,u32 val)939 static void __sock_set_mark(struct sock *sk, u32 val)
940 {
941 	if (val != sk->sk_mark) {
942 		sk->sk_mark = val;
943 		sk_dst_reset(sk);
944 	}
945 }
946 
sock_set_mark(struct sock * sk,u32 val)947 void sock_set_mark(struct sock *sk, u32 val)
948 {
949 	lock_sock(sk);
950 	__sock_set_mark(sk, val);
951 	release_sock(sk);
952 }
953 EXPORT_SYMBOL(sock_set_mark);
954 
955 /*
956  *	This is meant for all protocols to use and covers goings on
957  *	at the socket level. Everything here is generic.
958  */
959 
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)960 int sock_setsockopt(struct socket *sock, int level, int optname,
961 		    sockptr_t optval, unsigned int optlen)
962 {
963 	struct so_timestamping timestamping;
964 	struct sock_txtime sk_txtime;
965 	struct sock *sk = sock->sk;
966 	int val;
967 	int valbool;
968 	struct linger ling;
969 	int ret = 0;
970 
971 	/*
972 	 *	Options without arguments
973 	 */
974 
975 	if (optname == SO_BINDTODEVICE)
976 		return sock_setbindtodevice(sk, optval, optlen);
977 
978 	if (optlen < sizeof(int))
979 		return -EINVAL;
980 
981 	if (copy_from_sockptr(&val, optval, sizeof(val)))
982 		return -EFAULT;
983 
984 	valbool = val ? 1 : 0;
985 
986 	lock_sock(sk);
987 
988 	switch (optname) {
989 	case SO_DEBUG:
990 		if (val && !capable(CAP_NET_ADMIN))
991 			ret = -EACCES;
992 		else
993 			sock_valbool_flag(sk, SOCK_DBG, valbool);
994 		break;
995 	case SO_REUSEADDR:
996 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
997 		break;
998 	case SO_REUSEPORT:
999 		sk->sk_reuseport = valbool;
1000 		break;
1001 	case SO_TYPE:
1002 	case SO_PROTOCOL:
1003 	case SO_DOMAIN:
1004 	case SO_ERROR:
1005 		ret = -ENOPROTOOPT;
1006 		break;
1007 	case SO_DONTROUTE:
1008 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1009 		sk_dst_reset(sk);
1010 		break;
1011 	case SO_BROADCAST:
1012 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1013 		break;
1014 	case SO_SNDBUF:
1015 		/* Don't error on this BSD doesn't and if you think
1016 		 * about it this is right. Otherwise apps have to
1017 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1018 		 * are treated in BSD as hints
1019 		 */
1020 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1021 set_sndbuf:
1022 		/* Ensure val * 2 fits into an int, to prevent max_t()
1023 		 * from treating it as a negative value.
1024 		 */
1025 		val = min_t(int, val, INT_MAX / 2);
1026 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1027 		WRITE_ONCE(sk->sk_sndbuf,
1028 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1029 		/* Wake up sending tasks if we upped the value. */
1030 		sk->sk_write_space(sk);
1031 		break;
1032 
1033 	case SO_SNDBUFFORCE:
1034 		if (!capable(CAP_NET_ADMIN)) {
1035 			ret = -EPERM;
1036 			break;
1037 		}
1038 
1039 		/* No negative values (to prevent underflow, as val will be
1040 		 * multiplied by 2).
1041 		 */
1042 		if (val < 0)
1043 			val = 0;
1044 		goto set_sndbuf;
1045 
1046 	case SO_RCVBUF:
1047 		/* Don't error on this BSD doesn't and if you think
1048 		 * about it this is right. Otherwise apps have to
1049 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1050 		 * are treated in BSD as hints
1051 		 */
1052 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1053 		break;
1054 
1055 	case SO_RCVBUFFORCE:
1056 		if (!capable(CAP_NET_ADMIN)) {
1057 			ret = -EPERM;
1058 			break;
1059 		}
1060 
1061 		/* No negative values (to prevent underflow, as val will be
1062 		 * multiplied by 2).
1063 		 */
1064 		__sock_set_rcvbuf(sk, max(val, 0));
1065 		break;
1066 
1067 	case SO_KEEPALIVE:
1068 		if (sk->sk_prot->keepalive)
1069 			sk->sk_prot->keepalive(sk, valbool);
1070 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1071 		break;
1072 
1073 	case SO_OOBINLINE:
1074 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1075 		break;
1076 
1077 	case SO_NO_CHECK:
1078 		sk->sk_no_check_tx = valbool;
1079 		break;
1080 
1081 	case SO_PRIORITY:
1082 		if ((val >= 0 && val <= 6) ||
1083 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1084 			sk->sk_priority = val;
1085 		else
1086 			ret = -EPERM;
1087 		break;
1088 
1089 	case SO_LINGER:
1090 		if (optlen < sizeof(ling)) {
1091 			ret = -EINVAL;	/* 1003.1g */
1092 			break;
1093 		}
1094 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1095 			ret = -EFAULT;
1096 			break;
1097 		}
1098 		if (!ling.l_onoff)
1099 			sock_reset_flag(sk, SOCK_LINGER);
1100 		else {
1101 #if (BITS_PER_LONG == 32)
1102 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1103 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1104 			else
1105 #endif
1106 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1107 			sock_set_flag(sk, SOCK_LINGER);
1108 		}
1109 		break;
1110 
1111 	case SO_BSDCOMPAT:
1112 		break;
1113 
1114 	case SO_PASSCRED:
1115 		if (valbool)
1116 			set_bit(SOCK_PASSCRED, &sock->flags);
1117 		else
1118 			clear_bit(SOCK_PASSCRED, &sock->flags);
1119 		break;
1120 
1121 	case SO_TIMESTAMP_OLD:
1122 	case SO_TIMESTAMP_NEW:
1123 	case SO_TIMESTAMPNS_OLD:
1124 	case SO_TIMESTAMPNS_NEW:
1125 		sock_set_timestamp(sk, optname, valbool);
1126 		break;
1127 
1128 	case SO_TIMESTAMPING_NEW:
1129 	case SO_TIMESTAMPING_OLD:
1130 		if (optlen == sizeof(timestamping)) {
1131 			if (copy_from_sockptr(&timestamping, optval,
1132 					      sizeof(timestamping))) {
1133 				ret = -EFAULT;
1134 				break;
1135 			}
1136 		} else {
1137 			memset(&timestamping, 0, sizeof(timestamping));
1138 			timestamping.flags = val;
1139 		}
1140 		ret = sock_set_timestamping(sk, optname, timestamping);
1141 		break;
1142 
1143 	case SO_RCVLOWAT:
1144 		if (val < 0)
1145 			val = INT_MAX;
1146 		if (sock->ops->set_rcvlowat)
1147 			ret = sock->ops->set_rcvlowat(sk, val);
1148 		else
1149 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1150 		break;
1151 
1152 	case SO_RCVTIMEO_OLD:
1153 	case SO_RCVTIMEO_NEW:
1154 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1155 				       optlen, optname == SO_RCVTIMEO_OLD);
1156 		break;
1157 
1158 	case SO_SNDTIMEO_OLD:
1159 	case SO_SNDTIMEO_NEW:
1160 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1161 				       optlen, optname == SO_SNDTIMEO_OLD);
1162 		break;
1163 
1164 	case SO_ATTACH_FILTER: {
1165 		struct sock_fprog fprog;
1166 
1167 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1168 		if (!ret)
1169 			ret = sk_attach_filter(&fprog, sk);
1170 		break;
1171 	}
1172 	case SO_ATTACH_BPF:
1173 		ret = -EINVAL;
1174 		if (optlen == sizeof(u32)) {
1175 			u32 ufd;
1176 
1177 			ret = -EFAULT;
1178 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1179 				break;
1180 
1181 			ret = sk_attach_bpf(ufd, sk);
1182 		}
1183 		break;
1184 
1185 	case SO_ATTACH_REUSEPORT_CBPF: {
1186 		struct sock_fprog fprog;
1187 
1188 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1189 		if (!ret)
1190 			ret = sk_reuseport_attach_filter(&fprog, sk);
1191 		break;
1192 	}
1193 	case SO_ATTACH_REUSEPORT_EBPF:
1194 		ret = -EINVAL;
1195 		if (optlen == sizeof(u32)) {
1196 			u32 ufd;
1197 
1198 			ret = -EFAULT;
1199 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1200 				break;
1201 
1202 			ret = sk_reuseport_attach_bpf(ufd, sk);
1203 		}
1204 		break;
1205 
1206 	case SO_DETACH_REUSEPORT_BPF:
1207 		ret = reuseport_detach_prog(sk);
1208 		break;
1209 
1210 	case SO_DETACH_FILTER:
1211 		ret = sk_detach_filter(sk);
1212 		break;
1213 
1214 	case SO_LOCK_FILTER:
1215 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1216 			ret = -EPERM;
1217 		else
1218 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1219 		break;
1220 
1221 	case SO_PASSSEC:
1222 		if (valbool)
1223 			set_bit(SOCK_PASSSEC, &sock->flags);
1224 		else
1225 			clear_bit(SOCK_PASSSEC, &sock->flags);
1226 		break;
1227 	case SO_MARK:
1228 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1229 			ret = -EPERM;
1230 			break;
1231 		}
1232 
1233 		__sock_set_mark(sk, val);
1234 		break;
1235 	case SO_RCVMARK:
1236 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1237 		break;
1238 
1239 	case SO_RXQ_OVFL:
1240 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1241 		break;
1242 
1243 	case SO_WIFI_STATUS:
1244 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1245 		break;
1246 
1247 	case SO_PEEK_OFF:
1248 		if (sock->ops->set_peek_off)
1249 			ret = sock->ops->set_peek_off(sk, val);
1250 		else
1251 			ret = -EOPNOTSUPP;
1252 		break;
1253 
1254 	case SO_NOFCS:
1255 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1256 		break;
1257 
1258 	case SO_SELECT_ERR_QUEUE:
1259 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1260 		break;
1261 
1262 #ifdef CONFIG_NET_RX_BUSY_POLL
1263 	case SO_BUSY_POLL:
1264 		/* allow unprivileged users to decrease the value */
1265 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1266 			ret = -EPERM;
1267 		else {
1268 			if (val < 0)
1269 				ret = -EINVAL;
1270 			else
1271 				WRITE_ONCE(sk->sk_ll_usec, val);
1272 		}
1273 		break;
1274 	case SO_PREFER_BUSY_POLL:
1275 		if (valbool && !capable(CAP_NET_ADMIN))
1276 			ret = -EPERM;
1277 		else
1278 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1279 		break;
1280 	case SO_BUSY_POLL_BUDGET:
1281 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1282 			ret = -EPERM;
1283 		} else {
1284 			if (val < 0 || val > U16_MAX)
1285 				ret = -EINVAL;
1286 			else
1287 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1288 		}
1289 		break;
1290 #endif
1291 
1292 	case SO_MAX_PACING_RATE:
1293 		{
1294 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1295 
1296 		if (sizeof(ulval) != sizeof(val) &&
1297 		    optlen >= sizeof(ulval) &&
1298 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1299 			ret = -EFAULT;
1300 			break;
1301 		}
1302 		if (ulval != ~0UL)
1303 			cmpxchg(&sk->sk_pacing_status,
1304 				SK_PACING_NONE,
1305 				SK_PACING_NEEDED);
1306 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1307 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1308 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1309 		break;
1310 		}
1311 	case SO_INCOMING_CPU:
1312 		reuseport_update_incoming_cpu(sk, val);
1313 		break;
1314 
1315 	case SO_CNX_ADVICE:
1316 		if (val == 1)
1317 			dst_negative_advice(sk);
1318 		break;
1319 
1320 	case SO_ZEROCOPY:
1321 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1322 			if (!((sk->sk_type == SOCK_STREAM &&
1323 			       sk->sk_protocol == IPPROTO_TCP) ||
1324 			      (sk->sk_type == SOCK_DGRAM &&
1325 			       sk->sk_protocol == IPPROTO_UDP)))
1326 				ret = -ENOTSUPP;
1327 		} else if (sk->sk_family != PF_RDS) {
1328 			ret = -ENOTSUPP;
1329 		}
1330 		if (!ret) {
1331 			if (val < 0 || val > 1)
1332 				ret = -EINVAL;
1333 			else
1334 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1335 		}
1336 		break;
1337 
1338 	case SO_TXTIME:
1339 		if (optlen != sizeof(struct sock_txtime)) {
1340 			ret = -EINVAL;
1341 			break;
1342 		} else if (copy_from_sockptr(&sk_txtime, optval,
1343 			   sizeof(struct sock_txtime))) {
1344 			ret = -EFAULT;
1345 			break;
1346 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1347 			ret = -EINVAL;
1348 			break;
1349 		}
1350 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1351 		 * scheduler has enough safe guards.
1352 		 */
1353 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1354 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1355 			ret = -EPERM;
1356 			break;
1357 		}
1358 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1359 		sk->sk_clockid = sk_txtime.clockid;
1360 		sk->sk_txtime_deadline_mode =
1361 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1362 		sk->sk_txtime_report_errors =
1363 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1364 		break;
1365 
1366 	case SO_BINDTOIFINDEX:
1367 		ret = sock_bindtoindex_locked(sk, val);
1368 		break;
1369 
1370 	case SO_BUF_LOCK:
1371 		if (val & ~SOCK_BUF_LOCK_MASK) {
1372 			ret = -EINVAL;
1373 			break;
1374 		}
1375 		sk->sk_userlocks = val | (sk->sk_userlocks &
1376 					  ~SOCK_BUF_LOCK_MASK);
1377 		break;
1378 
1379 	default:
1380 		ret = -ENOPROTOOPT;
1381 		break;
1382 	}
1383 	release_sock(sk);
1384 	return ret;
1385 }
1386 EXPORT_SYMBOL(sock_setsockopt);
1387 
sk_get_peer_cred(struct sock * sk)1388 static const struct cred *sk_get_peer_cred(struct sock *sk)
1389 {
1390 	const struct cred *cred;
1391 
1392 	spin_lock(&sk->sk_peer_lock);
1393 	cred = get_cred(sk->sk_peer_cred);
1394 	spin_unlock(&sk->sk_peer_lock);
1395 
1396 	return cred;
1397 }
1398 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1399 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1400 			  struct ucred *ucred)
1401 {
1402 	ucred->pid = pid_vnr(pid);
1403 	ucred->uid = ucred->gid = -1;
1404 	if (cred) {
1405 		struct user_namespace *current_ns = current_user_ns();
1406 
1407 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1408 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1409 	}
1410 }
1411 
groups_to_user(gid_t __user * dst,const struct group_info * src)1412 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1413 {
1414 	struct user_namespace *user_ns = current_user_ns();
1415 	int i;
1416 
1417 	for (i = 0; i < src->ngroups; i++)
1418 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1419 			return -EFAULT;
1420 
1421 	return 0;
1422 }
1423 
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1424 int sock_getsockopt(struct socket *sock, int level, int optname,
1425 		    char __user *optval, int __user *optlen)
1426 {
1427 	struct sock *sk = sock->sk;
1428 
1429 	union {
1430 		int val;
1431 		u64 val64;
1432 		unsigned long ulval;
1433 		struct linger ling;
1434 		struct old_timeval32 tm32;
1435 		struct __kernel_old_timeval tm;
1436 		struct  __kernel_sock_timeval stm;
1437 		struct sock_txtime txtime;
1438 		struct so_timestamping timestamping;
1439 	} v;
1440 
1441 	int lv = sizeof(int);
1442 	int len;
1443 
1444 	if (get_user(len, optlen))
1445 		return -EFAULT;
1446 	if (len < 0)
1447 		return -EINVAL;
1448 
1449 	memset(&v, 0, sizeof(v));
1450 
1451 	switch (optname) {
1452 	case SO_DEBUG:
1453 		v.val = sock_flag(sk, SOCK_DBG);
1454 		break;
1455 
1456 	case SO_DONTROUTE:
1457 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1458 		break;
1459 
1460 	case SO_BROADCAST:
1461 		v.val = sock_flag(sk, SOCK_BROADCAST);
1462 		break;
1463 
1464 	case SO_SNDBUF:
1465 		v.val = READ_ONCE(sk->sk_sndbuf);
1466 		break;
1467 
1468 	case SO_RCVBUF:
1469 		v.val = READ_ONCE(sk->sk_rcvbuf);
1470 		break;
1471 
1472 	case SO_REUSEADDR:
1473 		v.val = sk->sk_reuse;
1474 		break;
1475 
1476 	case SO_REUSEPORT:
1477 		v.val = sk->sk_reuseport;
1478 		break;
1479 
1480 	case SO_KEEPALIVE:
1481 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1482 		break;
1483 
1484 	case SO_TYPE:
1485 		v.val = sk->sk_type;
1486 		break;
1487 
1488 	case SO_PROTOCOL:
1489 		v.val = sk->sk_protocol;
1490 		break;
1491 
1492 	case SO_DOMAIN:
1493 		v.val = sk->sk_family;
1494 		break;
1495 
1496 	case SO_ERROR:
1497 		v.val = -sock_error(sk);
1498 		if (v.val == 0)
1499 			v.val = xchg(&sk->sk_err_soft, 0);
1500 		break;
1501 
1502 	case SO_OOBINLINE:
1503 		v.val = sock_flag(sk, SOCK_URGINLINE);
1504 		break;
1505 
1506 	case SO_NO_CHECK:
1507 		v.val = sk->sk_no_check_tx;
1508 		break;
1509 
1510 	case SO_PRIORITY:
1511 		v.val = sk->sk_priority;
1512 		break;
1513 
1514 	case SO_LINGER:
1515 		lv		= sizeof(v.ling);
1516 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1517 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1518 		break;
1519 
1520 	case SO_BSDCOMPAT:
1521 		break;
1522 
1523 	case SO_TIMESTAMP_OLD:
1524 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1525 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1526 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1527 		break;
1528 
1529 	case SO_TIMESTAMPNS_OLD:
1530 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1531 		break;
1532 
1533 	case SO_TIMESTAMP_NEW:
1534 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1535 		break;
1536 
1537 	case SO_TIMESTAMPNS_NEW:
1538 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1539 		break;
1540 
1541 	case SO_TIMESTAMPING_OLD:
1542 	case SO_TIMESTAMPING_NEW:
1543 		lv = sizeof(v.timestamping);
1544 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1545 		 * returning the flags when they were set through the same option.
1546 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1547 		 */
1548 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1549 			v.timestamping.flags = sk->sk_tsflags;
1550 			v.timestamping.bind_phc = sk->sk_bind_phc;
1551 		}
1552 		break;
1553 
1554 	case SO_RCVTIMEO_OLD:
1555 	case SO_RCVTIMEO_NEW:
1556 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1557 		break;
1558 
1559 	case SO_SNDTIMEO_OLD:
1560 	case SO_SNDTIMEO_NEW:
1561 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1562 		break;
1563 
1564 	case SO_RCVLOWAT:
1565 		v.val = READ_ONCE(sk->sk_rcvlowat);
1566 		break;
1567 
1568 	case SO_SNDLOWAT:
1569 		v.val = 1;
1570 		break;
1571 
1572 	case SO_PASSCRED:
1573 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1574 		break;
1575 
1576 	case SO_PEERCRED:
1577 	{
1578 		struct ucred peercred;
1579 		if (len > sizeof(peercred))
1580 			len = sizeof(peercred);
1581 
1582 		spin_lock(&sk->sk_peer_lock);
1583 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1584 		spin_unlock(&sk->sk_peer_lock);
1585 
1586 		if (copy_to_user(optval, &peercred, len))
1587 			return -EFAULT;
1588 		goto lenout;
1589 	}
1590 
1591 	case SO_PEERGROUPS:
1592 	{
1593 		const struct cred *cred;
1594 		int ret, n;
1595 
1596 		cred = sk_get_peer_cred(sk);
1597 		if (!cred)
1598 			return -ENODATA;
1599 
1600 		n = cred->group_info->ngroups;
1601 		if (len < n * sizeof(gid_t)) {
1602 			len = n * sizeof(gid_t);
1603 			put_cred(cred);
1604 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1605 		}
1606 		len = n * sizeof(gid_t);
1607 
1608 		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1609 		put_cred(cred);
1610 		if (ret)
1611 			return ret;
1612 		goto lenout;
1613 	}
1614 
1615 	case SO_PEERNAME:
1616 	{
1617 		char address[128];
1618 
1619 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1620 		if (lv < 0)
1621 			return -ENOTCONN;
1622 		if (lv < len)
1623 			return -EINVAL;
1624 		if (copy_to_user(optval, address, len))
1625 			return -EFAULT;
1626 		goto lenout;
1627 	}
1628 
1629 	/* Dubious BSD thing... Probably nobody even uses it, but
1630 	 * the UNIX standard wants it for whatever reason... -DaveM
1631 	 */
1632 	case SO_ACCEPTCONN:
1633 		v.val = sk->sk_state == TCP_LISTEN;
1634 		break;
1635 
1636 	case SO_PASSSEC:
1637 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1638 		break;
1639 
1640 	case SO_PEERSEC:
1641 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1642 
1643 	case SO_MARK:
1644 		v.val = sk->sk_mark;
1645 		break;
1646 
1647 	case SO_RCVMARK:
1648 		v.val = sock_flag(sk, SOCK_RCVMARK);
1649 		break;
1650 
1651 	case SO_RXQ_OVFL:
1652 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1653 		break;
1654 
1655 	case SO_WIFI_STATUS:
1656 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1657 		break;
1658 
1659 	case SO_PEEK_OFF:
1660 		if (!sock->ops->set_peek_off)
1661 			return -EOPNOTSUPP;
1662 
1663 		v.val = READ_ONCE(sk->sk_peek_off);
1664 		break;
1665 	case SO_NOFCS:
1666 		v.val = sock_flag(sk, SOCK_NOFCS);
1667 		break;
1668 
1669 	case SO_BINDTODEVICE:
1670 		return sock_getbindtodevice(sk, optval, optlen, len);
1671 
1672 	case SO_GET_FILTER:
1673 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1674 		if (len < 0)
1675 			return len;
1676 
1677 		goto lenout;
1678 
1679 	case SO_LOCK_FILTER:
1680 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1681 		break;
1682 
1683 	case SO_BPF_EXTENSIONS:
1684 		v.val = bpf_tell_extensions();
1685 		break;
1686 
1687 	case SO_SELECT_ERR_QUEUE:
1688 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1689 		break;
1690 
1691 #ifdef CONFIG_NET_RX_BUSY_POLL
1692 	case SO_BUSY_POLL:
1693 		v.val = READ_ONCE(sk->sk_ll_usec);
1694 		break;
1695 	case SO_PREFER_BUSY_POLL:
1696 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1697 		break;
1698 #endif
1699 
1700 	case SO_MAX_PACING_RATE:
1701 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1702 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1703 			lv = sizeof(v.ulval);
1704 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1705 		} else {
1706 			/* 32bit version */
1707 			v.val = min_t(unsigned long, ~0U,
1708 				      READ_ONCE(sk->sk_max_pacing_rate));
1709 		}
1710 		break;
1711 
1712 	case SO_INCOMING_CPU:
1713 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1714 		break;
1715 
1716 	case SO_MEMINFO:
1717 	{
1718 		u32 meminfo[SK_MEMINFO_VARS];
1719 
1720 		sk_get_meminfo(sk, meminfo);
1721 
1722 		len = min_t(unsigned int, len, sizeof(meminfo));
1723 		if (copy_to_user(optval, &meminfo, len))
1724 			return -EFAULT;
1725 
1726 		goto lenout;
1727 	}
1728 
1729 #ifdef CONFIG_NET_RX_BUSY_POLL
1730 	case SO_INCOMING_NAPI_ID:
1731 		v.val = READ_ONCE(sk->sk_napi_id);
1732 
1733 		/* aggregate non-NAPI IDs down to 0 */
1734 		if (v.val < MIN_NAPI_ID)
1735 			v.val = 0;
1736 
1737 		break;
1738 #endif
1739 
1740 	case SO_COOKIE:
1741 		lv = sizeof(u64);
1742 		if (len < lv)
1743 			return -EINVAL;
1744 		v.val64 = sock_gen_cookie(sk);
1745 		break;
1746 
1747 	case SO_ZEROCOPY:
1748 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1749 		break;
1750 
1751 	case SO_TXTIME:
1752 		lv = sizeof(v.txtime);
1753 		v.txtime.clockid = sk->sk_clockid;
1754 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1755 				  SOF_TXTIME_DEADLINE_MODE : 0;
1756 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1757 				  SOF_TXTIME_REPORT_ERRORS : 0;
1758 		break;
1759 
1760 	case SO_BINDTOIFINDEX:
1761 		v.val = sk->sk_bound_dev_if;
1762 		break;
1763 
1764 	case SO_NETNS_COOKIE:
1765 		lv = sizeof(u64);
1766 		if (len != lv)
1767 			return -EINVAL;
1768 		v.val64 = sock_net(sk)->net_cookie;
1769 		break;
1770 
1771 	case SO_BUF_LOCK:
1772 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1773 		break;
1774 
1775 	default:
1776 		/* We implement the SO_SNDLOWAT etc to not be settable
1777 		 * (1003.1g 7).
1778 		 */
1779 		return -ENOPROTOOPT;
1780 	}
1781 
1782 	if (len > lv)
1783 		len = lv;
1784 	if (copy_to_user(optval, &v, len))
1785 		return -EFAULT;
1786 lenout:
1787 	if (put_user(len, optlen))
1788 		return -EFAULT;
1789 	return 0;
1790 }
1791 
1792 /*
1793  * Initialize an sk_lock.
1794  *
1795  * (We also register the sk_lock with the lock validator.)
1796  */
sock_lock_init(struct sock * sk)1797 static inline void sock_lock_init(struct sock *sk)
1798 {
1799 	if (sk->sk_kern_sock)
1800 		sock_lock_init_class_and_name(
1801 			sk,
1802 			af_family_kern_slock_key_strings[sk->sk_family],
1803 			af_family_kern_slock_keys + sk->sk_family,
1804 			af_family_kern_key_strings[sk->sk_family],
1805 			af_family_kern_keys + sk->sk_family);
1806 	else
1807 		sock_lock_init_class_and_name(
1808 			sk,
1809 			af_family_slock_key_strings[sk->sk_family],
1810 			af_family_slock_keys + sk->sk_family,
1811 			af_family_key_strings[sk->sk_family],
1812 			af_family_keys + sk->sk_family);
1813 }
1814 
1815 /*
1816  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1817  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1818  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1819  */
sock_copy(struct sock * nsk,const struct sock * osk)1820 static void sock_copy(struct sock *nsk, const struct sock *osk)
1821 {
1822 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1823 #ifdef CONFIG_SECURITY_NETWORK
1824 	void *sptr = nsk->sk_security;
1825 #endif
1826 
1827 	/* If we move sk_tx_queue_mapping out of the private section,
1828 	 * we must check if sk_tx_queue_clear() is called after
1829 	 * sock_copy() in sk_clone_lock().
1830 	 */
1831 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1832 		     offsetof(struct sock, sk_dontcopy_begin) ||
1833 		     offsetof(struct sock, sk_tx_queue_mapping) >=
1834 		     offsetof(struct sock, sk_dontcopy_end));
1835 
1836 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1837 
1838 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1839 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1840 
1841 #ifdef CONFIG_SECURITY_NETWORK
1842 	nsk->sk_security = sptr;
1843 	security_sk_clone(osk, nsk);
1844 #endif
1845 }
1846 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1847 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1848 		int family)
1849 {
1850 	struct sock *sk;
1851 	struct kmem_cache *slab;
1852 
1853 	slab = prot->slab;
1854 	if (slab != NULL) {
1855 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1856 		if (!sk)
1857 			return sk;
1858 		if (want_init_on_alloc(priority))
1859 			sk_prot_clear_nulls(sk, prot->obj_size);
1860 	} else
1861 		sk = kmalloc(prot->obj_size, priority);
1862 
1863 	if (sk != NULL) {
1864 		if (security_sk_alloc(sk, family, priority))
1865 			goto out_free;
1866 
1867 		trace_android_rvh_sk_alloc(sk);
1868 
1869 		if (!try_module_get(prot->owner))
1870 			goto out_free_sec;
1871 	}
1872 
1873 	return sk;
1874 
1875 out_free_sec:
1876 	security_sk_free(sk);
1877 	trace_android_rvh_sk_free(sk);
1878 out_free:
1879 	if (slab != NULL)
1880 		kmem_cache_free(slab, sk);
1881 	else
1882 		kfree(sk);
1883 	return NULL;
1884 }
1885 
sk_prot_free(struct proto * prot,struct sock * sk)1886 static void sk_prot_free(struct proto *prot, struct sock *sk)
1887 {
1888 	struct kmem_cache *slab;
1889 	struct module *owner;
1890 
1891 	owner = prot->owner;
1892 	slab = prot->slab;
1893 
1894 	cgroup_sk_free(&sk->sk_cgrp_data);
1895 	mem_cgroup_sk_free(sk);
1896 	security_sk_free(sk);
1897 	trace_android_rvh_sk_free(sk);
1898 	if (slab != NULL)
1899 		kmem_cache_free(slab, sk);
1900 	else
1901 		kfree(sk);
1902 	module_put(owner);
1903 }
1904 
1905 /**
1906  *	sk_alloc - All socket objects are allocated here
1907  *	@net: the applicable net namespace
1908  *	@family: protocol family
1909  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1910  *	@prot: struct proto associated with this new sock instance
1911  *	@kern: is this to be a kernel socket?
1912  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1913 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1914 		      struct proto *prot, int kern)
1915 {
1916 	struct sock *sk;
1917 
1918 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1919 	if (sk) {
1920 		sk->sk_family = family;
1921 		/*
1922 		 * See comment in struct sock definition to understand
1923 		 * why we need sk_prot_creator -acme
1924 		 */
1925 		sk->sk_prot = sk->sk_prot_creator = prot;
1926 		sk->sk_kern_sock = kern;
1927 		sock_lock_init(sk);
1928 		sk->sk_net_refcnt = kern ? 0 : 1;
1929 		if (likely(sk->sk_net_refcnt)) {
1930 			get_net(net);
1931 			sock_inuse_add(net, 1);
1932 		}
1933 
1934 		sock_net_set(sk, net);
1935 		refcount_set(&sk->sk_wmem_alloc, 1);
1936 
1937 		mem_cgroup_sk_alloc(sk);
1938 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1939 		sock_update_classid(&sk->sk_cgrp_data);
1940 		sock_update_netprioidx(&sk->sk_cgrp_data);
1941 		sk_tx_queue_clear(sk);
1942 	}
1943 
1944 	return sk;
1945 }
1946 EXPORT_SYMBOL(sk_alloc);
1947 
1948 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1949  * grace period. This is the case for UDP sockets and TCP listeners.
1950  */
__sk_destruct(struct rcu_head * head)1951 static void __sk_destruct(struct rcu_head *head)
1952 {
1953 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1954 	struct sk_filter *filter;
1955 
1956 	if (sk->sk_destruct)
1957 		sk->sk_destruct(sk);
1958 
1959 	filter = rcu_dereference_check(sk->sk_filter,
1960 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1961 	if (filter) {
1962 		sk_filter_uncharge(sk, filter);
1963 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1964 	}
1965 
1966 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1967 
1968 #ifdef CONFIG_BPF_SYSCALL
1969 	bpf_sk_storage_free(sk);
1970 #endif
1971 
1972 	if (atomic_read(&sk->sk_omem_alloc))
1973 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1974 			 __func__, atomic_read(&sk->sk_omem_alloc));
1975 
1976 	if (sk->sk_frag.page) {
1977 		put_page(sk->sk_frag.page);
1978 		sk->sk_frag.page = NULL;
1979 	}
1980 
1981 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1982 	put_cred(sk->sk_peer_cred);
1983 	put_pid(sk->sk_peer_pid);
1984 
1985 	if (likely(sk->sk_net_refcnt))
1986 		put_net(sock_net(sk));
1987 	sk_prot_free(sk->sk_prot_creator, sk);
1988 }
1989 
sk_destruct(struct sock * sk)1990 void sk_destruct(struct sock *sk)
1991 {
1992 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1993 
1994 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1995 		reuseport_detach_sock(sk);
1996 		use_call_rcu = true;
1997 	}
1998 
1999 	if (use_call_rcu)
2000 		call_rcu(&sk->sk_rcu, __sk_destruct);
2001 	else
2002 		__sk_destruct(&sk->sk_rcu);
2003 }
2004 
__sk_free(struct sock * sk)2005 static void __sk_free(struct sock *sk)
2006 {
2007 	if (likely(sk->sk_net_refcnt))
2008 		sock_inuse_add(sock_net(sk), -1);
2009 
2010 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2011 		sock_diag_broadcast_destroy(sk);
2012 	else
2013 		sk_destruct(sk);
2014 }
2015 
sk_free(struct sock * sk)2016 void sk_free(struct sock *sk)
2017 {
2018 	/*
2019 	 * We subtract one from sk_wmem_alloc and can know if
2020 	 * some packets are still in some tx queue.
2021 	 * If not null, sock_wfree() will call __sk_free(sk) later
2022 	 */
2023 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2024 		__sk_free(sk);
2025 }
2026 EXPORT_SYMBOL(sk_free);
2027 
sk_init_common(struct sock * sk)2028 static void sk_init_common(struct sock *sk)
2029 {
2030 	skb_queue_head_init(&sk->sk_receive_queue);
2031 	skb_queue_head_init(&sk->sk_write_queue);
2032 	skb_queue_head_init(&sk->sk_error_queue);
2033 
2034 	rwlock_init(&sk->sk_callback_lock);
2035 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2036 			af_rlock_keys + sk->sk_family,
2037 			af_family_rlock_key_strings[sk->sk_family]);
2038 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2039 			af_wlock_keys + sk->sk_family,
2040 			af_family_wlock_key_strings[sk->sk_family]);
2041 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2042 			af_elock_keys + sk->sk_family,
2043 			af_family_elock_key_strings[sk->sk_family]);
2044 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2045 			af_callback_keys + sk->sk_family,
2046 			af_family_clock_key_strings[sk->sk_family]);
2047 }
2048 
2049 /**
2050  *	sk_clone_lock - clone a socket, and lock its clone
2051  *	@sk: the socket to clone
2052  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2053  *
2054  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2055  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2056 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2057 {
2058 	struct proto *prot = READ_ONCE(sk->sk_prot);
2059 	struct sk_filter *filter;
2060 	bool is_charged = true;
2061 	struct sock *newsk;
2062 
2063 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2064 	if (!newsk)
2065 		goto out;
2066 
2067 	sock_copy(newsk, sk);
2068 
2069 	newsk->sk_prot_creator = prot;
2070 
2071 	/* SANITY */
2072 	if (likely(newsk->sk_net_refcnt)) {
2073 		get_net(sock_net(newsk));
2074 		sock_inuse_add(sock_net(newsk), 1);
2075 	}
2076 	sk_node_init(&newsk->sk_node);
2077 	sock_lock_init(newsk);
2078 	bh_lock_sock(newsk);
2079 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2080 	newsk->sk_backlog.len = 0;
2081 
2082 	atomic_set(&newsk->sk_rmem_alloc, 0);
2083 
2084 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2085 	refcount_set(&newsk->sk_wmem_alloc, 1);
2086 
2087 	atomic_set(&newsk->sk_omem_alloc, 0);
2088 	sk_init_common(newsk);
2089 
2090 	newsk->sk_dst_cache	= NULL;
2091 	newsk->sk_dst_pending_confirm = 0;
2092 	newsk->sk_wmem_queued	= 0;
2093 	newsk->sk_forward_alloc = 0;
2094 	atomic_set(&newsk->sk_drops, 0);
2095 	newsk->sk_send_head	= NULL;
2096 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2097 	atomic_set(&newsk->sk_zckey, 0);
2098 
2099 	sock_reset_flag(newsk, SOCK_DONE);
2100 
2101 	/* sk->sk_memcg will be populated at accept() time */
2102 	newsk->sk_memcg = NULL;
2103 
2104 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2105 
2106 	rcu_read_lock();
2107 	filter = rcu_dereference(sk->sk_filter);
2108 	if (filter != NULL)
2109 		/* though it's an empty new sock, the charging may fail
2110 		 * if sysctl_optmem_max was changed between creation of
2111 		 * original socket and cloning
2112 		 */
2113 		is_charged = sk_filter_charge(newsk, filter);
2114 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2115 	rcu_read_unlock();
2116 
2117 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2118 		/* We need to make sure that we don't uncharge the new
2119 		 * socket if we couldn't charge it in the first place
2120 		 * as otherwise we uncharge the parent's filter.
2121 		 */
2122 		if (!is_charged)
2123 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2124 		sk_free_unlock_clone(newsk);
2125 		newsk = NULL;
2126 		goto out;
2127 	}
2128 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2129 
2130 	if (bpf_sk_storage_clone(sk, newsk)) {
2131 		sk_free_unlock_clone(newsk);
2132 		newsk = NULL;
2133 		goto out;
2134 	}
2135 
2136 	/* Clear sk_user_data if parent had the pointer tagged
2137 	 * as not suitable for copying when cloning.
2138 	 */
2139 	if (sk_user_data_is_nocopy(newsk))
2140 		newsk->sk_user_data = NULL;
2141 
2142 	newsk->sk_err	   = 0;
2143 	newsk->sk_err_soft = 0;
2144 	newsk->sk_priority = 0;
2145 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2146 
2147 	/* Before updating sk_refcnt, we must commit prior changes to memory
2148 	 * (Documentation/RCU/rculist_nulls.rst for details)
2149 	 */
2150 	smp_wmb();
2151 	refcount_set(&newsk->sk_refcnt, 2);
2152 
2153 	/* Increment the counter in the same struct proto as the master
2154 	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2155 	 * is the same as sk->sk_prot->socks, as this field was copied
2156 	 * with memcpy).
2157 	 *
2158 	 * This _changes_ the previous behaviour, where
2159 	 * tcp_create_openreq_child always was incrementing the
2160 	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2161 	 * to be taken into account in all callers. -acme
2162 	 */
2163 	sk_refcnt_debug_inc(newsk);
2164 	sk_set_socket(newsk, NULL);
2165 	sk_tx_queue_clear(newsk);
2166 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2167 
2168 	if (newsk->sk_prot->sockets_allocated)
2169 		sk_sockets_allocated_inc(newsk);
2170 
2171 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2172 		net_enable_timestamp();
2173 out:
2174 	return newsk;
2175 }
2176 EXPORT_SYMBOL_GPL(sk_clone_lock);
2177 
sk_free_unlock_clone(struct sock * sk)2178 void sk_free_unlock_clone(struct sock *sk)
2179 {
2180 	/* It is still raw copy of parent, so invalidate
2181 	 * destructor and make plain sk_free() */
2182 	sk->sk_destruct = NULL;
2183 	bh_unlock_sock(sk);
2184 	sk_free(sk);
2185 }
2186 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2187 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2188 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2189 {
2190 	u32 max_segs = 1;
2191 
2192 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2193 	if (sk->sk_route_caps & NETIF_F_GSO)
2194 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2195 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
2196 	if (sk_can_gso(sk)) {
2197 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2198 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2199 		} else {
2200 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2201 			sk->sk_gso_max_size = dst->dev->gso_max_size;
2202 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2203 		}
2204 	}
2205 	sk->sk_gso_max_segs = max_segs;
2206 	sk_dst_set(sk, dst);
2207 }
2208 EXPORT_SYMBOL_GPL(sk_setup_caps);
2209 
2210 /*
2211  *	Simple resource managers for sockets.
2212  */
2213 
2214 
2215 /*
2216  * Write buffer destructor automatically called from kfree_skb.
2217  */
sock_wfree(struct sk_buff * skb)2218 void sock_wfree(struct sk_buff *skb)
2219 {
2220 	struct sock *sk = skb->sk;
2221 	unsigned int len = skb->truesize;
2222 
2223 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2224 		/*
2225 		 * Keep a reference on sk_wmem_alloc, this will be released
2226 		 * after sk_write_space() call
2227 		 */
2228 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2229 		sk->sk_write_space(sk);
2230 		len = 1;
2231 	}
2232 	/*
2233 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2234 	 * could not do because of in-flight packets
2235 	 */
2236 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2237 		__sk_free(sk);
2238 }
2239 EXPORT_SYMBOL(sock_wfree);
2240 
2241 /* This variant of sock_wfree() is used by TCP,
2242  * since it sets SOCK_USE_WRITE_QUEUE.
2243  */
__sock_wfree(struct sk_buff * skb)2244 void __sock_wfree(struct sk_buff *skb)
2245 {
2246 	struct sock *sk = skb->sk;
2247 
2248 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2249 		__sk_free(sk);
2250 }
2251 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2252 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2253 {
2254 	skb_orphan(skb);
2255 	skb->sk = sk;
2256 #ifdef CONFIG_INET
2257 	if (unlikely(!sk_fullsock(sk))) {
2258 		skb->destructor = sock_edemux;
2259 		sock_hold(sk);
2260 		return;
2261 	}
2262 #endif
2263 	skb->destructor = sock_wfree;
2264 	skb_set_hash_from_sk(skb, sk);
2265 	/*
2266 	 * We used to take a refcount on sk, but following operation
2267 	 * is enough to guarantee sk_free() wont free this sock until
2268 	 * all in-flight packets are completed
2269 	 */
2270 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2271 }
2272 EXPORT_SYMBOL(skb_set_owner_w);
2273 
can_skb_orphan_partial(const struct sk_buff * skb)2274 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2275 {
2276 #ifdef CONFIG_TLS_DEVICE
2277 	/* Drivers depend on in-order delivery for crypto offload,
2278 	 * partial orphan breaks out-of-order-OK logic.
2279 	 */
2280 	if (skb->decrypted)
2281 		return false;
2282 #endif
2283 	return (skb->destructor == sock_wfree ||
2284 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2285 }
2286 
2287 /* This helper is used by netem, as it can hold packets in its
2288  * delay queue. We want to allow the owner socket to send more
2289  * packets, as if they were already TX completed by a typical driver.
2290  * But we also want to keep skb->sk set because some packet schedulers
2291  * rely on it (sch_fq for example).
2292  */
skb_orphan_partial(struct sk_buff * skb)2293 void skb_orphan_partial(struct sk_buff *skb)
2294 {
2295 	if (skb_is_tcp_pure_ack(skb))
2296 		return;
2297 
2298 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2299 		return;
2300 
2301 	skb_orphan(skb);
2302 }
2303 EXPORT_SYMBOL(skb_orphan_partial);
2304 
2305 /*
2306  * Read buffer destructor automatically called from kfree_skb.
2307  */
sock_rfree(struct sk_buff * skb)2308 void sock_rfree(struct sk_buff *skb)
2309 {
2310 	struct sock *sk = skb->sk;
2311 	unsigned int len = skb->truesize;
2312 
2313 	atomic_sub(len, &sk->sk_rmem_alloc);
2314 	sk_mem_uncharge(sk, len);
2315 }
2316 EXPORT_SYMBOL(sock_rfree);
2317 
2318 /*
2319  * Buffer destructor for skbs that are not used directly in read or write
2320  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2321  */
sock_efree(struct sk_buff * skb)2322 void sock_efree(struct sk_buff *skb)
2323 {
2324 	sock_put(skb->sk);
2325 }
2326 EXPORT_SYMBOL(sock_efree);
2327 
2328 /* Buffer destructor for prefetch/receive path where reference count may
2329  * not be held, e.g. for listen sockets.
2330  */
2331 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2332 void sock_pfree(struct sk_buff *skb)
2333 {
2334 	if (sk_is_refcounted(skb->sk))
2335 		sock_gen_put(skb->sk);
2336 }
2337 EXPORT_SYMBOL(sock_pfree);
2338 #endif /* CONFIG_INET */
2339 
sock_i_uid(struct sock * sk)2340 kuid_t sock_i_uid(struct sock *sk)
2341 {
2342 	kuid_t uid;
2343 
2344 	read_lock_bh(&sk->sk_callback_lock);
2345 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2346 	read_unlock_bh(&sk->sk_callback_lock);
2347 	return uid;
2348 }
2349 EXPORT_SYMBOL(sock_i_uid);
2350 
__sock_i_ino(struct sock * sk)2351 unsigned long __sock_i_ino(struct sock *sk)
2352 {
2353 	unsigned long ino;
2354 
2355 	read_lock(&sk->sk_callback_lock);
2356 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2357 	read_unlock(&sk->sk_callback_lock);
2358 	return ino;
2359 }
2360 EXPORT_SYMBOL(__sock_i_ino);
2361 
sock_i_ino(struct sock * sk)2362 unsigned long sock_i_ino(struct sock *sk)
2363 {
2364 	unsigned long ino;
2365 
2366 	local_bh_disable();
2367 	ino = __sock_i_ino(sk);
2368 	local_bh_enable();
2369 	return ino;
2370 }
2371 EXPORT_SYMBOL(sock_i_ino);
2372 
2373 /*
2374  * Allocate a skb from the socket's send buffer.
2375  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2376 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2377 			     gfp_t priority)
2378 {
2379 	if (force ||
2380 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2381 		struct sk_buff *skb = alloc_skb(size, priority);
2382 
2383 		if (skb) {
2384 			skb_set_owner_w(skb, sk);
2385 			return skb;
2386 		}
2387 	}
2388 	return NULL;
2389 }
2390 EXPORT_SYMBOL(sock_wmalloc);
2391 
sock_ofree(struct sk_buff * skb)2392 static void sock_ofree(struct sk_buff *skb)
2393 {
2394 	struct sock *sk = skb->sk;
2395 
2396 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2397 }
2398 
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2399 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2400 			     gfp_t priority)
2401 {
2402 	struct sk_buff *skb;
2403 
2404 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2405 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2406 	    READ_ONCE(sysctl_optmem_max))
2407 		return NULL;
2408 
2409 	skb = alloc_skb(size, priority);
2410 	if (!skb)
2411 		return NULL;
2412 
2413 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2414 	skb->sk = sk;
2415 	skb->destructor = sock_ofree;
2416 	return skb;
2417 }
2418 
2419 /*
2420  * Allocate a memory block from the socket's option memory buffer.
2421  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2422 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2423 {
2424 	int optmem_max = READ_ONCE(sysctl_optmem_max);
2425 
2426 	if ((unsigned int)size <= optmem_max &&
2427 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2428 		void *mem;
2429 		/* First do the add, to avoid the race if kmalloc
2430 		 * might sleep.
2431 		 */
2432 		atomic_add(size, &sk->sk_omem_alloc);
2433 		mem = kmalloc(size, priority);
2434 		if (mem)
2435 			return mem;
2436 		atomic_sub(size, &sk->sk_omem_alloc);
2437 	}
2438 	return NULL;
2439 }
2440 EXPORT_SYMBOL(sock_kmalloc);
2441 
2442 /* Free an option memory block. Note, we actually want the inline
2443  * here as this allows gcc to detect the nullify and fold away the
2444  * condition entirely.
2445  */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2446 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2447 				  const bool nullify)
2448 {
2449 	if (WARN_ON_ONCE(!mem))
2450 		return;
2451 	if (nullify)
2452 		kfree_sensitive(mem);
2453 	else
2454 		kfree(mem);
2455 	atomic_sub(size, &sk->sk_omem_alloc);
2456 }
2457 
sock_kfree_s(struct sock * sk,void * mem,int size)2458 void sock_kfree_s(struct sock *sk, void *mem, int size)
2459 {
2460 	__sock_kfree_s(sk, mem, size, false);
2461 }
2462 EXPORT_SYMBOL(sock_kfree_s);
2463 
sock_kzfree_s(struct sock * sk,void * mem,int size)2464 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2465 {
2466 	__sock_kfree_s(sk, mem, size, true);
2467 }
2468 EXPORT_SYMBOL(sock_kzfree_s);
2469 
2470 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2471    I think, these locks should be removed for datagram sockets.
2472  */
sock_wait_for_wmem(struct sock * sk,long timeo)2473 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2474 {
2475 	DEFINE_WAIT(wait);
2476 
2477 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2478 	for (;;) {
2479 		if (!timeo)
2480 			break;
2481 		if (signal_pending(current))
2482 			break;
2483 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2484 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2485 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2486 			break;
2487 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2488 			break;
2489 		if (READ_ONCE(sk->sk_err))
2490 			break;
2491 		timeo = schedule_timeout(timeo);
2492 	}
2493 	finish_wait(sk_sleep(sk), &wait);
2494 	return timeo;
2495 }
2496 
2497 
2498 /*
2499  *	Generic send/receive buffer handlers
2500  */
2501 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2502 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2503 				     unsigned long data_len, int noblock,
2504 				     int *errcode, int max_page_order)
2505 {
2506 	struct sk_buff *skb;
2507 	long timeo;
2508 	int err;
2509 
2510 	timeo = sock_sndtimeo(sk, noblock);
2511 	for (;;) {
2512 		err = sock_error(sk);
2513 		if (err != 0)
2514 			goto failure;
2515 
2516 		err = -EPIPE;
2517 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2518 			goto failure;
2519 
2520 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2521 			break;
2522 
2523 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2524 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2525 		err = -EAGAIN;
2526 		if (!timeo)
2527 			goto failure;
2528 		if (signal_pending(current))
2529 			goto interrupted;
2530 		timeo = sock_wait_for_wmem(sk, timeo);
2531 	}
2532 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2533 				   errcode, sk->sk_allocation);
2534 	if (skb)
2535 		skb_set_owner_w(skb, sk);
2536 	return skb;
2537 
2538 interrupted:
2539 	err = sock_intr_errno(timeo);
2540 failure:
2541 	*errcode = err;
2542 	return NULL;
2543 }
2544 EXPORT_SYMBOL(sock_alloc_send_pskb);
2545 
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)2546 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2547 				    int noblock, int *errcode)
2548 {
2549 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2550 }
2551 EXPORT_SYMBOL(sock_alloc_send_skb);
2552 
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2553 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2554 		     struct sockcm_cookie *sockc)
2555 {
2556 	u32 tsflags;
2557 
2558 	switch (cmsg->cmsg_type) {
2559 	case SO_MARK:
2560 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2561 			return -EPERM;
2562 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2563 			return -EINVAL;
2564 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2565 		break;
2566 	case SO_TIMESTAMPING_OLD:
2567 	case SO_TIMESTAMPING_NEW:
2568 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2569 			return -EINVAL;
2570 
2571 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2572 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2573 			return -EINVAL;
2574 
2575 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2576 		sockc->tsflags |= tsflags;
2577 		break;
2578 	case SCM_TXTIME:
2579 		if (!sock_flag(sk, SOCK_TXTIME))
2580 			return -EINVAL;
2581 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2582 			return -EINVAL;
2583 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2584 		break;
2585 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2586 	case SCM_RIGHTS:
2587 	case SCM_CREDENTIALS:
2588 		break;
2589 	default:
2590 		return -EINVAL;
2591 	}
2592 	return 0;
2593 }
2594 EXPORT_SYMBOL(__sock_cmsg_send);
2595 
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2596 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2597 		   struct sockcm_cookie *sockc)
2598 {
2599 	struct cmsghdr *cmsg;
2600 	int ret;
2601 
2602 	for_each_cmsghdr(cmsg, msg) {
2603 		if (!CMSG_OK(msg, cmsg))
2604 			return -EINVAL;
2605 		if (cmsg->cmsg_level != SOL_SOCKET)
2606 			continue;
2607 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2608 		if (ret)
2609 			return ret;
2610 	}
2611 	return 0;
2612 }
2613 EXPORT_SYMBOL(sock_cmsg_send);
2614 
sk_enter_memory_pressure(struct sock * sk)2615 static void sk_enter_memory_pressure(struct sock *sk)
2616 {
2617 	if (!sk->sk_prot->enter_memory_pressure)
2618 		return;
2619 
2620 	sk->sk_prot->enter_memory_pressure(sk);
2621 }
2622 
sk_leave_memory_pressure(struct sock * sk)2623 static void sk_leave_memory_pressure(struct sock *sk)
2624 {
2625 	if (sk->sk_prot->leave_memory_pressure) {
2626 		sk->sk_prot->leave_memory_pressure(sk);
2627 	} else {
2628 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2629 
2630 		if (memory_pressure && READ_ONCE(*memory_pressure))
2631 			WRITE_ONCE(*memory_pressure, 0);
2632 	}
2633 }
2634 
2635 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2636 
2637 /**
2638  * skb_page_frag_refill - check that a page_frag contains enough room
2639  * @sz: minimum size of the fragment we want to get
2640  * @pfrag: pointer to page_frag
2641  * @gfp: priority for memory allocation
2642  *
2643  * Note: While this allocator tries to use high order pages, there is
2644  * no guarantee that allocations succeed. Therefore, @sz MUST be
2645  * less or equal than PAGE_SIZE.
2646  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2647 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2648 {
2649 	if (pfrag->page) {
2650 		if (page_ref_count(pfrag->page) == 1) {
2651 			pfrag->offset = 0;
2652 			return true;
2653 		}
2654 		if (pfrag->offset + sz <= pfrag->size)
2655 			return true;
2656 		put_page(pfrag->page);
2657 	}
2658 
2659 	pfrag->offset = 0;
2660 	if (SKB_FRAG_PAGE_ORDER &&
2661 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2662 		/* Avoid direct reclaim but allow kswapd to wake */
2663 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2664 					  __GFP_COMP | __GFP_NOWARN |
2665 					  __GFP_NORETRY,
2666 					  SKB_FRAG_PAGE_ORDER);
2667 		if (likely(pfrag->page)) {
2668 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2669 			return true;
2670 		}
2671 	}
2672 	pfrag->page = alloc_page(gfp);
2673 	if (likely(pfrag->page)) {
2674 		pfrag->size = PAGE_SIZE;
2675 		return true;
2676 	}
2677 	return false;
2678 }
2679 EXPORT_SYMBOL(skb_page_frag_refill);
2680 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2681 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2682 {
2683 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2684 		return true;
2685 
2686 	sk_enter_memory_pressure(sk);
2687 	sk_stream_moderate_sndbuf(sk);
2688 	return false;
2689 }
2690 EXPORT_SYMBOL(sk_page_frag_refill);
2691 
__lock_sock(struct sock * sk)2692 void __lock_sock(struct sock *sk)
2693 	__releases(&sk->sk_lock.slock)
2694 	__acquires(&sk->sk_lock.slock)
2695 {
2696 	DEFINE_WAIT(wait);
2697 
2698 	for (;;) {
2699 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2700 					TASK_UNINTERRUPTIBLE);
2701 		spin_unlock_bh(&sk->sk_lock.slock);
2702 		schedule();
2703 		spin_lock_bh(&sk->sk_lock.slock);
2704 		if (!sock_owned_by_user(sk))
2705 			break;
2706 	}
2707 	finish_wait(&sk->sk_lock.wq, &wait);
2708 }
2709 
__release_sock(struct sock * sk)2710 void __release_sock(struct sock *sk)
2711 	__releases(&sk->sk_lock.slock)
2712 	__acquires(&sk->sk_lock.slock)
2713 {
2714 	struct sk_buff *skb, *next;
2715 
2716 	while ((skb = sk->sk_backlog.head) != NULL) {
2717 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2718 
2719 		spin_unlock_bh(&sk->sk_lock.slock);
2720 
2721 		do {
2722 			next = skb->next;
2723 			prefetch(next);
2724 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2725 			skb_mark_not_on_list(skb);
2726 			sk_backlog_rcv(sk, skb);
2727 
2728 			cond_resched();
2729 
2730 			skb = next;
2731 		} while (skb != NULL);
2732 
2733 		spin_lock_bh(&sk->sk_lock.slock);
2734 	}
2735 
2736 	/*
2737 	 * Doing the zeroing here guarantee we can not loop forever
2738 	 * while a wild producer attempts to flood us.
2739 	 */
2740 	sk->sk_backlog.len = 0;
2741 }
2742 
__sk_flush_backlog(struct sock * sk)2743 void __sk_flush_backlog(struct sock *sk)
2744 {
2745 	spin_lock_bh(&sk->sk_lock.slock);
2746 	__release_sock(sk);
2747 	spin_unlock_bh(&sk->sk_lock.slock);
2748 }
2749 
2750 /**
2751  * sk_wait_data - wait for data to arrive at sk_receive_queue
2752  * @sk:    sock to wait on
2753  * @timeo: for how long
2754  * @skb:   last skb seen on sk_receive_queue
2755  *
2756  * Now socket state including sk->sk_err is changed only under lock,
2757  * hence we may omit checks after joining wait queue.
2758  * We check receive queue before schedule() only as optimization;
2759  * it is very likely that release_sock() added new data.
2760  */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2761 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2762 {
2763 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2764 	int rc;
2765 
2766 	add_wait_queue(sk_sleep(sk), &wait);
2767 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2768 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2769 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2770 	remove_wait_queue(sk_sleep(sk), &wait);
2771 	return rc;
2772 }
2773 EXPORT_SYMBOL(sk_wait_data);
2774 
2775 /**
2776  *	__sk_mem_raise_allocated - increase memory_allocated
2777  *	@sk: socket
2778  *	@size: memory size to allocate
2779  *	@amt: pages to allocate
2780  *	@kind: allocation type
2781  *
2782  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2783  */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)2784 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2785 {
2786 	struct proto *prot = sk->sk_prot;
2787 	long allocated = sk_memory_allocated_add(sk, amt);
2788 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2789 	bool charged = true;
2790 
2791 	if (memcg_charge &&
2792 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2793 						gfp_memcg_charge())))
2794 		goto suppress_allocation;
2795 
2796 	/* Under limit. */
2797 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2798 		sk_leave_memory_pressure(sk);
2799 		return 1;
2800 	}
2801 
2802 	/* Under pressure. */
2803 	if (allocated > sk_prot_mem_limits(sk, 1))
2804 		sk_enter_memory_pressure(sk);
2805 
2806 	/* Over hard limit. */
2807 	if (allocated > sk_prot_mem_limits(sk, 2))
2808 		goto suppress_allocation;
2809 
2810 	/* guarantee minimum buffer size under pressure */
2811 	if (kind == SK_MEM_RECV) {
2812 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2813 			return 1;
2814 
2815 	} else { /* SK_MEM_SEND */
2816 		int wmem0 = sk_get_wmem0(sk, prot);
2817 
2818 		if (sk->sk_type == SOCK_STREAM) {
2819 			if (sk->sk_wmem_queued < wmem0)
2820 				return 1;
2821 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2822 				return 1;
2823 		}
2824 	}
2825 
2826 	if (sk_has_memory_pressure(sk)) {
2827 		u64 alloc;
2828 
2829 		if (!sk_under_memory_pressure(sk))
2830 			return 1;
2831 		alloc = sk_sockets_allocated_read_positive(sk);
2832 		if (sk_prot_mem_limits(sk, 2) > alloc *
2833 		    sk_mem_pages(sk->sk_wmem_queued +
2834 				 atomic_read(&sk->sk_rmem_alloc) +
2835 				 sk->sk_forward_alloc))
2836 			return 1;
2837 	}
2838 
2839 suppress_allocation:
2840 
2841 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2842 		sk_stream_moderate_sndbuf(sk);
2843 
2844 		/* Fail only if socket is _under_ its sndbuf.
2845 		 * In this case we cannot block, so that we have to fail.
2846 		 */
2847 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2848 			/* Force charge with __GFP_NOFAIL */
2849 			if (memcg_charge && !charged) {
2850 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2851 					gfp_memcg_charge() | __GFP_NOFAIL);
2852 			}
2853 			return 1;
2854 		}
2855 	}
2856 
2857 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2858 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2859 
2860 	sk_memory_allocated_sub(sk, amt);
2861 
2862 	if (memcg_charge && charged)
2863 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2864 
2865 	return 0;
2866 }
2867 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2868 
2869 /**
2870  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2871  *	@sk: socket
2872  *	@size: memory size to allocate
2873  *	@kind: allocation type
2874  *
2875  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2876  *	rmem allocation. This function assumes that protocols which have
2877  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2878  */
__sk_mem_schedule(struct sock * sk,int size,int kind)2879 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2880 {
2881 	int ret, amt = sk_mem_pages(size);
2882 
2883 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2884 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2885 	if (!ret)
2886 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2887 	return ret;
2888 }
2889 EXPORT_SYMBOL(__sk_mem_schedule);
2890 
2891 /**
2892  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2893  *	@sk: socket
2894  *	@amount: number of quanta
2895  *
2896  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2897  */
__sk_mem_reduce_allocated(struct sock * sk,int amount)2898 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2899 {
2900 	sk_memory_allocated_sub(sk, amount);
2901 
2902 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2903 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2904 
2905 	if (sk_under_global_memory_pressure(sk) &&
2906 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2907 		sk_leave_memory_pressure(sk);
2908 }
2909 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2910 
2911 /**
2912  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2913  *	@sk: socket
2914  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2915  */
__sk_mem_reclaim(struct sock * sk,int amount)2916 void __sk_mem_reclaim(struct sock *sk, int amount)
2917 {
2918 	amount >>= SK_MEM_QUANTUM_SHIFT;
2919 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2920 	__sk_mem_reduce_allocated(sk, amount);
2921 }
2922 EXPORT_SYMBOL(__sk_mem_reclaim);
2923 
sk_set_peek_off(struct sock * sk,int val)2924 int sk_set_peek_off(struct sock *sk, int val)
2925 {
2926 	WRITE_ONCE(sk->sk_peek_off, val);
2927 	return 0;
2928 }
2929 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2930 
2931 /*
2932  * Set of default routines for initialising struct proto_ops when
2933  * the protocol does not support a particular function. In certain
2934  * cases where it makes no sense for a protocol to have a "do nothing"
2935  * function, some default processing is provided.
2936  */
2937 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2938 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2939 {
2940 	return -EOPNOTSUPP;
2941 }
2942 EXPORT_SYMBOL(sock_no_bind);
2943 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2944 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2945 		    int len, int flags)
2946 {
2947 	return -EOPNOTSUPP;
2948 }
2949 EXPORT_SYMBOL(sock_no_connect);
2950 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2951 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2952 {
2953 	return -EOPNOTSUPP;
2954 }
2955 EXPORT_SYMBOL(sock_no_socketpair);
2956 
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)2957 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2958 		   bool kern)
2959 {
2960 	return -EOPNOTSUPP;
2961 }
2962 EXPORT_SYMBOL(sock_no_accept);
2963 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)2964 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2965 		    int peer)
2966 {
2967 	return -EOPNOTSUPP;
2968 }
2969 EXPORT_SYMBOL(sock_no_getname);
2970 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2971 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2972 {
2973 	return -EOPNOTSUPP;
2974 }
2975 EXPORT_SYMBOL(sock_no_ioctl);
2976 
sock_no_listen(struct socket * sock,int backlog)2977 int sock_no_listen(struct socket *sock, int backlog)
2978 {
2979 	return -EOPNOTSUPP;
2980 }
2981 EXPORT_SYMBOL(sock_no_listen);
2982 
sock_no_shutdown(struct socket * sock,int how)2983 int sock_no_shutdown(struct socket *sock, int how)
2984 {
2985 	return -EOPNOTSUPP;
2986 }
2987 EXPORT_SYMBOL(sock_no_shutdown);
2988 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2989 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2990 {
2991 	return -EOPNOTSUPP;
2992 }
2993 EXPORT_SYMBOL(sock_no_sendmsg);
2994 
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)2995 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2996 {
2997 	return -EOPNOTSUPP;
2998 }
2999 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3000 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)3001 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3002 		    int flags)
3003 {
3004 	return -EOPNOTSUPP;
3005 }
3006 EXPORT_SYMBOL(sock_no_recvmsg);
3007 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)3008 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3009 {
3010 	/* Mirror missing mmap method error code */
3011 	return -ENODEV;
3012 }
3013 EXPORT_SYMBOL(sock_no_mmap);
3014 
3015 /*
3016  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3017  * various sock-based usage counts.
3018  */
__receive_sock(struct file * file)3019 void __receive_sock(struct file *file)
3020 {
3021 	struct socket *sock;
3022 
3023 	sock = sock_from_file(file);
3024 	if (sock) {
3025 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3026 		sock_update_classid(&sock->sk->sk_cgrp_data);
3027 	}
3028 }
3029 
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)3030 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3031 {
3032 	ssize_t res;
3033 	struct msghdr msg = {.msg_flags = flags};
3034 	struct kvec iov;
3035 	char *kaddr = kmap(page);
3036 	iov.iov_base = kaddr + offset;
3037 	iov.iov_len = size;
3038 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3039 	kunmap(page);
3040 	return res;
3041 }
3042 EXPORT_SYMBOL(sock_no_sendpage);
3043 
sock_no_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)3044 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3045 				int offset, size_t size, int flags)
3046 {
3047 	ssize_t res;
3048 	struct msghdr msg = {.msg_flags = flags};
3049 	struct kvec iov;
3050 	char *kaddr = kmap(page);
3051 
3052 	iov.iov_base = kaddr + offset;
3053 	iov.iov_len = size;
3054 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3055 	kunmap(page);
3056 	return res;
3057 }
3058 EXPORT_SYMBOL(sock_no_sendpage_locked);
3059 
3060 /*
3061  *	Default Socket Callbacks
3062  */
3063 
sock_def_wakeup(struct sock * sk)3064 static void sock_def_wakeup(struct sock *sk)
3065 {
3066 	struct socket_wq *wq;
3067 
3068 	rcu_read_lock();
3069 	wq = rcu_dereference(sk->sk_wq);
3070 	if (skwq_has_sleeper(wq))
3071 		wake_up_interruptible_all(&wq->wait);
3072 	rcu_read_unlock();
3073 }
3074 
sock_def_error_report(struct sock * sk)3075 static void sock_def_error_report(struct sock *sk)
3076 {
3077 	struct socket_wq *wq;
3078 
3079 	rcu_read_lock();
3080 	wq = rcu_dereference(sk->sk_wq);
3081 	if (skwq_has_sleeper(wq))
3082 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3083 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3084 	rcu_read_unlock();
3085 }
3086 
sock_def_readable(struct sock * sk)3087 void sock_def_readable(struct sock *sk)
3088 {
3089 	struct socket_wq *wq;
3090 
3091 	rcu_read_lock();
3092 	wq = rcu_dereference(sk->sk_wq);
3093 
3094 	if (skwq_has_sleeper(wq)) {
3095 		int done = 0;
3096 
3097 		trace_android_vh_do_wake_up_sync(&wq->wait, &done);
3098 		if (done)
3099 			goto out;
3100 
3101 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3102 						EPOLLRDNORM | EPOLLRDBAND);
3103 	}
3104 
3105 out:
3106 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3107 	rcu_read_unlock();
3108 }
3109 
sock_def_write_space(struct sock * sk)3110 static void sock_def_write_space(struct sock *sk)
3111 {
3112 	struct socket_wq *wq;
3113 
3114 	rcu_read_lock();
3115 
3116 	/* Do not wake up a writer until he can make "significant"
3117 	 * progress.  --DaveM
3118 	 */
3119 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3120 		wq = rcu_dereference(sk->sk_wq);
3121 		if (skwq_has_sleeper(wq))
3122 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3123 						EPOLLWRNORM | EPOLLWRBAND);
3124 
3125 		/* Should agree with poll, otherwise some programs break */
3126 		if (sock_writeable(sk))
3127 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3128 	}
3129 
3130 	rcu_read_unlock();
3131 }
3132 
sock_def_destruct(struct sock * sk)3133 static void sock_def_destruct(struct sock *sk)
3134 {
3135 }
3136 
sk_send_sigurg(struct sock * sk)3137 void sk_send_sigurg(struct sock *sk)
3138 {
3139 	if (sk->sk_socket && sk->sk_socket->file)
3140 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3141 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3142 }
3143 EXPORT_SYMBOL(sk_send_sigurg);
3144 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3145 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3146 		    unsigned long expires)
3147 {
3148 	if (!mod_timer(timer, expires))
3149 		sock_hold(sk);
3150 }
3151 EXPORT_SYMBOL(sk_reset_timer);
3152 
sk_stop_timer(struct sock * sk,struct timer_list * timer)3153 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3154 {
3155 	if (del_timer(timer))
3156 		__sock_put(sk);
3157 }
3158 EXPORT_SYMBOL(sk_stop_timer);
3159 
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3160 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3161 {
3162 	if (del_timer_sync(timer))
3163 		__sock_put(sk);
3164 }
3165 EXPORT_SYMBOL(sk_stop_timer_sync);
3166 
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3167 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3168 {
3169 	sk_init_common(sk);
3170 	sk->sk_send_head	=	NULL;
3171 
3172 	timer_setup(&sk->sk_timer, NULL, 0);
3173 
3174 	sk->sk_allocation	=	GFP_KERNEL;
3175 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3176 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3177 	sk->sk_state		=	TCP_CLOSE;
3178 	sk_set_socket(sk, sock);
3179 
3180 	sock_set_flag(sk, SOCK_ZAPPED);
3181 
3182 	if (sock) {
3183 		sk->sk_type	=	sock->type;
3184 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3185 		sock->sk	=	sk;
3186 	} else {
3187 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3188 	}
3189 	sk->sk_uid	=	uid;
3190 
3191 	rwlock_init(&sk->sk_callback_lock);
3192 	if (sk->sk_kern_sock)
3193 		lockdep_set_class_and_name(
3194 			&sk->sk_callback_lock,
3195 			af_kern_callback_keys + sk->sk_family,
3196 			af_family_kern_clock_key_strings[sk->sk_family]);
3197 	else
3198 		lockdep_set_class_and_name(
3199 			&sk->sk_callback_lock,
3200 			af_callback_keys + sk->sk_family,
3201 			af_family_clock_key_strings[sk->sk_family]);
3202 
3203 	sk->sk_state_change	=	sock_def_wakeup;
3204 	sk->sk_data_ready	=	sock_def_readable;
3205 	sk->sk_write_space	=	sock_def_write_space;
3206 	sk->sk_error_report	=	sock_def_error_report;
3207 	sk->sk_destruct		=	sock_def_destruct;
3208 
3209 	sk->sk_frag.page	=	NULL;
3210 	sk->sk_frag.offset	=	0;
3211 	sk->sk_peek_off		=	-1;
3212 
3213 	sk->sk_peer_pid 	=	NULL;
3214 	sk->sk_peer_cred	=	NULL;
3215 	spin_lock_init(&sk->sk_peer_lock);
3216 
3217 	sk->sk_write_pending	=	0;
3218 	sk->sk_rcvlowat		=	1;
3219 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3220 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3221 
3222 	sk->sk_stamp = SK_DEFAULT_STAMP;
3223 #if BITS_PER_LONG==32
3224 	seqlock_init(&sk->sk_stamp_seq);
3225 #endif
3226 	atomic_set(&sk->sk_zckey, 0);
3227 
3228 #ifdef CONFIG_NET_RX_BUSY_POLL
3229 	sk->sk_napi_id		=	0;
3230 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3231 #endif
3232 
3233 	sk->sk_max_pacing_rate = ~0UL;
3234 	sk->sk_pacing_rate = ~0UL;
3235 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3236 	sk->sk_incoming_cpu = -1;
3237 
3238 	sk_rx_queue_clear(sk);
3239 	/*
3240 	 * Before updating sk_refcnt, we must commit prior changes to memory
3241 	 * (Documentation/RCU/rculist_nulls.rst for details)
3242 	 */
3243 	smp_wmb();
3244 	refcount_set(&sk->sk_refcnt, 1);
3245 	atomic_set(&sk->sk_drops, 0);
3246 }
3247 EXPORT_SYMBOL(sock_init_data_uid);
3248 
sock_init_data(struct socket * sock,struct sock * sk)3249 void sock_init_data(struct socket *sock, struct sock *sk)
3250 {
3251 	kuid_t uid = sock ?
3252 		SOCK_INODE(sock)->i_uid :
3253 		make_kuid(sock_net(sk)->user_ns, 0);
3254 
3255 	sock_init_data_uid(sock, sk, uid);
3256 }
3257 EXPORT_SYMBOL(sock_init_data);
3258 
lock_sock_nested(struct sock * sk,int subclass)3259 void lock_sock_nested(struct sock *sk, int subclass)
3260 {
3261 	/* The sk_lock has mutex_lock() semantics here. */
3262 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3263 
3264 	might_sleep();
3265 	spin_lock_bh(&sk->sk_lock.slock);
3266 	if (sk->sk_lock.owned)
3267 		__lock_sock(sk);
3268 	sk->sk_lock.owned = 1;
3269 	spin_unlock_bh(&sk->sk_lock.slock);
3270 }
3271 EXPORT_SYMBOL(lock_sock_nested);
3272 
release_sock(struct sock * sk)3273 void release_sock(struct sock *sk)
3274 {
3275 	spin_lock_bh(&sk->sk_lock.slock);
3276 	if (sk->sk_backlog.tail)
3277 		__release_sock(sk);
3278 
3279 	/* Warning : release_cb() might need to release sk ownership,
3280 	 * ie call sock_release_ownership(sk) before us.
3281 	 */
3282 	if (sk->sk_prot->release_cb)
3283 		sk->sk_prot->release_cb(sk);
3284 
3285 	sock_release_ownership(sk);
3286 	if (waitqueue_active(&sk->sk_lock.wq))
3287 		wake_up(&sk->sk_lock.wq);
3288 	spin_unlock_bh(&sk->sk_lock.slock);
3289 }
3290 EXPORT_SYMBOL(release_sock);
3291 
__lock_sock_fast(struct sock * sk)3292 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3293 {
3294 	might_sleep();
3295 	spin_lock_bh(&sk->sk_lock.slock);
3296 
3297 	if (!sk->sk_lock.owned) {
3298 		/*
3299 		 * Fast path return with bottom halves disabled and
3300 		 * sock::sk_lock.slock held.
3301 		 *
3302 		 * The 'mutex' is not contended and holding
3303 		 * sock::sk_lock.slock prevents all other lockers to
3304 		 * proceed so the corresponding unlock_sock_fast() can
3305 		 * avoid the slow path of release_sock() completely and
3306 		 * just release slock.
3307 		 *
3308 		 * From a semantical POV this is equivalent to 'acquiring'
3309 		 * the 'mutex', hence the corresponding lockdep
3310 		 * mutex_release() has to happen in the fast path of
3311 		 * unlock_sock_fast().
3312 		 */
3313 		return false;
3314 	}
3315 
3316 	__lock_sock(sk);
3317 	sk->sk_lock.owned = 1;
3318 	__acquire(&sk->sk_lock.slock);
3319 	spin_unlock_bh(&sk->sk_lock.slock);
3320 	return true;
3321 }
3322 EXPORT_SYMBOL(__lock_sock_fast);
3323 
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3324 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3325 		   bool timeval, bool time32)
3326 {
3327 	struct sock *sk = sock->sk;
3328 	struct timespec64 ts;
3329 
3330 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3331 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3332 	if (ts.tv_sec == -1)
3333 		return -ENOENT;
3334 	if (ts.tv_sec == 0) {
3335 		ktime_t kt = ktime_get_real();
3336 		sock_write_timestamp(sk, kt);
3337 		ts = ktime_to_timespec64(kt);
3338 	}
3339 
3340 	if (timeval)
3341 		ts.tv_nsec /= 1000;
3342 
3343 #ifdef CONFIG_COMPAT_32BIT_TIME
3344 	if (time32)
3345 		return put_old_timespec32(&ts, userstamp);
3346 #endif
3347 #ifdef CONFIG_SPARC64
3348 	/* beware of padding in sparc64 timeval */
3349 	if (timeval && !in_compat_syscall()) {
3350 		struct __kernel_old_timeval __user tv = {
3351 			.tv_sec = ts.tv_sec,
3352 			.tv_usec = ts.tv_nsec,
3353 		};
3354 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3355 			return -EFAULT;
3356 		return 0;
3357 	}
3358 #endif
3359 	return put_timespec64(&ts, userstamp);
3360 }
3361 EXPORT_SYMBOL(sock_gettstamp);
3362 
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3363 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3364 {
3365 	if (!sock_flag(sk, flag)) {
3366 		unsigned long previous_flags = sk->sk_flags;
3367 
3368 		sock_set_flag(sk, flag);
3369 		/*
3370 		 * we just set one of the two flags which require net
3371 		 * time stamping, but time stamping might have been on
3372 		 * already because of the other one
3373 		 */
3374 		if (sock_needs_netstamp(sk) &&
3375 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3376 			net_enable_timestamp();
3377 	}
3378 }
3379 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3380 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3381 		       int level, int type)
3382 {
3383 	struct sock_exterr_skb *serr;
3384 	struct sk_buff *skb;
3385 	int copied, err;
3386 
3387 	err = -EAGAIN;
3388 	skb = sock_dequeue_err_skb(sk);
3389 	if (skb == NULL)
3390 		goto out;
3391 
3392 	copied = skb->len;
3393 	if (copied > len) {
3394 		msg->msg_flags |= MSG_TRUNC;
3395 		copied = len;
3396 	}
3397 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3398 	if (err)
3399 		goto out_free_skb;
3400 
3401 	sock_recv_timestamp(msg, sk, skb);
3402 
3403 	serr = SKB_EXT_ERR(skb);
3404 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3405 
3406 	msg->msg_flags |= MSG_ERRQUEUE;
3407 	err = copied;
3408 
3409 out_free_skb:
3410 	kfree_skb(skb);
3411 out:
3412 	return err;
3413 }
3414 EXPORT_SYMBOL(sock_recv_errqueue);
3415 
3416 /*
3417  *	Get a socket option on an socket.
3418  *
3419  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3420  *	asynchronous errors should be reported by getsockopt. We assume
3421  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3422  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3423 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3424 			   char __user *optval, int __user *optlen)
3425 {
3426 	struct sock *sk = sock->sk;
3427 
3428 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3429 }
3430 EXPORT_SYMBOL(sock_common_getsockopt);
3431 
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3432 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3433 			int flags)
3434 {
3435 	struct sock *sk = sock->sk;
3436 	int addr_len = 0;
3437 	int err;
3438 
3439 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3440 				   flags & ~MSG_DONTWAIT, &addr_len);
3441 	if (err >= 0)
3442 		msg->msg_namelen = addr_len;
3443 	return err;
3444 }
3445 EXPORT_SYMBOL(sock_common_recvmsg);
3446 
3447 /*
3448  *	Set socket options on an inet socket.
3449  */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3450 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3451 			   sockptr_t optval, unsigned int optlen)
3452 {
3453 	struct sock *sk = sock->sk;
3454 
3455 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3456 }
3457 EXPORT_SYMBOL(sock_common_setsockopt);
3458 
sk_common_release(struct sock * sk)3459 void sk_common_release(struct sock *sk)
3460 {
3461 	if (sk->sk_prot->destroy)
3462 		sk->sk_prot->destroy(sk);
3463 
3464 	/*
3465 	 * Observation: when sk_common_release is called, processes have
3466 	 * no access to socket. But net still has.
3467 	 * Step one, detach it from networking:
3468 	 *
3469 	 * A. Remove from hash tables.
3470 	 */
3471 
3472 	sk->sk_prot->unhash(sk);
3473 
3474 	/*
3475 	 * In this point socket cannot receive new packets, but it is possible
3476 	 * that some packets are in flight because some CPU runs receiver and
3477 	 * did hash table lookup before we unhashed socket. They will achieve
3478 	 * receive queue and will be purged by socket destructor.
3479 	 *
3480 	 * Also we still have packets pending on receive queue and probably,
3481 	 * our own packets waiting in device queues. sock_destroy will drain
3482 	 * receive queue, but transmitted packets will delay socket destruction
3483 	 * until the last reference will be released.
3484 	 */
3485 
3486 	sock_orphan(sk);
3487 
3488 	xfrm_sk_free_policy(sk);
3489 
3490 	sk_refcnt_debug_release(sk);
3491 
3492 	sock_put(sk);
3493 }
3494 EXPORT_SYMBOL(sk_common_release);
3495 
sk_get_meminfo(const struct sock * sk,u32 * mem)3496 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3497 {
3498 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3499 
3500 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3501 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3502 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3503 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3504 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3505 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3506 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3507 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3508 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3509 }
3510 
3511 #ifdef CONFIG_PROC_FS
3512 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3513 struct prot_inuse {
3514 	int val[PROTO_INUSE_NR];
3515 };
3516 
3517 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3518 
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)3519 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3520 {
3521 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3522 }
3523 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3524 
sock_prot_inuse_get(struct net * net,struct proto * prot)3525 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3526 {
3527 	int cpu, idx = prot->inuse_idx;
3528 	int res = 0;
3529 
3530 	for_each_possible_cpu(cpu)
3531 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3532 
3533 	return res >= 0 ? res : 0;
3534 }
3535 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3536 
sock_inuse_add(struct net * net,int val)3537 static void sock_inuse_add(struct net *net, int val)
3538 {
3539 	this_cpu_add(*net->core.sock_inuse, val);
3540 }
3541 
sock_inuse_get(struct net * net)3542 int sock_inuse_get(struct net *net)
3543 {
3544 	int cpu, res = 0;
3545 
3546 	for_each_possible_cpu(cpu)
3547 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3548 
3549 	return res;
3550 }
3551 
3552 EXPORT_SYMBOL_GPL(sock_inuse_get);
3553 
sock_inuse_init_net(struct net * net)3554 static int __net_init sock_inuse_init_net(struct net *net)
3555 {
3556 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3557 	if (net->core.prot_inuse == NULL)
3558 		return -ENOMEM;
3559 
3560 	net->core.sock_inuse = alloc_percpu(int);
3561 	if (net->core.sock_inuse == NULL)
3562 		goto out;
3563 
3564 	return 0;
3565 
3566 out:
3567 	free_percpu(net->core.prot_inuse);
3568 	return -ENOMEM;
3569 }
3570 
sock_inuse_exit_net(struct net * net)3571 static void __net_exit sock_inuse_exit_net(struct net *net)
3572 {
3573 	free_percpu(net->core.prot_inuse);
3574 	free_percpu(net->core.sock_inuse);
3575 }
3576 
3577 static struct pernet_operations net_inuse_ops = {
3578 	.init = sock_inuse_init_net,
3579 	.exit = sock_inuse_exit_net,
3580 };
3581 
net_inuse_init(void)3582 static __init int net_inuse_init(void)
3583 {
3584 	if (register_pernet_subsys(&net_inuse_ops))
3585 		panic("Cannot initialize net inuse counters");
3586 
3587 	return 0;
3588 }
3589 
3590 core_initcall(net_inuse_init);
3591 
assign_proto_idx(struct proto * prot)3592 static int assign_proto_idx(struct proto *prot)
3593 {
3594 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3595 
3596 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3597 		pr_err("PROTO_INUSE_NR exhausted\n");
3598 		return -ENOSPC;
3599 	}
3600 
3601 	set_bit(prot->inuse_idx, proto_inuse_idx);
3602 	return 0;
3603 }
3604 
release_proto_idx(struct proto * prot)3605 static void release_proto_idx(struct proto *prot)
3606 {
3607 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3608 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3609 }
3610 #else
assign_proto_idx(struct proto * prot)3611 static inline int assign_proto_idx(struct proto *prot)
3612 {
3613 	return 0;
3614 }
3615 
release_proto_idx(struct proto * prot)3616 static inline void release_proto_idx(struct proto *prot)
3617 {
3618 }
3619 
sock_inuse_add(struct net * net,int val)3620 static void sock_inuse_add(struct net *net, int val)
3621 {
3622 }
3623 #endif
3624 
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3625 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3626 {
3627 	if (!twsk_prot)
3628 		return;
3629 	kfree(twsk_prot->twsk_slab_name);
3630 	twsk_prot->twsk_slab_name = NULL;
3631 	kmem_cache_destroy(twsk_prot->twsk_slab);
3632 	twsk_prot->twsk_slab = NULL;
3633 }
3634 
tw_prot_init(const struct proto * prot)3635 static int tw_prot_init(const struct proto *prot)
3636 {
3637 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3638 
3639 	if (!twsk_prot)
3640 		return 0;
3641 
3642 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3643 					      prot->name);
3644 	if (!twsk_prot->twsk_slab_name)
3645 		return -ENOMEM;
3646 
3647 	twsk_prot->twsk_slab =
3648 		kmem_cache_create(twsk_prot->twsk_slab_name,
3649 				  twsk_prot->twsk_obj_size, 0,
3650 				  SLAB_ACCOUNT | prot->slab_flags,
3651 				  NULL);
3652 	if (!twsk_prot->twsk_slab) {
3653 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3654 			prot->name);
3655 		return -ENOMEM;
3656 	}
3657 
3658 	return 0;
3659 }
3660 
req_prot_cleanup(struct request_sock_ops * rsk_prot)3661 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3662 {
3663 	if (!rsk_prot)
3664 		return;
3665 	kfree(rsk_prot->slab_name);
3666 	rsk_prot->slab_name = NULL;
3667 	kmem_cache_destroy(rsk_prot->slab);
3668 	rsk_prot->slab = NULL;
3669 }
3670 
req_prot_init(const struct proto * prot)3671 static int req_prot_init(const struct proto *prot)
3672 {
3673 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3674 
3675 	if (!rsk_prot)
3676 		return 0;
3677 
3678 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3679 					prot->name);
3680 	if (!rsk_prot->slab_name)
3681 		return -ENOMEM;
3682 
3683 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3684 					   rsk_prot->obj_size, 0,
3685 					   SLAB_ACCOUNT | prot->slab_flags,
3686 					   NULL);
3687 
3688 	if (!rsk_prot->slab) {
3689 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3690 			prot->name);
3691 		return -ENOMEM;
3692 	}
3693 	return 0;
3694 }
3695 
proto_register(struct proto * prot,int alloc_slab)3696 int proto_register(struct proto *prot, int alloc_slab)
3697 {
3698 	int ret = -ENOBUFS;
3699 
3700 	if (alloc_slab) {
3701 		prot->slab = kmem_cache_create_usercopy(prot->name,
3702 					prot->obj_size, 0,
3703 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3704 					prot->slab_flags,
3705 					prot->useroffset, prot->usersize,
3706 					NULL);
3707 
3708 		if (prot->slab == NULL) {
3709 			pr_crit("%s: Can't create sock SLAB cache!\n",
3710 				prot->name);
3711 			goto out;
3712 		}
3713 
3714 		if (req_prot_init(prot))
3715 			goto out_free_request_sock_slab;
3716 
3717 		if (tw_prot_init(prot))
3718 			goto out_free_timewait_sock_slab;
3719 	}
3720 
3721 	mutex_lock(&proto_list_mutex);
3722 	ret = assign_proto_idx(prot);
3723 	if (ret) {
3724 		mutex_unlock(&proto_list_mutex);
3725 		goto out_free_timewait_sock_slab;
3726 	}
3727 	list_add(&prot->node, &proto_list);
3728 	mutex_unlock(&proto_list_mutex);
3729 	return ret;
3730 
3731 out_free_timewait_sock_slab:
3732 	if (alloc_slab)
3733 		tw_prot_cleanup(prot->twsk_prot);
3734 out_free_request_sock_slab:
3735 	if (alloc_slab) {
3736 		req_prot_cleanup(prot->rsk_prot);
3737 
3738 		kmem_cache_destroy(prot->slab);
3739 		prot->slab = NULL;
3740 	}
3741 out:
3742 	return ret;
3743 }
3744 EXPORT_SYMBOL(proto_register);
3745 
proto_unregister(struct proto * prot)3746 void proto_unregister(struct proto *prot)
3747 {
3748 	mutex_lock(&proto_list_mutex);
3749 	release_proto_idx(prot);
3750 	list_del(&prot->node);
3751 	mutex_unlock(&proto_list_mutex);
3752 
3753 	kmem_cache_destroy(prot->slab);
3754 	prot->slab = NULL;
3755 
3756 	req_prot_cleanup(prot->rsk_prot);
3757 	tw_prot_cleanup(prot->twsk_prot);
3758 }
3759 EXPORT_SYMBOL(proto_unregister);
3760 
sock_load_diag_module(int family,int protocol)3761 int sock_load_diag_module(int family, int protocol)
3762 {
3763 	if (!protocol) {
3764 		if (!sock_is_registered(family))
3765 			return -ENOENT;
3766 
3767 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3768 				      NETLINK_SOCK_DIAG, family);
3769 	}
3770 
3771 #ifdef CONFIG_INET
3772 	if (family == AF_INET &&
3773 	    protocol != IPPROTO_RAW &&
3774 	    protocol < MAX_INET_PROTOS &&
3775 	    !rcu_access_pointer(inet_protos[protocol]))
3776 		return -ENOENT;
3777 #endif
3778 
3779 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3780 			      NETLINK_SOCK_DIAG, family, protocol);
3781 }
3782 EXPORT_SYMBOL(sock_load_diag_module);
3783 
3784 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)3785 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3786 	__acquires(proto_list_mutex)
3787 {
3788 	mutex_lock(&proto_list_mutex);
3789 	return seq_list_start_head(&proto_list, *pos);
3790 }
3791 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)3792 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3793 {
3794 	return seq_list_next(v, &proto_list, pos);
3795 }
3796 
proto_seq_stop(struct seq_file * seq,void * v)3797 static void proto_seq_stop(struct seq_file *seq, void *v)
3798 	__releases(proto_list_mutex)
3799 {
3800 	mutex_unlock(&proto_list_mutex);
3801 }
3802 
proto_method_implemented(const void * method)3803 static char proto_method_implemented(const void *method)
3804 {
3805 	return method == NULL ? 'n' : 'y';
3806 }
sock_prot_memory_allocated(struct proto * proto)3807 static long sock_prot_memory_allocated(struct proto *proto)
3808 {
3809 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3810 }
3811 
sock_prot_memory_pressure(struct proto * proto)3812 static const char *sock_prot_memory_pressure(struct proto *proto)
3813 {
3814 	return proto->memory_pressure != NULL ?
3815 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3816 }
3817 
proto_seq_printf(struct seq_file * seq,struct proto * proto)3818 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3819 {
3820 
3821 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3822 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3823 		   proto->name,
3824 		   proto->obj_size,
3825 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3826 		   sock_prot_memory_allocated(proto),
3827 		   sock_prot_memory_pressure(proto),
3828 		   proto->max_header,
3829 		   proto->slab == NULL ? "no" : "yes",
3830 		   module_name(proto->owner),
3831 		   proto_method_implemented(proto->close),
3832 		   proto_method_implemented(proto->connect),
3833 		   proto_method_implemented(proto->disconnect),
3834 		   proto_method_implemented(proto->accept),
3835 		   proto_method_implemented(proto->ioctl),
3836 		   proto_method_implemented(proto->init),
3837 		   proto_method_implemented(proto->destroy),
3838 		   proto_method_implemented(proto->shutdown),
3839 		   proto_method_implemented(proto->setsockopt),
3840 		   proto_method_implemented(proto->getsockopt),
3841 		   proto_method_implemented(proto->sendmsg),
3842 		   proto_method_implemented(proto->recvmsg),
3843 		   proto_method_implemented(proto->sendpage),
3844 		   proto_method_implemented(proto->bind),
3845 		   proto_method_implemented(proto->backlog_rcv),
3846 		   proto_method_implemented(proto->hash),
3847 		   proto_method_implemented(proto->unhash),
3848 		   proto_method_implemented(proto->get_port),
3849 		   proto_method_implemented(proto->enter_memory_pressure));
3850 }
3851 
proto_seq_show(struct seq_file * seq,void * v)3852 static int proto_seq_show(struct seq_file *seq, void *v)
3853 {
3854 	if (v == &proto_list)
3855 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3856 			   "protocol",
3857 			   "size",
3858 			   "sockets",
3859 			   "memory",
3860 			   "press",
3861 			   "maxhdr",
3862 			   "slab",
3863 			   "module",
3864 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3865 	else
3866 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3867 	return 0;
3868 }
3869 
3870 static const struct seq_operations proto_seq_ops = {
3871 	.start  = proto_seq_start,
3872 	.next   = proto_seq_next,
3873 	.stop   = proto_seq_stop,
3874 	.show   = proto_seq_show,
3875 };
3876 
proto_init_net(struct net * net)3877 static __net_init int proto_init_net(struct net *net)
3878 {
3879 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3880 			sizeof(struct seq_net_private)))
3881 		return -ENOMEM;
3882 
3883 	return 0;
3884 }
3885 
proto_exit_net(struct net * net)3886 static __net_exit void proto_exit_net(struct net *net)
3887 {
3888 	remove_proc_entry("protocols", net->proc_net);
3889 }
3890 
3891 
3892 static __net_initdata struct pernet_operations proto_net_ops = {
3893 	.init = proto_init_net,
3894 	.exit = proto_exit_net,
3895 };
3896 
proto_init(void)3897 static int __init proto_init(void)
3898 {
3899 	return register_pernet_subsys(&proto_net_ops);
3900 }
3901 
3902 subsys_initcall(proto_init);
3903 
3904 #endif /* PROC_FS */
3905 
3906 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)3907 bool sk_busy_loop_end(void *p, unsigned long start_time)
3908 {
3909 	struct sock *sk = p;
3910 
3911 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3912 	       sk_busy_loop_timeout(sk, start_time);
3913 }
3914 EXPORT_SYMBOL(sk_busy_loop_end);
3915 #endif /* CONFIG_NET_RX_BUSY_POLL */
3916 
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)3917 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3918 {
3919 	if (!sk->sk_prot->bind_add)
3920 		return -EOPNOTSUPP;
3921 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3922 }
3923 EXPORT_SYMBOL(sock_bind_add);
3924