• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 #include <trace/hooks/sched.h>
139 #include <trace/hooks/net.h>
140 
141 #include <net/tcp.h>
142 #include <net/busy_poll.h>
143 
144 #include <linux/ethtool.h>
145 
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
148 
149 static void sock_inuse_add(struct net *net, int val);
150 
151 /**
152  * sk_ns_capable - General socket capability test
153  * @sk: Socket to use a capability on or through
154  * @user_ns: The user namespace of the capability to use
155  * @cap: The capability to use
156  *
157  * Test to see if the opener of the socket had when the socket was
158  * created and the current process has the capability @cap in the user
159  * namespace @user_ns.
160  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)161 bool sk_ns_capable(const struct sock *sk,
162 		   struct user_namespace *user_ns, int cap)
163 {
164 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 		ns_capable(user_ns, cap);
166 }
167 EXPORT_SYMBOL(sk_ns_capable);
168 
169 /**
170  * sk_capable - Socket global capability test
171  * @sk: Socket to use a capability on or through
172  * @cap: The global capability to use
173  *
174  * Test to see if the opener of the socket had when the socket was
175  * created and the current process has the capability @cap in all user
176  * namespaces.
177  */
sk_capable(const struct sock * sk,int cap)178 bool sk_capable(const struct sock *sk, int cap)
179 {
180 	return sk_ns_capable(sk, &init_user_ns, cap);
181 }
182 EXPORT_SYMBOL(sk_capable);
183 
184 /**
185  * sk_net_capable - Network namespace socket capability test
186  * @sk: Socket to use a capability on or through
187  * @cap: The capability to use
188  *
189  * Test to see if the opener of the socket had when the socket was created
190  * and the current process has the capability @cap over the network namespace
191  * the socket is a member of.
192  */
sk_net_capable(const struct sock * sk,int cap)193 bool sk_net_capable(const struct sock *sk, int cap)
194 {
195 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
196 }
197 EXPORT_SYMBOL(sk_net_capable);
198 
199 /*
200  * Each address family might have different locking rules, so we have
201  * one slock key per address family and separate keys for internal and
202  * userspace sockets.
203  */
204 static struct lock_class_key af_family_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_keys[AF_MAX];
206 static struct lock_class_key af_family_slock_keys[AF_MAX];
207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
208 
209 /*
210  * Make lock validator output more readable. (we pre-construct these
211  * strings build-time, so that runtime initialization of socket
212  * locks is fast):
213  */
214 
215 #define _sock_locks(x)						  \
216   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
217   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
218   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
219   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
220   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
221   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
222   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
223   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
224   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
225   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
226   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
227   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
228   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
229   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
230   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
231   x "AF_MCTP"  , \
232   x "AF_MAX"
233 
234 static const char *const af_family_key_strings[AF_MAX+1] = {
235 	_sock_locks("sk_lock-")
236 };
237 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
238 	_sock_locks("slock-")
239 };
240 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
241 	_sock_locks("clock-")
242 };
243 
244 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
245 	_sock_locks("k-sk_lock-")
246 };
247 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
248 	_sock_locks("k-slock-")
249 };
250 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
251 	_sock_locks("k-clock-")
252 };
253 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
254 	_sock_locks("rlock-")
255 };
256 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
257 	_sock_locks("wlock-")
258 };
259 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
260 	_sock_locks("elock-")
261 };
262 
263 /*
264  * sk_callback_lock and sk queues locking rules are per-address-family,
265  * so split the lock classes by using a per-AF key:
266  */
267 static struct lock_class_key af_callback_keys[AF_MAX];
268 static struct lock_class_key af_rlock_keys[AF_MAX];
269 static struct lock_class_key af_wlock_keys[AF_MAX];
270 static struct lock_class_key af_elock_keys[AF_MAX];
271 static struct lock_class_key af_kern_callback_keys[AF_MAX];
272 
273 /* Run time adjustable parameters. */
274 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
275 EXPORT_SYMBOL(sysctl_wmem_max);
276 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
277 EXPORT_SYMBOL(sysctl_rmem_max);
278 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
279 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
280 
281 /* Maximal space eaten by iovec or ancillary data plus some space */
282 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
283 EXPORT_SYMBOL(sysctl_optmem_max);
284 
285 int sysctl_tstamp_allow_data __read_mostly = 1;
286 
287 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
288 EXPORT_SYMBOL_GPL(memalloc_socks_key);
289 
290 /**
291  * sk_set_memalloc - sets %SOCK_MEMALLOC
292  * @sk: socket to set it on
293  *
294  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
295  * It's the responsibility of the admin to adjust min_free_kbytes
296  * to meet the requirements
297  */
sk_set_memalloc(struct sock * sk)298 void sk_set_memalloc(struct sock *sk)
299 {
300 	sock_set_flag(sk, SOCK_MEMALLOC);
301 	sk->sk_allocation |= __GFP_MEMALLOC;
302 	static_branch_inc(&memalloc_socks_key);
303 }
304 EXPORT_SYMBOL_GPL(sk_set_memalloc);
305 
sk_clear_memalloc(struct sock * sk)306 void sk_clear_memalloc(struct sock *sk)
307 {
308 	sock_reset_flag(sk, SOCK_MEMALLOC);
309 	sk->sk_allocation &= ~__GFP_MEMALLOC;
310 	static_branch_dec(&memalloc_socks_key);
311 
312 	/*
313 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
314 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
315 	 * it has rmem allocations due to the last swapfile being deactivated
316 	 * but there is a risk that the socket is unusable due to exceeding
317 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
318 	 */
319 	sk_mem_reclaim(sk);
320 }
321 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
322 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)323 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
324 {
325 	int ret;
326 	unsigned int noreclaim_flag;
327 
328 	/* these should have been dropped before queueing */
329 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
330 
331 	noreclaim_flag = memalloc_noreclaim_save();
332 	ret = sk->sk_backlog_rcv(sk, skb);
333 	memalloc_noreclaim_restore(noreclaim_flag);
334 
335 	return ret;
336 }
337 EXPORT_SYMBOL(__sk_backlog_rcv);
338 
sk_error_report(struct sock * sk)339 void sk_error_report(struct sock *sk)
340 {
341 	sk->sk_error_report(sk);
342 
343 	switch (sk->sk_family) {
344 	case AF_INET:
345 		fallthrough;
346 	case AF_INET6:
347 		trace_inet_sk_error_report(sk);
348 		break;
349 	default:
350 		break;
351 	}
352 }
353 EXPORT_SYMBOL(sk_error_report);
354 
sock_get_timeout(long timeo,void * optval,bool old_timeval)355 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
356 {
357 	struct __kernel_sock_timeval tv;
358 
359 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
360 		tv.tv_sec = 0;
361 		tv.tv_usec = 0;
362 	} else {
363 		tv.tv_sec = timeo / HZ;
364 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
365 	}
366 
367 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
368 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
369 		*(struct old_timeval32 *)optval = tv32;
370 		return sizeof(tv32);
371 	}
372 
373 	if (old_timeval) {
374 		struct __kernel_old_timeval old_tv;
375 		old_tv.tv_sec = tv.tv_sec;
376 		old_tv.tv_usec = tv.tv_usec;
377 		*(struct __kernel_old_timeval *)optval = old_tv;
378 		return sizeof(old_tv);
379 	}
380 
381 	*(struct __kernel_sock_timeval *)optval = tv;
382 	return sizeof(tv);
383 }
384 
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)385 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
386 			    bool old_timeval)
387 {
388 	struct __kernel_sock_timeval tv;
389 
390 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
391 		struct old_timeval32 tv32;
392 
393 		if (optlen < sizeof(tv32))
394 			return -EINVAL;
395 
396 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
397 			return -EFAULT;
398 		tv.tv_sec = tv32.tv_sec;
399 		tv.tv_usec = tv32.tv_usec;
400 	} else if (old_timeval) {
401 		struct __kernel_old_timeval old_tv;
402 
403 		if (optlen < sizeof(old_tv))
404 			return -EINVAL;
405 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
406 			return -EFAULT;
407 		tv.tv_sec = old_tv.tv_sec;
408 		tv.tv_usec = old_tv.tv_usec;
409 	} else {
410 		if (optlen < sizeof(tv))
411 			return -EINVAL;
412 		if (copy_from_sockptr(&tv, optval, sizeof(tv)))
413 			return -EFAULT;
414 	}
415 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
416 		return -EDOM;
417 
418 	if (tv.tv_sec < 0) {
419 		static int warned __read_mostly;
420 
421 		*timeo_p = 0;
422 		if (warned < 10 && net_ratelimit()) {
423 			warned++;
424 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
425 				__func__, current->comm, task_pid_nr(current));
426 		}
427 		return 0;
428 	}
429 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
430 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
431 		return 0;
432 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
433 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
434 	return 0;
435 }
436 
sock_needs_netstamp(const struct sock * sk)437 static bool sock_needs_netstamp(const struct sock *sk)
438 {
439 	switch (sk->sk_family) {
440 	case AF_UNSPEC:
441 	case AF_UNIX:
442 		return false;
443 	default:
444 		return true;
445 	}
446 }
447 
sock_disable_timestamp(struct sock * sk,unsigned long flags)448 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
449 {
450 	if (sk->sk_flags & flags) {
451 		sk->sk_flags &= ~flags;
452 		if (sock_needs_netstamp(sk) &&
453 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
454 			net_disable_timestamp();
455 	}
456 }
457 
458 
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)459 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
460 {
461 	unsigned long flags;
462 	struct sk_buff_head *list = &sk->sk_receive_queue;
463 
464 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
465 		atomic_inc(&sk->sk_drops);
466 		trace_sock_rcvqueue_full(sk, skb);
467 		return -ENOMEM;
468 	}
469 
470 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
471 		atomic_inc(&sk->sk_drops);
472 		return -ENOBUFS;
473 	}
474 
475 	skb->dev = NULL;
476 	skb_set_owner_r(skb, sk);
477 
478 	/* we escape from rcu protected region, make sure we dont leak
479 	 * a norefcounted dst
480 	 */
481 	skb_dst_force(skb);
482 
483 	spin_lock_irqsave(&list->lock, flags);
484 	sock_skb_set_dropcount(sk, skb);
485 	__skb_queue_tail(list, skb);
486 	spin_unlock_irqrestore(&list->lock, flags);
487 
488 	if (!sock_flag(sk, SOCK_DEAD))
489 		sk->sk_data_ready(sk);
490 	return 0;
491 }
492 EXPORT_SYMBOL(__sock_queue_rcv_skb);
493 
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)494 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
495 {
496 	int err;
497 
498 	err = sk_filter(sk, skb);
499 	if (err)
500 		return err;
501 
502 	return __sock_queue_rcv_skb(sk, skb);
503 }
504 EXPORT_SYMBOL(sock_queue_rcv_skb);
505 
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)506 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
507 		     const int nested, unsigned int trim_cap, bool refcounted)
508 {
509 	int rc = NET_RX_SUCCESS;
510 
511 	if (sk_filter_trim_cap(sk, skb, trim_cap))
512 		goto discard_and_relse;
513 
514 	skb->dev = NULL;
515 
516 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
517 		atomic_inc(&sk->sk_drops);
518 		goto discard_and_relse;
519 	}
520 	if (nested)
521 		bh_lock_sock_nested(sk);
522 	else
523 		bh_lock_sock(sk);
524 	if (!sock_owned_by_user(sk)) {
525 		/*
526 		 * trylock + unlock semantics:
527 		 */
528 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
529 
530 		rc = sk_backlog_rcv(sk, skb);
531 
532 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
533 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
534 		bh_unlock_sock(sk);
535 		atomic_inc(&sk->sk_drops);
536 		goto discard_and_relse;
537 	}
538 
539 	bh_unlock_sock(sk);
540 out:
541 	if (refcounted)
542 		sock_put(sk);
543 	return rc;
544 discard_and_relse:
545 	kfree_skb(skb);
546 	goto out;
547 }
548 EXPORT_SYMBOL(__sk_receive_skb);
549 
550 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
551 							  u32));
552 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
553 							   u32));
__sk_dst_check(struct sock * sk,u32 cookie)554 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
555 {
556 	struct dst_entry *dst = __sk_dst_get(sk);
557 
558 	if (dst && dst->obsolete &&
559 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
560 			       dst, cookie) == NULL) {
561 		sk_tx_queue_clear(sk);
562 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
563 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
564 		dst_release(dst);
565 		return NULL;
566 	}
567 
568 	return dst;
569 }
570 EXPORT_SYMBOL(__sk_dst_check);
571 
sk_dst_check(struct sock * sk,u32 cookie)572 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
573 {
574 	struct dst_entry *dst = sk_dst_get(sk);
575 
576 	if (dst && dst->obsolete &&
577 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
578 			       dst, cookie) == NULL) {
579 		sk_dst_reset(sk);
580 		dst_release(dst);
581 		return NULL;
582 	}
583 
584 	return dst;
585 }
586 EXPORT_SYMBOL(sk_dst_check);
587 
sock_bindtoindex_locked(struct sock * sk,int ifindex)588 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
589 {
590 	int ret = -ENOPROTOOPT;
591 #ifdef CONFIG_NETDEVICES
592 	struct net *net = sock_net(sk);
593 
594 	/* Sorry... */
595 	ret = -EPERM;
596 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
597 		goto out;
598 
599 	ret = -EINVAL;
600 	if (ifindex < 0)
601 		goto out;
602 
603 	sk->sk_bound_dev_if = ifindex;
604 	if (sk->sk_prot->rehash)
605 		sk->sk_prot->rehash(sk);
606 	sk_dst_reset(sk);
607 
608 	ret = 0;
609 
610 out:
611 #endif
612 
613 	return ret;
614 }
615 
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)616 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
617 {
618 	int ret;
619 
620 	if (lock_sk)
621 		lock_sock(sk);
622 	ret = sock_bindtoindex_locked(sk, ifindex);
623 	if (lock_sk)
624 		release_sock(sk);
625 
626 	return ret;
627 }
628 EXPORT_SYMBOL(sock_bindtoindex);
629 
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)630 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
631 {
632 	int ret = -ENOPROTOOPT;
633 #ifdef CONFIG_NETDEVICES
634 	struct net *net = sock_net(sk);
635 	char devname[IFNAMSIZ];
636 	int index;
637 
638 	ret = -EINVAL;
639 	if (optlen < 0)
640 		goto out;
641 
642 	/* Bind this socket to a particular device like "eth0",
643 	 * as specified in the passed interface name. If the
644 	 * name is "" or the option length is zero the socket
645 	 * is not bound.
646 	 */
647 	if (optlen > IFNAMSIZ - 1)
648 		optlen = IFNAMSIZ - 1;
649 	memset(devname, 0, sizeof(devname));
650 
651 	ret = -EFAULT;
652 	if (copy_from_sockptr(devname, optval, optlen))
653 		goto out;
654 
655 	index = 0;
656 	if (devname[0] != '\0') {
657 		struct net_device *dev;
658 
659 		rcu_read_lock();
660 		dev = dev_get_by_name_rcu(net, devname);
661 		if (dev)
662 			index = dev->ifindex;
663 		rcu_read_unlock();
664 		ret = -ENODEV;
665 		if (!dev)
666 			goto out;
667 	}
668 
669 	return sock_bindtoindex(sk, index, true);
670 out:
671 #endif
672 
673 	return ret;
674 }
675 
sock_getbindtodevice(struct sock * sk,char __user * optval,int __user * optlen,int len)676 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
677 				int __user *optlen, int len)
678 {
679 	int ret = -ENOPROTOOPT;
680 #ifdef CONFIG_NETDEVICES
681 	struct net *net = sock_net(sk);
682 	char devname[IFNAMSIZ];
683 
684 	if (sk->sk_bound_dev_if == 0) {
685 		len = 0;
686 		goto zero;
687 	}
688 
689 	ret = -EINVAL;
690 	if (len < IFNAMSIZ)
691 		goto out;
692 
693 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
694 	if (ret)
695 		goto out;
696 
697 	len = strlen(devname) + 1;
698 
699 	ret = -EFAULT;
700 	if (copy_to_user(optval, devname, len))
701 		goto out;
702 
703 zero:
704 	ret = -EFAULT;
705 	if (put_user(len, optlen))
706 		goto out;
707 
708 	ret = 0;
709 
710 out:
711 #endif
712 
713 	return ret;
714 }
715 
sk_mc_loop(struct sock * sk)716 bool sk_mc_loop(struct sock *sk)
717 {
718 	if (dev_recursion_level())
719 		return false;
720 	if (!sk)
721 		return true;
722 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
723 	switch (READ_ONCE(sk->sk_family)) {
724 	case AF_INET:
725 		return inet_sk(sk)->mc_loop;
726 #if IS_ENABLED(CONFIG_IPV6)
727 	case AF_INET6:
728 		return inet6_sk(sk)->mc_loop;
729 #endif
730 	}
731 	WARN_ON_ONCE(1);
732 	return true;
733 }
734 EXPORT_SYMBOL(sk_mc_loop);
735 
sock_set_reuseaddr(struct sock * sk)736 void sock_set_reuseaddr(struct sock *sk)
737 {
738 	lock_sock(sk);
739 	sk->sk_reuse = SK_CAN_REUSE;
740 	release_sock(sk);
741 }
742 EXPORT_SYMBOL(sock_set_reuseaddr);
743 
sock_set_reuseport(struct sock * sk)744 void sock_set_reuseport(struct sock *sk)
745 {
746 	lock_sock(sk);
747 	sk->sk_reuseport = true;
748 	release_sock(sk);
749 }
750 EXPORT_SYMBOL(sock_set_reuseport);
751 
sock_no_linger(struct sock * sk)752 void sock_no_linger(struct sock *sk)
753 {
754 	lock_sock(sk);
755 	sk->sk_lingertime = 0;
756 	sock_set_flag(sk, SOCK_LINGER);
757 	release_sock(sk);
758 }
759 EXPORT_SYMBOL(sock_no_linger);
760 
sock_set_priority(struct sock * sk,u32 priority)761 void sock_set_priority(struct sock *sk, u32 priority)
762 {
763 	lock_sock(sk);
764 	sk->sk_priority = priority;
765 	release_sock(sk);
766 }
767 EXPORT_SYMBOL(sock_set_priority);
768 
sock_set_sndtimeo(struct sock * sk,s64 secs)769 void sock_set_sndtimeo(struct sock *sk, s64 secs)
770 {
771 	lock_sock(sk);
772 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
773 		sk->sk_sndtimeo = secs * HZ;
774 	else
775 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
776 	release_sock(sk);
777 }
778 EXPORT_SYMBOL(sock_set_sndtimeo);
779 
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)780 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
781 {
782 	if (val)  {
783 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
784 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
785 		sock_set_flag(sk, SOCK_RCVTSTAMP);
786 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
787 	} else {
788 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
789 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
790 	}
791 }
792 
sock_enable_timestamps(struct sock * sk)793 void sock_enable_timestamps(struct sock *sk)
794 {
795 	lock_sock(sk);
796 	__sock_set_timestamps(sk, true, false, true);
797 	release_sock(sk);
798 }
799 EXPORT_SYMBOL(sock_enable_timestamps);
800 
sock_set_timestamp(struct sock * sk,int optname,bool valbool)801 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
802 {
803 	switch (optname) {
804 	case SO_TIMESTAMP_OLD:
805 		__sock_set_timestamps(sk, valbool, false, false);
806 		break;
807 	case SO_TIMESTAMP_NEW:
808 		__sock_set_timestamps(sk, valbool, true, false);
809 		break;
810 	case SO_TIMESTAMPNS_OLD:
811 		__sock_set_timestamps(sk, valbool, false, true);
812 		break;
813 	case SO_TIMESTAMPNS_NEW:
814 		__sock_set_timestamps(sk, valbool, true, true);
815 		break;
816 	}
817 }
818 
sock_timestamping_bind_phc(struct sock * sk,int phc_index)819 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
820 {
821 	struct net *net = sock_net(sk);
822 	struct net_device *dev = NULL;
823 	bool match = false;
824 	int *vclock_index;
825 	int i, num;
826 
827 	if (sk->sk_bound_dev_if)
828 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
829 
830 	if (!dev) {
831 		pr_err("%s: sock not bind to device\n", __func__);
832 		return -EOPNOTSUPP;
833 	}
834 
835 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
836 	dev_put(dev);
837 
838 	for (i = 0; i < num; i++) {
839 		if (*(vclock_index + i) == phc_index) {
840 			match = true;
841 			break;
842 		}
843 	}
844 
845 	if (num > 0)
846 		kfree(vclock_index);
847 
848 	if (!match)
849 		return -EINVAL;
850 
851 	sk->sk_bind_phc = phc_index;
852 
853 	return 0;
854 }
855 
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)856 int sock_set_timestamping(struct sock *sk, int optname,
857 			  struct so_timestamping timestamping)
858 {
859 	int val = timestamping.flags;
860 	int ret;
861 
862 	if (val & ~SOF_TIMESTAMPING_MASK)
863 		return -EINVAL;
864 
865 	if (val & SOF_TIMESTAMPING_OPT_ID &&
866 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
867 		if (sk->sk_protocol == IPPROTO_TCP &&
868 		    sk->sk_type == SOCK_STREAM) {
869 			if ((1 << sk->sk_state) &
870 			    (TCPF_CLOSE | TCPF_LISTEN))
871 				return -EINVAL;
872 			atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
873 		} else {
874 			atomic_set(&sk->sk_tskey, 0);
875 		}
876 	}
877 
878 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
879 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
880 		return -EINVAL;
881 
882 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
883 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
884 		if (ret)
885 			return ret;
886 	}
887 
888 	sk->sk_tsflags = val;
889 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
890 
891 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
892 		sock_enable_timestamp(sk,
893 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
894 	else
895 		sock_disable_timestamp(sk,
896 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
897 	return 0;
898 }
899 
sock_set_keepalive(struct sock * sk)900 void sock_set_keepalive(struct sock *sk)
901 {
902 	lock_sock(sk);
903 	if (sk->sk_prot->keepalive)
904 		sk->sk_prot->keepalive(sk, true);
905 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
906 	release_sock(sk);
907 }
908 EXPORT_SYMBOL(sock_set_keepalive);
909 
__sock_set_rcvbuf(struct sock * sk,int val)910 static void __sock_set_rcvbuf(struct sock *sk, int val)
911 {
912 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
913 	 * as a negative value.
914 	 */
915 	val = min_t(int, val, INT_MAX / 2);
916 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
917 
918 	/* We double it on the way in to account for "struct sk_buff" etc.
919 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
920 	 * will allow that much actual data to be received on that socket.
921 	 *
922 	 * Applications are unaware that "struct sk_buff" and other overheads
923 	 * allocate from the receive buffer during socket buffer allocation.
924 	 *
925 	 * And after considering the possible alternatives, returning the value
926 	 * we actually used in getsockopt is the most desirable behavior.
927 	 */
928 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
929 }
930 
sock_set_rcvbuf(struct sock * sk,int val)931 void sock_set_rcvbuf(struct sock *sk, int val)
932 {
933 	lock_sock(sk);
934 	__sock_set_rcvbuf(sk, val);
935 	release_sock(sk);
936 }
937 EXPORT_SYMBOL(sock_set_rcvbuf);
938 
__sock_set_mark(struct sock * sk,u32 val)939 static void __sock_set_mark(struct sock *sk, u32 val)
940 {
941 	if (val != sk->sk_mark) {
942 		sk->sk_mark = val;
943 		sk_dst_reset(sk);
944 	}
945 }
946 
sock_set_mark(struct sock * sk,u32 val)947 void sock_set_mark(struct sock *sk, u32 val)
948 {
949 	lock_sock(sk);
950 	__sock_set_mark(sk, val);
951 	release_sock(sk);
952 }
953 EXPORT_SYMBOL(sock_set_mark);
954 
955 /*
956  *	This is meant for all protocols to use and covers goings on
957  *	at the socket level. Everything here is generic.
958  */
959 
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)960 int sock_setsockopt(struct socket *sock, int level, int optname,
961 		    sockptr_t optval, unsigned int optlen)
962 {
963 	struct so_timestamping timestamping;
964 	struct sock_txtime sk_txtime;
965 	struct sock *sk = sock->sk;
966 	int val;
967 	int valbool;
968 	struct linger ling;
969 	int ret = 0;
970 
971 	/*
972 	 *	Options without arguments
973 	 */
974 
975 	if (optname == SO_BINDTODEVICE)
976 		return sock_setbindtodevice(sk, optval, optlen);
977 
978 	if (optlen < sizeof(int))
979 		return -EINVAL;
980 
981 	if (copy_from_sockptr(&val, optval, sizeof(val)))
982 		return -EFAULT;
983 
984 	valbool = val ? 1 : 0;
985 
986 	lock_sock(sk);
987 
988 	switch (optname) {
989 	case SO_DEBUG:
990 		if (val && !capable(CAP_NET_ADMIN))
991 			ret = -EACCES;
992 		else
993 			sock_valbool_flag(sk, SOCK_DBG, valbool);
994 		break;
995 	case SO_REUSEADDR:
996 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
997 		break;
998 	case SO_REUSEPORT:
999 		sk->sk_reuseport = valbool;
1000 		break;
1001 	case SO_TYPE:
1002 	case SO_PROTOCOL:
1003 	case SO_DOMAIN:
1004 	case SO_ERROR:
1005 		ret = -ENOPROTOOPT;
1006 		break;
1007 	case SO_DONTROUTE:
1008 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1009 		sk_dst_reset(sk);
1010 		break;
1011 	case SO_BROADCAST:
1012 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1013 		break;
1014 	case SO_SNDBUF:
1015 		/* Don't error on this BSD doesn't and if you think
1016 		 * about it this is right. Otherwise apps have to
1017 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1018 		 * are treated in BSD as hints
1019 		 */
1020 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1021 set_sndbuf:
1022 		/* Ensure val * 2 fits into an int, to prevent max_t()
1023 		 * from treating it as a negative value.
1024 		 */
1025 		val = min_t(int, val, INT_MAX / 2);
1026 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1027 		WRITE_ONCE(sk->sk_sndbuf,
1028 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1029 		/* Wake up sending tasks if we upped the value. */
1030 		sk->sk_write_space(sk);
1031 		break;
1032 
1033 	case SO_SNDBUFFORCE:
1034 		if (!capable(CAP_NET_ADMIN)) {
1035 			ret = -EPERM;
1036 			break;
1037 		}
1038 
1039 		/* No negative values (to prevent underflow, as val will be
1040 		 * multiplied by 2).
1041 		 */
1042 		if (val < 0)
1043 			val = 0;
1044 		goto set_sndbuf;
1045 
1046 	case SO_RCVBUF:
1047 		/* Don't error on this BSD doesn't and if you think
1048 		 * about it this is right. Otherwise apps have to
1049 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1050 		 * are treated in BSD as hints
1051 		 */
1052 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1053 		break;
1054 
1055 	case SO_RCVBUFFORCE:
1056 		if (!capable(CAP_NET_ADMIN)) {
1057 			ret = -EPERM;
1058 			break;
1059 		}
1060 
1061 		/* No negative values (to prevent underflow, as val will be
1062 		 * multiplied by 2).
1063 		 */
1064 		__sock_set_rcvbuf(sk, max(val, 0));
1065 		break;
1066 
1067 	case SO_KEEPALIVE:
1068 		if (sk->sk_prot->keepalive)
1069 			sk->sk_prot->keepalive(sk, valbool);
1070 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1071 		break;
1072 
1073 	case SO_OOBINLINE:
1074 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1075 		break;
1076 
1077 	case SO_NO_CHECK:
1078 		sk->sk_no_check_tx = valbool;
1079 		break;
1080 
1081 	case SO_PRIORITY:
1082 		if ((val >= 0 && val <= 6) ||
1083 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1084 			sk->sk_priority = val;
1085 		else
1086 			ret = -EPERM;
1087 		break;
1088 
1089 	case SO_LINGER:
1090 		if (optlen < sizeof(ling)) {
1091 			ret = -EINVAL;	/* 1003.1g */
1092 			break;
1093 		}
1094 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1095 			ret = -EFAULT;
1096 			break;
1097 		}
1098 		if (!ling.l_onoff)
1099 			sock_reset_flag(sk, SOCK_LINGER);
1100 		else {
1101 #if (BITS_PER_LONG == 32)
1102 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1103 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1104 			else
1105 #endif
1106 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1107 			sock_set_flag(sk, SOCK_LINGER);
1108 		}
1109 		break;
1110 
1111 	case SO_BSDCOMPAT:
1112 		break;
1113 
1114 	case SO_PASSCRED:
1115 		if (valbool)
1116 			set_bit(SOCK_PASSCRED, &sock->flags);
1117 		else
1118 			clear_bit(SOCK_PASSCRED, &sock->flags);
1119 		break;
1120 
1121 	case SO_TIMESTAMP_OLD:
1122 	case SO_TIMESTAMP_NEW:
1123 	case SO_TIMESTAMPNS_OLD:
1124 	case SO_TIMESTAMPNS_NEW:
1125 		sock_set_timestamp(sk, optname, valbool);
1126 		break;
1127 
1128 	case SO_TIMESTAMPING_NEW:
1129 	case SO_TIMESTAMPING_OLD:
1130 		if (optlen == sizeof(timestamping)) {
1131 			if (copy_from_sockptr(&timestamping, optval,
1132 					      sizeof(timestamping))) {
1133 				ret = -EFAULT;
1134 				break;
1135 			}
1136 		} else {
1137 			memset(&timestamping, 0, sizeof(timestamping));
1138 			timestamping.flags = val;
1139 		}
1140 		ret = sock_set_timestamping(sk, optname, timestamping);
1141 		break;
1142 
1143 	case SO_RCVLOWAT:
1144 		if (val < 0)
1145 			val = INT_MAX;
1146 		if (sock->ops->set_rcvlowat)
1147 			ret = sock->ops->set_rcvlowat(sk, val);
1148 		else
1149 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1150 		break;
1151 
1152 	case SO_RCVTIMEO_OLD:
1153 	case SO_RCVTIMEO_NEW:
1154 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1155 				       optlen, optname == SO_RCVTIMEO_OLD);
1156 		break;
1157 
1158 	case SO_SNDTIMEO_OLD:
1159 	case SO_SNDTIMEO_NEW:
1160 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1161 				       optlen, optname == SO_SNDTIMEO_OLD);
1162 		break;
1163 
1164 	case SO_ATTACH_FILTER: {
1165 		struct sock_fprog fprog;
1166 
1167 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1168 		if (!ret)
1169 			ret = sk_attach_filter(&fprog, sk);
1170 		break;
1171 	}
1172 	case SO_ATTACH_BPF:
1173 		ret = -EINVAL;
1174 		if (optlen == sizeof(u32)) {
1175 			u32 ufd;
1176 
1177 			ret = -EFAULT;
1178 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1179 				break;
1180 
1181 			ret = sk_attach_bpf(ufd, sk);
1182 		}
1183 		break;
1184 
1185 	case SO_ATTACH_REUSEPORT_CBPF: {
1186 		struct sock_fprog fprog;
1187 
1188 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1189 		if (!ret)
1190 			ret = sk_reuseport_attach_filter(&fprog, sk);
1191 		break;
1192 	}
1193 	case SO_ATTACH_REUSEPORT_EBPF:
1194 		ret = -EINVAL;
1195 		if (optlen == sizeof(u32)) {
1196 			u32 ufd;
1197 
1198 			ret = -EFAULT;
1199 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1200 				break;
1201 
1202 			ret = sk_reuseport_attach_bpf(ufd, sk);
1203 		}
1204 		break;
1205 
1206 	case SO_DETACH_REUSEPORT_BPF:
1207 		ret = reuseport_detach_prog(sk);
1208 		break;
1209 
1210 	case SO_DETACH_FILTER:
1211 		ret = sk_detach_filter(sk);
1212 		break;
1213 
1214 	case SO_LOCK_FILTER:
1215 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1216 			ret = -EPERM;
1217 		else
1218 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1219 		break;
1220 
1221 	case SO_PASSSEC:
1222 		if (valbool)
1223 			set_bit(SOCK_PASSSEC, &sock->flags);
1224 		else
1225 			clear_bit(SOCK_PASSSEC, &sock->flags);
1226 		break;
1227 	case SO_MARK:
1228 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1229 			ret = -EPERM;
1230 			break;
1231 		}
1232 
1233 		__sock_set_mark(sk, val);
1234 		break;
1235 
1236 	case SO_RXQ_OVFL:
1237 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1238 		break;
1239 
1240 	case SO_WIFI_STATUS:
1241 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1242 		break;
1243 
1244 	case SO_PEEK_OFF:
1245 		if (sock->ops->set_peek_off)
1246 			ret = sock->ops->set_peek_off(sk, val);
1247 		else
1248 			ret = -EOPNOTSUPP;
1249 		break;
1250 
1251 	case SO_NOFCS:
1252 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1253 		break;
1254 
1255 	case SO_SELECT_ERR_QUEUE:
1256 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1257 		break;
1258 
1259 #ifdef CONFIG_NET_RX_BUSY_POLL
1260 	case SO_BUSY_POLL:
1261 		/* allow unprivileged users to decrease the value */
1262 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1263 			ret = -EPERM;
1264 		else {
1265 			if (val < 0)
1266 				ret = -EINVAL;
1267 			else
1268 				WRITE_ONCE(sk->sk_ll_usec, val);
1269 		}
1270 		break;
1271 	case SO_PREFER_BUSY_POLL:
1272 		if (valbool && !capable(CAP_NET_ADMIN))
1273 			ret = -EPERM;
1274 		else
1275 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1276 		break;
1277 	case SO_BUSY_POLL_BUDGET:
1278 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1279 			ret = -EPERM;
1280 		} else {
1281 			if (val < 0 || val > U16_MAX)
1282 				ret = -EINVAL;
1283 			else
1284 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1285 		}
1286 		break;
1287 #endif
1288 
1289 	case SO_MAX_PACING_RATE:
1290 		{
1291 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1292 
1293 		if (sizeof(ulval) != sizeof(val) &&
1294 		    optlen >= sizeof(ulval) &&
1295 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1296 			ret = -EFAULT;
1297 			break;
1298 		}
1299 		if (ulval != ~0UL)
1300 			cmpxchg(&sk->sk_pacing_status,
1301 				SK_PACING_NONE,
1302 				SK_PACING_NEEDED);
1303 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1304 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1305 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1306 		break;
1307 		}
1308 	case SO_INCOMING_CPU:
1309 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1310 		break;
1311 
1312 	case SO_CNX_ADVICE:
1313 		if (val == 1)
1314 			dst_negative_advice(sk);
1315 		break;
1316 
1317 	case SO_ZEROCOPY:
1318 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1319 			if (!((sk->sk_type == SOCK_STREAM &&
1320 			       sk->sk_protocol == IPPROTO_TCP) ||
1321 			      (sk->sk_type == SOCK_DGRAM &&
1322 			       sk->sk_protocol == IPPROTO_UDP)))
1323 				ret = -ENOTSUPP;
1324 		} else if (sk->sk_family != PF_RDS) {
1325 			ret = -ENOTSUPP;
1326 		}
1327 		if (!ret) {
1328 			if (val < 0 || val > 1)
1329 				ret = -EINVAL;
1330 			else
1331 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1332 		}
1333 		break;
1334 
1335 	case SO_TXTIME:
1336 		if (optlen != sizeof(struct sock_txtime)) {
1337 			ret = -EINVAL;
1338 			break;
1339 		} else if (copy_from_sockptr(&sk_txtime, optval,
1340 			   sizeof(struct sock_txtime))) {
1341 			ret = -EFAULT;
1342 			break;
1343 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1344 			ret = -EINVAL;
1345 			break;
1346 		}
1347 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1348 		 * scheduler has enough safe guards.
1349 		 */
1350 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1351 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1352 			ret = -EPERM;
1353 			break;
1354 		}
1355 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1356 		sk->sk_clockid = sk_txtime.clockid;
1357 		sk->sk_txtime_deadline_mode =
1358 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1359 		sk->sk_txtime_report_errors =
1360 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1361 		break;
1362 
1363 	case SO_BINDTOIFINDEX:
1364 		ret = sock_bindtoindex_locked(sk, val);
1365 		break;
1366 
1367 	case SO_BUF_LOCK:
1368 		if (val & ~SOCK_BUF_LOCK_MASK) {
1369 			ret = -EINVAL;
1370 			break;
1371 		}
1372 		sk->sk_userlocks = val | (sk->sk_userlocks &
1373 					  ~SOCK_BUF_LOCK_MASK);
1374 		break;
1375 
1376 	default:
1377 		ret = -ENOPROTOOPT;
1378 		break;
1379 	}
1380 	release_sock(sk);
1381 	return ret;
1382 }
1383 EXPORT_SYMBOL(sock_setsockopt);
1384 
sk_get_peer_cred(struct sock * sk)1385 static const struct cred *sk_get_peer_cred(struct sock *sk)
1386 {
1387 	const struct cred *cred;
1388 
1389 	spin_lock(&sk->sk_peer_lock);
1390 	cred = get_cred(sk->sk_peer_cred);
1391 	spin_unlock(&sk->sk_peer_lock);
1392 
1393 	return cred;
1394 }
1395 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1396 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1397 			  struct ucred *ucred)
1398 {
1399 	ucred->pid = pid_vnr(pid);
1400 	ucred->uid = ucred->gid = -1;
1401 	if (cred) {
1402 		struct user_namespace *current_ns = current_user_ns();
1403 
1404 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1405 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1406 	}
1407 }
1408 
groups_to_user(gid_t __user * dst,const struct group_info * src)1409 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1410 {
1411 	struct user_namespace *user_ns = current_user_ns();
1412 	int i;
1413 
1414 	for (i = 0; i < src->ngroups; i++)
1415 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1416 			return -EFAULT;
1417 
1418 	return 0;
1419 }
1420 
sock_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1421 int sock_getsockopt(struct socket *sock, int level, int optname,
1422 		    char __user *optval, int __user *optlen)
1423 {
1424 	struct sock *sk = sock->sk;
1425 
1426 	union {
1427 		int val;
1428 		u64 val64;
1429 		unsigned long ulval;
1430 		struct linger ling;
1431 		struct old_timeval32 tm32;
1432 		struct __kernel_old_timeval tm;
1433 		struct  __kernel_sock_timeval stm;
1434 		struct sock_txtime txtime;
1435 		struct so_timestamping timestamping;
1436 	} v;
1437 
1438 	int lv = sizeof(int);
1439 	int len;
1440 
1441 	if (get_user(len, optlen))
1442 		return -EFAULT;
1443 	if (len < 0)
1444 		return -EINVAL;
1445 
1446 	memset(&v, 0, sizeof(v));
1447 
1448 	switch (optname) {
1449 	case SO_DEBUG:
1450 		v.val = sock_flag(sk, SOCK_DBG);
1451 		break;
1452 
1453 	case SO_DONTROUTE:
1454 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1455 		break;
1456 
1457 	case SO_BROADCAST:
1458 		v.val = sock_flag(sk, SOCK_BROADCAST);
1459 		break;
1460 
1461 	case SO_SNDBUF:
1462 		v.val = READ_ONCE(sk->sk_sndbuf);
1463 		break;
1464 
1465 	case SO_RCVBUF:
1466 		v.val = READ_ONCE(sk->sk_rcvbuf);
1467 		break;
1468 
1469 	case SO_REUSEADDR:
1470 		v.val = sk->sk_reuse;
1471 		break;
1472 
1473 	case SO_REUSEPORT:
1474 		v.val = sk->sk_reuseport;
1475 		break;
1476 
1477 	case SO_KEEPALIVE:
1478 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1479 		break;
1480 
1481 	case SO_TYPE:
1482 		v.val = sk->sk_type;
1483 		break;
1484 
1485 	case SO_PROTOCOL:
1486 		v.val = sk->sk_protocol;
1487 		break;
1488 
1489 	case SO_DOMAIN:
1490 		v.val = sk->sk_family;
1491 		break;
1492 
1493 	case SO_ERROR:
1494 		v.val = -sock_error(sk);
1495 		if (v.val == 0)
1496 			v.val = xchg(&sk->sk_err_soft, 0);
1497 		break;
1498 
1499 	case SO_OOBINLINE:
1500 		v.val = sock_flag(sk, SOCK_URGINLINE);
1501 		break;
1502 
1503 	case SO_NO_CHECK:
1504 		v.val = sk->sk_no_check_tx;
1505 		break;
1506 
1507 	case SO_PRIORITY:
1508 		v.val = sk->sk_priority;
1509 		break;
1510 
1511 	case SO_LINGER:
1512 		lv		= sizeof(v.ling);
1513 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1514 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1515 		break;
1516 
1517 	case SO_BSDCOMPAT:
1518 		break;
1519 
1520 	case SO_TIMESTAMP_OLD:
1521 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1522 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1523 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1524 		break;
1525 
1526 	case SO_TIMESTAMPNS_OLD:
1527 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1528 		break;
1529 
1530 	case SO_TIMESTAMP_NEW:
1531 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1532 		break;
1533 
1534 	case SO_TIMESTAMPNS_NEW:
1535 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1536 		break;
1537 
1538 	case SO_TIMESTAMPING_OLD:
1539 	case SO_TIMESTAMPING_NEW:
1540 		lv = sizeof(v.timestamping);
1541 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1542 		 * returning the flags when they were set through the same option.
1543 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1544 		 */
1545 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1546 			v.timestamping.flags = sk->sk_tsflags;
1547 			v.timestamping.bind_phc = sk->sk_bind_phc;
1548 		}
1549 		break;
1550 
1551 	case SO_RCVTIMEO_OLD:
1552 	case SO_RCVTIMEO_NEW:
1553 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1554 		break;
1555 
1556 	case SO_SNDTIMEO_OLD:
1557 	case SO_SNDTIMEO_NEW:
1558 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1559 		break;
1560 
1561 	case SO_RCVLOWAT:
1562 		v.val = READ_ONCE(sk->sk_rcvlowat);
1563 		break;
1564 
1565 	case SO_SNDLOWAT:
1566 		v.val = 1;
1567 		break;
1568 
1569 	case SO_PASSCRED:
1570 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1571 		break;
1572 
1573 	case SO_PEERCRED:
1574 	{
1575 		struct ucred peercred;
1576 		if (len > sizeof(peercred))
1577 			len = sizeof(peercred);
1578 
1579 		spin_lock(&sk->sk_peer_lock);
1580 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1581 		spin_unlock(&sk->sk_peer_lock);
1582 
1583 		if (copy_to_user(optval, &peercred, len))
1584 			return -EFAULT;
1585 		goto lenout;
1586 	}
1587 
1588 	case SO_PEERGROUPS:
1589 	{
1590 		const struct cred *cred;
1591 		int ret, n;
1592 
1593 		cred = sk_get_peer_cred(sk);
1594 		if (!cred)
1595 			return -ENODATA;
1596 
1597 		n = cred->group_info->ngroups;
1598 		if (len < n * sizeof(gid_t)) {
1599 			len = n * sizeof(gid_t);
1600 			put_cred(cred);
1601 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1602 		}
1603 		len = n * sizeof(gid_t);
1604 
1605 		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1606 		put_cred(cred);
1607 		if (ret)
1608 			return ret;
1609 		goto lenout;
1610 	}
1611 
1612 	case SO_PEERNAME:
1613 	{
1614 		char address[128];
1615 
1616 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1617 		if (lv < 0)
1618 			return -ENOTCONN;
1619 		if (lv < len)
1620 			return -EINVAL;
1621 		if (copy_to_user(optval, address, len))
1622 			return -EFAULT;
1623 		goto lenout;
1624 	}
1625 
1626 	/* Dubious BSD thing... Probably nobody even uses it, but
1627 	 * the UNIX standard wants it for whatever reason... -DaveM
1628 	 */
1629 	case SO_ACCEPTCONN:
1630 		v.val = sk->sk_state == TCP_LISTEN;
1631 		break;
1632 
1633 	case SO_PASSSEC:
1634 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1635 		break;
1636 
1637 	case SO_PEERSEC:
1638 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1639 
1640 	case SO_MARK:
1641 		v.val = sk->sk_mark;
1642 		break;
1643 
1644 	case SO_RXQ_OVFL:
1645 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1646 		break;
1647 
1648 	case SO_WIFI_STATUS:
1649 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1650 		break;
1651 
1652 	case SO_PEEK_OFF:
1653 		if (!sock->ops->set_peek_off)
1654 			return -EOPNOTSUPP;
1655 
1656 		v.val = READ_ONCE(sk->sk_peek_off);
1657 		break;
1658 	case SO_NOFCS:
1659 		v.val = sock_flag(sk, SOCK_NOFCS);
1660 		break;
1661 
1662 	case SO_BINDTODEVICE:
1663 		return sock_getbindtodevice(sk, optval, optlen, len);
1664 
1665 	case SO_GET_FILTER:
1666 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1667 		if (len < 0)
1668 			return len;
1669 
1670 		goto lenout;
1671 
1672 	case SO_LOCK_FILTER:
1673 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1674 		break;
1675 
1676 	case SO_BPF_EXTENSIONS:
1677 		v.val = bpf_tell_extensions();
1678 		break;
1679 
1680 	case SO_SELECT_ERR_QUEUE:
1681 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1682 		break;
1683 
1684 #ifdef CONFIG_NET_RX_BUSY_POLL
1685 	case SO_BUSY_POLL:
1686 		v.val = READ_ONCE(sk->sk_ll_usec);
1687 		break;
1688 	case SO_PREFER_BUSY_POLL:
1689 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1690 		break;
1691 #endif
1692 
1693 	case SO_MAX_PACING_RATE:
1694 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1695 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1696 			lv = sizeof(v.ulval);
1697 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1698 		} else {
1699 			/* 32bit version */
1700 			v.val = min_t(unsigned long, ~0U,
1701 				      READ_ONCE(sk->sk_max_pacing_rate));
1702 		}
1703 		break;
1704 
1705 	case SO_INCOMING_CPU:
1706 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1707 		break;
1708 
1709 	case SO_MEMINFO:
1710 	{
1711 		u32 meminfo[SK_MEMINFO_VARS];
1712 
1713 		sk_get_meminfo(sk, meminfo);
1714 
1715 		len = min_t(unsigned int, len, sizeof(meminfo));
1716 		if (copy_to_user(optval, &meminfo, len))
1717 			return -EFAULT;
1718 
1719 		goto lenout;
1720 	}
1721 
1722 #ifdef CONFIG_NET_RX_BUSY_POLL
1723 	case SO_INCOMING_NAPI_ID:
1724 		v.val = READ_ONCE(sk->sk_napi_id);
1725 
1726 		/* aggregate non-NAPI IDs down to 0 */
1727 		if (v.val < MIN_NAPI_ID)
1728 			v.val = 0;
1729 
1730 		break;
1731 #endif
1732 
1733 	case SO_COOKIE:
1734 		lv = sizeof(u64);
1735 		if (len < lv)
1736 			return -EINVAL;
1737 		v.val64 = sock_gen_cookie(sk);
1738 		break;
1739 
1740 	case SO_ZEROCOPY:
1741 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1742 		break;
1743 
1744 	case SO_TXTIME:
1745 		lv = sizeof(v.txtime);
1746 		v.txtime.clockid = sk->sk_clockid;
1747 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1748 				  SOF_TXTIME_DEADLINE_MODE : 0;
1749 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1750 				  SOF_TXTIME_REPORT_ERRORS : 0;
1751 		break;
1752 
1753 	case SO_BINDTOIFINDEX:
1754 		v.val = sk->sk_bound_dev_if;
1755 		break;
1756 
1757 	case SO_NETNS_COOKIE:
1758 		lv = sizeof(u64);
1759 		if (len != lv)
1760 			return -EINVAL;
1761 		v.val64 = sock_net(sk)->net_cookie;
1762 		break;
1763 
1764 	case SO_BUF_LOCK:
1765 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1766 		break;
1767 
1768 	default:
1769 		/* We implement the SO_SNDLOWAT etc to not be settable
1770 		 * (1003.1g 7).
1771 		 */
1772 		return -ENOPROTOOPT;
1773 	}
1774 
1775 	if (len > lv)
1776 		len = lv;
1777 	if (copy_to_user(optval, &v, len))
1778 		return -EFAULT;
1779 lenout:
1780 	if (put_user(len, optlen))
1781 		return -EFAULT;
1782 	return 0;
1783 }
1784 
1785 /*
1786  * Initialize an sk_lock.
1787  *
1788  * (We also register the sk_lock with the lock validator.)
1789  */
sock_lock_init(struct sock * sk)1790 static inline void sock_lock_init(struct sock *sk)
1791 {
1792 	if (sk->sk_kern_sock)
1793 		sock_lock_init_class_and_name(
1794 			sk,
1795 			af_family_kern_slock_key_strings[sk->sk_family],
1796 			af_family_kern_slock_keys + sk->sk_family,
1797 			af_family_kern_key_strings[sk->sk_family],
1798 			af_family_kern_keys + sk->sk_family);
1799 	else
1800 		sock_lock_init_class_and_name(
1801 			sk,
1802 			af_family_slock_key_strings[sk->sk_family],
1803 			af_family_slock_keys + sk->sk_family,
1804 			af_family_key_strings[sk->sk_family],
1805 			af_family_keys + sk->sk_family);
1806 }
1807 
1808 /*
1809  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1810  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1811  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1812  */
sock_copy(struct sock * nsk,const struct sock * osk)1813 static void sock_copy(struct sock *nsk, const struct sock *osk)
1814 {
1815 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1816 #ifdef CONFIG_SECURITY_NETWORK
1817 	void *sptr = nsk->sk_security;
1818 #endif
1819 
1820 	/* If we move sk_tx_queue_mapping out of the private section,
1821 	 * we must check if sk_tx_queue_clear() is called after
1822 	 * sock_copy() in sk_clone_lock().
1823 	 */
1824 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1825 		     offsetof(struct sock, sk_dontcopy_begin) ||
1826 		     offsetof(struct sock, sk_tx_queue_mapping) >=
1827 		     offsetof(struct sock, sk_dontcopy_end));
1828 
1829 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1830 
1831 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1832 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1833 
1834 #ifdef CONFIG_SECURITY_NETWORK
1835 	nsk->sk_security = sptr;
1836 	security_sk_clone(osk, nsk);
1837 #endif
1838 }
1839 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)1840 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1841 		int family)
1842 {
1843 	struct sock *sk;
1844 	struct kmem_cache *slab;
1845 
1846 	slab = prot->slab;
1847 	if (slab != NULL) {
1848 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1849 		if (!sk)
1850 			return sk;
1851 		if (want_init_on_alloc(priority))
1852 			sk_prot_clear_nulls(sk, prot->obj_size);
1853 	} else
1854 		sk = kmalloc(prot->obj_size, priority);
1855 
1856 	if (sk != NULL) {
1857 		if (security_sk_alloc(sk, family, priority))
1858 			goto out_free;
1859 
1860 		trace_android_rvh_sk_alloc(sk);
1861 
1862 		if (!try_module_get(prot->owner))
1863 			goto out_free_sec;
1864 	}
1865 
1866 	return sk;
1867 
1868 out_free_sec:
1869 	security_sk_free(sk);
1870 	trace_android_rvh_sk_free(sk);
1871 out_free:
1872 	if (slab != NULL)
1873 		kmem_cache_free(slab, sk);
1874 	else
1875 		kfree(sk);
1876 	return NULL;
1877 }
1878 
sk_prot_free(struct proto * prot,struct sock * sk)1879 static void sk_prot_free(struct proto *prot, struct sock *sk)
1880 {
1881 	struct kmem_cache *slab;
1882 	struct module *owner;
1883 
1884 	owner = prot->owner;
1885 	slab = prot->slab;
1886 
1887 	cgroup_sk_free(&sk->sk_cgrp_data);
1888 	mem_cgroup_sk_free(sk);
1889 	security_sk_free(sk);
1890 	trace_android_rvh_sk_free(sk);
1891 	if (slab != NULL)
1892 		kmem_cache_free(slab, sk);
1893 	else
1894 		kfree(sk);
1895 	module_put(owner);
1896 }
1897 
1898 /**
1899  *	sk_alloc - All socket objects are allocated here
1900  *	@net: the applicable net namespace
1901  *	@family: protocol family
1902  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1903  *	@prot: struct proto associated with this new sock instance
1904  *	@kern: is this to be a kernel socket?
1905  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)1906 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1907 		      struct proto *prot, int kern)
1908 {
1909 	struct sock *sk;
1910 
1911 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1912 	if (sk) {
1913 		sk->sk_family = family;
1914 		/*
1915 		 * See comment in struct sock definition to understand
1916 		 * why we need sk_prot_creator -acme
1917 		 */
1918 		sk->sk_prot = sk->sk_prot_creator = prot;
1919 		sk->sk_kern_sock = kern;
1920 		sock_lock_init(sk);
1921 		sk->sk_net_refcnt = kern ? 0 : 1;
1922 		if (likely(sk->sk_net_refcnt)) {
1923 			get_net(net);
1924 			sock_inuse_add(net, 1);
1925 		}
1926 
1927 		sock_net_set(sk, net);
1928 		refcount_set(&sk->sk_wmem_alloc, 1);
1929 
1930 		mem_cgroup_sk_alloc(sk);
1931 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1932 		sock_update_classid(&sk->sk_cgrp_data);
1933 		sock_update_netprioidx(&sk->sk_cgrp_data);
1934 		sk_tx_queue_clear(sk);
1935 	}
1936 
1937 	return sk;
1938 }
1939 EXPORT_SYMBOL(sk_alloc);
1940 
1941 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1942  * grace period. This is the case for UDP sockets and TCP listeners.
1943  */
__sk_destruct(struct rcu_head * head)1944 static void __sk_destruct(struct rcu_head *head)
1945 {
1946 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1947 	struct sk_filter *filter;
1948 
1949 	if (sk->sk_destruct)
1950 		sk->sk_destruct(sk);
1951 
1952 	filter = rcu_dereference_check(sk->sk_filter,
1953 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1954 	if (filter) {
1955 		sk_filter_uncharge(sk, filter);
1956 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1957 	}
1958 
1959 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1960 
1961 #ifdef CONFIG_BPF_SYSCALL
1962 	bpf_sk_storage_free(sk);
1963 #endif
1964 
1965 	if (atomic_read(&sk->sk_omem_alloc))
1966 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1967 			 __func__, atomic_read(&sk->sk_omem_alloc));
1968 
1969 	if (sk->sk_frag.page) {
1970 		put_page(sk->sk_frag.page);
1971 		sk->sk_frag.page = NULL;
1972 	}
1973 
1974 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1975 	put_cred(sk->sk_peer_cred);
1976 	put_pid(sk->sk_peer_pid);
1977 
1978 	if (likely(sk->sk_net_refcnt))
1979 		put_net(sock_net(sk));
1980 	sk_prot_free(sk->sk_prot_creator, sk);
1981 }
1982 
sk_destruct(struct sock * sk)1983 void sk_destruct(struct sock *sk)
1984 {
1985 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1986 
1987 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1988 		reuseport_detach_sock(sk);
1989 		use_call_rcu = true;
1990 	}
1991 
1992 	if (use_call_rcu)
1993 		call_rcu(&sk->sk_rcu, __sk_destruct);
1994 	else
1995 		__sk_destruct(&sk->sk_rcu);
1996 }
1997 
__sk_free(struct sock * sk)1998 static void __sk_free(struct sock *sk)
1999 {
2000 	if (likely(sk->sk_net_refcnt))
2001 		sock_inuse_add(sock_net(sk), -1);
2002 
2003 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2004 		sock_diag_broadcast_destroy(sk);
2005 	else
2006 		sk_destruct(sk);
2007 }
2008 
sk_free(struct sock * sk)2009 void sk_free(struct sock *sk)
2010 {
2011 	/*
2012 	 * We subtract one from sk_wmem_alloc and can know if
2013 	 * some packets are still in some tx queue.
2014 	 * If not null, sock_wfree() will call __sk_free(sk) later
2015 	 */
2016 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2017 		__sk_free(sk);
2018 }
2019 EXPORT_SYMBOL(sk_free);
2020 
sk_init_common(struct sock * sk)2021 static void sk_init_common(struct sock *sk)
2022 {
2023 	skb_queue_head_init(&sk->sk_receive_queue);
2024 	skb_queue_head_init(&sk->sk_write_queue);
2025 	skb_queue_head_init(&sk->sk_error_queue);
2026 
2027 	rwlock_init(&sk->sk_callback_lock);
2028 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2029 			af_rlock_keys + sk->sk_family,
2030 			af_family_rlock_key_strings[sk->sk_family]);
2031 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2032 			af_wlock_keys + sk->sk_family,
2033 			af_family_wlock_key_strings[sk->sk_family]);
2034 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2035 			af_elock_keys + sk->sk_family,
2036 			af_family_elock_key_strings[sk->sk_family]);
2037 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2038 			af_callback_keys + sk->sk_family,
2039 			af_family_clock_key_strings[sk->sk_family]);
2040 }
2041 
2042 /**
2043  *	sk_clone_lock - clone a socket, and lock its clone
2044  *	@sk: the socket to clone
2045  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2046  *
2047  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2048  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2049 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2050 {
2051 	struct proto *prot = READ_ONCE(sk->sk_prot);
2052 	struct sk_filter *filter;
2053 	bool is_charged = true;
2054 	struct sock *newsk;
2055 
2056 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2057 	if (!newsk)
2058 		goto out;
2059 
2060 	sock_copy(newsk, sk);
2061 
2062 	newsk->sk_prot_creator = prot;
2063 
2064 	/* SANITY */
2065 	if (likely(newsk->sk_net_refcnt)) {
2066 		get_net(sock_net(newsk));
2067 		sock_inuse_add(sock_net(newsk), 1);
2068 	}
2069 	sk_node_init(&newsk->sk_node);
2070 	sock_lock_init(newsk);
2071 	bh_lock_sock(newsk);
2072 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2073 	newsk->sk_backlog.len = 0;
2074 
2075 	atomic_set(&newsk->sk_rmem_alloc, 0);
2076 
2077 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2078 	refcount_set(&newsk->sk_wmem_alloc, 1);
2079 
2080 	atomic_set(&newsk->sk_omem_alloc, 0);
2081 	sk_init_common(newsk);
2082 
2083 	newsk->sk_dst_cache	= NULL;
2084 	newsk->sk_dst_pending_confirm = 0;
2085 	newsk->sk_wmem_queued	= 0;
2086 	newsk->sk_forward_alloc = 0;
2087 	atomic_set(&newsk->sk_drops, 0);
2088 	newsk->sk_send_head	= NULL;
2089 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2090 	atomic_set(&newsk->sk_zckey, 0);
2091 
2092 	sock_reset_flag(newsk, SOCK_DONE);
2093 
2094 	/* sk->sk_memcg will be populated at accept() time */
2095 	newsk->sk_memcg = NULL;
2096 
2097 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2098 
2099 	rcu_read_lock();
2100 	filter = rcu_dereference(sk->sk_filter);
2101 	if (filter != NULL)
2102 		/* though it's an empty new sock, the charging may fail
2103 		 * if sysctl_optmem_max was changed between creation of
2104 		 * original socket and cloning
2105 		 */
2106 		is_charged = sk_filter_charge(newsk, filter);
2107 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2108 	rcu_read_unlock();
2109 
2110 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2111 		/* We need to make sure that we don't uncharge the new
2112 		 * socket if we couldn't charge it in the first place
2113 		 * as otherwise we uncharge the parent's filter.
2114 		 */
2115 		if (!is_charged)
2116 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2117 		sk_free_unlock_clone(newsk);
2118 		newsk = NULL;
2119 		goto out;
2120 	}
2121 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2122 
2123 	if (bpf_sk_storage_clone(sk, newsk)) {
2124 		sk_free_unlock_clone(newsk);
2125 		newsk = NULL;
2126 		goto out;
2127 	}
2128 
2129 	/* Clear sk_user_data if parent had the pointer tagged
2130 	 * as not suitable for copying when cloning.
2131 	 */
2132 	if (sk_user_data_is_nocopy(newsk))
2133 		newsk->sk_user_data = NULL;
2134 
2135 	newsk->sk_err	   = 0;
2136 	newsk->sk_err_soft = 0;
2137 	newsk->sk_priority = 0;
2138 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2139 
2140 	/* Before updating sk_refcnt, we must commit prior changes to memory
2141 	 * (Documentation/RCU/rculist_nulls.rst for details)
2142 	 */
2143 	smp_wmb();
2144 	refcount_set(&newsk->sk_refcnt, 2);
2145 
2146 	/* Increment the counter in the same struct proto as the master
2147 	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2148 	 * is the same as sk->sk_prot->socks, as this field was copied
2149 	 * with memcpy).
2150 	 *
2151 	 * This _changes_ the previous behaviour, where
2152 	 * tcp_create_openreq_child always was incrementing the
2153 	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2154 	 * to be taken into account in all callers. -acme
2155 	 */
2156 	sk_refcnt_debug_inc(newsk);
2157 	sk_set_socket(newsk, NULL);
2158 	sk_tx_queue_clear(newsk);
2159 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2160 
2161 	if (newsk->sk_prot->sockets_allocated)
2162 		sk_sockets_allocated_inc(newsk);
2163 
2164 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2165 		net_enable_timestamp();
2166 out:
2167 	return newsk;
2168 }
2169 EXPORT_SYMBOL_GPL(sk_clone_lock);
2170 
sk_free_unlock_clone(struct sock * sk)2171 void sk_free_unlock_clone(struct sock *sk)
2172 {
2173 	/* It is still raw copy of parent, so invalidate
2174 	 * destructor and make plain sk_free() */
2175 	sk->sk_destruct = NULL;
2176 	bh_unlock_sock(sk);
2177 	sk_free(sk);
2178 }
2179 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2180 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2181 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2182 {
2183 	u32 max_segs = 1;
2184 
2185 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2186 	if (sk->sk_route_caps & NETIF_F_GSO)
2187 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2188 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
2189 	if (sk_can_gso(sk)) {
2190 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2191 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2192 		} else {
2193 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2194 			sk->sk_gso_max_size = dst->dev->gso_max_size;
2195 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2196 		}
2197 	}
2198 	sk->sk_gso_max_segs = max_segs;
2199 	sk_dst_set(sk, dst);
2200 }
2201 EXPORT_SYMBOL_GPL(sk_setup_caps);
2202 
2203 /*
2204  *	Simple resource managers for sockets.
2205  */
2206 
2207 
2208 /*
2209  * Write buffer destructor automatically called from kfree_skb.
2210  */
sock_wfree(struct sk_buff * skb)2211 void sock_wfree(struct sk_buff *skb)
2212 {
2213 	struct sock *sk = skb->sk;
2214 	unsigned int len = skb->truesize;
2215 
2216 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2217 		/*
2218 		 * Keep a reference on sk_wmem_alloc, this will be released
2219 		 * after sk_write_space() call
2220 		 */
2221 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2222 		sk->sk_write_space(sk);
2223 		len = 1;
2224 	}
2225 	/*
2226 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2227 	 * could not do because of in-flight packets
2228 	 */
2229 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2230 		__sk_free(sk);
2231 }
2232 EXPORT_SYMBOL(sock_wfree);
2233 
2234 /* This variant of sock_wfree() is used by TCP,
2235  * since it sets SOCK_USE_WRITE_QUEUE.
2236  */
__sock_wfree(struct sk_buff * skb)2237 void __sock_wfree(struct sk_buff *skb)
2238 {
2239 	struct sock *sk = skb->sk;
2240 
2241 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2242 		__sk_free(sk);
2243 }
2244 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2245 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2246 {
2247 	skb_orphan(skb);
2248 	skb->sk = sk;
2249 #ifdef CONFIG_INET
2250 	if (unlikely(!sk_fullsock(sk))) {
2251 		skb->destructor = sock_edemux;
2252 		sock_hold(sk);
2253 		return;
2254 	}
2255 #endif
2256 	skb->destructor = sock_wfree;
2257 	skb_set_hash_from_sk(skb, sk);
2258 	/*
2259 	 * We used to take a refcount on sk, but following operation
2260 	 * is enough to guarantee sk_free() wont free this sock until
2261 	 * all in-flight packets are completed
2262 	 */
2263 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2264 }
2265 EXPORT_SYMBOL(skb_set_owner_w);
2266 
can_skb_orphan_partial(const struct sk_buff * skb)2267 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2268 {
2269 #ifdef CONFIG_TLS_DEVICE
2270 	/* Drivers depend on in-order delivery for crypto offload,
2271 	 * partial orphan breaks out-of-order-OK logic.
2272 	 */
2273 	if (skb->decrypted)
2274 		return false;
2275 #endif
2276 	return (skb->destructor == sock_wfree ||
2277 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2278 }
2279 
2280 /* This helper is used by netem, as it can hold packets in its
2281  * delay queue. We want to allow the owner socket to send more
2282  * packets, as if they were already TX completed by a typical driver.
2283  * But we also want to keep skb->sk set because some packet schedulers
2284  * rely on it (sch_fq for example).
2285  */
skb_orphan_partial(struct sk_buff * skb)2286 void skb_orphan_partial(struct sk_buff *skb)
2287 {
2288 	if (skb_is_tcp_pure_ack(skb))
2289 		return;
2290 
2291 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2292 		return;
2293 
2294 	skb_orphan(skb);
2295 }
2296 EXPORT_SYMBOL(skb_orphan_partial);
2297 
2298 /*
2299  * Read buffer destructor automatically called from kfree_skb.
2300  */
sock_rfree(struct sk_buff * skb)2301 void sock_rfree(struct sk_buff *skb)
2302 {
2303 	struct sock *sk = skb->sk;
2304 	unsigned int len = skb->truesize;
2305 
2306 	atomic_sub(len, &sk->sk_rmem_alloc);
2307 	sk_mem_uncharge(sk, len);
2308 }
2309 EXPORT_SYMBOL(sock_rfree);
2310 
2311 /*
2312  * Buffer destructor for skbs that are not used directly in read or write
2313  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2314  */
sock_efree(struct sk_buff * skb)2315 void sock_efree(struct sk_buff *skb)
2316 {
2317 	sock_put(skb->sk);
2318 }
2319 EXPORT_SYMBOL(sock_efree);
2320 
2321 /* Buffer destructor for prefetch/receive path where reference count may
2322  * not be held, e.g. for listen sockets.
2323  */
2324 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2325 void sock_pfree(struct sk_buff *skb)
2326 {
2327 	if (sk_is_refcounted(skb->sk))
2328 		sock_gen_put(skb->sk);
2329 }
2330 EXPORT_SYMBOL(sock_pfree);
2331 #endif /* CONFIG_INET */
2332 
sock_i_uid(struct sock * sk)2333 kuid_t sock_i_uid(struct sock *sk)
2334 {
2335 	kuid_t uid;
2336 
2337 	read_lock_bh(&sk->sk_callback_lock);
2338 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2339 	read_unlock_bh(&sk->sk_callback_lock);
2340 	return uid;
2341 }
2342 EXPORT_SYMBOL(sock_i_uid);
2343 
__sock_i_ino(struct sock * sk)2344 unsigned long __sock_i_ino(struct sock *sk)
2345 {
2346 	unsigned long ino;
2347 
2348 	read_lock(&sk->sk_callback_lock);
2349 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2350 	read_unlock(&sk->sk_callback_lock);
2351 	return ino;
2352 }
2353 EXPORT_SYMBOL(__sock_i_ino);
2354 
sock_i_ino(struct sock * sk)2355 unsigned long sock_i_ino(struct sock *sk)
2356 {
2357 	unsigned long ino;
2358 
2359 	local_bh_disable();
2360 	ino = __sock_i_ino(sk);
2361 	local_bh_enable();
2362 	return ino;
2363 }
2364 EXPORT_SYMBOL(sock_i_ino);
2365 
2366 /*
2367  * Allocate a skb from the socket's send buffer.
2368  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2369 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2370 			     gfp_t priority)
2371 {
2372 	if (force ||
2373 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2374 		struct sk_buff *skb = alloc_skb(size, priority);
2375 
2376 		if (skb) {
2377 			skb_set_owner_w(skb, sk);
2378 			return skb;
2379 		}
2380 	}
2381 	return NULL;
2382 }
2383 EXPORT_SYMBOL(sock_wmalloc);
2384 
sock_ofree(struct sk_buff * skb)2385 static void sock_ofree(struct sk_buff *skb)
2386 {
2387 	struct sock *sk = skb->sk;
2388 
2389 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2390 }
2391 
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2392 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2393 			     gfp_t priority)
2394 {
2395 	struct sk_buff *skb;
2396 
2397 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2398 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2399 	    READ_ONCE(sysctl_optmem_max))
2400 		return NULL;
2401 
2402 	skb = alloc_skb(size, priority);
2403 	if (!skb)
2404 		return NULL;
2405 
2406 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2407 	skb->sk = sk;
2408 	skb->destructor = sock_ofree;
2409 	return skb;
2410 }
2411 
2412 /*
2413  * Allocate a memory block from the socket's option memory buffer.
2414  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2415 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2416 {
2417 	int optmem_max = READ_ONCE(sysctl_optmem_max);
2418 
2419 	if ((unsigned int)size <= optmem_max &&
2420 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2421 		void *mem;
2422 		/* First do the add, to avoid the race if kmalloc
2423 		 * might sleep.
2424 		 */
2425 		atomic_add(size, &sk->sk_omem_alloc);
2426 		mem = kmalloc(size, priority);
2427 		if (mem)
2428 			return mem;
2429 		atomic_sub(size, &sk->sk_omem_alloc);
2430 	}
2431 	return NULL;
2432 }
2433 EXPORT_SYMBOL(sock_kmalloc);
2434 
2435 /* Free an option memory block. Note, we actually want the inline
2436  * here as this allows gcc to detect the nullify and fold away the
2437  * condition entirely.
2438  */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2439 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2440 				  const bool nullify)
2441 {
2442 	if (WARN_ON_ONCE(!mem))
2443 		return;
2444 	if (nullify)
2445 		kfree_sensitive(mem);
2446 	else
2447 		kfree(mem);
2448 	atomic_sub(size, &sk->sk_omem_alloc);
2449 }
2450 
sock_kfree_s(struct sock * sk,void * mem,int size)2451 void sock_kfree_s(struct sock *sk, void *mem, int size)
2452 {
2453 	__sock_kfree_s(sk, mem, size, false);
2454 }
2455 EXPORT_SYMBOL(sock_kfree_s);
2456 
sock_kzfree_s(struct sock * sk,void * mem,int size)2457 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2458 {
2459 	__sock_kfree_s(sk, mem, size, true);
2460 }
2461 EXPORT_SYMBOL(sock_kzfree_s);
2462 
2463 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2464    I think, these locks should be removed for datagram sockets.
2465  */
sock_wait_for_wmem(struct sock * sk,long timeo)2466 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2467 {
2468 	DEFINE_WAIT(wait);
2469 
2470 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2471 	for (;;) {
2472 		if (!timeo)
2473 			break;
2474 		if (signal_pending(current))
2475 			break;
2476 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2477 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2478 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2479 			break;
2480 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2481 			break;
2482 		if (READ_ONCE(sk->sk_err))
2483 			break;
2484 		timeo = schedule_timeout(timeo);
2485 	}
2486 	finish_wait(sk_sleep(sk), &wait);
2487 	return timeo;
2488 }
2489 
2490 
2491 /*
2492  *	Generic send/receive buffer handlers
2493  */
2494 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2495 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2496 				     unsigned long data_len, int noblock,
2497 				     int *errcode, int max_page_order)
2498 {
2499 	struct sk_buff *skb;
2500 	long timeo;
2501 	int err;
2502 
2503 	timeo = sock_sndtimeo(sk, noblock);
2504 	for (;;) {
2505 		err = sock_error(sk);
2506 		if (err != 0)
2507 			goto failure;
2508 
2509 		err = -EPIPE;
2510 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2511 			goto failure;
2512 
2513 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2514 			break;
2515 
2516 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2517 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2518 		err = -EAGAIN;
2519 		if (!timeo)
2520 			goto failure;
2521 		if (signal_pending(current))
2522 			goto interrupted;
2523 		timeo = sock_wait_for_wmem(sk, timeo);
2524 	}
2525 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2526 				   errcode, sk->sk_allocation);
2527 	if (skb)
2528 		skb_set_owner_w(skb, sk);
2529 	return skb;
2530 
2531 interrupted:
2532 	err = sock_intr_errno(timeo);
2533 failure:
2534 	*errcode = err;
2535 	return NULL;
2536 }
2537 EXPORT_SYMBOL(sock_alloc_send_pskb);
2538 
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)2539 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2540 				    int noblock, int *errcode)
2541 {
2542 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2543 }
2544 EXPORT_SYMBOL(sock_alloc_send_skb);
2545 
__sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2546 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2547 		     struct sockcm_cookie *sockc)
2548 {
2549 	u32 tsflags;
2550 
2551 	switch (cmsg->cmsg_type) {
2552 	case SO_MARK:
2553 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2554 			return -EPERM;
2555 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2556 			return -EINVAL;
2557 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2558 		break;
2559 	case SO_TIMESTAMPING_OLD:
2560 	case SO_TIMESTAMPING_NEW:
2561 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2562 			return -EINVAL;
2563 
2564 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2565 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2566 			return -EINVAL;
2567 
2568 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2569 		sockc->tsflags |= tsflags;
2570 		break;
2571 	case SCM_TXTIME:
2572 		if (!sock_flag(sk, SOCK_TXTIME))
2573 			return -EINVAL;
2574 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2575 			return -EINVAL;
2576 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2577 		break;
2578 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2579 	case SCM_RIGHTS:
2580 	case SCM_CREDENTIALS:
2581 		break;
2582 	default:
2583 		return -EINVAL;
2584 	}
2585 	return 0;
2586 }
2587 EXPORT_SYMBOL(__sock_cmsg_send);
2588 
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2589 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2590 		   struct sockcm_cookie *sockc)
2591 {
2592 	struct cmsghdr *cmsg;
2593 	int ret;
2594 
2595 	for_each_cmsghdr(cmsg, msg) {
2596 		if (!CMSG_OK(msg, cmsg))
2597 			return -EINVAL;
2598 		if (cmsg->cmsg_level != SOL_SOCKET)
2599 			continue;
2600 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2601 		if (ret)
2602 			return ret;
2603 	}
2604 	return 0;
2605 }
2606 EXPORT_SYMBOL(sock_cmsg_send);
2607 
sk_enter_memory_pressure(struct sock * sk)2608 static void sk_enter_memory_pressure(struct sock *sk)
2609 {
2610 	if (!sk->sk_prot->enter_memory_pressure)
2611 		return;
2612 
2613 	sk->sk_prot->enter_memory_pressure(sk);
2614 }
2615 
sk_leave_memory_pressure(struct sock * sk)2616 static void sk_leave_memory_pressure(struct sock *sk)
2617 {
2618 	if (sk->sk_prot->leave_memory_pressure) {
2619 		sk->sk_prot->leave_memory_pressure(sk);
2620 	} else {
2621 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2622 
2623 		if (memory_pressure && READ_ONCE(*memory_pressure))
2624 			WRITE_ONCE(*memory_pressure, 0);
2625 	}
2626 }
2627 
2628 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2629 
2630 /**
2631  * skb_page_frag_refill - check that a page_frag contains enough room
2632  * @sz: minimum size of the fragment we want to get
2633  * @pfrag: pointer to page_frag
2634  * @gfp: priority for memory allocation
2635  *
2636  * Note: While this allocator tries to use high order pages, there is
2637  * no guarantee that allocations succeed. Therefore, @sz MUST be
2638  * less or equal than PAGE_SIZE.
2639  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2640 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2641 {
2642 	if (pfrag->page) {
2643 		if (page_ref_count(pfrag->page) == 1) {
2644 			pfrag->offset = 0;
2645 			return true;
2646 		}
2647 		if (pfrag->offset + sz <= pfrag->size)
2648 			return true;
2649 		put_page(pfrag->page);
2650 	}
2651 
2652 	pfrag->offset = 0;
2653 	if (SKB_FRAG_PAGE_ORDER &&
2654 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2655 		/* Avoid direct reclaim but allow kswapd to wake */
2656 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2657 					  __GFP_COMP | __GFP_NOWARN |
2658 					  __GFP_NORETRY,
2659 					  SKB_FRAG_PAGE_ORDER);
2660 		if (likely(pfrag->page)) {
2661 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2662 			return true;
2663 		}
2664 	}
2665 	pfrag->page = alloc_page(gfp);
2666 	if (likely(pfrag->page)) {
2667 		pfrag->size = PAGE_SIZE;
2668 		return true;
2669 	}
2670 	return false;
2671 }
2672 EXPORT_SYMBOL(skb_page_frag_refill);
2673 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2674 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2675 {
2676 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2677 		return true;
2678 
2679 	sk_enter_memory_pressure(sk);
2680 	sk_stream_moderate_sndbuf(sk);
2681 	return false;
2682 }
2683 EXPORT_SYMBOL(sk_page_frag_refill);
2684 
__lock_sock(struct sock * sk)2685 void __lock_sock(struct sock *sk)
2686 	__releases(&sk->sk_lock.slock)
2687 	__acquires(&sk->sk_lock.slock)
2688 {
2689 	DEFINE_WAIT(wait);
2690 
2691 	for (;;) {
2692 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2693 					TASK_UNINTERRUPTIBLE);
2694 		spin_unlock_bh(&sk->sk_lock.slock);
2695 		schedule();
2696 		spin_lock_bh(&sk->sk_lock.slock);
2697 		if (!sock_owned_by_user(sk))
2698 			break;
2699 	}
2700 	finish_wait(&sk->sk_lock.wq, &wait);
2701 }
2702 
__release_sock(struct sock * sk)2703 void __release_sock(struct sock *sk)
2704 	__releases(&sk->sk_lock.slock)
2705 	__acquires(&sk->sk_lock.slock)
2706 {
2707 	struct sk_buff *skb, *next;
2708 
2709 	while ((skb = sk->sk_backlog.head) != NULL) {
2710 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2711 
2712 		spin_unlock_bh(&sk->sk_lock.slock);
2713 
2714 		do {
2715 			next = skb->next;
2716 			prefetch(next);
2717 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2718 			skb_mark_not_on_list(skb);
2719 			sk_backlog_rcv(sk, skb);
2720 
2721 			cond_resched();
2722 
2723 			skb = next;
2724 		} while (skb != NULL);
2725 
2726 		spin_lock_bh(&sk->sk_lock.slock);
2727 	}
2728 
2729 	/*
2730 	 * Doing the zeroing here guarantee we can not loop forever
2731 	 * while a wild producer attempts to flood us.
2732 	 */
2733 	sk->sk_backlog.len = 0;
2734 }
2735 
__sk_flush_backlog(struct sock * sk)2736 void __sk_flush_backlog(struct sock *sk)
2737 {
2738 	spin_lock_bh(&sk->sk_lock.slock);
2739 	__release_sock(sk);
2740 	spin_unlock_bh(&sk->sk_lock.slock);
2741 }
2742 
2743 /**
2744  * sk_wait_data - wait for data to arrive at sk_receive_queue
2745  * @sk:    sock to wait on
2746  * @timeo: for how long
2747  * @skb:   last skb seen on sk_receive_queue
2748  *
2749  * Now socket state including sk->sk_err is changed only under lock,
2750  * hence we may omit checks after joining wait queue.
2751  * We check receive queue before schedule() only as optimization;
2752  * it is very likely that release_sock() added new data.
2753  */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)2754 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2755 {
2756 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2757 	int rc;
2758 
2759 	add_wait_queue(sk_sleep(sk), &wait);
2760 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2761 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2762 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2763 	remove_wait_queue(sk_sleep(sk), &wait);
2764 	return rc;
2765 }
2766 EXPORT_SYMBOL(sk_wait_data);
2767 
2768 /**
2769  *	__sk_mem_raise_allocated - increase memory_allocated
2770  *	@sk: socket
2771  *	@size: memory size to allocate
2772  *	@amt: pages to allocate
2773  *	@kind: allocation type
2774  *
2775  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2776  */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)2777 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2778 {
2779 	struct proto *prot = sk->sk_prot;
2780 	long allocated = sk_memory_allocated_add(sk, amt);
2781 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2782 	bool charged = true;
2783 
2784 	if (memcg_charge &&
2785 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2786 						gfp_memcg_charge())))
2787 		goto suppress_allocation;
2788 
2789 	/* Under limit. */
2790 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2791 		sk_leave_memory_pressure(sk);
2792 		return 1;
2793 	}
2794 
2795 	/* Under pressure. */
2796 	if (allocated > sk_prot_mem_limits(sk, 1))
2797 		sk_enter_memory_pressure(sk);
2798 
2799 	/* Over hard limit. */
2800 	if (allocated > sk_prot_mem_limits(sk, 2))
2801 		goto suppress_allocation;
2802 
2803 	/* guarantee minimum buffer size under pressure */
2804 	if (kind == SK_MEM_RECV) {
2805 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2806 			return 1;
2807 
2808 	} else { /* SK_MEM_SEND */
2809 		int wmem0 = sk_get_wmem0(sk, prot);
2810 
2811 		if (sk->sk_type == SOCK_STREAM) {
2812 			if (sk->sk_wmem_queued < wmem0)
2813 				return 1;
2814 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2815 				return 1;
2816 		}
2817 	}
2818 
2819 	if (sk_has_memory_pressure(sk)) {
2820 		u64 alloc;
2821 
2822 		if (!sk_under_memory_pressure(sk))
2823 			return 1;
2824 		alloc = sk_sockets_allocated_read_positive(sk);
2825 		if (sk_prot_mem_limits(sk, 2) > alloc *
2826 		    sk_mem_pages(sk->sk_wmem_queued +
2827 				 atomic_read(&sk->sk_rmem_alloc) +
2828 				 sk->sk_forward_alloc))
2829 			return 1;
2830 	}
2831 
2832 suppress_allocation:
2833 
2834 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2835 		sk_stream_moderate_sndbuf(sk);
2836 
2837 		/* Fail only if socket is _under_ its sndbuf.
2838 		 * In this case we cannot block, so that we have to fail.
2839 		 */
2840 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2841 			/* Force charge with __GFP_NOFAIL */
2842 			if (memcg_charge && !charged) {
2843 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2844 					gfp_memcg_charge() | __GFP_NOFAIL);
2845 			}
2846 			return 1;
2847 		}
2848 	}
2849 
2850 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2851 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2852 
2853 	sk_memory_allocated_sub(sk, amt);
2854 
2855 	if (memcg_charge && charged)
2856 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2857 
2858 	return 0;
2859 }
2860 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2861 
2862 /**
2863  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2864  *	@sk: socket
2865  *	@size: memory size to allocate
2866  *	@kind: allocation type
2867  *
2868  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2869  *	rmem allocation. This function assumes that protocols which have
2870  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2871  */
__sk_mem_schedule(struct sock * sk,int size,int kind)2872 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2873 {
2874 	int ret, amt = sk_mem_pages(size);
2875 
2876 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2877 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2878 	if (!ret)
2879 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2880 	return ret;
2881 }
2882 EXPORT_SYMBOL(__sk_mem_schedule);
2883 
2884 /**
2885  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2886  *	@sk: socket
2887  *	@amount: number of quanta
2888  *
2889  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2890  */
__sk_mem_reduce_allocated(struct sock * sk,int amount)2891 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2892 {
2893 	sk_memory_allocated_sub(sk, amount);
2894 
2895 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2896 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2897 
2898 	if (sk_under_global_memory_pressure(sk) &&
2899 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2900 		sk_leave_memory_pressure(sk);
2901 }
2902 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2903 
2904 /**
2905  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2906  *	@sk: socket
2907  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2908  */
__sk_mem_reclaim(struct sock * sk,int amount)2909 void __sk_mem_reclaim(struct sock *sk, int amount)
2910 {
2911 	amount >>= SK_MEM_QUANTUM_SHIFT;
2912 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2913 	__sk_mem_reduce_allocated(sk, amount);
2914 }
2915 EXPORT_SYMBOL(__sk_mem_reclaim);
2916 
sk_set_peek_off(struct sock * sk,int val)2917 int sk_set_peek_off(struct sock *sk, int val)
2918 {
2919 	WRITE_ONCE(sk->sk_peek_off, val);
2920 	return 0;
2921 }
2922 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2923 
2924 /*
2925  * Set of default routines for initialising struct proto_ops when
2926  * the protocol does not support a particular function. In certain
2927  * cases where it makes no sense for a protocol to have a "do nothing"
2928  * function, some default processing is provided.
2929  */
2930 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)2931 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2932 {
2933 	return -EOPNOTSUPP;
2934 }
2935 EXPORT_SYMBOL(sock_no_bind);
2936 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)2937 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2938 		    int len, int flags)
2939 {
2940 	return -EOPNOTSUPP;
2941 }
2942 EXPORT_SYMBOL(sock_no_connect);
2943 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)2944 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2945 {
2946 	return -EOPNOTSUPP;
2947 }
2948 EXPORT_SYMBOL(sock_no_socketpair);
2949 
sock_no_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)2950 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2951 		   bool kern)
2952 {
2953 	return -EOPNOTSUPP;
2954 }
2955 EXPORT_SYMBOL(sock_no_accept);
2956 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)2957 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2958 		    int peer)
2959 {
2960 	return -EOPNOTSUPP;
2961 }
2962 EXPORT_SYMBOL(sock_no_getname);
2963 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)2964 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2965 {
2966 	return -EOPNOTSUPP;
2967 }
2968 EXPORT_SYMBOL(sock_no_ioctl);
2969 
sock_no_listen(struct socket * sock,int backlog)2970 int sock_no_listen(struct socket *sock, int backlog)
2971 {
2972 	return -EOPNOTSUPP;
2973 }
2974 EXPORT_SYMBOL(sock_no_listen);
2975 
sock_no_shutdown(struct socket * sock,int how)2976 int sock_no_shutdown(struct socket *sock, int how)
2977 {
2978 	return -EOPNOTSUPP;
2979 }
2980 EXPORT_SYMBOL(sock_no_shutdown);
2981 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)2982 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2983 {
2984 	return -EOPNOTSUPP;
2985 }
2986 EXPORT_SYMBOL(sock_no_sendmsg);
2987 
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)2988 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2989 {
2990 	return -EOPNOTSUPP;
2991 }
2992 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2993 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)2994 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2995 		    int flags)
2996 {
2997 	return -EOPNOTSUPP;
2998 }
2999 EXPORT_SYMBOL(sock_no_recvmsg);
3000 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)3001 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3002 {
3003 	/* Mirror missing mmap method error code */
3004 	return -ENODEV;
3005 }
3006 EXPORT_SYMBOL(sock_no_mmap);
3007 
3008 /*
3009  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3010  * various sock-based usage counts.
3011  */
__receive_sock(struct file * file)3012 void __receive_sock(struct file *file)
3013 {
3014 	struct socket *sock;
3015 
3016 	sock = sock_from_file(file);
3017 	if (sock) {
3018 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3019 		sock_update_classid(&sock->sk->sk_cgrp_data);
3020 	}
3021 }
3022 
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)3023 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3024 {
3025 	ssize_t res;
3026 	struct msghdr msg = {.msg_flags = flags};
3027 	struct kvec iov;
3028 	char *kaddr = kmap(page);
3029 	iov.iov_base = kaddr + offset;
3030 	iov.iov_len = size;
3031 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3032 	kunmap(page);
3033 	return res;
3034 }
3035 EXPORT_SYMBOL(sock_no_sendpage);
3036 
sock_no_sendpage_locked(struct sock * sk,struct page * page,int offset,size_t size,int flags)3037 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3038 				int offset, size_t size, int flags)
3039 {
3040 	ssize_t res;
3041 	struct msghdr msg = {.msg_flags = flags};
3042 	struct kvec iov;
3043 	char *kaddr = kmap(page);
3044 
3045 	iov.iov_base = kaddr + offset;
3046 	iov.iov_len = size;
3047 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3048 	kunmap(page);
3049 	return res;
3050 }
3051 EXPORT_SYMBOL(sock_no_sendpage_locked);
3052 
3053 /*
3054  *	Default Socket Callbacks
3055  */
3056 
sock_def_wakeup(struct sock * sk)3057 static void sock_def_wakeup(struct sock *sk)
3058 {
3059 	struct socket_wq *wq;
3060 
3061 	rcu_read_lock();
3062 	wq = rcu_dereference(sk->sk_wq);
3063 	if (skwq_has_sleeper(wq))
3064 		wake_up_interruptible_all(&wq->wait);
3065 	rcu_read_unlock();
3066 }
3067 
sock_def_error_report(struct sock * sk)3068 static void sock_def_error_report(struct sock *sk)
3069 {
3070 	struct socket_wq *wq;
3071 
3072 	rcu_read_lock();
3073 	wq = rcu_dereference(sk->sk_wq);
3074 	if (skwq_has_sleeper(wq))
3075 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3076 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3077 	rcu_read_unlock();
3078 }
3079 
sock_def_readable(struct sock * sk)3080 void sock_def_readable(struct sock *sk)
3081 {
3082 	struct socket_wq *wq;
3083 
3084 	rcu_read_lock();
3085 	wq = rcu_dereference(sk->sk_wq);
3086 
3087 	if (skwq_has_sleeper(wq)) {
3088 		int done = 0;
3089 
3090 		trace_android_vh_do_wake_up_sync(&wq->wait, &done);
3091 		if (done)
3092 			goto out;
3093 
3094 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3095 						EPOLLRDNORM | EPOLLRDBAND);
3096 	}
3097 
3098 out:
3099 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3100 	rcu_read_unlock();
3101 }
3102 
sock_def_write_space(struct sock * sk)3103 static void sock_def_write_space(struct sock *sk)
3104 {
3105 	struct socket_wq *wq;
3106 
3107 	rcu_read_lock();
3108 
3109 	/* Do not wake up a writer until he can make "significant"
3110 	 * progress.  --DaveM
3111 	 */
3112 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3113 		wq = rcu_dereference(sk->sk_wq);
3114 		if (skwq_has_sleeper(wq))
3115 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3116 						EPOLLWRNORM | EPOLLWRBAND);
3117 
3118 		/* Should agree with poll, otherwise some programs break */
3119 		if (sock_writeable(sk))
3120 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3121 	}
3122 
3123 	rcu_read_unlock();
3124 }
3125 
sock_def_destruct(struct sock * sk)3126 static void sock_def_destruct(struct sock *sk)
3127 {
3128 }
3129 
sk_send_sigurg(struct sock * sk)3130 void sk_send_sigurg(struct sock *sk)
3131 {
3132 	if (sk->sk_socket && sk->sk_socket->file)
3133 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3134 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3135 }
3136 EXPORT_SYMBOL(sk_send_sigurg);
3137 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3138 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3139 		    unsigned long expires)
3140 {
3141 	if (!mod_timer(timer, expires))
3142 		sock_hold(sk);
3143 }
3144 EXPORT_SYMBOL(sk_reset_timer);
3145 
sk_stop_timer(struct sock * sk,struct timer_list * timer)3146 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3147 {
3148 	if (del_timer(timer))
3149 		__sock_put(sk);
3150 }
3151 EXPORT_SYMBOL(sk_stop_timer);
3152 
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3153 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3154 {
3155 	if (del_timer_sync(timer))
3156 		__sock_put(sk);
3157 }
3158 EXPORT_SYMBOL(sk_stop_timer_sync);
3159 
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3160 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3161 {
3162 	sk_init_common(sk);
3163 	sk->sk_send_head	=	NULL;
3164 
3165 	timer_setup(&sk->sk_timer, NULL, 0);
3166 
3167 	sk->sk_allocation	=	GFP_KERNEL;
3168 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3169 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3170 	sk->sk_state		=	TCP_CLOSE;
3171 	sk_set_socket(sk, sock);
3172 
3173 	sock_set_flag(sk, SOCK_ZAPPED);
3174 
3175 	if (sock) {
3176 		sk->sk_type	=	sock->type;
3177 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3178 		sock->sk	=	sk;
3179 	} else {
3180 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3181 	}
3182 	sk->sk_uid	=	uid;
3183 
3184 	rwlock_init(&sk->sk_callback_lock);
3185 	if (sk->sk_kern_sock)
3186 		lockdep_set_class_and_name(
3187 			&sk->sk_callback_lock,
3188 			af_kern_callback_keys + sk->sk_family,
3189 			af_family_kern_clock_key_strings[sk->sk_family]);
3190 	else
3191 		lockdep_set_class_and_name(
3192 			&sk->sk_callback_lock,
3193 			af_callback_keys + sk->sk_family,
3194 			af_family_clock_key_strings[sk->sk_family]);
3195 
3196 	sk->sk_state_change	=	sock_def_wakeup;
3197 	sk->sk_data_ready	=	sock_def_readable;
3198 	sk->sk_write_space	=	sock_def_write_space;
3199 	sk->sk_error_report	=	sock_def_error_report;
3200 	sk->sk_destruct		=	sock_def_destruct;
3201 
3202 	sk->sk_frag.page	=	NULL;
3203 	sk->sk_frag.offset	=	0;
3204 	sk->sk_peek_off		=	-1;
3205 
3206 	sk->sk_peer_pid 	=	NULL;
3207 	sk->sk_peer_cred	=	NULL;
3208 	spin_lock_init(&sk->sk_peer_lock);
3209 
3210 	sk->sk_write_pending	=	0;
3211 	sk->sk_rcvlowat		=	1;
3212 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3213 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3214 
3215 	sk->sk_stamp = SK_DEFAULT_STAMP;
3216 #if BITS_PER_LONG==32
3217 	seqlock_init(&sk->sk_stamp_seq);
3218 #endif
3219 	atomic_set(&sk->sk_zckey, 0);
3220 
3221 #ifdef CONFIG_NET_RX_BUSY_POLL
3222 	sk->sk_napi_id		=	0;
3223 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3224 #endif
3225 
3226 	sk->sk_max_pacing_rate = ~0UL;
3227 	sk->sk_pacing_rate = ~0UL;
3228 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3229 	sk->sk_incoming_cpu = -1;
3230 
3231 	sk_rx_queue_clear(sk);
3232 	/*
3233 	 * Before updating sk_refcnt, we must commit prior changes to memory
3234 	 * (Documentation/RCU/rculist_nulls.rst for details)
3235 	 */
3236 	smp_wmb();
3237 	refcount_set(&sk->sk_refcnt, 1);
3238 	atomic_set(&sk->sk_drops, 0);
3239 }
3240 EXPORT_SYMBOL(sock_init_data_uid);
3241 
sock_init_data(struct socket * sock,struct sock * sk)3242 void sock_init_data(struct socket *sock, struct sock *sk)
3243 {
3244 	kuid_t uid = sock ?
3245 		SOCK_INODE(sock)->i_uid :
3246 		make_kuid(sock_net(sk)->user_ns, 0);
3247 
3248 	sock_init_data_uid(sock, sk, uid);
3249 }
3250 EXPORT_SYMBOL(sock_init_data);
3251 
lock_sock_nested(struct sock * sk,int subclass)3252 void lock_sock_nested(struct sock *sk, int subclass)
3253 {
3254 	/* The sk_lock has mutex_lock() semantics here. */
3255 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3256 
3257 	might_sleep();
3258 	spin_lock_bh(&sk->sk_lock.slock);
3259 	if (sk->sk_lock.owned)
3260 		__lock_sock(sk);
3261 	sk->sk_lock.owned = 1;
3262 	spin_unlock_bh(&sk->sk_lock.slock);
3263 }
3264 EXPORT_SYMBOL(lock_sock_nested);
3265 
release_sock(struct sock * sk)3266 void release_sock(struct sock *sk)
3267 {
3268 	spin_lock_bh(&sk->sk_lock.slock);
3269 	if (sk->sk_backlog.tail)
3270 		__release_sock(sk);
3271 
3272 	/* Warning : release_cb() might need to release sk ownership,
3273 	 * ie call sock_release_ownership(sk) before us.
3274 	 */
3275 	if (sk->sk_prot->release_cb)
3276 		sk->sk_prot->release_cb(sk);
3277 
3278 	sock_release_ownership(sk);
3279 	if (waitqueue_active(&sk->sk_lock.wq))
3280 		wake_up(&sk->sk_lock.wq);
3281 	spin_unlock_bh(&sk->sk_lock.slock);
3282 }
3283 EXPORT_SYMBOL(release_sock);
3284 
__lock_sock_fast(struct sock * sk)3285 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3286 {
3287 	might_sleep();
3288 	spin_lock_bh(&sk->sk_lock.slock);
3289 
3290 	if (!sk->sk_lock.owned) {
3291 		/*
3292 		 * Fast path return with bottom halves disabled and
3293 		 * sock::sk_lock.slock held.
3294 		 *
3295 		 * The 'mutex' is not contended and holding
3296 		 * sock::sk_lock.slock prevents all other lockers to
3297 		 * proceed so the corresponding unlock_sock_fast() can
3298 		 * avoid the slow path of release_sock() completely and
3299 		 * just release slock.
3300 		 *
3301 		 * From a semantical POV this is equivalent to 'acquiring'
3302 		 * the 'mutex', hence the corresponding lockdep
3303 		 * mutex_release() has to happen in the fast path of
3304 		 * unlock_sock_fast().
3305 		 */
3306 		return false;
3307 	}
3308 
3309 	__lock_sock(sk);
3310 	sk->sk_lock.owned = 1;
3311 	__acquire(&sk->sk_lock.slock);
3312 	spin_unlock_bh(&sk->sk_lock.slock);
3313 	return true;
3314 }
3315 EXPORT_SYMBOL(__lock_sock_fast);
3316 
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3317 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3318 		   bool timeval, bool time32)
3319 {
3320 	struct sock *sk = sock->sk;
3321 	struct timespec64 ts;
3322 
3323 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3324 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3325 	if (ts.tv_sec == -1)
3326 		return -ENOENT;
3327 	if (ts.tv_sec == 0) {
3328 		ktime_t kt = ktime_get_real();
3329 		sock_write_timestamp(sk, kt);
3330 		ts = ktime_to_timespec64(kt);
3331 	}
3332 
3333 	if (timeval)
3334 		ts.tv_nsec /= 1000;
3335 
3336 #ifdef CONFIG_COMPAT_32BIT_TIME
3337 	if (time32)
3338 		return put_old_timespec32(&ts, userstamp);
3339 #endif
3340 #ifdef CONFIG_SPARC64
3341 	/* beware of padding in sparc64 timeval */
3342 	if (timeval && !in_compat_syscall()) {
3343 		struct __kernel_old_timeval __user tv = {
3344 			.tv_sec = ts.tv_sec,
3345 			.tv_usec = ts.tv_nsec,
3346 		};
3347 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3348 			return -EFAULT;
3349 		return 0;
3350 	}
3351 #endif
3352 	return put_timespec64(&ts, userstamp);
3353 }
3354 EXPORT_SYMBOL(sock_gettstamp);
3355 
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3356 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3357 {
3358 	if (!sock_flag(sk, flag)) {
3359 		unsigned long previous_flags = sk->sk_flags;
3360 
3361 		sock_set_flag(sk, flag);
3362 		/*
3363 		 * we just set one of the two flags which require net
3364 		 * time stamping, but time stamping might have been on
3365 		 * already because of the other one
3366 		 */
3367 		if (sock_needs_netstamp(sk) &&
3368 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3369 			net_enable_timestamp();
3370 	}
3371 }
3372 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3373 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3374 		       int level, int type)
3375 {
3376 	struct sock_exterr_skb *serr;
3377 	struct sk_buff *skb;
3378 	int copied, err;
3379 
3380 	err = -EAGAIN;
3381 	skb = sock_dequeue_err_skb(sk);
3382 	if (skb == NULL)
3383 		goto out;
3384 
3385 	copied = skb->len;
3386 	if (copied > len) {
3387 		msg->msg_flags |= MSG_TRUNC;
3388 		copied = len;
3389 	}
3390 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3391 	if (err)
3392 		goto out_free_skb;
3393 
3394 	sock_recv_timestamp(msg, sk, skb);
3395 
3396 	serr = SKB_EXT_ERR(skb);
3397 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3398 
3399 	msg->msg_flags |= MSG_ERRQUEUE;
3400 	err = copied;
3401 
3402 out_free_skb:
3403 	kfree_skb(skb);
3404 out:
3405 	return err;
3406 }
3407 EXPORT_SYMBOL(sock_recv_errqueue);
3408 
3409 /*
3410  *	Get a socket option on an socket.
3411  *
3412  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3413  *	asynchronous errors should be reported by getsockopt. We assume
3414  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3415  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3416 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3417 			   char __user *optval, int __user *optlen)
3418 {
3419 	struct sock *sk = sock->sk;
3420 
3421 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3422 }
3423 EXPORT_SYMBOL(sock_common_getsockopt);
3424 
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3425 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3426 			int flags)
3427 {
3428 	struct sock *sk = sock->sk;
3429 	int addr_len = 0;
3430 	int err;
3431 
3432 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3433 				   flags & ~MSG_DONTWAIT, &addr_len);
3434 	if (err >= 0)
3435 		msg->msg_namelen = addr_len;
3436 	return err;
3437 }
3438 EXPORT_SYMBOL(sock_common_recvmsg);
3439 
3440 /*
3441  *	Set socket options on an inet socket.
3442  */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3443 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3444 			   sockptr_t optval, unsigned int optlen)
3445 {
3446 	struct sock *sk = sock->sk;
3447 
3448 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3449 }
3450 EXPORT_SYMBOL(sock_common_setsockopt);
3451 
sk_common_release(struct sock * sk)3452 void sk_common_release(struct sock *sk)
3453 {
3454 	if (sk->sk_prot->destroy)
3455 		sk->sk_prot->destroy(sk);
3456 
3457 	/*
3458 	 * Observation: when sk_common_release is called, processes have
3459 	 * no access to socket. But net still has.
3460 	 * Step one, detach it from networking:
3461 	 *
3462 	 * A. Remove from hash tables.
3463 	 */
3464 
3465 	sk->sk_prot->unhash(sk);
3466 
3467 	/*
3468 	 * In this point socket cannot receive new packets, but it is possible
3469 	 * that some packets are in flight because some CPU runs receiver and
3470 	 * did hash table lookup before we unhashed socket. They will achieve
3471 	 * receive queue and will be purged by socket destructor.
3472 	 *
3473 	 * Also we still have packets pending on receive queue and probably,
3474 	 * our own packets waiting in device queues. sock_destroy will drain
3475 	 * receive queue, but transmitted packets will delay socket destruction
3476 	 * until the last reference will be released.
3477 	 */
3478 
3479 	sock_orphan(sk);
3480 
3481 	xfrm_sk_free_policy(sk);
3482 
3483 	sk_refcnt_debug_release(sk);
3484 
3485 	sock_put(sk);
3486 }
3487 EXPORT_SYMBOL(sk_common_release);
3488 
sk_get_meminfo(const struct sock * sk,u32 * mem)3489 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3490 {
3491 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3492 
3493 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3494 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3495 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3496 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3497 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3498 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3499 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3500 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3501 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3502 }
3503 
3504 #ifdef CONFIG_PROC_FS
3505 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3506 struct prot_inuse {
3507 	int val[PROTO_INUSE_NR];
3508 };
3509 
3510 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3511 
sock_prot_inuse_add(struct net * net,struct proto * prot,int val)3512 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3513 {
3514 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3515 }
3516 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3517 
sock_prot_inuse_get(struct net * net,struct proto * prot)3518 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3519 {
3520 	int cpu, idx = prot->inuse_idx;
3521 	int res = 0;
3522 
3523 	for_each_possible_cpu(cpu)
3524 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3525 
3526 	return res >= 0 ? res : 0;
3527 }
3528 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3529 
sock_inuse_add(struct net * net,int val)3530 static void sock_inuse_add(struct net *net, int val)
3531 {
3532 	this_cpu_add(*net->core.sock_inuse, val);
3533 }
3534 
sock_inuse_get(struct net * net)3535 int sock_inuse_get(struct net *net)
3536 {
3537 	int cpu, res = 0;
3538 
3539 	for_each_possible_cpu(cpu)
3540 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3541 
3542 	return res;
3543 }
3544 
3545 EXPORT_SYMBOL_GPL(sock_inuse_get);
3546 
sock_inuse_init_net(struct net * net)3547 static int __net_init sock_inuse_init_net(struct net *net)
3548 {
3549 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3550 	if (net->core.prot_inuse == NULL)
3551 		return -ENOMEM;
3552 
3553 	net->core.sock_inuse = alloc_percpu(int);
3554 	if (net->core.sock_inuse == NULL)
3555 		goto out;
3556 
3557 	return 0;
3558 
3559 out:
3560 	free_percpu(net->core.prot_inuse);
3561 	return -ENOMEM;
3562 }
3563 
sock_inuse_exit_net(struct net * net)3564 static void __net_exit sock_inuse_exit_net(struct net *net)
3565 {
3566 	free_percpu(net->core.prot_inuse);
3567 	free_percpu(net->core.sock_inuse);
3568 }
3569 
3570 static struct pernet_operations net_inuse_ops = {
3571 	.init = sock_inuse_init_net,
3572 	.exit = sock_inuse_exit_net,
3573 };
3574 
net_inuse_init(void)3575 static __init int net_inuse_init(void)
3576 {
3577 	if (register_pernet_subsys(&net_inuse_ops))
3578 		panic("Cannot initialize net inuse counters");
3579 
3580 	return 0;
3581 }
3582 
3583 core_initcall(net_inuse_init);
3584 
assign_proto_idx(struct proto * prot)3585 static int assign_proto_idx(struct proto *prot)
3586 {
3587 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3588 
3589 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3590 		pr_err("PROTO_INUSE_NR exhausted\n");
3591 		return -ENOSPC;
3592 	}
3593 
3594 	set_bit(prot->inuse_idx, proto_inuse_idx);
3595 	return 0;
3596 }
3597 
release_proto_idx(struct proto * prot)3598 static void release_proto_idx(struct proto *prot)
3599 {
3600 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3601 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3602 }
3603 #else
assign_proto_idx(struct proto * prot)3604 static inline int assign_proto_idx(struct proto *prot)
3605 {
3606 	return 0;
3607 }
3608 
release_proto_idx(struct proto * prot)3609 static inline void release_proto_idx(struct proto *prot)
3610 {
3611 }
3612 
sock_inuse_add(struct net * net,int val)3613 static void sock_inuse_add(struct net *net, int val)
3614 {
3615 }
3616 #endif
3617 
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3618 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3619 {
3620 	if (!twsk_prot)
3621 		return;
3622 	kfree(twsk_prot->twsk_slab_name);
3623 	twsk_prot->twsk_slab_name = NULL;
3624 	kmem_cache_destroy(twsk_prot->twsk_slab);
3625 	twsk_prot->twsk_slab = NULL;
3626 }
3627 
tw_prot_init(const struct proto * prot)3628 static int tw_prot_init(const struct proto *prot)
3629 {
3630 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3631 
3632 	if (!twsk_prot)
3633 		return 0;
3634 
3635 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3636 					      prot->name);
3637 	if (!twsk_prot->twsk_slab_name)
3638 		return -ENOMEM;
3639 
3640 	twsk_prot->twsk_slab =
3641 		kmem_cache_create(twsk_prot->twsk_slab_name,
3642 				  twsk_prot->twsk_obj_size, 0,
3643 				  SLAB_ACCOUNT | prot->slab_flags,
3644 				  NULL);
3645 	if (!twsk_prot->twsk_slab) {
3646 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3647 			prot->name);
3648 		return -ENOMEM;
3649 	}
3650 
3651 	return 0;
3652 }
3653 
req_prot_cleanup(struct request_sock_ops * rsk_prot)3654 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3655 {
3656 	if (!rsk_prot)
3657 		return;
3658 	kfree(rsk_prot->slab_name);
3659 	rsk_prot->slab_name = NULL;
3660 	kmem_cache_destroy(rsk_prot->slab);
3661 	rsk_prot->slab = NULL;
3662 }
3663 
req_prot_init(const struct proto * prot)3664 static int req_prot_init(const struct proto *prot)
3665 {
3666 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3667 
3668 	if (!rsk_prot)
3669 		return 0;
3670 
3671 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3672 					prot->name);
3673 	if (!rsk_prot->slab_name)
3674 		return -ENOMEM;
3675 
3676 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3677 					   rsk_prot->obj_size, 0,
3678 					   SLAB_ACCOUNT | prot->slab_flags,
3679 					   NULL);
3680 
3681 	if (!rsk_prot->slab) {
3682 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3683 			prot->name);
3684 		return -ENOMEM;
3685 	}
3686 	return 0;
3687 }
3688 
proto_register(struct proto * prot,int alloc_slab)3689 int proto_register(struct proto *prot, int alloc_slab)
3690 {
3691 	int ret = -ENOBUFS;
3692 
3693 	if (alloc_slab) {
3694 		prot->slab = kmem_cache_create_usercopy(prot->name,
3695 					prot->obj_size, 0,
3696 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3697 					prot->slab_flags,
3698 					prot->useroffset, prot->usersize,
3699 					NULL);
3700 
3701 		if (prot->slab == NULL) {
3702 			pr_crit("%s: Can't create sock SLAB cache!\n",
3703 				prot->name);
3704 			goto out;
3705 		}
3706 
3707 		if (req_prot_init(prot))
3708 			goto out_free_request_sock_slab;
3709 
3710 		if (tw_prot_init(prot))
3711 			goto out_free_timewait_sock_slab;
3712 	}
3713 
3714 	mutex_lock(&proto_list_mutex);
3715 	ret = assign_proto_idx(prot);
3716 	if (ret) {
3717 		mutex_unlock(&proto_list_mutex);
3718 		goto out_free_timewait_sock_slab;
3719 	}
3720 	list_add(&prot->node, &proto_list);
3721 	mutex_unlock(&proto_list_mutex);
3722 	return ret;
3723 
3724 out_free_timewait_sock_slab:
3725 	if (alloc_slab)
3726 		tw_prot_cleanup(prot->twsk_prot);
3727 out_free_request_sock_slab:
3728 	if (alloc_slab) {
3729 		req_prot_cleanup(prot->rsk_prot);
3730 
3731 		kmem_cache_destroy(prot->slab);
3732 		prot->slab = NULL;
3733 	}
3734 out:
3735 	return ret;
3736 }
3737 EXPORT_SYMBOL(proto_register);
3738 
proto_unregister(struct proto * prot)3739 void proto_unregister(struct proto *prot)
3740 {
3741 	mutex_lock(&proto_list_mutex);
3742 	release_proto_idx(prot);
3743 	list_del(&prot->node);
3744 	mutex_unlock(&proto_list_mutex);
3745 
3746 	kmem_cache_destroy(prot->slab);
3747 	prot->slab = NULL;
3748 
3749 	req_prot_cleanup(prot->rsk_prot);
3750 	tw_prot_cleanup(prot->twsk_prot);
3751 }
3752 EXPORT_SYMBOL(proto_unregister);
3753 
sock_load_diag_module(int family,int protocol)3754 int sock_load_diag_module(int family, int protocol)
3755 {
3756 	if (!protocol) {
3757 		if (!sock_is_registered(family))
3758 			return -ENOENT;
3759 
3760 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3761 				      NETLINK_SOCK_DIAG, family);
3762 	}
3763 
3764 #ifdef CONFIG_INET
3765 	if (family == AF_INET &&
3766 	    protocol != IPPROTO_RAW &&
3767 	    protocol < MAX_INET_PROTOS &&
3768 	    !rcu_access_pointer(inet_protos[protocol]))
3769 		return -ENOENT;
3770 #endif
3771 
3772 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3773 			      NETLINK_SOCK_DIAG, family, protocol);
3774 }
3775 EXPORT_SYMBOL(sock_load_diag_module);
3776 
3777 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)3778 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3779 	__acquires(proto_list_mutex)
3780 {
3781 	mutex_lock(&proto_list_mutex);
3782 	return seq_list_start_head(&proto_list, *pos);
3783 }
3784 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)3785 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3786 {
3787 	return seq_list_next(v, &proto_list, pos);
3788 }
3789 
proto_seq_stop(struct seq_file * seq,void * v)3790 static void proto_seq_stop(struct seq_file *seq, void *v)
3791 	__releases(proto_list_mutex)
3792 {
3793 	mutex_unlock(&proto_list_mutex);
3794 }
3795 
proto_method_implemented(const void * method)3796 static char proto_method_implemented(const void *method)
3797 {
3798 	return method == NULL ? 'n' : 'y';
3799 }
sock_prot_memory_allocated(struct proto * proto)3800 static long sock_prot_memory_allocated(struct proto *proto)
3801 {
3802 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3803 }
3804 
sock_prot_memory_pressure(struct proto * proto)3805 static const char *sock_prot_memory_pressure(struct proto *proto)
3806 {
3807 	return proto->memory_pressure != NULL ?
3808 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3809 }
3810 
proto_seq_printf(struct seq_file * seq,struct proto * proto)3811 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3812 {
3813 
3814 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3815 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3816 		   proto->name,
3817 		   proto->obj_size,
3818 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3819 		   sock_prot_memory_allocated(proto),
3820 		   sock_prot_memory_pressure(proto),
3821 		   proto->max_header,
3822 		   proto->slab == NULL ? "no" : "yes",
3823 		   module_name(proto->owner),
3824 		   proto_method_implemented(proto->close),
3825 		   proto_method_implemented(proto->connect),
3826 		   proto_method_implemented(proto->disconnect),
3827 		   proto_method_implemented(proto->accept),
3828 		   proto_method_implemented(proto->ioctl),
3829 		   proto_method_implemented(proto->init),
3830 		   proto_method_implemented(proto->destroy),
3831 		   proto_method_implemented(proto->shutdown),
3832 		   proto_method_implemented(proto->setsockopt),
3833 		   proto_method_implemented(proto->getsockopt),
3834 		   proto_method_implemented(proto->sendmsg),
3835 		   proto_method_implemented(proto->recvmsg),
3836 		   proto_method_implemented(proto->sendpage),
3837 		   proto_method_implemented(proto->bind),
3838 		   proto_method_implemented(proto->backlog_rcv),
3839 		   proto_method_implemented(proto->hash),
3840 		   proto_method_implemented(proto->unhash),
3841 		   proto_method_implemented(proto->get_port),
3842 		   proto_method_implemented(proto->enter_memory_pressure));
3843 }
3844 
proto_seq_show(struct seq_file * seq,void * v)3845 static int proto_seq_show(struct seq_file *seq, void *v)
3846 {
3847 	if (v == &proto_list)
3848 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3849 			   "protocol",
3850 			   "size",
3851 			   "sockets",
3852 			   "memory",
3853 			   "press",
3854 			   "maxhdr",
3855 			   "slab",
3856 			   "module",
3857 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3858 	else
3859 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3860 	return 0;
3861 }
3862 
3863 static const struct seq_operations proto_seq_ops = {
3864 	.start  = proto_seq_start,
3865 	.next   = proto_seq_next,
3866 	.stop   = proto_seq_stop,
3867 	.show   = proto_seq_show,
3868 };
3869 
proto_init_net(struct net * net)3870 static __net_init int proto_init_net(struct net *net)
3871 {
3872 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3873 			sizeof(struct seq_net_private)))
3874 		return -ENOMEM;
3875 
3876 	return 0;
3877 }
3878 
proto_exit_net(struct net * net)3879 static __net_exit void proto_exit_net(struct net *net)
3880 {
3881 	remove_proc_entry("protocols", net->proc_net);
3882 }
3883 
3884 
3885 static __net_initdata struct pernet_operations proto_net_ops = {
3886 	.init = proto_init_net,
3887 	.exit = proto_exit_net,
3888 };
3889 
proto_init(void)3890 static int __init proto_init(void)
3891 {
3892 	return register_pernet_subsys(&proto_net_ops);
3893 }
3894 
3895 subsys_initcall(proto_init);
3896 
3897 #endif /* PROC_FS */
3898 
3899 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)3900 bool sk_busy_loop_end(void *p, unsigned long start_time)
3901 {
3902 	struct sock *sk = p;
3903 
3904 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3905 	       sk_busy_loop_timeout(sk, start_time);
3906 }
3907 EXPORT_SYMBOL(sk_busy_loop_end);
3908 #endif /* CONFIG_NET_RX_BUSY_POLL */
3909 
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)3910 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3911 {
3912 	if (!sk->sk_prot->bind_add)
3913 		return -EOPNOTSUPP;
3914 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3915 }
3916 EXPORT_SYMBOL(sock_bind_add);
3917